diff --git a/.asf.yaml b/.asf.yaml index 16cdf8bfed322..ae5e99cf230d8 100644 --- a/.asf.yaml +++ b/.asf.yaml @@ -31,3 +31,8 @@ github: merge: false squash: true rebase: true + +notifications: + pullrequests: reviews@spark.apache.org + issues: reviews@spark.apache.org + commits: commits@spark.apache.org diff --git a/.github/labeler.yml b/.github/labeler.yml index bd61902925e33..afaeeecda51a2 100644 --- a/.github/labeler.yml +++ b/.github/labeler.yml @@ -84,12 +84,12 @@ SPARK SHELL: - "repl/**/*" - "bin/spark-shell*" SQL: -#- any: ["**/sql/**/*", "!python/pyspark/sql/avro/**/*", "!python/pyspark/sql/streaming.py", "!python/pyspark/sql/tests/test_streaming.py"] +#- any: ["**/sql/**/*", "!python/pyspark/sql/avro/**/*", "!python/pyspark/sql/streaming/**/*", "!python/pyspark/sql/tests/streaming/test_streaming.py"] - "**/sql/**/*" - "common/unsafe/**/*" #- "!python/pyspark/sql/avro/**/*" - #- "!python/pyspark/sql/streaming.py" - #- "!python/pyspark/sql/tests/test_streaming.py" + #- "!python/pyspark/sql/streaming/**/*" + #- "!python/pyspark/sql/tests/streaming/test_streaming.py" - "bin/spark-sql*" - "bin/beeline*" - "sbin/*thriftserver*.sh" @@ -103,7 +103,7 @@ SQL: - "**/*schema.R" - "**/*types.R" AVRO: - - "external/avro/**/*" + - "connector/avro/**/*" - "python/pyspark/sql/avro/**/*" DSTREAM: - "streaming/**/*" @@ -123,13 +123,15 @@ MLLIB: - "python/pyspark/mllib/**/*" STRUCTURED STREAMING: - "**/sql/**/streaming/**/*" - - "external/kafka-0-10-sql/**/*" - - "python/pyspark/sql/streaming.py" - - "python/pyspark/sql/tests/test_streaming.py" + - "connector/kafka-0-10-sql/**/*" + - "python/pyspark/sql/streaming/**/*" + - "python/pyspark/sql/tests/streaming/test_streaming.py" - "**/*streaming.R" PYTHON: - "bin/pyspark*" - "**/python/**/*" +PANDAS API ON SPARK: + - "python/pyspark/pandas/**/*" R: - "**/r/**/*" - "**/R/**/*" @@ -149,4 +151,10 @@ WEB UI: - "**/*UI.scala" DEPLOY: - "sbin/**/*" - +CONNECT: + - "connector/connect/**/*" + - "**/sql/sparkconnect/**/*" + - "python/pyspark/sql/**/connect/**/*" +PROTOBUF: + - "connector/protobuf/**/*" + - "python/pyspark/sql/protobuf/**/*" diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 91e168210fb30..8671cff054bb8 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -30,6 +30,10 @@ on: description: 'JDK version: 8, 11 or 17' required: true default: '8' + scala: + description: 'Scala version: 2.12 or 2.13' + required: true + default: '2.12' failfast: description: 'Failfast: true or false' required: true @@ -50,11 +54,69 @@ jobs: steps: - name: Generate matrix id: set-matrix - run: echo "::set-output name=matrix::["`seq -s, 1 $SPARK_BENCHMARK_NUM_SPLITS`"]" + run: echo "matrix=["`seq -s, 1 $SPARK_BENCHMARK_NUM_SPLITS`"]" >> $GITHUB_OUTPUT + + # Any TPC-DS related updates on this job need to be applied to tpcds-1g job of build_and_test.yml as well + tpcds-1g-gen: + name: "Generate an input dataset for TPCDSQueryBenchmark with SF=1" + if: contains(github.event.inputs.class, 'TPCDSQueryBenchmark') || contains(github.event.inputs.class, '*') + runs-on: ubuntu-20.04 + env: + SPARK_LOCAL_IP: localhost + steps: + - name: Checkout Spark repository + uses: actions/checkout@v3 + # In order to get diff files + with: + fetch-depth: 0 + - name: Cache Scala, SBT and Maven + uses: actions/cache@v3 + with: + path: | + build/apache-maven-* + build/scala-* + build/*.jar + ~/.sbt + key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} + restore-keys: | + build- + - name: Cache Coursier local repository + uses: actions/cache@v3 + with: + path: ~/.cache/coursier + key: benchmark-coursier-${{ github.event.inputs.jdk }}-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + restore-keys: | + benchmark-coursier-${{ github.event.inputs.jdk }} + - name: Cache TPC-DS generated data + id: cache-tpcds-sf-1 + uses: actions/cache@v3 + with: + path: ./tpcds-sf-1 + key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }} + - name: Checkout tpcds-kit repository + if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' + uses: actions/checkout@v3 + with: + repository: databricks/tpcds-kit + ref: 2a5078a782192ddb6efbcead8de9973d6ab4f069 + path: ./tpcds-kit + - name: Build tpcds-kit + if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' + run: cd tpcds-kit/tools && make OS=LINUX + - name: Install Java ${{ github.event.inputs.jdk }} + if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' + uses: actions/setup-java@v3 + with: + distribution: temurin + java-version: ${{ github.event.inputs.jdk }} + - name: Generate TPC-DS (SF=1) table data + if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' + run: build/sbt "sql/Test/runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite" benchmark: - name: "Run benchmarks: ${{ github.event.inputs.class }} (JDK ${{ github.event.inputs.jdk }}, ${{ matrix.split }} out of ${{ github.event.inputs.num-splits }} splits)" - needs: matrix-gen + name: "Run benchmarks: ${{ github.event.inputs.class }} (JDK ${{ github.event.inputs.jdk }}, Scala ${{ github.event.inputs.scala }}, ${{ matrix.split }} out of ${{ github.event.inputs.num-splits }} splits)" + if: always() + needs: [matrix-gen, tpcds-1g-gen] # Ubuntu 20.04 is the latest LTS. The next LTS is 22.04. runs-on: ubuntu-20.04 strategy: @@ -69,14 +131,15 @@ jobs: SPARK_LOCAL_IP: localhost # To prevent spark.test.home not being set. See more detail in SPARK-36007. SPARK_HOME: ${{ github.workspace }} + SPARK_TPCDS_DATA: ${{ github.workspace }}/tpcds-sf-1 steps: - name: Checkout Spark repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 # In order to get diff files with: fetch-depth: 0 - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: | build/apache-maven-* @@ -87,19 +150,28 @@ jobs: restore-keys: | build- - name: Cache Coursier local repository - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ~/.cache/coursier key: benchmark-coursier-${{ github.event.inputs.jdk }}-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | benchmark-coursier-${{ github.event.inputs.jdk }} - name: Install Java ${{ github.event.inputs.jdk }} - uses: actions/setup-java@v1 + uses: actions/setup-java@v3 with: + distribution: temurin java-version: ${{ github.event.inputs.jdk }} + - name: Cache TPC-DS generated data + if: contains(github.event.inputs.class, 'TPCDSQueryBenchmark') || contains(github.event.inputs.class, '*') + id: cache-tpcds-sf-1 + uses: actions/cache@v3 + with: + path: ./tpcds-sf-1 + key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }} - name: Run benchmarks run: | - ./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pspark-ganglia-lgpl test:package + dev/change-scala-version.sh ${{ github.event.inputs.scala }} + ./build/sbt -Pscala-${{ github.event.inputs.scala }} -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pspark-ganglia-lgpl Test/package # Make less noisy cp conf/log4j2.properties.template conf/log4j2.properties sed -i 's/rootLogger.level = info/rootLogger.level = warn/g' conf/log4j2.properties @@ -109,13 +181,15 @@ jobs: --jars "`find . -name '*-SNAPSHOT-tests.jar' -o -name '*avro*-SNAPSHOT.jar' | paste -sd ',' -`" \ "`find . -name 'spark-core*-SNAPSHOT-tests.jar'`" \ "${{ github.event.inputs.class }}" + # Revert to default Scala version to clean up unnecessary git diff + dev/change-scala-version.sh 2.12 # To keep the directory structure and file permissions, tar them # See also https://github.com/actions/upload-artifact#maintaining-file-permissions-and-case-sensitive-files echo "Preparing the benchmark results:" - tar -cvf benchmark-results-${{ github.event.inputs.jdk }}.tar `git diff --name-only` `git ls-files --others --exclude-standard` + tar -cvf benchmark-results-${{ github.event.inputs.jdk }}-${{ github.event.inputs.scala }}.tar `git diff --name-only` `git ls-files --others --exclude=tpcds-sf-1 --exclude-standard` - name: Upload benchmark results - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: - name: benchmark-results-${{ github.event.inputs.jdk }}-${{ matrix.split }} - path: benchmark-results-${{ github.event.inputs.jdk }}.tar + name: benchmark-results-${{ github.event.inputs.jdk }}-${{ github.event.inputs.scala }}-${{ matrix.split }} + path: benchmark-results-${{ github.event.inputs.jdk }}-${{ github.event.inputs.scala }}.tar diff --git a/.github/workflows/build_and_test.yml b/.github/workflows/build_and_test.yml index a392f940df99d..29a9a58de08a8 100644 --- a/.github/workflows/build_and_test.yml +++ b/.github/workflows/build_and_test.yml @@ -20,74 +20,35 @@ name: Build and test on: - push: - branches: - - '**' workflow_call: inputs: - ansi_enabled: + java: required: false - type: boolean - default: false - + type: string + default: 8 + branch: + description: Branch to run the build against + required: false + type: string + default: branch-3.4 + hadoop: + description: Hadoop version to run with. HADOOP_PROFILE environment variable should accept it. + required: false + type: string + default: hadoop3 + envs: + description: Additional environment variables to set when running the tests. Should be in JSON format. + required: false + type: string + default: '{}' + jobs: + description: >- + Jobs to run, and should be in JSON format. The values should be matched with the job's key defined + in this file, e.g., build. See precondition job below. + required: false + type: string + default: '' jobs: - configure-jobs: - name: Configure jobs - runs-on: ubuntu-20.04 - outputs: - java: ${{ steps.set-outputs.outputs.java }} - branch: ${{ steps.set-outputs.outputs.branch }} - hadoop: ${{ steps.set-outputs.outputs.hadoop }} - type: ${{ steps.set-outputs.outputs.type }} - envs: ${{ steps.set-outputs.outputs.envs }} - steps: - - name: Configure branch and additional environment variables - id: set-outputs - run: | - if [ "${{ github.event.schedule }}" = "0 1 * * *" ]; then - echo '::set-output name=java::8' - echo '::set-output name=branch::master' - echo '::set-output name=type::scheduled' - echo '::set-output name=envs::{}' - echo '::set-output name=hadoop::hadoop2' - elif [ "${{ github.event.schedule }}" = "0 4 * * *" ]; then - echo '::set-output name=java::8' - echo '::set-output name=branch::master' - echo '::set-output name=type::scheduled' - echo '::set-output name=envs::{"SCALA_PROFILE": "scala2.13"}' - echo '::set-output name=hadoop::hadoop3' - elif [ "${{ github.event.schedule }}" = "0 7 * * *" ]; then - echo '::set-output name=java::8' - echo '::set-output name=branch::branch-3.2' - echo '::set-output name=type::scheduled' - echo '::set-output name=envs::{"SCALA_PROFILE": "scala2.13"}' - echo '::set-output name=hadoop::hadoop3.2' - elif [ "${{ github.event.schedule }}" = "0 10 * * *" ]; then - echo '::set-output name=java::8' - echo '::set-output name=branch::master' - echo '::set-output name=type::pyspark-coverage-scheduled' - echo '::set-output name=envs::{"PYSPARK_CODECOV": "true"}' - echo '::set-output name=hadoop::hadoop3' - elif [ "${{ github.event.schedule }}" = "0 13 * * *" ]; then - echo '::set-output name=java::11' - echo '::set-output name=branch::master' - echo '::set-output name=type::scheduled' - echo '::set-output name=envs::{"SKIP_MIMA": "true", "SKIP_UNIDOC": "true"}' - echo '::set-output name=hadoop::hadoop3' - elif [ "${{ github.event.schedule }}" = "0 16 * * *" ]; then - echo '::set-output name=java::17' - echo '::set-output name=branch::master' - echo '::set-output name=type::scheduled' - echo '::set-output name=envs::{"SKIP_MIMA": "true", "SKIP_UNIDOC": "true"}' - echo '::set-output name=hadoop::hadoop3' - else - echo '::set-output name=java::8' - echo '::set-output name=branch::branch-3.3' # Default branch to run on. CHANGE here when a branch is cut out. - echo '::set-output name=type::regular' - echo '::set-output name=envs::{"SPARK_ANSI_SQL_MODE": "${{ inputs.ansi_enabled }}"}' - echo '::set-output name=hadoop::hadoop3' - fi - precondition: name: Check changes runs-on: ubuntu-20.04 @@ -95,50 +56,86 @@ jobs: GITHUB_PREV_SHA: ${{ github.event.before }} outputs: required: ${{ steps.set-outputs.outputs.required }} + image_url: >- + ${{ + (inputs.branch == 'branch-3.4' && steps.infra-image-outputs.outputs.image_url) + || 'dongjoon/apache-spark-github-action-image:20220207' + }} steps: - name: Checkout Spark repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: fetch-depth: 0 repository: apache/spark - ref: branch-3.3 + ref: ${{ inputs.branch }} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - name: Check all modules id: set-outputs run: | - build=`./dev/is-changed.py -m avro,build,catalyst,core,docker-integration-tests,examples,graphx,hadoop-cloud,hive,hive-thriftserver,kubernetes,kvstore,launcher,mesos,mllib,mllib-local,network-common,network-shuffle,pyspark-core,pyspark-ml,pyspark-mllib,pyspark-pandas,pyspark-pandas-slow,pyspark-resource,pyspark-sql,pyspark-streaming,repl,sketch,spark-ganglia-lgpl,sparkr,sql,sql-kafka-0-10,streaming,streaming-kafka-0-10,streaming-kinesis-asl,tags,unsafe,yarn` - pyspark=`./dev/is-changed.py -m avro,build,catalyst,core,graphx,hive,kvstore,launcher,mllib,mllib-local,network-common,network-shuffle,pyspark-core,pyspark-ml,pyspark-mllib,pyspark-pandas,pyspark-pandas-slow,pyspark-resource,pyspark-sql,pyspark-streaming,repl,sketch,sql,tags,unsafe` - sparkr=`./dev/is-changed.py -m avro,build,catalyst,core,hive,kvstore,launcher,mllib,mllib-local,network-common,network-shuffle,repl,sketch,sparkr,sql,tags,unsafe` - tpcds=`./dev/is-changed.py -m build,catalyst,core,hive,kvstore,launcher,network-common,network-shuffle,repl,sketch,sql,tags,unsafe` - docker=`./dev/is-changed.py -m build,catalyst,core,docker-integration-tests,hive,kvstore,launcher,network-common,network-shuffle,repl,sketch,sql,tags,unsafe` - echo "{\"build\": \"$build\", \"pyspark\": \"$pyspark\", \"sparkr\": \"$sparkr\", \"tpcds\": \"$tpcds\", \"docker\": \"$docker\"}" > required.json - cat required.json - echo "::set-output name=required::$(cat required.json)" + if [ -z "${{ inputs.jobs }}" ]; then + # is-changed.py is missing in branch-3.2, and it might run in scheduled build, see also SPARK-39517 + pyspark=true; sparkr=true; tpcds=true; docker=true; + if [ -f "./dev/is-changed.py" ]; then + pyspark_modules=`cd dev && python -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"` + pyspark=`./dev/is-changed.py -m $pyspark_modules` + sparkr=`./dev/is-changed.py -m sparkr` + tpcds=`./dev/is-changed.py -m sql` + docker=`./dev/is-changed.py -m docker-integration-tests` + fi + # 'build', 'scala-213', and 'java-11-17' are always true for now. + # It does not save significant time and most of PRs trigger the build. + precondition=" + { + \"build\": \"true\", + \"pyspark\": \"$pyspark\", + \"sparkr\": \"$sparkr\", + \"tpcds-1g\": \"$tpcds\", + \"docker-integration-tests\": \"$docker\", + \"scala-213\": \"true\", + \"java-11-17\": \"true\", + \"lint\" : \"true\", + \"k8s-integration-tests\" : \"true\", + }" + echo $precondition # For debugging + # Remove `\n` to avoid "Invalid format" error + precondition="${precondition//$'\n'/}}" + echo "required=$precondition" >> $GITHUB_OUTPUT + else + # This is usually set by scheduled jobs. + precondition='${{ inputs.jobs }}' + echo $precondition # For debugging + precondition="${precondition//$'\n'/}" + echo "required=$precondition" >> $GITHUB_OUTPUT + fi + - name: Generate infra image URL + id: infra-image-outputs + run: | + # Convert to lowercase to meet Docker repo name requirement + REPO_OWNER=$(echo "${{ github.repository_owner }}" | tr '[:upper:]' '[:lower:]') + IMG_NAME="apache-spark-ci-image:${{ inputs.branch }}-${{ github.run_id }}" + IMG_URL="ghcr.io/$REPO_OWNER/$IMG_NAME" + echo "image_url=$IMG_URL" >> $GITHUB_OUTPUT # Build: build Spark and run the tests for specified modules. build: - name: "Build modules (${{ format('{0}, {1} job', needs.configure-jobs.outputs.branch, needs.configure-jobs.outputs.type) }}): ${{ matrix.modules }} ${{ matrix.comment }} (JDK ${{ matrix.java }}, ${{ matrix.hadoop }}, ${{ matrix.hive }})" - needs: [configure-jobs, precondition] - # Run scheduled jobs for Apache Spark only - # Run regular jobs for commit in both Apache Spark and forked repository - if: >- - (github.repository == 'apache/spark' && needs.configure-jobs.outputs.type == 'scheduled') - || (needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).build == 'true') + name: "Build modules: ${{ matrix.modules }} ${{ matrix.comment }}" + needs: precondition + if: fromJson(needs.precondition.outputs.required).build == 'true' # Ubuntu 20.04 is the latest LTS. The next LTS is 22.04. runs-on: ubuntu-20.04 strategy: fail-fast: false matrix: java: - - ${{ needs.configure-jobs.outputs.java }} + - ${{ inputs.java }} hadoop: - - ${{ needs.configure-jobs.outputs.hadoop }} + - ${{ inputs.hadoop }} hive: - hive2.3 # TODO(SPARK-32246): We don't test 'streaming-kinesis-asl' for now. @@ -154,7 +151,8 @@ jobs: - >- streaming, sql-kafka-0-10, streaming-kafka-0-10, mllib-local, mllib, - yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl + yarn, mesos, kubernetes, hadoop-cloud, spark-ganglia-lgpl, + connect, protobuf # Here, we split Hive and SQL tests into some of slow ones and the rest of them. included-tags: [""] excluded-tags: [""] @@ -162,27 +160,27 @@ jobs: include: # Hive tests - modules: hive - java: ${{ needs.configure-jobs.outputs.java }} - hadoop: ${{ needs.configure-jobs.outputs.hadoop }} + java: ${{ inputs.java }} + hadoop: ${{ inputs.hadoop }} hive: hive2.3 included-tags: org.apache.spark.tags.SlowHiveTest comment: "- slow tests" - modules: hive - java: ${{ needs.configure-jobs.outputs.java }} - hadoop: ${{ needs.configure-jobs.outputs.hadoop }} + java: ${{ inputs.java }} + hadoop: ${{ inputs.hadoop }} hive: hive2.3 excluded-tags: org.apache.spark.tags.SlowHiveTest comment: "- other tests" # SQL tests - modules: sql - java: ${{ needs.configure-jobs.outputs.java }} - hadoop: ${{ needs.configure-jobs.outputs.hadoop }} + java: ${{ inputs.java }} + hadoop: ${{ inputs.hadoop }} hive: hive2.3 included-tags: org.apache.spark.tags.ExtendedSQLTest comment: "- slow tests" - modules: sql - java: ${{ needs.configure-jobs.outputs.java }} - hadoop: ${{ needs.configure-jobs.outputs.hadoop }} + java: ${{ inputs.java }} + hadoop: ${{ inputs.hadoop }} hive: hive2.3 excluded-tags: org.apache.spark.tags.ExtendedSQLTest comment: "- other tests" @@ -196,22 +194,22 @@ jobs: SPARK_LOCAL_IP: localhost steps: - name: Checkout Spark repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 # In order to fetch changed files with: fetch-depth: 0 repository: apache/spark - ref: ${{ needs.configure-jobs.outputs.branch }} + ref: ${{ inputs.branch }} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: | build/apache-maven-* @@ -222,18 +220,19 @@ jobs: restore-keys: | build- - name: Cache Coursier local repository - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ~/.cache/coursier key: ${{ matrix.java }}-${{ matrix.hadoop }}-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | ${{ matrix.java }}-${{ matrix.hadoop }}-coursier- - name: Install Java ${{ matrix.java }} - uses: actions/setup-java@v1 + uses: actions/setup-java@v3 with: + distribution: temurin java-version: ${{ matrix.java }} - name: Install Python 3.8 - uses: actions/setup-python@v2 + uses: actions/setup-python@v4 # We should install one Python that is higher then 3+ for SQL and Yarn because: # - SQL component also has Python related tests, for example, IntegratedUDFTestUtils. # - Yarn has a Python specific test too, for example, YarnClusterSuite. @@ -244,11 +243,11 @@ jobs: - name: Install Python packages (Python 3.8) if: (contains(matrix.modules, 'sql') && !contains(matrix.modules, 'sql-')) run: | - python3.8 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy xmlrunner + python3.8 -m pip install 'numpy>=1.20.0' pyarrow pandas scipy unittest-xml-reporting 'grpcio==1.48.1' 'protobuf==3.19.5' python3.8 -m pip list # Run the tests. - name: Run tests - env: ${{ fromJSON(needs.configure-jobs.outputs.envs) }} + env: ${{ fromJSON(inputs.envs) }} run: | # Hive "other tests" test needs larger metaspace size based on experiment. if [[ "$MODULES_TO_TEST" == "hive" ]] && [[ "$EXCLUDED_TAGS" == "org.apache.spark.tags.SlowHiveTest" ]]; then export METASPACE_SIZE=2g; fi @@ -256,35 +255,78 @@ jobs: ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" --included-tags "$INCLUDED_TAGS" --excluded-tags "$EXCLUDED_TAGS" - name: Upload test results to report if: always() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: test-results-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} path: "**/target/test-reports/*.xml" - name: Upload unit tests log files if: failure() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: name: unit-tests-log-${{ matrix.modules }}-${{ matrix.comment }}-${{ matrix.java }}-${{ matrix.hadoop }}-${{ matrix.hive }} path: "**/target/unit-tests.log" - pyspark: - needs: [configure-jobs, precondition] - # Run PySpark coverage scheduled jobs for Apache Spark only - # Run scheduled jobs with JDK 17 in Apache Spark - # Run regular jobs for commit in both Apache Spark and forked repository + infra-image: + name: "Base image build" + needs: precondition + # Currently, only enable docker build from cache for `master` branch jobs if: >- - (github.repository == 'apache/spark' && needs.configure-jobs.outputs.type == 'pyspark-coverage-scheduled') - || (github.repository == 'apache/spark' && needs.configure-jobs.outputs.type == 'scheduled' && needs.configure-jobs.outputs.java == '17') - || (needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).pyspark == 'true') - name: "Build modules (${{ format('{0}, {1} job', needs.configure-jobs.outputs.branch, needs.configure-jobs.outputs.type) }}): ${{ matrix.modules }}" + (fromJson(needs.precondition.outputs.required).pyspark == 'true' || + fromJson(needs.precondition.outputs.required).lint == 'true' || + fromJson(needs.precondition.outputs.required).sparkr == 'true') && + inputs.branch == 'branch-3.4' + runs-on: ubuntu-latest + permissions: + packages: write + steps: + - name: Login to GitHub Container Registry + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Checkout Spark repository + uses: actions/checkout@v3 + # In order to fetch changed files + with: + fetch-depth: 0 + repository: apache/spark + ref: ${{ inputs.branch }} + - name: Sync the current branch with the latest in Apache Spark + if: github.repository != 'apache/spark' + run: | + echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV + git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - name: Build and push + id: docker_build + uses: docker/build-push-action@v3 + with: + context: ./dev/infra/ + push: true + tags: | + ${{ needs.precondition.outputs.image_url }} + # Use the infra image cache to speed up + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-cache:${{ inputs.branch }} + + pyspark: + needs: [precondition, infra-image] + # always run if pyspark == 'true', even infra-image is skip (such as non-master job) + if: always() && fromJson(needs.precondition.outputs.required).pyspark == 'true' + name: "Build modules: ${{ matrix.modules }}" runs-on: ubuntu-20.04 container: - image: dongjoon/apache-spark-github-action-image:20220207 + image: ${{ needs.precondition.outputs.image_url }} strategy: fail-fast: false matrix: java: - - ${{ needs.configure-jobs.outputs.java }} + - ${{ inputs.java }} modules: - >- pyspark-sql, pyspark-mllib, pyspark-resource @@ -294,34 +336,38 @@ jobs: pyspark-pandas - >- pyspark-pandas-slow + - >- + pyspark-connect, pyspark-errors env: MODULES_TO_TEST: ${{ matrix.modules }} - HADOOP_PROFILE: ${{ needs.configure-jobs.outputs.hadoop }} + HADOOP_PROFILE: ${{ inputs.hadoop }} HIVE_PROFILE: hive2.3 GITHUB_PREV_SHA: ${{ github.event.before }} SPARK_LOCAL_IP: localhost SKIP_UNIDOC: true SKIP_MIMA: true METASPACE_SIZE: 1g - SPARK_ANSI_SQL_MODE: ${{ inputs.ansi_enabled }} steps: - name: Checkout Spark repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 # In order to fetch changed files with: fetch-depth: 0 repository: apache/spark - ref: branch-3.3 + ref: ${{ inputs.branch }} + - name: Add GITHUB_WORKSPACE to git trust safe.directory + run: | + git config --global --add safe.directory ${GITHUB_WORKSPACE} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: | build/apache-maven-* @@ -332,15 +378,16 @@ jobs: restore-keys: | build- - name: Cache Coursier local repository - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ~/.cache/coursier key: pyspark-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | pyspark-coursier- - name: Install Java ${{ matrix.java }} - uses: actions/setup-java@v1 + uses: actions/setup-java@v3 with: + distribution: temurin java-version: ${{ matrix.java }} - name: List Python packages (Python 3.9, PyPy3) run: | @@ -352,12 +399,12 @@ jobs: bash miniconda.sh -b -p $HOME/miniconda # Run the tests. - name: Run tests - env: ${{ fromJSON(needs.configure-jobs.outputs.envs) }} + env: ${{ fromJSON(inputs.envs) }} run: | export PATH=$PATH:$HOME/miniconda/bin ./dev/run-tests --parallelism 1 --modules "$MODULES_TO_TEST" - name: Upload coverage to Codecov - if: needs.configure-jobs.outputs.type == 'pyspark-coverage-scheduled' + if: fromJSON(inputs.envs).PYSPARK_CODECOV == 'true' uses: codecov/codecov-action@v2 with: files: ./python/coverage.xml @@ -365,51 +412,52 @@ jobs: name: PySpark - name: Upload test results to report if: always() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: - name: test-results-${{ matrix.modules }}--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3 + name: test-results-${{ matrix.modules }}--8-${{ inputs.hadoop }}-hive2.3 path: "**/target/test-reports/*.xml" - name: Upload unit tests log files if: failure() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: - name: unit-tests-log-${{ matrix.modules }}--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3 + name: unit-tests-log-${{ matrix.modules }}--8-${{ inputs.hadoop }}-hive2.3 path: "**/target/unit-tests.log" sparkr: - needs: [configure-jobs, precondition] - if: >- - (needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).sparkr == 'true') - || (github.repository == 'apache/spark' && needs.configure-jobs.outputs.type == 'scheduled' && needs.configure-jobs.outputs.java == '17') + needs: [precondition, infra-image] + # always run if sparkr == 'true', even infra-image is skip (such as non-master job) + if: always() && fromJson(needs.precondition.outputs.required).sparkr == 'true' name: "Build modules: sparkr" runs-on: ubuntu-20.04 container: - image: dongjoon/apache-spark-github-action-image:20220207 + image: ${{ needs.precondition.outputs.image_url }} env: - HADOOP_PROFILE: ${{ needs.configure-jobs.outputs.hadoop }} + HADOOP_PROFILE: ${{ inputs.hadoop }} HIVE_PROFILE: hive2.3 GITHUB_PREV_SHA: ${{ github.event.before }} SPARK_LOCAL_IP: localhost SKIP_MIMA: true - SPARK_ANSI_SQL_MODE: ${{ inputs.ansi_enabled }} steps: - name: Checkout Spark repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 # In order to fetch changed files with: fetch-depth: 0 repository: apache/spark - ref: branch-3.3 + ref: ${{ inputs.branch }} + - name: Add GITHUB_WORKSPACE to git trust safe.directory + run: | + git config --global --add safe.directory ${GITHUB_WORKSPACE} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: | build/apache-maven-* @@ -420,17 +468,19 @@ jobs: restore-keys: | build- - name: Cache Coursier local repository - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ~/.cache/coursier key: sparkr-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | sparkr-coursier- - - name: Install Java ${{ needs.configure-jobs.outputs.java }} - uses: actions/setup-java@v1 + - name: Install Java ${{ inputs.java }} + uses: actions/setup-java@v3 with: - java-version: ${{ needs.configure-jobs.outputs.java }} + distribution: temurin + java-version: ${{ inputs.java }} - name: Run tests + env: ${{ fromJSON(inputs.envs) }} run: | # The followings are also used by `r-lib/actions/setup-r` to avoid # R issues at docker environment @@ -439,15 +489,16 @@ jobs: ./dev/run-tests --parallelism 1 --modules sparkr - name: Upload test results to report if: always() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: - name: test-results-sparkr--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3 + name: test-results-sparkr--8-${{ inputs.hadoop }}-hive2.3 path: "**/target/test-reports/*.xml" # Static analysis, and documentation build lint: - needs: configure-jobs - if: needs.configure-jobs.outputs.type == 'regular' + needs: [precondition, infra-image] + # always run if lint == 'true', even infra-image is skip (such as non-master job) + if: always() && fromJson(needs.precondition.outputs.required).lint == 'true' name: Linters, licenses, dependencies and documentation generation runs-on: ubuntu-20.04 env: @@ -455,24 +506,29 @@ jobs: LANG: C.UTF-8 PYSPARK_DRIVER_PYTHON: python3.9 PYSPARK_PYTHON: python3.9 + GITHUB_PREV_SHA: ${{ github.event.before }} container: - image: dongjoon/apache-spark-github-action-image:20220207 + image: ${{ needs.precondition.outputs.image_url }} steps: - name: Checkout Spark repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: fetch-depth: 0 repository: apache/spark - ref: branch-3.3 + ref: ${{ inputs.branch }} + - name: Add GITHUB_WORKSPACE to git trust safe.directory + run: | + git config --global --add safe.directory ${GITHUB_WORKSPACE} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | + echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty # Cache local repositories. Note that GitHub Actions cache has a 2G limit. - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: | build/apache-maven-* @@ -483,27 +539,60 @@ jobs: restore-keys: | build- - name: Cache Coursier local repository - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ~/.cache/coursier key: docs-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | docs-coursier- - name: Cache Maven local repository - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ~/.m2/repository key: docs-maven-${{ hashFiles('**/pom.xml') }} restore-keys: | docs-maven- + - name: Install Java 8 + uses: actions/setup-java@v3 + with: + distribution: temurin + java-version: 8 + - name: License test + run: ./dev/check-license + - name: Dependencies test + run: ./dev/test-dependencies.sh + - name: Scala linter + run: ./dev/lint-scala + - name: Java linter + run: ./dev/lint-java + - name: Spark connect jvm client mima check + if: inputs.branch != 'branch-3.2' && inputs.branch != 'branch-3.3' + run: ./dev/connect-jvm-client-mima-check - name: Install Python linter dependencies run: | # TODO(SPARK-32407): Sphinx 3.1+ does not correctly index nested classes. # See also https://github.com/sphinx-doc/sphinx/issues/7551. # Jinja2 3.0.0+ causes error when building with Sphinx. # See also https://issues.apache.org/jira/browse/SPARK-35375. - python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.920' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==21.12b0' - python3.9 -m pip install 'pandas-stubs==1.2.0.53' + python3.9 -m pip install 'flake8==3.9.0' pydata_sphinx_theme 'mypy==0.920' 'pytest==7.1.3' 'pytest-mypy-plugins==1.9.3' numpydoc 'jinja2<3.0.0' 'black==22.6.0' + python3.9 -m pip install 'pandas-stubs==1.2.0.53' ipython 'grpcio==1.48.1' 'grpc-stubs==1.24.11' 'googleapis-common-protos-stubs==2.2.0' + - name: Python linter + run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python + - name: Install dependencies for Python code generation check + run: | + # See more in "Installation" https://docs.buf.build/installation#tarball + curl -LO https://github.com/bufbuild/buf/releases/download/v1.15.1/buf-Linux-x86_64.tar.gz + mkdir -p $HOME/buf + tar -xvzf buf-Linux-x86_64.tar.gz -C $HOME/buf --strip-components 1 + python3.9 -m pip install 'protobuf==3.19.5' 'mypy-protobuf==3.3.0' + - name: Python code generation check + run: if test -f ./dev/connect-check-protos.py; then PATH=$PATH:$HOME/buf/bin PYTHON_EXECUTABLE=python3.9 ./dev/connect-check-protos.py; fi + - name: Install JavaScript linter dependencies + run: | + apt update + apt-get install -y nodejs npm + - name: JS linter + run: ./dev/lint-js - name: Install R linter dependencies and SparkR run: | apt update @@ -513,10 +602,6 @@ jobs: Rscript -e "install.packages(c('devtools'), repos='https://cloud.r-project.org/')" Rscript -e "devtools::install_version('lintr', version='2.0.1', repos='https://cloud.r-project.org')" ./R/install-dev.sh - - name: Instll JavaScript linter dependencies - run: | - apt update - apt-get install -y nodejs npm - name: Install dependencies for documentation generation run: | # pandoc is required to generate PySpark APIs as well in nbsphinx. @@ -527,9 +612,9 @@ jobs: # See also https://issues.apache.org/jira/browse/SPARK-35375. # Pin the MarkupSafe to 2.0.1 to resolve the CI error. # See also https://issues.apache.org/jira/browse/SPARK-38279. - python3.9 -m pip install 'sphinx<3.1.0' mkdocs pydata_sphinx_theme ipython nbsphinx numpydoc 'jinja2<3.0.0' 'markupsafe==2.0.1' 'pyzmq<24.0.0' + python3.9 -m pip install 'sphinx<3.1.0' mkdocs pydata_sphinx_theme nbsphinx numpydoc 'jinja2<3.0.0' 'markupsafe==2.0.1' 'pyzmq<24.0.0' python3.9 -m pip install ipython_genutils # See SPARK-38517 - python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' + python3.9 -m pip install sphinx_plotly_directive 'numpy>=1.20.0' pyarrow pandas 'plotly>=4.8' python3.9 -m pip install 'docutils<0.18.0' # See SPARK-39421 apt-get update -y apt-get install -y ruby ruby-dev @@ -539,32 +624,22 @@ jobs: gem install bundler cd docs bundle install - - name: Install Java 8 - uses: actions/setup-java@v1 - with: - java-version: 8 - - name: Scala linter - run: ./dev/lint-scala - - name: Java linter - run: ./dev/lint-java - - name: Python linter - run: PYTHON_EXECUTABLE=python3.9 ./dev/lint-python - name: R linter run: ./dev/lint-r - - name: JS linter - run: ./dev/lint-js - - name: License test - run: ./dev/check-license - - name: Dependencies test - run: ./dev/test-dependencies.sh - name: Run documentation build run: | + if [ -f "./dev/is-changed.py" ]; then + # Skip PySpark and SparkR docs while keeping Scala/Java/SQL docs + pyspark_modules=`cd dev && python3.9 -c "import sparktestsupport.modules as m; print(','.join(m.name for m in m.all_modules if m.name.startswith('pyspark')))"` + if [ `./dev/is-changed.py -m $pyspark_modules` = false ]; then export SKIP_PYTHONDOC=1; fi + if [ `./dev/is-changed.py -m sparkr` = false ]; then export SKIP_RDOC=1; fi + fi cd docs bundle exec jekyll build java-11-17: - needs: [configure-jobs, precondition] - if: needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).build == 'true' + needs: precondition + if: fromJson(needs.precondition.outputs.required).java-11-17 == 'true' name: Java ${{ matrix.java }} build with Maven strategy: fail-fast: false @@ -575,19 +650,19 @@ jobs: runs-on: ubuntu-20.04 steps: - name: Checkout Spark repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: fetch-depth: 0 repository: apache/spark - ref: branch-3.3 + ref: ${{ inputs.branch }} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: | build/apache-maven-* @@ -598,15 +673,16 @@ jobs: restore-keys: | build- - name: Cache Maven local repository - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ~/.m2/repository key: java${{ matrix.java }}-maven-${{ hashFiles('**/pom.xml') }} restore-keys: | java${{ matrix.java }}-maven- - name: Install Java ${{ matrix.java }} - uses: actions/setup-java@v1 + uses: actions/setup-java@v3 with: + distribution: temurin java-version: ${{ matrix.java }} - name: Build with Maven run: | @@ -618,25 +694,25 @@ jobs: rm -rf ~/.m2/repository/org/apache/spark scala-213: - needs: [configure-jobs, precondition] - if: needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).build == 'true' + needs: precondition + if: fromJson(needs.precondition.outputs.required).scala-213 == 'true' name: Scala 2.13 build with SBT runs-on: ubuntu-20.04 steps: - name: Checkout Spark repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: fetch-depth: 0 repository: apache/spark - ref: branch-3.3 + ref: ${{ inputs.branch }} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: | build/apache-maven-* @@ -647,44 +723,45 @@ jobs: restore-keys: | build- - name: Cache Coursier local repository - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ~/.cache/coursier key: scala-213-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | scala-213-coursier- - name: Install Java 8 - uses: actions/setup-java@v1 + uses: actions/setup-java@v3 with: + distribution: temurin java-version: 8 - name: Build with SBT run: | ./dev/change-scala-version.sh 2.13 - ./build/sbt -Pyarn -Pmesos -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pdocker-integration-tests -Pkubernetes-integration-tests -Pspark-ganglia-lgpl -Pscala-2.13 compile test:compile + ./build/sbt -Pyarn -Pmesos -Pkubernetes -Pvolcano -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pdocker-integration-tests -Pkubernetes-integration-tests -Pspark-ganglia-lgpl -Pscala-2.13 compile Test/compile + # Any TPC-DS related updates on this job need to be applied to tpcds-1g-gen job of benchmark.yml as well tpcds-1g: - needs: [configure-jobs, precondition] - if: needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).tpcds == 'true' + needs: precondition + if: fromJson(needs.precondition.outputs.required).tpcds-1g == 'true' name: Run TPC-DS queries with SF=1 runs-on: ubuntu-20.04 env: SPARK_LOCAL_IP: localhost - SPARK_ANSI_SQL_MODE: ${{ inputs.ansi_enabled }} steps: - name: Checkout Spark repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: fetch-depth: 0 repository: apache/spark - ref: branch-3.3 + ref: ${{ inputs.branch }} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: | build/apache-maven-* @@ -695,25 +772,26 @@ jobs: restore-keys: | build- - name: Cache Coursier local repository - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ~/.cache/coursier key: tpcds-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | tpcds-coursier- - name: Install Java 8 - uses: actions/setup-java@v1 + uses: actions/setup-java@v3 with: + distribution: temurin java-version: 8 - name: Cache TPC-DS generated data id: cache-tpcds-sf-1 - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ./tpcds-sf-1 key: tpcds-${{ hashFiles('.github/workflows/build_and_test.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }} - name: Checkout tpcds-kit repository if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: repository: databricks/tpcds-kit ref: 2a5078a782192ddb6efbcead8de9973d6ab4f069 @@ -723,11 +801,12 @@ jobs: run: cd tpcds-kit/tools && make OS=LINUX - name: Generate TPC-DS (SF=1) table data if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true' - run: build/sbt "sql/test:runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite" + run: build/sbt "sql/Test/runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite" - name: Run TPC-DS queries (Sort merge join) run: | SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" env: + SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }} SPARK_TPCDS_JOIN_CONF: | spark.sql.autoBroadcastJoinThreshold=-1 spark.sql.join.preferSortMergeJoin=true @@ -735,56 +814,58 @@ jobs: run: | SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" env: + SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }} SPARK_TPCDS_JOIN_CONF: | spark.sql.autoBroadcastJoinThreshold=10485760 - name: Run TPC-DS queries (Shuffled hash join) run: | SPARK_TPCDS_DATA=`pwd`/tpcds-sf-1 build/sbt "sql/testOnly org.apache.spark.sql.TPCDSQueryTestSuite" env: + SPARK_ANSI_SQL_MODE: ${{ fromJSON(inputs.envs).SPARK_ANSI_SQL_MODE }} SPARK_TPCDS_JOIN_CONF: | spark.sql.autoBroadcastJoinThreshold=-1 spark.sql.join.forceApplyShuffledHashJoin=true - name: Upload test results to report if: always() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: - name: test-results-tpcds--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3 + name: test-results-tpcds--8-${{ inputs.hadoop }}-hive2.3 path: "**/target/test-reports/*.xml" - name: Upload unit tests log files if: failure() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: - name: unit-tests-log-tpcds--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3 + name: unit-tests-log-tpcds--8-${{ inputs.hadoop }}-hive2.3 path: "**/target/unit-tests.log" docker-integration-tests: - needs: [configure-jobs, precondition] - if: needs.configure-jobs.outputs.type == 'regular' && fromJson(needs.precondition.outputs.required).docker == 'true' + needs: precondition + if: fromJson(needs.precondition.outputs.required).docker-integration-tests == 'true' name: Run Docker integration tests runs-on: ubuntu-20.04 env: - HADOOP_PROFILE: ${{ needs.configure-jobs.outputs.hadoop }} + HADOOP_PROFILE: ${{ inputs.hadoop }} HIVE_PROFILE: hive2.3 GITHUB_PREV_SHA: ${{ github.event.before }} SPARK_LOCAL_IP: localhost - ORACLE_DOCKER_IMAGE_NAME: gvenzl/oracle-xe:18.4.0 + ORACLE_DOCKER_IMAGE_NAME: gvenzl/oracle-xe:21.3.0 SKIP_MIMA: true steps: - name: Checkout Spark repository - uses: actions/checkout@v2 + uses: actions/checkout@v3 with: fetch-depth: 0 repository: apache/spark - ref: branch-3.3 + ref: ${{ inputs.branch }} - name: Sync the current branch with the latest in Apache Spark if: github.repository != 'apache/spark' run: | echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD - git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty - name: Cache Scala, SBT and Maven - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: | build/apache-maven-* @@ -795,28 +876,100 @@ jobs: restore-keys: | build- - name: Cache Coursier local repository - uses: actions/cache@v2 + uses: actions/cache@v3 with: path: ~/.cache/coursier key: docker-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} restore-keys: | docker-integration-coursier- - name: Install Java 8 - uses: actions/setup-java@v1 + uses: actions/setup-java@v3 with: + distribution: temurin java-version: 8 - name: Run tests run: | ./dev/run-tests --parallelism 1 --modules docker-integration-tests --included-tags org.apache.spark.tags.DockerTest - name: Upload test results to report if: always() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: - name: test-results-docker-integration--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3 + name: test-results-docker-integration--8-${{ inputs.hadoop }}-hive2.3 path: "**/target/test-reports/*.xml" - name: Upload unit tests log files if: failure() - uses: actions/upload-artifact@v2 + uses: actions/upload-artifact@v3 with: - name: unit-tests-log-docker-integration--8-${{ needs.configure-jobs.outputs.hadoop }}-hive2.3 + name: unit-tests-log-docker-integration--8-${{ inputs.hadoop }}-hive2.3 path: "**/target/unit-tests.log" + + k8s-integration-tests: + needs: precondition + if: fromJson(needs.precondition.outputs.required).k8s-integration-tests == 'true' + name: Run Spark on Kubernetes Integration test + runs-on: ubuntu-20.04 + steps: + - name: Checkout Spark repository + uses: actions/checkout@v3 + with: + fetch-depth: 0 + repository: apache/spark + ref: ${{ inputs.branch }} + - name: Sync the current branch with the latest in Apache Spark + if: github.repository != 'apache/spark' + run: | + echo "APACHE_SPARK_REF=$(git rev-parse HEAD)" >> $GITHUB_ENV + git fetch https://github.com/$GITHUB_REPOSITORY.git ${GITHUB_REF#refs/heads/} + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' merge --no-commit --progress --squash FETCH_HEAD + git -c user.name='Apache Spark Test Account' -c user.email='sparktestacc@gmail.com' commit -m "Merged commit" --allow-empty + - name: Cache Scala, SBT and Maven + uses: actions/cache@v3 + with: + path: | + build/apache-maven-* + build/scala-* + build/*.jar + ~/.sbt + key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }} + restore-keys: | + build- + - name: Cache Coursier local repository + uses: actions/cache@v3 + with: + path: ~/.cache/coursier + key: k8s-integration-coursier-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }} + restore-keys: | + k8s-integration-coursier- + - name: Install Java ${{ inputs.java }} + uses: actions/setup-java@v3 + with: + distribution: temurin + java-version: ${{ inputs.java }} + - name: start minikube + run: | + # See more in "Installation" https://minikube.sigs.k8s.io/docs/start/ + curl -LO https://storage.googleapis.com/minikube/releases/latest/minikube-linux-amd64 + sudo install minikube-linux-amd64 /usr/local/bin/minikube + # Github Action limit cpu:2, memory: 6947MB, limit to 2U6G for better resource statistic + minikube start --cpus 2 --memory 6144 + - name: Print K8S pods and nodes info + run: | + kubectl get pods -A + kubectl describe node + - name: Run Spark on K8S integration test (With driver cpu 0.5, executor cpu 0.2 limited) + run: | + # Prepare PV test + PVC_TMP_DIR=$(mktemp -d) + export PVC_TESTS_HOST_PATH=$PVC_TMP_DIR + export PVC_TESTS_VM_PATH=$PVC_TMP_DIR + minikube mount ${PVC_TESTS_HOST_PATH}:${PVC_TESTS_VM_PATH} --gid=0 --uid=185 & + kubectl create clusterrolebinding serviceaccounts-cluster-admin --clusterrole=cluster-admin --group=system:serviceaccounts || true + kubectl apply -f https://raw.githubusercontent.com/volcano-sh/volcano/v1.7.0/installer/volcano-development.yaml || true + eval $(minikube docker-env) + build/sbt -Psparkr -Pkubernetes -Pvolcano -Pkubernetes-integration-tests -Dspark.kubernetes.test.driverRequestCores=0.5 -Dspark.kubernetes.test.executorRequestCores=0.2 -Dspark.kubernetes.test.volcanoMaxConcurrencyJobNum=1 -Dtest.exclude.tags=local "kubernetes-integration-tests/test" + - name: Upload Spark on K8S integration tests log files + if: failure() + uses: actions/upload-artifact@v3 + with: + name: spark-on-kubernetes-it-log + path: "**/target/integration-tests.log" diff --git a/.github/workflows/build_and_test_ansi.yml b/.github/workflows/build_and_test_ansi.yml deleted file mode 100644 index 3b8e44ff80ec3..0000000000000 --- a/.github/workflows/build_and_test_ansi.yml +++ /dev/null @@ -1,34 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# - -name: "Build and test (ANSI)" - -on: - push: - branches: - - branch-3.3 - -jobs: - call-build-and-test: - name: Call main build - uses: ./.github/workflows/build_and_test.yml - if: github.repository == 'apache/spark' - with: - ansi_enabled: true - diff --git a/.github/workflows/build_ansi.yml b/.github/workflows/build_ansi.yml new file mode 100644 index 0000000000000..e67a9262fcd70 --- /dev/null +++ b/.github/workflows/build_ansi.yml @@ -0,0 +1,48 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build / ANSI (master, Hadoop 3, JDK 8, Scala 2.12)" + +on: + schedule: + - cron: '0 1 * * *' + +jobs: + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/build_and_test.yml + if: github.repository == 'apache/spark' + with: + java: 8 + branch: master + hadoop: hadoop3 + envs: >- + { + "SPARK_ANSI_SQL_MODE": "true", + } + jobs: >- + { + "build": "true", + "pyspark": "true", + "sparkr": "true", + "tpcds-1g": "true", + "docker-integration-tests": "true" + } diff --git a/.github/workflows/build_branch32.yml b/.github/workflows/build_branch32.yml new file mode 100644 index 0000000000000..723db45ca3755 --- /dev/null +++ b/.github/workflows/build_branch32.yml @@ -0,0 +1,49 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build (branch-3.2, Scala 2.13, Hadoop 3, JDK 8)" + +on: + schedule: + - cron: '0 4 * * *' + +jobs: + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/build_and_test.yml + if: github.repository == 'apache/spark' + with: + java: 8 + branch: branch-3.2 + hadoop: hadoop3.2 + envs: >- + { + "SCALA_PROFILE": "scala2.13" + } + # TODO(SPARK-39712): Reenable "sparkr": "true" + # TODO(SPARK-39685): Reenable "lint": "true" + # TODO(SPARK-39681): Reenable "pyspark": "true" + # TODO(SPARK-39682): Reenable "docker-integration-tests": "true" + jobs: >- + { + "build": "true", + "tpcds-1g": "true" + } diff --git a/.github/workflows/build_branch33.yml b/.github/workflows/build_branch33.yml new file mode 100644 index 0000000000000..7ceafceb7180d --- /dev/null +++ b/.github/workflows/build_branch33.yml @@ -0,0 +1,49 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build (branch-3.3, Scala 2.13, Hadoop 3, JDK 8)" + +on: + schedule: + - cron: '0 7 * * *' + +jobs: + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/build_and_test.yml + if: github.repository == 'apache/spark' + with: + java: 8 + branch: branch-3.3 + hadoop: hadoop3 + envs: >- + { + "SCALA_PROFILE": "scala2.13" + } + jobs: >- + { + "build": "true", + "pyspark": "true", + "sparkr": "true", + "tpcds-1g": "true", + "docker-integration-tests": "true", + "lint" : "true" + } diff --git a/.github/workflows/build_coverage.yml b/.github/workflows/build_coverage.yml new file mode 100644 index 0000000000000..aa210f0031866 --- /dev/null +++ b/.github/workflows/build_coverage.yml @@ -0,0 +1,44 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build / Coverage (master, Scala 2.12, Hadoop 3, JDK 8)" + +on: + schedule: + - cron: '0 10 * * *' + +jobs: + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/build_and_test.yml + if: github.repository == 'apache/spark' + with: + java: 8 + branch: master + hadoop: hadoop3 + envs: >- + { + "PYSPARK_CODECOV": "true" + } + jobs: >- + { + "pyspark": "true" + } diff --git a/.github/workflows/build_hadoop2.yml b/.github/workflows/build_hadoop2.yml new file mode 100644 index 0000000000000..9716d568be8e0 --- /dev/null +++ b/.github/workflows/build_hadoop2.yml @@ -0,0 +1,44 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build (master, Scala 2.12, Hadoop 2, JDK 8)" + +on: + schedule: + - cron: '0 13 * * *' + +jobs: + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/build_and_test.yml + if: github.repository == 'apache/spark' + with: + java: 8 + branch: master + hadoop: hadoop2 + # TODO(SPARK-39684): Reenable "docker-integration-tests": "true" + jobs: >- + { + "build": "true", + "pyspark": "true", + "sparkr": "true", + "tpcds-1g": "true" + } diff --git a/.github/workflows/build_infra_images_cache.yml b/.github/workflows/build_infra_images_cache.yml new file mode 100644 index 0000000000000..b8aae945599de --- /dev/null +++ b/.github/workflows/build_infra_images_cache.yml @@ -0,0 +1,62 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: Build / Cache base image + +on: + # Run jobs when a commit is merged + push: + branches: + - 'master' + - 'branch-*' + paths: + - 'dev/infra/Dockerfile' + - '.github/workflows/build_infra_images_cache.yml' + # Create infra image when cutting down branches/tags + create: +jobs: + main: + if: github.repository == 'apache/spark' + runs-on: ubuntu-latest + permissions: + packages: write + steps: + - name: Checkout Spark repository + uses: actions/checkout@v3 + - name: Set up QEMU + uses: docker/setup-qemu-action@v2 + - name: Set up Docker Buildx + uses: docker/setup-buildx-action@v2 + - name: Login to DockerHub + uses: docker/login-action@v2 + with: + registry: ghcr.io + username: ${{ github.actor }} + password: ${{ secrets.GITHUB_TOKEN }} + - name: Build and push + id: docker_build + uses: docker/build-push-action@v3 + with: + context: ./dev/infra/ + push: true + tags: ghcr.io/apache/spark/apache-spark-github-action-image-cache:${{ github.ref_name }}-static + cache-from: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-cache:${{ github.ref_name }} + cache-to: type=registry,ref=ghcr.io/apache/spark/apache-spark-github-action-image-cache:${{ github.ref_name }},mode=max + - name: Image digest + run: echo ${{ steps.docker_build.outputs.digest }} diff --git a/.github/workflows/build_java11.yml b/.github/workflows/build_java11.yml new file mode 100644 index 0000000000000..bf7b2edb45ff3 --- /dev/null +++ b/.github/workflows/build_java11.yml @@ -0,0 +1,49 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build (master, Scala 2.12, Hadoop 3, JDK 11)" + +on: + schedule: + - cron: '0 16 * * *' + +jobs: + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/build_and_test.yml + if: github.repository == 'apache/spark' + with: + java: 11 + branch: master + hadoop: hadoop3 + envs: >- + { + "SKIP_MIMA": "true", + "SKIP_UNIDOC": "true" + } + jobs: >- + { + "build": "true", + "pyspark": "true", + "sparkr": "true", + "tpcds-1g": "true", + "docker-integration-tests": "true" + } diff --git a/.github/workflows/build_java17.yml b/.github/workflows/build_java17.yml new file mode 100644 index 0000000000000..9465e5ea0e317 --- /dev/null +++ b/.github/workflows/build_java17.yml @@ -0,0 +1,49 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build (master, Scala 2.12, Hadoop 3, JDK 17)" + +on: + schedule: + - cron: '0 22 * * *' + +jobs: + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/build_and_test.yml + if: github.repository == 'apache/spark' + with: + java: 17 + branch: master + hadoop: hadoop3 + envs: >- + { + "SKIP_MIMA": "true", + "SKIP_UNIDOC": "true" + } + jobs: >- + { + "build": "true", + "pyspark": "true", + "sparkr": "true", + "tpcds-1g": "true", + "docker-integration-tests": "true" + } diff --git a/.github/workflows/build_main.yml b/.github/workflows/build_main.yml new file mode 100644 index 0000000000000..1ac6c87b7d041 --- /dev/null +++ b/.github/workflows/build_main.yml @@ -0,0 +1,49 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +<<<<<<<< HEAD:.github/workflows/build_and_test_ansi.yml +name: "Build and test (ANSI)" +======== +name: "Build" +>>>>>>>> 17a8e67a6a03fd5a33f4ed078f8325665a0635aa:.github/workflows/build_main.yml + +on: + push: + branches: +<<<<<<<< HEAD:.github/workflows/build_and_test_ansi.yml + - branch-3.3 + +jobs: + call-build-and-test: + name: Call main build + uses: ./.github/workflows/build_and_test.yml + if: github.repository == 'apache/spark' + with: + ansi_enabled: true + +======== + - '**' + +jobs: + call-build-and-test: + permissions: + packages: write + name: Run + uses: ./.github/workflows/build_and_test.yml +>>>>>>>> 17a8e67a6a03fd5a33f4ed078f8325665a0635aa:.github/workflows/build_main.yml diff --git a/.github/workflows/build_rockdb_as_ui_backend.yml b/.github/workflows/build_rockdb_as_ui_backend.yml new file mode 100644 index 0000000000000..04e0e7c2e1073 --- /dev/null +++ b/.github/workflows/build_rockdb_as_ui_backend.yml @@ -0,0 +1,48 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build / RocksDB as UI Backend (master, Hadoop 3, JDK 8, Scala 2.12)" + +on: + schedule: + - cron: '0 6 * * *' + +jobs: + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/build_and_test.yml + if: github.repository == 'apache/spark' + with: + java: 8 + branch: master + hadoop: hadoop3 + envs: >- + { + "LIVE_UI_LOCAL_STORE_DIR": "/tmp/kvStore", + } + jobs: >- + { + "build": "true", + "pyspark": "true", + "sparkr": "true", + "tpcds-1g": "true", + "docker-integration-tests": "true" + } diff --git a/.github/workflows/build_scala213.yml b/.github/workflows/build_scala213.yml new file mode 100644 index 0000000000000..cae0981ee1e8a --- /dev/null +++ b/.github/workflows/build_scala213.yml @@ -0,0 +1,49 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# + +name: "Build (master, Scala 2.13, Hadoop 3, JDK 8)" + +on: + schedule: + - cron: '0 19 * * *' + +jobs: + run-build: + permissions: + packages: write + name: Run + uses: ./.github/workflows/build_and_test.yml + if: github.repository == 'apache/spark' + with: + java: 8 + branch: master + hadoop: hadoop3 + envs: >- + { + "SCALA_PROFILE": "scala2.13" + } + jobs: >- + { + "build": "true", + "pyspark": "true", + "sparkr": "true", + "tpcds-1g": "true", + "docker-integration-tests": "true", + "lint" : "true" + } diff --git a/.github/workflows/cancel_duplicate_workflow_runs.yml b/.github/workflows/cancel_duplicate_workflow_runs.yml index 525c7e7972c2a..d41ca31190d94 100644 --- a/.github/workflows/cancel_duplicate_workflow_runs.yml +++ b/.github/workflows/cancel_duplicate_workflow_runs.yml @@ -21,7 +21,7 @@ name: Cancelling Duplicates on: workflow_run: workflows: - - 'Build and test' + - 'Build' types: ['requested'] jobs: diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml index 88d17bf34d504..c6b6e65bc9fec 100644 --- a/.github/workflows/labeler.yml +++ b/.github/workflows/labeler.yml @@ -30,6 +30,9 @@ jobs: label: name: Label pull requests runs-on: ubuntu-latest + permissions: + contents: read + pull-requests: write steps: # In order to get back the negated matches like in the old config, # we need the actinons/labeler concept of `all` and `any` which matches @@ -44,7 +47,7 @@ jobs: # # However, these are not in a published release and the current `main` branch # has some issues upon testing. - - uses: actions/labeler@5f867a63be70efff62b767459b009290364495eb # pin@2.2.0 + - uses: actions/labeler@v4 with: repo-token: "${{ secrets.GITHUB_TOKEN }}" sync-labels: true diff --git a/.github/workflows/notify_test_workflow.yml b/.github/workflows/notify_test_workflow.yml index eb0da84a797c3..6fb776d708346 100644 --- a/.github/workflows/notify_test_workflow.yml +++ b/.github/workflows/notify_test_workflow.yml @@ -31,9 +31,12 @@ jobs: notify: name: Notify test workflow runs-on: ubuntu-20.04 + permissions: + actions: read + checks: write steps: - name: "Notify test workflow" - uses: actions/github-script@f05a81df23035049204b043b50c3322045ce7eb3 # pin@v3 + uses: actions/github-script@v6 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | @@ -46,7 +49,7 @@ jobs: const params = { owner: context.payload.pull_request.head.repo.owner.login, repo: context.payload.pull_request.head.repo.name, - id: 'build_and_test.yml', + id: 'build_main.yml', branch: context.payload.pull_request.head.ref, } const check_run_params = { @@ -69,7 +72,7 @@ jobs: // Assume that runs were not found. } - const name = 'Build and test' + const name = 'Build' const head_sha = context.payload.pull_request.head.sha let status = 'queued' @@ -77,7 +80,7 @@ jobs: status = 'completed' const conclusion = 'action_required' - github.checks.create({ + github.rest.checks.create({ owner: context.repo.owner, repo: context.repo.repo, name: name, @@ -113,7 +116,7 @@ jobs: // Here we get check run ID to provide Check run view instead of Actions view, see also SPARK-37879. const check_runs = await github.request(check_run_endpoint, check_run_params) - const check_run_head = check_runs.data.check_runs.filter(r => r.name === "Configure jobs")[0] + const check_run_head = check_runs.data.check_runs.filter(r => r.name === "Run / Check changes")[0] if (check_run_head.head_sha != context.payload.pull_request.head.sha) { throw new Error('There was a new unsynced commit pushed. Please retrigger the workflow.'); @@ -129,7 +132,7 @@ jobs: + '/actions/runs/' + run_id - github.checks.create({ + github.rest.checks.create({ owner: context.repo.owner, repo: context.repo.repo, name: name, diff --git a/.github/workflows/publish_snapshot.yml b/.github/workflows/publish_snapshot.yml index bd75e26108658..f0a8ad5ef6a72 100644 --- a/.github/workflows/publish_snapshot.yml +++ b/.github/workflows/publish_snapshot.yml @@ -32,23 +32,24 @@ jobs: matrix: branch: - master + - branch-3.3 - branch-3.2 - - branch-3.1 steps: - name: Checkout Spark repository - uses: actions/checkout@61b9e3751b92087fd0b06925ba6dd6314e06f089 # pin@master + uses: actions/checkout@v3 with: ref: ${{ matrix.branch }} - name: Cache Maven local repository - uses: actions/cache@c64c572235d810460d0d6876e9c705ad5002b353 # pin@v2 + uses: actions/cache@v3 with: path: ~/.m2/repository key: snapshot-maven-${{ hashFiles('**/pom.xml') }} restore-keys: | snapshot-maven- - name: Install Java 8 - uses: actions/setup-java@d202f5dbf7256730fb690ec59f6381650114feb2 # pin@v1 + uses: actions/setup-java@v3 with: + distribution: temurin java-version: 8 - name: Publish snapshot env: diff --git a/.github/workflows/test_report.yml b/.github/workflows/test_report.yml index a3f09c06ed989..c6225e6a1abe5 100644 --- a/.github/workflows/test_report.yml +++ b/.github/workflows/test_report.yml @@ -20,12 +20,13 @@ name: Report test results on: workflow_run: - workflows: ["Build and test", "Build and test (ANSI)"] + workflows: ["Build"] types: - completed jobs: test_report: + if: github.event.workflow_run.conclusion != 'skipped' runs-on: ubuntu-latest steps: - name: Download test results to report diff --git a/.github/workflows/update_build_status.yml b/.github/workflows/update_build_status.yml index 671487adbfe05..05cf4914a25ca 100644 --- a/.github/workflows/update_build_status.yml +++ b/.github/workflows/update_build_status.yml @@ -27,9 +27,12 @@ jobs: update: name: Update build status runs-on: ubuntu-20.04 + permissions: + actions: read + checks: write steps: - name: "Update build status" - uses: actions/github-script@f05a81df23035049204b043b50c3322045ce7eb3 # pin@v3 + uses: actions/github-script@v6 with: github-token: ${{ secrets.GITHUB_TOKEN }} script: | @@ -58,7 +61,7 @@ jobs: // Iterator GitHub Checks in the PR for await (const cr of checkRuns.data.check_runs) { - if (cr.name == 'Build and test' && cr.conclusion != "action_required") { + if (cr.name == 'Build' && cr.conclusion != "action_required") { // text contains parameters to make request in JSON. const params = JSON.parse(cr.output.text) diff --git a/.gitignore b/.gitignore index 0e2f59f43f83d..11141961bf805 100644 --- a/.gitignore +++ b/.gitignore @@ -18,10 +18,7 @@ .ensime_cache/ .ensime_lucene .generated-mima* -# All the files under .idea/ are ignore. To add new files under ./idea that are not in the VCS yet, please use `git add -f` .idea/ -# SPARK-35223: Add IssueNavigationLink to make IDEA support hyperlink on JIRA Ticket and GitHub PR on Git plugin. -!.idea/vcs.xml .idea_modules/ .metals .project @@ -77,6 +74,7 @@ python/coverage.xml python/deps python/docs/_site/ python/docs/source/reference/**/api/ +python/docs/source/user_guide/pandas_on_spark/supported_pandas_api.rst python/test_coverage/coverage_data python/test_coverage/htmlcov python/pyspark/python diff --git a/.idea/vcs.xml b/.idea/vcs.xml deleted file mode 100644 index 28fd3fcdf10ea..0000000000000 --- a/.idea/vcs.xml +++ /dev/null @@ -1,36 +0,0 @@ - - - - - - - - - - diff --git a/LICENSE b/LICENSE index df6bed16f4471..012fdbca4c90d 100644 --- a/LICENSE +++ b/LICENSE @@ -216,7 +216,7 @@ core/src/main/resources/org/apache/spark/ui/static/bootstrap* core/src/main/resources/org/apache/spark/ui/static/jsonFormatter* core/src/main/resources/org/apache/spark/ui/static/vis* docs/js/vendor/bootstrap.js -external/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaReporter.java +connector/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaReporter.java Python Software Foundation License diff --git a/LICENSE-binary b/LICENSE-binary index 40e2e389b2264..9472d28e509ac 100644 --- a/LICENSE-binary +++ b/LICENSE-binary @@ -382,6 +382,10 @@ org.eclipse.jetty:jetty-servlets org.eclipse.jetty:jetty-util org.eclipse.jetty:jetty-webapp org.eclipse.jetty:jetty-xml +org.scala-lang:scala-compiler +org.scala-lang:scala-library +org.scala-lang:scala-reflect +org.scala-lang.modules:scala-parser-combinators_2.12 org.scala-lang.modules:scala-xml_2.12 com.github.joshelser:dropwizard-metrics-hadoop-metrics2-reporter com.zaxxer.HikariCP @@ -404,6 +408,7 @@ org.datanucleus:javax.jdo com.tdunning:json org.apache.velocity:velocity org.apache.yetus:audience-annotations +com.google.cloud.bigdataoss:gcs-connector core/src/main/java/org/apache/spark/util/collection/TimSort.java core/src/main/resources/org/apache/spark/ui/static/bootstrap* @@ -426,7 +431,6 @@ javolution:javolution com.esotericsoftware:kryo-shaded com.esotericsoftware:minlog com.esotericsoftware:reflectasm -com.google.protobuf:protobuf-java org.codehaus.janino:commons-compiler org.codehaus.janino:janino jline:jline @@ -438,6 +442,7 @@ pl.edu.icm:JLargeArrays BSD 3-Clause ------------ +com.google.protobuf:protobuf-java dk.brics.automaton:automaton org.antlr:antlr-runtime org.antlr:ST4 @@ -445,10 +450,6 @@ org.antlr:stringtemplate org.antlr:antlr4-runtime antlr:antlr com.thoughtworks.paranamer:paranamer -org.scala-lang:scala-compiler -org.scala-lang:scala-library -org.scala-lang:scala-reflect -org.scala-lang.modules:scala-parser-combinators_2.12 org.fusesource.leveldbjni:leveldbjni-all net.sourceforge.f2j:arpack_combined_all xmlenc:xmlenc diff --git a/R/check-cran.sh b/R/check-cran.sh index 22c8f423cfd12..4123361f5e285 100755 --- a/R/check-cran.sh +++ b/R/check-cran.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one or more diff --git a/R/create-docs.sh b/R/create-docs.sh index 4867fd99e647c..3deaefd0659dc 100755 --- a/R/create-docs.sh +++ b/R/create-docs.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one or more diff --git a/R/create-rd.sh b/R/create-rd.sh index 72a932c175c95..1f0527458f2f0 100755 --- a/R/create-rd.sh +++ b/R/create-rd.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one or more diff --git a/R/find-r.sh b/R/find-r.sh index 690acc083af91..f1a5026911a7f 100755 --- a/R/find-r.sh +++ b/R/find-r.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one or more diff --git a/R/install-dev.sh b/R/install-dev.sh index 9fbc999f2e805..7df21c6c5ec9a 100755 --- a/R/install-dev.sh +++ b/R/install-dev.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one or more diff --git a/R/install-source-package.sh b/R/install-source-package.sh index 8de3569d1d482..0a2a5fe00f31f 100755 --- a/R/install-source-package.sh +++ b/R/install-source-package.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one or more diff --git a/R/pkg/DESCRIPTION b/R/pkg/DESCRIPTION index 0e449e841cf6d..fa7028630a899 100644 --- a/R/pkg/DESCRIPTION +++ b/R/pkg/DESCRIPTION @@ -1,6 +1,6 @@ Package: SparkR Type: Package -Version: 3.3.1 +Version: 3.4.1 Title: R Front End for 'Apache Spark' Description: Provides an R Front end for 'Apache Spark' . Authors@R: diff --git a/R/pkg/NAMESPACE b/R/pkg/NAMESPACE index 6e0557cff88ce..bb05e99a9d8a6 100644 --- a/R/pkg/NAMESPACE +++ b/R/pkg/NAMESPACE @@ -143,6 +143,7 @@ exportMethods("arrange", "join", "limit", "localCheckpoint", + "melt", "merge", "mutate", "na.omit", @@ -182,6 +183,7 @@ exportMethods("arrange", "unionByName", "unique", "unpersist", + "unpivot", "where", "with", "withColumn", @@ -474,9 +476,16 @@ export("as.DataFrame", "createDataFrame", "createExternalTable", "createTable", + "currentCatalog", "currentDatabase", + "databaseExists", "dropTempTable", "dropTempView", + "functionExists", + "getDatabase", + "getFunc", + "getTable", + "listCatalogs", "listColumns", "listDatabases", "listFunctions", @@ -493,6 +502,7 @@ export("as.DataFrame", "refreshByPath", "refreshTable", "setCheckpointDir", + "setCurrentCatalog", "setCurrentDatabase", "spark.lapply", "spark.addFile", @@ -500,6 +510,7 @@ export("as.DataFrame", "spark.getSparkFiles", "sql", "str", + "tableExists", "tableToDF", "tableNames", "tables", diff --git a/R/pkg/R/DataFrame.R b/R/pkg/R/DataFrame.R index e143cbd8256f9..3f9bc9cb6d053 100644 --- a/R/pkg/R/DataFrame.R +++ b/R/pkg/R/DataFrame.R @@ -3366,7 +3366,7 @@ setMethod("na.omit", setMethod("fillna", signature(x = "SparkDataFrame"), function(x, value, cols = NULL) { - if (!(class(value) %in% c("integer", "numeric", "character", "list"))) { + if (!(inherits(value, c("integer", "numeric", "character", "list")))) { stop("value should be an integer, numeric, character or named list.") } @@ -3378,7 +3378,7 @@ setMethod("fillna", } # Check each item in the named list is of valid type lapply(value, function(v) { - if (!(class(v) %in% c("integer", "numeric", "character"))) { + if (!(inherits(v, c("integer", "numeric", "character")))) { stop("Each item in value should be an integer, numeric or character.") } }) @@ -3577,41 +3577,56 @@ setMethod("str", #' This is a no-op if schema doesn't contain column name(s). #' #' @param x a SparkDataFrame. -#' @param col a character vector of column names or a Column. -#' @param ... further arguments to be passed to or from other methods. -#' @return A SparkDataFrame. +#' @param col a list of columns or single Column or name. +#' @param ... additional column(s) if only one column is specified in \code{col}. +#' If more than one column is assigned in \code{col}, \code{...} +#' should be left empty. +#' @return A new SparkDataFrame with selected columns. #' #' @family SparkDataFrame functions #' @rdname drop #' @name drop -#' @aliases drop,SparkDataFrame-method +#' @aliases drop,SparkDataFrame,characterOrColumn-method #' @examples -#'\dontrun{ +#' \dontrun{ #' sparkR.session() #' path <- "path/to/file.json" #' df <- read.json(path) #' drop(df, "col1") #' drop(df, c("col1", "col2")) #' drop(df, df$col1) +#' drop(df, "col1", "col2") +#' drop(df, df$name, df$age) #' } -#' @note drop since 2.0.0 +#' @note drop(SparkDataFrame, characterOrColumn, ...) since 3.4.0 setMethod("drop", - signature(x = "SparkDataFrame"), - function(x, col) { - stopifnot(class(col) == "character" || class(col) == "Column") - - if (class(col) == "Column") { - sdf <- callJMethod(x@sdf, "drop", col@jc) + signature(x = "SparkDataFrame", col = "characterOrColumn"), + function(x, col, ...) { + if (class(col) == "character" && length(col) > 1) { + if (length(list(...)) > 0) { + stop("To drop multiple columns, use a character vector or ... for character/Column") + } + cols <- as.list(col) } else { - sdf <- callJMethod(x@sdf, "drop", as.list(col)) + cols <- list(col, ...) } + + cols <- lapply(cols, function(c) { + if (class(c) == "Column") { + c@jc + } else { + col(c)@jc + } + }) + + sdf <- callJMethod(x@sdf, "drop", cols[[1]], cols[-1]) dataFrame(sdf) }) # Expose base::drop #' @name drop #' @rdname drop -#' @aliases drop,ANY-method +#' @aliases drop,ANY,ANY-method setMethod("drop", signature(x = "ANY"), function(x) { @@ -4238,3 +4253,76 @@ setMethod("withWatermark", sdf <- callJMethod(x@sdf, "withWatermark", eventTime, delayThreshold) dataFrame(sdf) }) + +#' Unpivot a DataFrame from wide format to long format. +#' +#' This is the reverse to \code{groupBy(...).pivot(...).agg(...)}, +#' except for the aggregation, which cannot be reversed. +#' +#' @param x a SparkDataFrame. +#' @param ids a character vector or a list of columns +#' @param values a character vector, a list of columns or \code{NULL}. +#' If not NULL must not be empty. If \code{NULL}, uses all columns that +#' are not set as \code{ids}. +#' @param variableColumnName character Name of the variable column. +#' @param valueColumnName character Name of the value column. +#' @return a SparkDataFrame. +#' @aliases unpivot,SparkDataFrame,ANY,ANY,character,character-method +#' @family SparkDataFrame functions +#' @rdname unpivot +#' @name unpivot +#' @examples +#' \dontrun{ +#' df <- createDataFrame(data.frame( +#' id = 1:3, x = c(1, 3, 5), y = c(2, 4, 6), z = c(-1, 0, 1) +#' )) +#' +#' head(unpivot(df, "id", c("x", "y"), "var", "val")) +#' +#' head(unpivot(df, "id", NULL, "var", "val")) +#' } +#' @note unpivot since 3.4.0 +setMethod("unpivot", + signature( + x = "SparkDataFrame", ids = "ANY", values = "ANY", + variableColumnName = "character", valueColumnName = "character" + ), + function(x, ids, values, variableColumnName, valueColumnName) { + as_jcols <- function(xs) lapply( + xs, + function(x) { + if (is.character(x)) { + column(x)@jc + } else { + c@jc + } + } + ) + + sdf <- if (is.null(values)) { + callJMethod( + x@sdf, "unpivotWithSeq", as_jcols(ids), variableColumnName, valueColumnName + ) + } else { + callJMethod( + x@sdf, "unpivotWithSeq", + as_jcols(ids), as_jcols(values), + variableColumnName, valueColumnName + ) + } + dataFrame(sdf) + }) + +#' @rdname unpivot +#' @name melt +#' @aliases melt,SparkDataFrame,ANY,ANY,character,character-method +#' @note melt since 3.4.0 +setMethod("melt", + signature( + x = "SparkDataFrame", ids = "ANY", values = "ANY", + variableColumnName = "character", valueColumnName = "character" + ), + function(x, ids, values, variableColumnName, valueColumnName) { + unpivot(x, ids, values, variableColumnName, valueColumnName) + } +) diff --git a/R/pkg/R/WindowSpec.R b/R/pkg/R/WindowSpec.R index be47d0117ed7f..5c1de0beac3ca 100644 --- a/R/pkg/R/WindowSpec.R +++ b/R/pkg/R/WindowSpec.R @@ -135,7 +135,7 @@ setMethod("orderBy", #' An offset indicates the number of rows above or below the current row, the frame for the #' current row starts or ends. For instance, given a row based sliding frame with a lower bound #' offset of -1 and a upper bound offset of +2. The frame for row with index 5 would range from -#' index 4 to index 6. +#' index 4 to index 7. #' #' @param x a WindowSpec #' @param start boundary start, inclusive. diff --git a/R/pkg/R/catalog.R b/R/pkg/R/catalog.R index 275737f804bde..942af4de3c0bb 100644 --- a/R/pkg/R/catalog.R +++ b/R/pkg/R/catalog.R @@ -17,6 +17,66 @@ # catalog.R: SparkSession catalog functions +#' Returns the current default catalog +#' +#' Returns the current default catalog. +#' +#' @return name of the current default catalog. +#' @rdname currentCatalog +#' @name currentCatalog +#' @examples +#' \dontrun{ +#' sparkR.session() +#' currentCatalog() +#' } +#' @note since 3.4.0 +currentCatalog <- function() { + sparkSession <- getSparkSession() + catalog <- callJMethod(sparkSession, "catalog") + callJMethod(catalog, "currentCatalog") +} + +#' Sets the current default catalog +#' +#' Sets the current default catalog. +#' +#' @param catalogName name of the catalog +#' @rdname setCurrentCatalog +#' @name setCurrentCatalog +#' @examples +#' \dontrun{ +#' sparkR.session() +#' setCurrentCatalog("spark_catalog") +#' } +#' @note since 3.4.0 +setCurrentCatalog <- function(catalogName) { + sparkSession <- getSparkSession() + if (class(catalogName) != "character") { + stop("catalogName must be a string.") + } + catalog <- callJMethod(sparkSession, "catalog") + invisible(handledCallJMethod(catalog, "setCurrentCatalog", catalogName)) +} + +#' Returns a list of catalog available +#' +#' Returns a list of catalog available. +#' +#' @return a SparkDataFrame of the list of catalog. +#' @rdname listCatalogs +#' @name listCatalogs +#' @examples +#' \dontrun{ +#' sparkR.session() +#' listCatalogs() +#' } +#' @note since 3.4.0 +listCatalogs <- function() { + sparkSession <- getSparkSession() + catalog <- callJMethod(sparkSession, "catalog") + dataFrame(callJMethod(callJMethod(catalog, "listCatalogs"), "toDF")) +} + #' (Deprecated) Create an external table #' #' Creates an external table based on the dataset in a data source, @@ -58,6 +118,7 @@ createExternalTable <- function(tableName, path = NULL, source = NULL, schema = #' #' @param tableName the qualified or unqualified name that designates a table. If no database #' identifier is provided, it refers to a table in the current database. +#' The table name can be fully qualified with catalog name since 3.4.0. #' @param path (optional) the path of files to load. #' @param source (optional) the name of the data source. #' @param schema (optional) the schema of the data required for some data sources. @@ -69,7 +130,7 @@ createExternalTable <- function(tableName, path = NULL, source = NULL, schema = #' sparkR.session() #' df <- createTable("myjson", path="path/to/json", source="json", schema) #' -#' createTable("people", source = "json", schema = schema) +#' createTable("spark_catalog.default.people", source = "json", schema = schema) #' insertInto(df, "people") #' } #' @name createTable @@ -100,6 +161,7 @@ createTable <- function(tableName, path = NULL, source = NULL, schema = NULL, .. #' #' @param tableName the qualified or unqualified name that designates a table. If no database #' identifier is provided, it refers to a table in the current database. +#' The table name can be fully qualified with catalog name since 3.4.0. #' @return SparkDataFrame #' @rdname cacheTable #' @examples @@ -124,6 +186,7 @@ cacheTable <- function(tableName) { #' #' @param tableName the qualified or unqualified name that designates a table. If no database #' identifier is provided, it refers to a table in the current database. +#' The table name can be fully qualified with catalog name since 3.4.0. #' @return SparkDataFrame #' @rdname uncacheTable #' @examples @@ -215,13 +278,14 @@ dropTempView <- function(viewName) { #' Returns a SparkDataFrame containing names of tables in the given database. #' #' @param databaseName (optional) name of the database +#' The database name can be qualified with catalog name since 3.4.0. #' @return a SparkDataFrame #' @rdname tables #' @seealso \link{listTables} #' @examples #'\dontrun{ #' sparkR.session() -#' tables("hive") +#' tables("spark_catalog.hive") #' } #' @name tables #' @note tables since 1.4.0 @@ -235,12 +299,13 @@ tables <- function(databaseName = NULL) { #' Returns the names of tables in the given database as an array. #' #' @param databaseName (optional) name of the database +#' The database name can be qualified with catalog name since 3.4.0. #' @return a list of table names #' @rdname tableNames #' @examples #'\dontrun{ #' sparkR.session() -#' tableNames("hive") +#' tableNames("spark_catalog.hive") #' } #' @name tableNames #' @note tableNames since 1.4.0 @@ -293,6 +358,28 @@ setCurrentDatabase <- function(databaseName) { invisible(handledCallJMethod(catalog, "setCurrentDatabase", databaseName)) } +#' Checks if the database with the specified name exists. +#' +#' Checks if the database with the specified name exists. +#' +#' @param databaseName name of the database, allowed to be qualified with catalog name +#' @rdname databaseExists +#' @name databaseExists +#' @examples +#' \dontrun{ +#' sparkR.session() +#' databaseExists("spark_catalog.default") +#' } +#' @note since 3.4.0 +databaseExists <- function(databaseName) { + sparkSession <- getSparkSession() + if (class(databaseName) != "character") { + stop("databaseName must be a string.") + } + catalog <- callJMethod(sparkSession, "catalog") + callJMethod(catalog, "databaseExists", databaseName) +} + #' Returns a list of databases available #' #' Returns a list of databases available. @@ -312,12 +399,54 @@ listDatabases <- function() { dataFrame(callJMethod(callJMethod(catalog, "listDatabases"), "toDF")) } +#' Get the database with the specified name +#' +#' Get the database with the specified name +#' +#' @param databaseName name of the database, allowed to be qualified with catalog name +#' @return A named list. +#' @rdname getDatabase +#' @name getDatabase +#' @examples +#' \dontrun{ +#' sparkR.session() +#' db <- getDatabase("default") +#' } +#' @note since 3.4.0 +getDatabase <- function(databaseName) { + sparkSession <- getSparkSession() + if (class(databaseName) != "character") { + stop("databaseName must be a string.") + } + catalog <- callJMethod(sparkSession, "catalog") + jdb <- handledCallJMethod(catalog, "getDatabase", databaseName) + + ret <- list(name = callJMethod(jdb, "name")) + jcata <- callJMethod(jdb, "catalog") + if (is.null(jcata)) { + ret$catalog <- NA + } else { + ret$catalog <- jcata + } + + jdesc <- callJMethod(jdb, "description") + if (is.null(jdesc)) { + ret$description <- NA + } else { + ret$description <- jdesc + } + + ret$locationUri <- callJMethod(jdb, "locationUri") + ret +} + #' Returns a list of tables or views in the specified database #' #' Returns a list of tables or views in the specified database. #' This includes all temporary views. #' #' @param databaseName (optional) name of the database +#' The database name can be qualified with catalog name since 3.4.0. #' @return a SparkDataFrame of the list of tables. #' @rdname listTables #' @name listTables @@ -326,7 +455,7 @@ listDatabases <- function() { #' \dontrun{ #' sparkR.session() #' listTables() -#' listTables("default") +#' listTables("spark_catalog.default") #' } #' @note since 2.2.0 listTables <- function(databaseName = NULL) { @@ -343,6 +472,78 @@ listTables <- function(databaseName = NULL) { dataFrame(callJMethod(jdst, "toDF")) } +#' Checks if the table with the specified name exists. +#' +#' Checks if the table with the specified name exists. +#' +#' @param tableName name of the table, allowed to be qualified with catalog name +#' @rdname tableExists +#' @name tableExists +#' @examples +#' \dontrun{ +#' sparkR.session() +#' databaseExists("spark_catalog.default.myTable") +#' } +#' @note since 3.4.0 +tableExists <- function(tableName) { + sparkSession <- getSparkSession() + if (class(tableName) != "character") { + stop("tableName must be a string.") + } + catalog <- callJMethod(sparkSession, "catalog") + callJMethod(catalog, "tableExists", tableName) +} + +#' Get the table with the specified name +#' +#' Get the table with the specified name +#' +#' @param tableName the qualified or unqualified name that designates a table, allowed to be +#' qualified with catalog name +#' @return A named list. +#' @rdname getTable +#' @name getTable +#' @examples +#' \dontrun{ +#' sparkR.session() +#' tbl <- getTable("spark_catalog.default.myTable") +#' } +#' @note since 3.4.0 +getTable <- function(tableName) { + sparkSession <- getSparkSession() + if (class(tableName) != "character") { + stop("tableName must be a string.") + } + catalog <- callJMethod(sparkSession, "catalog") + jtbl <- handledCallJMethod(catalog, "getTable", tableName) + + ret <- list(name = callJMethod(jtbl, "name")) + jcata <- callJMethod(jtbl, "catalog") + if (is.null(jcata)) { + ret$catalog <- NA + } else { + ret$catalog <- jcata + } + + jns <- callJMethod(jtbl, "namespace") + if (is.null(jns)) { + ret$namespace <- NA + } else { + ret$namespace <- jns + } + + jdesc <- callJMethod(jtbl, "description") + if (is.null(jdesc)) { + ret$description <- NA + } else { + ret$description <- jdesc + } + + ret$tableType <- callJMethod(jtbl, "tableType") + ret$isTemporary <- callJMethod(jtbl, "isTemporary") + ret +} + #' Returns a list of columns for the given table/view in the specified database #' #' Returns a list of columns for the given table/view in the specified database. @@ -350,6 +551,8 @@ listTables <- function(databaseName = NULL) { #' @param tableName the qualified or unqualified name that designates a table/view. If no database #' identifier is provided, it refers to a table/view in the current database. #' If \code{databaseName} parameter is specified, this must be an unqualified name. +#' The table name can be qualified with catalog name since 3.4.0, when databaseName +#' is NULL. #' @param databaseName (optional) name of the database #' @return a SparkDataFrame of the list of column descriptions. #' @rdname listColumns @@ -357,7 +560,7 @@ listTables <- function(databaseName = NULL) { #' @examples #' \dontrun{ #' sparkR.session() -#' listColumns("mytable") +#' listColumns("spark_catalog.default.mytable") #' } #' @note since 2.2.0 listColumns <- function(tableName, databaseName = NULL) { @@ -380,13 +583,14 @@ listColumns <- function(tableName, databaseName = NULL) { #' This includes all temporary functions. #' #' @param databaseName (optional) name of the database +#' The database name can be qualified with catalog name since 3.4.0. #' @return a SparkDataFrame of the list of function descriptions. #' @rdname listFunctions #' @name listFunctions #' @examples #' \dontrun{ #' sparkR.session() -#' listFunctions() +#' listFunctions(spark_catalog.default) #' } #' @note since 2.2.0 listFunctions <- function(databaseName = NULL) { @@ -403,6 +607,78 @@ listFunctions <- function(databaseName = NULL) { dataFrame(callJMethod(jdst, "toDF")) } +#' Checks if the function with the specified name exists. +#' +#' Checks if the function with the specified name exists. +#' +#' @param functionName name of the function, allowed to be qualified with catalog name +#' @rdname functionExists +#' @name functionExists +#' @examples +#' \dontrun{ +#' sparkR.session() +#' functionExists("spark_catalog.default.myFunc") +#' } +#' @note since 3.4.0 +functionExists <- function(functionName) { + sparkSession <- getSparkSession() + if (class(functionName) != "character") { + stop("functionName must be a string.") + } + catalog <- callJMethod(sparkSession, "catalog") + callJMethod(catalog, "functionExists", functionName) +} + +#' Get the function with the specified name +#' +#' Get the function with the specified name +#' +#' @param functionName name of the function, allowed to be qualified with catalog name +#' @return A named list. +#' @rdname getFunc +#' @name getFunc +#' @examples +#' \dontrun{ +#' sparkR.session() +#' func <- getFunc("spark_catalog.default.myFunc") +#' } +#' @note since 3.4.0. Use different name with the scala/python side, to avoid the +#' signature conflict with built-in "getFunction". +getFunc <- function(functionName) { + sparkSession <- getSparkSession() + if (class(functionName) != "character") { + stop("functionName must be a string.") + } + catalog <- callJMethod(sparkSession, "catalog") + jfunc <- handledCallJMethod(catalog, "getFunction", functionName) + + ret <- list(name = callJMethod(jfunc, "name")) + jcata <- callJMethod(jfunc, "catalog") + if (is.null(jcata)) { + ret$catalog <- NA + } else { + ret$catalog <- jcata + } + + jns <- callJMethod(jfunc, "namespace") + if (is.null(jns)) { + ret$namespace <- NA + } else { + ret$namespace <- jns + } + + jdesc <- callJMethod(jfunc, "description") + if (is.null(jdesc)) { + ret$description <- NA + } else { + ret$description <- jdesc + } + + ret$className <- callJMethod(jfunc, "className") + ret$isTemporary <- callJMethod(jfunc, "isTemporary") + ret +} + #' Recovers all the partitions in the directory of a table and update the catalog #' #' Recovers all the partitions in the directory of a table and update the catalog. The name should @@ -410,12 +686,13 @@ listFunctions <- function(databaseName = NULL) { #' #' @param tableName the qualified or unqualified name that designates a table. If no database #' identifier is provided, it refers to a table in the current database. +#' The table name can be fully qualified with catalog name since 3.4.0. #' @rdname recoverPartitions #' @name recoverPartitions #' @examples #' \dontrun{ #' sparkR.session() -#' recoverPartitions("myTable") +#' recoverPartitions("spark_catalog.default.myTable") #' } #' @note since 2.2.0 recoverPartitions <- function(tableName) { @@ -436,12 +713,13 @@ recoverPartitions <- function(tableName) { #' #' @param tableName the qualified or unqualified name that designates a table. If no database #' identifier is provided, it refers to a table in the current database. +#' The table name can be fully qualified with catalog name since 3.4.0. #' @rdname refreshTable #' @name refreshTable #' @examples #' \dontrun{ #' sparkR.session() -#' refreshTable("myTable") +#' refreshTable("spark_catalog.default.myTable") #' } #' @note since 2.2.0 refreshTable <- function(tableName) { diff --git a/R/pkg/R/column.R b/R/pkg/R/column.R index f1fd30e144bb6..e4865056f58bc 100644 --- a/R/pkg/R/column.R +++ b/R/pkg/R/column.R @@ -85,7 +85,7 @@ createOperator <- function(op) { callJMethod(e1@jc, operators[[op]]) } } else { - if (class(e2) == "Column") { + if (inherits(e2, "Column")) { e2 <- e2@jc } if (op == "^") { @@ -110,7 +110,7 @@ createColumnFunction2 <- function(name) { setMethod(name, signature(x = "Column"), function(x, data) { - if (class(data) == "Column") { + if (inherits(data, "Column")) { data <- data@jc } jc <- callJMethod(x@jc, name, data) @@ -306,7 +306,7 @@ setMethod("%in%", setMethod("otherwise", signature(x = "Column", value = "ANY"), function(x, value) { - value <- if (class(value) == "Column") { value@jc } else { value } + value <- if (inherits(value, "Column")) { value@jc } else { value } jc <- callJMethod(x@jc, "otherwise", value) column(jc) }) diff --git a/R/pkg/R/context.R b/R/pkg/R/context.R index cca6c2c817de9..eea83aa5ab527 100644 --- a/R/pkg/R/context.R +++ b/R/pkg/R/context.R @@ -170,7 +170,7 @@ parallelize <- function(sc, coll, numSlices = 1) { serializedSlices <- lapply(slices, serialize, connection = NULL) # The RPC backend cannot handle arguments larger than 2GB (INT_MAX) - # If serialized data is safely less than that threshold we send it over the PRC channel. + # If serialized data is safely less than that threshold we send it over the RPC channel. # Otherwise, we write it to a file and send the file name if (objectSize < sizeLimit) { jrdd <- callJStatic("org.apache.spark.api.r.RRDD", "createRDDFromArray", sc, serializedSlices) diff --git a/R/pkg/R/functions.R b/R/pkg/R/functions.R index 1377f0daa7360..00ce630bd18e3 100644 --- a/R/pkg/R/functions.R +++ b/R/pkg/R/functions.R @@ -258,6 +258,13 @@ NULL #' into accumulator (the first argument). #' @param finish an unary \code{function} \code{(Column) -> Column} used to #' apply final transformation on the accumulated data in \code{array_aggregate}. +#' @param comparator an optional binary (\code{(Column, Column) -> Column}) \code{function} +#' which is used to compare the elemnts of the array. +#' The comparator will take two +#' arguments representing two elements of the array. It returns a negative integer, +#' 0, or a positive integer as the first element is less than, equal to, +#' or greater than the second element. +#' If the comparator function returns null, the function will fail and raise an error. #' @param ... additional argument(s). #' \itemize{ #' \item \code{to_json}, \code{from_json} and \code{schema_of_json}: this contains @@ -292,6 +299,7 @@ NULL #' head(select(tmp, array_contains(tmp$v1, 21), size(tmp$v1), shuffle(tmp$v1))) #' head(select(tmp, array_max(tmp$v1), array_min(tmp$v1), array_distinct(tmp$v1))) #' head(select(tmp, array_position(tmp$v1, 21), array_repeat(df$mpg, 3), array_sort(tmp$v1))) +#' head(select(tmp, array_sort(tmp$v1, function(x, y) coalesce(cast(y - x, "integer"), lit(0L))))) #' head(select(tmp, reverse(tmp$v1), array_remove(tmp$v1, 21))) #' head(select(tmp, array_transform("v1", function(x) x * 10))) #' head(select(tmp, array_exists("v1", function(x) x > 120))) @@ -445,7 +453,7 @@ setMethod("lit", signature("ANY"), function(x) { jc <- callJStatic("org.apache.spark.sql.functions", "lit", - if (class(x) == "Column") { x@jc } else { x }) + if (inherits(x, "Column")) { x@jc } else { x }) column(jc) }) @@ -966,7 +974,7 @@ setMethod("hash", #' @details #' \code{xxhash64}: Calculates the hash code of given columns using the 64-bit #' variant of the xxHash algorithm, and returns the result as a long -#' column. +#' column. The hash computation uses an initial seed of 42. #' #' @rdname column_misc_functions #' @aliases xxhash64 xxhash64,Column-method @@ -3256,7 +3264,8 @@ setMethod("format_string", signature(format = "character", x = "Column"), #' tmp <- mutate(df, to_unix = unix_timestamp(df$time), #' to_unix2 = unix_timestamp(df$time, 'yyyy-MM-dd HH'), #' from_unix = from_unixtime(unix_timestamp(df$time)), -#' from_unix2 = from_unixtime(unix_timestamp(df$time), 'yyyy-MM-dd HH:mm')) +#' from_unix2 = from_unixtime(unix_timestamp(df$time), 'yyyy-MM-dd HH:mm'), +#' timestamp_from_unix = timestamp_seconds(unix_timestamp(df$time))) #' head(tmp)} #' @note from_unixtime since 1.5.0 setMethod("from_unixtime", signature(x = "Column"), @@ -3586,7 +3595,7 @@ setMethod("unix_timestamp", signature(x = "Column", format = "character"), setMethod("when", signature(condition = "Column", value = "ANY"), function(condition, value) { condition <- condition@jc - value <- if (class(value) == "Column") { value@jc } else { value } + value <- if (inherits(value, "Column")) { value@jc } else { value } jc <- callJStatic("org.apache.spark.sql.functions", "when", condition, value) column(jc) }) @@ -3605,8 +3614,8 @@ setMethod("ifelse", signature(test = "Column", yes = "ANY", no = "ANY"), function(test, yes, no) { test <- test@jc - yes <- if (class(yes) == "Column") { yes@jc } else { yes } - no <- if (class(no) == "Column") { no@jc } else { no } + yes <- if (inherits(yes, "Column")) { yes@jc } else { yes } + no <- if (inherits(no, "Column")) { no@jc } else { no } jc <- callJMethod(callJStatic("org.apache.spark.sql.functions", "when", test, yes), @@ -4140,9 +4149,16 @@ setMethod("array_repeat", #' @note array_sort since 2.4.0 setMethod("array_sort", signature(x = "Column"), - function(x) { - jc <- callJStatic("org.apache.spark.sql.functions", "array_sort", x@jc) - column(jc) + function(x, comparator = NULL) { + if (is.null(comparator)) { + column(callJStatic("org.apache.spark.sql.functions", "array_sort", x@jc)) + } else { + invoke_higher_order_function( + "ArraySort", + cols = list(x), + funs = list(comparator) + ) + } }) #' @details @@ -4854,7 +4870,8 @@ setMethod("current_timestamp", }) #' @details -#' \code{timestamp_seconds}: Creates timestamp from the number of seconds since UTC epoch. +#' \code{timestamp_seconds}: Converts the number of seconds from the Unix epoch +#' (1970-01-01T00:00:00Z) to a timestamp. #' #' @rdname column_datetime_functions #' @aliases timestamp_seconds timestamp_seconds,Column-method diff --git a/R/pkg/R/generics.R b/R/pkg/R/generics.R index 5fe2ec602ecd3..328df50877b70 100644 --- a/R/pkg/R/generics.R +++ b/R/pkg/R/generics.R @@ -442,7 +442,7 @@ setGeneric("describe", function(x, col, ...) { standardGeneric("describe") }) setGeneric("distinct", function(x) { standardGeneric("distinct") }) #' @rdname drop -setGeneric("drop", function(x, ...) { standardGeneric("drop") }) +setGeneric("drop", function(x, col, ...) { standardGeneric("drop") }) #' @rdname dropDuplicates setGeneric("dropDuplicates", function(x, ...) { standardGeneric("dropDuplicates") }) @@ -670,6 +670,16 @@ setGeneric("randomSplit", function(x, weights, seed) { standardGeneric("randomSp #' @rdname broadcast setGeneric("broadcast", function(x) { standardGeneric("broadcast") }) +#' @rdname unpivot +setGeneric("unpivot", function(x, ids, values, variableColumnName, valueColumnName) { + standardGeneric("unpivot") +}) + +#' @rdname melt +setGeneric("melt", function(x, ids, values, variableColumnName, valueColumnName) { + standardGeneric("melt") +}) + ###################### Column Methods ########################## #' @rdname columnfunctions @@ -840,7 +850,7 @@ setGeneric("array_repeat", function(x, count) { standardGeneric("array_repeat") #' @rdname column_collection_functions #' @name NULL -setGeneric("array_sort", function(x) { standardGeneric("array_sort") }) +setGeneric("array_sort", function(x, ...) { standardGeneric("array_sort") }) #' @rdname column_ml_functions #' @name NULL diff --git a/R/pkg/R/install.R b/R/pkg/R/install.R index bbb9188cd083f..971de6010eb8a 100644 --- a/R/pkg/R/install.R +++ b/R/pkg/R/install.R @@ -29,19 +29,18 @@ #' \code{mirrorUrl} specifies the remote path to a Spark folder. It is followed by a subfolder #' named after the Spark version (that corresponds to SparkR), and then the tar filename. #' The filename is composed of four parts, i.e. [Spark version]-bin-[Hadoop version].tgz. -#' For example, the full path for a Spark 2.0.0 package for Hadoop 2.7 from -#' \code{http://apache.osuosl.org} has path: -#' \code{http://apache.osuosl.org/spark/spark-2.0.0/spark-2.0.0-bin-hadoop2.7.tgz}. +#' For example, the full path for a Spark 3.3.1 package from +#' \code{https://archive.apache.org} has path: +#' \code{http://archive.apache.org/dist/spark/spark-3.3.1/spark-3.3.1-bin-hadoop3.tgz}. #' For \code{hadoopVersion = "without"}, [Hadoop version] in the filename is then #' \code{without-hadoop}. #' -#' @param hadoopVersion Version of Hadoop to install. Default is \code{"2.7"}. It can take other -#' version number in the format of "x.y" where x and y are integer. +#' @param hadoopVersion Version of Hadoop to install. Default is \code{"3"}. #' If \code{hadoopVersion = "without"}, "Hadoop free" build is installed. #' See #' \href{https://spark.apache.org/docs/latest/hadoop-provided.html}{ #' "Hadoop Free" Build} for more information. -#' Other patched version names can also be used, e.g. \code{"cdh4"} +#' Other patched version names can also be used. #' @param mirrorUrl base URL of the repositories to use. The directory layout should follow #' \href{https://www.apache.org/dyn/closer.lua/spark/}{Apache mirrors}. #' @param localDir a local directory where Spark is installed. The directory contains @@ -65,7 +64,7 @@ #' @note install.spark since 2.1.0 #' @seealso See available Hadoop versions: #' \href{https://spark.apache.org/downloads.html}{Apache Spark} -install.spark <- function(hadoopVersion = "2.7", mirrorUrl = NULL, +install.spark <- function(hadoopVersion = "3", mirrorUrl = NULL, localDir = NULL, overwrite = FALSE) { sparkHome <- Sys.getenv("SPARK_HOME") if (isSparkRShell()) { @@ -251,7 +250,7 @@ defaultMirrorUrl <- function() { hadoopVersionName <- function(hadoopVersion) { if (hadoopVersion == "without") { "without-hadoop" - } else if (grepl("^[0-9]+\\.[0-9]+$", hadoopVersion, perl = TRUE)) { + } else if (grepl("^[0-9]+$", hadoopVersion, perl = TRUE)) { paste0("hadoop", hadoopVersion) } else { hadoopVersion diff --git a/R/pkg/R/mllib_classification.R b/R/pkg/R/mllib_classification.R index 093467ecf7d28..7204f8bb7dff4 100644 --- a/R/pkg/R/mllib_classification.R +++ b/R/pkg/R/mllib_classification.R @@ -322,7 +322,7 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") } if (!is.null(lowerBoundsOnCoefficients)) { - if (class(lowerBoundsOnCoefficients) != "matrix") { + if (!is.matrix(lowerBoundsOnCoefficients)) { stop("lowerBoundsOnCoefficients must be a matrix.") } row <- nrow(lowerBoundsOnCoefficients) @@ -331,7 +331,7 @@ setMethod("spark.logit", signature(data = "SparkDataFrame", formula = "formula") } if (!is.null(upperBoundsOnCoefficients)) { - if (class(upperBoundsOnCoefficients) != "matrix") { + if (!is.matrix(upperBoundsOnCoefficients)) { stop("upperBoundsOnCoefficients must be a matrix.") } diff --git a/R/pkg/R/serialize.R b/R/pkg/R/serialize.R index 7760d9be16f0b..61e174de9ac56 100644 --- a/R/pkg/R/serialize.R +++ b/R/pkg/R/serialize.R @@ -58,7 +58,12 @@ writeObject <- function(con, object, writeType = TRUE) { # Checking types is needed here, since 'is.na' only handles atomic vectors, # lists and pairlists if (type %in% c("integer", "character", "logical", "double", "numeric")) { - if (is.na(object)) { + if (is.na(object[[1]])) { + # Uses the first element for now to keep the behavior same as R before + # 4.2.0. This is wrong because we should differenciate c(NA) from a + # single NA as the former means array(null) and the latter means null + # in Spark SQL. However, it requires non-trivial comparison to distinguish + # both in R. We should ideally fix this. object <- NULL type <- "NULL" } @@ -203,7 +208,11 @@ writeEnv <- function(con, env) { } writeDate <- function(con, date) { - writeString(con, as.character(date)) + if (is.na(date)) { + writeString(con, "NA") + } else { + writeString(con, as.character(date)) + } } writeTime <- function(con, time) { @@ -226,7 +235,7 @@ writeSerializeInArrow <- function(conn, df) { # There looks no way to send each batch in streaming format via socket # connection. See ARROW-4512. # So, it writes the whole Arrow streaming-formatted binary at once for now. - writeRaw(conn, arrow::write_arrow(df, raw())) + writeRaw(conn, arrow::write_to_raw(df)) } else { stop("'arrow' package should be installed.") } diff --git a/R/pkg/R/sparkR.R b/R/pkg/R/sparkR.R index f18a6c7e25f1b..e2ab57471773c 100644 --- a/R/pkg/R/sparkR.R +++ b/R/pkg/R/sparkR.R @@ -40,8 +40,15 @@ sparkR.session.stop <- function() { env <- .sparkREnv if (exists(".sparkRCon", envir = env)) { if (exists(".sparkRjsc", envir = env)) { - sc <- get(".sparkRjsc", envir = env) - callJMethod(sc, "stop") + # Should try catch for every use of the connection in case + # the connection is timed-out, see also SPARK-42186. + tryCatch({ + sc <- get(".sparkRjsc", envir = env) + callJMethod(sc, "stop") + }, + error = function(err) { + warning(err) + }) rm(".sparkRjsc", envir = env) if (exists(".sparkRsession", envir = env)) { @@ -56,20 +63,35 @@ sparkR.session.stop <- function() { } if (exists(".backendLaunched", envir = env)) { - callJStatic("SparkRHandler", "stopBackend") + tryCatch({ + callJStatic("SparkRHandler", "stopBackend") + }, + error = function(err) { + warning(err) + }) } # Also close the connection and remove it from our env - conn <- get(".sparkRCon", envir = env) - close(conn) + tryCatch({ + conn <- get(".sparkRCon", envir = env) + close(conn) + }, + error = function(err) { + warning(err) + }) rm(".sparkRCon", envir = env) rm(".scStartTime", envir = env) } if (exists(".monitorConn", envir = env)) { - conn <- get(".monitorConn", envir = env) - close(conn) + tryCatch({ + conn <- get(".monitorConn", envir = env) + close(conn) + }, + error = function(err) { + warning(err) + }) rm(".monitorConn", envir = env) } diff --git a/R/pkg/pkgdown/_pkgdown_template.yml b/R/pkg/pkgdown/_pkgdown_template.yml index eeb676befbc8b..e6b485d489844 100644 --- a/R/pkg/pkgdown/_pkgdown_template.yml +++ b/R/pkg/pkgdown/_pkgdown_template.yml @@ -117,6 +117,7 @@ reference: - unionAll - unionByName - unpersist + - unpivot - with - withColumn @@ -261,9 +262,16 @@ reference: - title: "SQL Catalog" - contents: + - currentCatalog - currentDatabase + - databaseExists - dropTempTable - dropTempView + - functionExists + - getDatabase + - getFunc + - getTable + - listCatalogs - listColumns - listDatabases - listFunctions @@ -271,6 +279,9 @@ reference: - refreshByPath - refreshTable - recoverPartitions + - setCurrentCatalog + - setCurrentDatabase + - tableExists - tableNames - tables - uncacheTable @@ -283,7 +294,6 @@ reference: - getLocalProperty - install.spark - setCheckpointDir - - setCurrentDatabase - setJobDescription - setJobGroup - setLocalProperty diff --git a/R/pkg/tests/fulltests/test_sparkSQL.R b/R/pkg/tests/fulltests/test_sparkSQL.R index df1094bacef64..b0c56f1c15d06 100644 --- a/R/pkg/tests/fulltests/test_sparkSQL.R +++ b/R/pkg/tests/fulltests/test_sparkSQL.R @@ -154,7 +154,7 @@ test_that("structType and structField", { expect_is(testSchema$fields()[[2]], "structField") expect_equal(testSchema$fields()[[1]]$dataType.toString(), "StringType") - expect_error(structType("A stri"), "DataType stri is not supported.") + expect_error(structType("A stri"), ".*Unsupported data type \"STRI\".*") }) test_that("structField type strings", { @@ -495,7 +495,7 @@ test_that("SPARK-17902: collect() with stringsAsFactors enabled", { expect_equal(iris$Species, df$Species) }) -test_that("SPARK-17811: can create DataFrame containing NA as date and time", { +test_that("SPARK-17811, SPARK-18011: can create DataFrame containing NA as date and time", { df <- data.frame( id = 1:2, time = c(as.POSIXlt("2016-01-10"), NA), @@ -622,7 +622,7 @@ test_that("read/write json files", { # Test errorifexists expect_error(write.df(df, jsonPath2, "json", mode = "errorifexists"), - "analysis error - path file:.*already exists") + "Error in save : analysis error - \\[PATH_ALREADY_EXISTS\\].*") # Test write.json jsonPath3 <- tempfile(pattern = "jsonPath3", fileext = ".json") @@ -663,7 +663,7 @@ test_that("test tableNames and tables", { expect_equal(count(tables), count + 1) expect_equal(count(tables()), count(tables)) expect_true("tableName" %in% colnames(tables())) - expect_true(all(c("tableName", "database", "isTemporary") %in% colnames(tables()))) + expect_true(all(c("tableName", "namespace", "isTemporary") %in% colnames(tables()))) suppressWarnings(registerTempTable(df, "table2")) tables <- listTables() @@ -673,6 +673,22 @@ test_that("test tableNames and tables", { tables <- listTables() expect_equal(count(tables), count + 0) + + count2 <- count(listTables()) + schema <- structType(structField("name", "string"), structField("age", "integer"), + structField("height", "float")) + createTable("people", source = "json", schema = schema) + + expect_equal(length(tableNames()), count2 + 1) + expect_equal(length(tableNames("default")), count2 + 1) + expect_equal(length(tableNames("spark_catalog.default")), count2 + 1) + + tables <- listTables() + expect_equal(count(tables), count2 + 1) + expect_equal(count(tables()), count(tables)) + expect_equal(count(tables("default")), count2 + 1) + expect_equal(count(tables("spark_catalog.default")), count2 + 1) + sql("DROP TABLE IF EXISTS people") }) test_that( @@ -696,16 +712,27 @@ test_that( expect_true(dropTempView("dfView")) }) -test_that("test cache, uncache and clearCache", { - df <- read.json(jsonPath) - createOrReplaceTempView(df, "table1") - cacheTable("table1") - uncacheTable("table1") +test_that("test tableExists, cache, uncache and clearCache", { + schema <- structType(structField("name", "string"), structField("age", "integer"), + structField("height", "float")) + createTable("table1", source = "json", schema = schema) + + cacheTable("default.table1") + uncacheTable("spark_catalog.default.table1") clearCache() - expect_true(dropTempView("table1")) expect_error(uncacheTable("zxwtyswklpf"), - "Error in uncacheTable : analysis error - Table or view not found: zxwtyswklpf") + "[TABLE_OR_VIEW_NOT_FOUND]*`zxwtyswklpf`*") + + expect_true(tableExists("table1")) + expect_true(tableExists("default.table1")) + expect_true(tableExists("spark_catalog.default.table1")) + + sql("DROP TABLE IF EXISTS spark_catalog.default.table1") + + expect_false(tableExists("table1")) + expect_false(tableExists("default.table1")) + expect_false(tableExists("spark_catalog.default.table1")) }) test_that("insertInto() on a registered table", { @@ -1264,6 +1291,15 @@ test_that("drop column", { df1 <- drop(df, df$age) expect_equal(columns(df1), c("name", "age2")) + df1 <- drop(df, df$age, df$name) + expect_equal(columns(df1), c("age2")) + + df1 <- drop(df, df$age, column("random")) + expect_equal(columns(df1), c("name", "age2")) + + df1 <- drop(df, df$age, "random") + expect_equal(columns(df1), c("name", "age2")) + df$age2 <- NULL expect_equal(columns(df), c("name", "age")) df$age3 <- NULL @@ -1342,7 +1378,7 @@ test_that("test HiveContext", { schema <- structType(structField("name", "string"), structField("age", "integer"), structField("height", "float")) - createTable("people", source = "json", schema = schema) + createTable("spark_catalog.default.people", source = "json", schema = schema) df <- read.df(jsonPathNa, "json", schema) insertInto(df, "people") expect_equal(collect(sql("SELECT age from people WHERE name = 'Bob'"))$age, c(16)) @@ -1568,6 +1604,16 @@ test_that("column functions", { result <- collect(select(df, array_sort(df[[1]])))[[1]] expect_equal(result, list(list(1L, 2L, 3L, NA), list(4L, 5L, 6L, NA, NA))) + result <- collect(select( + df, + array_sort( + df[[1]], + function(x, y) otherwise( + when(isNull(x), 1L), otherwise(when(isNull(y), -1L), cast(y - x, "integer")) + ) + ) + ))[[1]] + expect_equal(result, list(list(3L, 2L, 1L, NA), list(6L, 5L, 4L, NA, NA))) result <- collect(select(df, sort_array(df[[1]], FALSE)))[[1]] expect_equal(result, list(list(3L, 2L, 1L, NA), list(6L, 5L, 4L, NA, NA))) @@ -2967,6 +3013,32 @@ test_that("mutate(), transform(), rename() and names()", { expect_match(tail(columns(newDF), 1L), "234567890", fixed = TRUE) }) +test_that("unpivot / melt", { + df <- createDataFrame(data.frame( + id = 1:3, x = c(1, 3, 5), y = c(2, 4, 6), z = c(-1, 0, 1) + )) + + result <- unpivot(df, "id", c("x", "y"), "var", "val") + expect_s4_class(result, "SparkDataFrame") + expect_equal(columns(result), c("id", "var", "val")) + expect_equal(count(distinct(select(result, "var"))), 2) + + result <- unpivot(df, "id", NULL, "variable", "value") + expect_s4_class(result, "SparkDataFrame") + expect_equal(columns(result), c("id", "variable", "value")) + expect_equal(count(distinct(select(result, "variable"))), 3) + + result <- melt(df, "id", c("x", "y"), "key", "value") + expect_s4_class(result, "SparkDataFrame") + expect_equal(columns(result), c("id", "key", "value")) + expect_equal(count(distinct(select(result, "key"))), 2) + + result <- melt(df, "id", NULL, "key", "val") + expect_s4_class(result, "SparkDataFrame") + expect_equal(columns(result), c("id", "key", "val")) + expect_equal(count(distinct(select(result, "key"))), 3) +}) + test_that("read/write ORC files", { setHiveContext(sc) df <- read.df(jsonPath, "json") @@ -3321,8 +3393,8 @@ test_that("approxQuantile() on a DataFrame", { test_that("SQL error message is returned from JVM", { retError <- tryCatch(sql("select * from blah"), error = function(e) e) - expect_equal(grepl("Table or view not found", retError), TRUE) - expect_equal(grepl("blah", retError), TRUE) + expect_equal(grepl("[TABLE_OR_VIEW_NOT_FOUND]", retError), TRUE) + expect_equal(grepl("`blah`", retError), TRUE) }) irisDF <- suppressWarnings(createDataFrame(iris)) @@ -3411,6 +3483,8 @@ test_that("Method coltypes() to get and set R's data types of a DataFrame", { "Length of type vector should match the number of columns for SparkDataFrame") expect_error(coltypes(df) <- c("environment", "list"), "Only atomic type is supported for column types") + + dropTempView("dfView") }) test_that("Method str()", { @@ -3450,6 +3524,8 @@ test_that("Method str()", { # Test utils:::str expect_equal(capture.output(utils:::str(iris)), capture.output(str(iris))) + + dropTempView("irisView") }) test_that("Histogram", { @@ -3911,15 +3987,16 @@ test_that("Call DataFrameWriter.save() API in Java without path and check argume # It makes sure that we can omit path argument in write.df API and then it calls # DataFrameWriter.save() without path. expect_error(write.df(df, source = "csv"), - "Error in save : illegal argument - Expected exactly one path to be specified") + paste("Error in save : org.apache.spark.SparkIllegalArgumentException:", + "Expected exactly one path to be specified")) expect_error(write.json(df, jsonPath), - "Error in json : analysis error - path file:.*already exists") + "Error in json : analysis error - \\[PATH_ALREADY_EXISTS\\].*") expect_error(write.text(df, jsonPath), - "Error in text : analysis error - path file:.*already exists") + "Error in text : analysis error - \\[PATH_ALREADY_EXISTS\\].*") expect_error(write.orc(df, jsonPath), - "Error in orc : analysis error - path file:.*already exists") + "Error in orc : analysis error - \\[PATH_ALREADY_EXISTS\\].*") expect_error(write.parquet(df, jsonPath), - "Error in parquet : analysis error - path file:.*already exists") + "Error in parquet : analysis error - \\[PATH_ALREADY_EXISTS\\].*") expect_error(write.parquet(df, jsonPath, mode = 123), "mode should be character or omitted.") # Arguments checking in R side. @@ -3937,14 +4014,17 @@ test_that("Call DataFrameWriter.load() API in Java without path and check argume # It makes sure that we can omit path argument in read.df API and then it calls # DataFrameWriter.load() without path. expect_error(read.df(source = "json"), - paste("Error in load : analysis error - Unable to infer schema for JSON.", - "It must be specified manually")) - expect_error(read.df("arbitrary_path"), "Error in load : analysis error - Path does not exist") - expect_error(read.json("arbitrary_path"), "Error in json : analysis error - Path does not exist") - expect_error(read.text("arbitrary_path"), "Error in text : analysis error - Path does not exist") - expect_error(read.orc("arbitrary_path"), "Error in orc : analysis error - Path does not exist") + "Error in load : analysis error - \\[UNABLE_TO_INFER_SCHEMA\\].*") + expect_error(read.df("arbitrary_path"), + "Error in load : analysis error - \\[PATH_NOT_FOUND\\].*") + expect_error(read.json("arbitrary_path"), + "Error in json : analysis error - \\[PATH_NOT_FOUND\\].*") + expect_error(read.text("arbitrary_path"), + "Error in text : analysis error - \\[PATH_NOT_FOUND\\].*") + expect_error(read.orc("arbitrary_path"), + "Error in orc : analysis error - \\[PATH_NOT_FOUND\\].*") expect_error(read.parquet("arbitrary_path"), - "Error in parquet : analysis error - Path does not exist") + "Error in parquet : analysis error - \\[PATH_NOT_FOUND\\].*") # Arguments checking in R side. expect_error(read.df(path = c(3)), @@ -3963,14 +4043,14 @@ test_that("Specify a schema by using a DDL-formatted string when reading", { expect_is(df1, "SparkDataFrame") expect_equal(dtypes(df1), list(c("name", "string"), c("age", "double"))) - expect_error(read.df(jsonPath, "json", "name stri"), "DataType stri is not supported.") + expect_error(read.df(jsonPath, "json", "name stri"), ".*Unsupported data type \"STRI\".*") # Test loadDF with a user defined schema in a DDL-formatted string. df2 <- loadDF(jsonPath, "json", "name STRING, age DOUBLE") expect_is(df2, "SparkDataFrame") expect_equal(dtypes(df2), list(c("name", "string"), c("age", "double"))) - expect_error(loadDF(jsonPath, "json", "name stri"), "DataType stri is not supported.") + expect_error(loadDF(jsonPath, "json", "name stri"), ".*Unsupported data type \"STRI\".*") }) test_that("Collect on DataFrame when NAs exists at the top of a timestamp column", { @@ -4011,22 +4091,45 @@ test_that("Collect on DataFrame when NAs exists at the top of a timestamp column expect_equal(class(ldf3$col3), c("POSIXct", "POSIXt")) }) -test_that("catalog APIs, currentDatabase, setCurrentDatabase, listDatabases", { +test_that("catalog APIs, listCatalogs, setCurrentCatalog, currentCatalog", { + expect_equal(currentCatalog(), "spark_catalog") + expect_error(setCurrentCatalog("spark_catalog"), NA) + expect_error(setCurrentCatalog("zxwtyswklpf"), + paste0("Error in setCurrentCatalog : ", + "org.apache.spark.sql.connector.catalog.CatalogNotFoundException: ", + "Catalog 'zxwtyswklpf' plugin class not found: ", + "spark.sql.catalog.zxwtyswklpf is not defined")) + catalogs <- collect(listCatalogs()) +}) + +test_that("catalog APIs, currentDatabase, setCurrentDatabase, listDatabases, getDatabase", { expect_equal(currentDatabase(), "default") expect_error(setCurrentDatabase("default"), NA) expect_error(setCurrentDatabase("zxwtyswklpf"), - paste0("Error in setCurrentDatabase : analysis error - Database ", - "'zxwtyswklpf' does not exist")) + "[SCHEMA_NOT_FOUND]*`zxwtyswklpf`*") + + expect_true(databaseExists("default")) + expect_true(databaseExists("spark_catalog.default")) + expect_false(databaseExists("some_db")) + expect_false(databaseExists("spark_catalog.some_db")) + dbs <- collect(listDatabases()) - expect_equal(names(dbs), c("name", "description", "locationUri")) + expect_equal(names(dbs), c("name", "catalog", "description", "locationUri")) expect_equal(which(dbs[, 1] == "default"), 1) + + db <- getDatabase("spark_catalog.default") + expect_equal(db$name, "default") + expect_equal(db$catalog, "spark_catalog") }) -test_that("catalog APIs, listTables, listColumns, listFunctions", { +test_that("catalog APIs, listTables, getTable, listColumns, listFunctions, functionExists", { tb <- listTables() count <- count(tables()) + expect_equal(nrow(listTables("default")), count) + expect_equal(nrow(listTables("spark_catalog.default")), count) expect_equal(nrow(tb), count) - expect_equal(colnames(tb), c("name", "database", "description", "tableType", "isTemporary")) + expect_equal(colnames(tb), + c("name", "catalog", "namespace", "description", "tableType", "isTemporary")) createOrReplaceTempView(as.DataFrame(cars), "cars") @@ -4035,7 +4138,7 @@ test_that("catalog APIs, listTables, listColumns, listFunctions", { tbs <- collect(tb) expect_true(nrow(tbs[tbs$name == "cars", ]) > 0) expect_error(listTables("bar"), - "Error in listTables : no such database - Database 'bar' not found") + "[SCHEMA_NOT_FOUND]*`bar`*") c <- listColumns("cars") expect_equal(nrow(c), 2) @@ -4043,18 +4146,48 @@ test_that("catalog APIs, listTables, listColumns, listFunctions", { c("name", "description", "dataType", "nullable", "isPartition", "isBucket")) expect_equal(collect(c)[[1]][[1]], "speed") expect_error(listColumns("zxwtyswklpf", "default"), - paste("Error in listColumns : analysis error - Table", - "'zxwtyswklpf' does not exist in database 'default'")) + "[TABLE_OR_VIEW_NOT_FOUND]*`spark_catalog`.`default`.`zxwtyswklpf`*") f <- listFunctions() expect_true(nrow(f) >= 200) # 250 expect_equal(colnames(f), - c("name", "database", "description", "className", "isTemporary")) - expect_equal(take(orderBy(f, "className"), 1)$className, + c("name", "catalog", "namespace", "description", "className", "isTemporary")) + expect_equal(take(orderBy(filter(f, "className IS NOT NULL"), "className"), 1)$className, "org.apache.spark.sql.catalyst.expressions.Abs") expect_error(listFunctions("zxwtyswklpf_db"), - paste("Error in listFunctions : analysis error - Database", - "'zxwtyswklpf_db' does not exist")) + "[SCHEMA_NOT_FOUND]*`zxwtyswklpf_db`*") + + expect_true(functionExists("abs")) + expect_false(functionExists("aabbss")) + + func0 <- getFunc("abs") + expect_equal(func0$name, "abs") + expect_equal(func0$className, "org.apache.spark.sql.catalyst.expressions.Abs") + expect_true(func0$isTemporary) + + sql("CREATE FUNCTION func1 AS 'org.apache.spark.sql.catalyst.expressions.Add'") + + func1 <- getFunc("spark_catalog.default.func1") + expect_equal(func1$name, "func1") + expect_equal(func1$catalog, "spark_catalog") + expect_equal(length(func1$namespace), 1) + expect_equal(func1$namespace[[1]], "default") + expect_equal(func1$className, "org.apache.spark.sql.catalyst.expressions.Add") + expect_false(func1$isTemporary) + + expect_true(functionExists("func1")) + expect_true(functionExists("default.func1")) + expect_true(functionExists("spark_catalog.default.func1")) + + expect_false(functionExists("func2")) + expect_false(functionExists("default.func2")) + expect_false(functionExists("spark_catalog.default.func2")) + + sql("DROP FUNCTION func1") + + expect_false(functionExists("func1")) + expect_false(functionExists("default.func1")) + expect_false(functionExists("spark_catalog.default.func1")) # recoverPartitions does not work with temporary view expect_error(recoverPartitions("cars"), @@ -4063,7 +4196,26 @@ test_that("catalog APIs, listTables, listColumns, listFunctions", { expect_error(refreshTable("cars"), NA) expect_error(refreshByPath("/"), NA) + view <- getTable("cars") + expect_equal(view$name, "cars") + expect_equal(view$tableType, "TEMPORARY") + expect_true(view$isTemporary) + dropTempView("cars") + + schema <- structType(structField("name", "string"), structField("age", "integer"), + structField("height", "float")) + createTable("default.people", source = "json", schema = schema) + + tbl <- getTable("spark_catalog.default.people") + expect_equal(tbl$name, "people") + expect_equal(tbl$catalog, "spark_catalog") + expect_equal(length(tbl$namespace), 1) + expect_equal(tbl$namespace[[1]], "default") + expect_equal(tbl$tableType, "MANAGED") + expect_false(tbl$isTemporary) + + sql("DROP TABLE IF EXISTS people") }) test_that("assert_true, raise_error", { @@ -4084,6 +4236,54 @@ test_that("assert_true, raise_error", { expect_error(collect(select(filtered, raise_error(filtered$name))), "Justin") }) +test_that("SPARK-41937: check class column for multi-class object works", { + .originalTimeZone <- Sys.getenv("TZ") + Sys.setenv(TZ = "") + temp_time <- as.POSIXlt("2015-03-11 12:13:04.043", tz = "") + sdf <- createDataFrame( + data.frame(x = temp_time + c(-1, 1, -1, 1, -1)), + schema = structType("x timestamp") + ) + expect_warning(collect(filter(sdf, column("x") > temp_time)), NA) + expect_equal(collect(filter(sdf, column("x") > temp_time)), data.frame(x = temp_time + c(1, 1))) + expect_warning(collect(filter(sdf, contains(column("x"), temp_time + 5))), NA) + expect_warning( + collect( + mutate( + sdf, + newcol = otherwise(when(column("x") > lit(temp_time), temp_time), temp_time + 1) + ) + ), + NA + ) + expect_equal( + collect( + mutate( + sdf, + newcol = otherwise(when(column("x") > lit(temp_time), temp_time), temp_time + 1) + ) + ), + data.frame(x = temp_time + c(-1, 1, -1, 1, -1), newcol = temp_time + c(1, 0, 1, 0, 1)) + ) + expect_error( + collect(fillna(sdf, temp_time)), + "value should be an integer, numeric, character or named list" + ) + expect_error( + collect(fillna(sdf, list(x = temp_time))), + "value should be an integer, numeric or character" + ) + expect_warning( + collect(mutate(sdf, x2 = ifelse(column("x") > temp_time, temp_time + 5, temp_time - 5))), + NA + ) + expect_equal( + collect(mutate(sdf, x2 = ifelse(column("x") > temp_time, temp_time + 5, temp_time - 5))), + data.frame(x = temp_time + c(-1, 1, -1, 1, -1), x2 = temp_time + c(-5, 5, -5, 5, -5)) + ) + Sys.setenv(TZ = .originalTimeZone) +}) + compare_list <- function(list1, list2) { # get testthat to show the diff by first making the 2 lists equal in length expect_equal(length(list1), length(list2)) diff --git a/R/pkg/tests/fulltests/test_streaming.R b/R/pkg/tests/fulltests/test_streaming.R index 6f0d2aefee886..8804471e640cf 100644 --- a/R/pkg/tests/fulltests/test_streaming.R +++ b/R/pkg/tests/fulltests/test_streaming.R @@ -130,7 +130,7 @@ test_that("Specify a schema by using a DDL-formatted string when reading", { stopQuery(q) expect_error(read.stream(path = parquetPath, schema = "name stri"), - "DataType stri is not supported.") + ".*Unsupported data type \"STRI\".*") unlink(parquetPath) }) @@ -140,8 +140,7 @@ test_that("Non-streaming DataFrame", { expect_false(isStreaming(c)) expect_error(write.stream(c, "memory", queryName = "people", outputMode = "complete"), - paste0(".*(writeStream : analysis error - 'writeStream' can be called only on ", - "streaming Dataset/DataFrame).*")) + paste0("Error in writeStream : analysis error - \\[WRITE_STREAM_NOT_ALLOWED\\].*")) }) test_that("Unsupported operation", { diff --git a/R/pkg/tests/fulltests/test_utils.R b/R/pkg/tests/fulltests/test_utils.R index 35f9c9e7bb31e..4d263e5d76509 100644 --- a/R/pkg/tests/fulltests/test_utils.R +++ b/R/pkg/tests/fulltests/test_utils.R @@ -190,7 +190,7 @@ test_that("captureJVMException", { error = function(e) { captureJVMException(e, method) }), - "parse error - .*DataType unknown.*not supported.") + ".*Unsupported data type \"UNKNOWN\".*") }) test_that("hashCode", { diff --git a/R/run-tests.sh b/R/run-tests.sh index 99b7438a80097..90a60eda03871 100755 --- a/R/run-tests.sh +++ b/R/run-tests.sh @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one or more @@ -23,16 +23,16 @@ FAILED=0 LOGFILE=$FWDIR/unit-tests.out rm -f $LOGFILE -SPARK_AVRO_JAR_PATH=$(find $FWDIR/../external/avro/ -name "spark-avro*jar" -print | egrep -v "tests.jar|test-sources.jar|sources.jar|javadoc.jar") +SPARK_AVRO_JAR_PATH=$(find $FWDIR/../connector/avro/ -name "spark-avro*jar" -print | egrep -v "tests.jar|test-sources.jar|sources.jar|javadoc.jar") if [[ $(echo $SPARK_AVRO_JAR_PATH | wc -l) -eq 1 ]]; then SPARK_JARS=$SPARK_AVRO_JAR_PATH fi if [ -z "$SPARK_JARS" ]; then - SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configurationFile=file:$FWDIR/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE + SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --driver-java-options "-Dlog4j.configurationFile=file:$FWDIR/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE else - SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --jars $SPARK_JARS --driver-java-options "-Dlog4j.configurationFile=file:$FWDIR/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE + SPARK_TESTING=1 NOT_CRAN=true $FWDIR/../bin/spark-submit --jars $SPARK_JARS --driver-java-options "-Dlog4j.configurationFile=file:$FWDIR/log4j2.properties" --conf spark.hadoop.fs.defaultFS="file:///" --conf spark.driver.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" --conf spark.executor.extraJavaOptions="-Dio.netty.tryReflectionSetAccessible=true -Xss4M" $FWDIR/pkg/tests/run-all.R 2>&1 | tee -a $LOGFILE fi FAILED=$((PIPESTATUS[0]||$FAILED)) diff --git a/README.md b/README.md index dbc0f2ba87ead..310df41f4654b 100644 --- a/README.md +++ b/README.md @@ -9,9 +9,10 @@ and Structured Streaming for stream processing. -[![GitHub Action Build](https://github.com/apache/spark/actions/workflows/build_and_test.yml/badge.svg?branch=master&event=push)](https://github.com/apache/spark/actions/workflows/build_and_test.yml?query=branch%3Amaster+event%3Apush) +[![GitHub Actions Build](https://github.com/apache/spark/actions/workflows/build_main.yml/badge.svg)](https://github.com/apache/spark/actions/workflows/build_main.yml) [![AppVeyor Build](https://img.shields.io/appveyor/ci/ApacheSoftwareFoundation/spark/master.svg?style=plastic&logo=appveyor)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/spark) [![PySpark Coverage](https://codecov.io/gh/apache/spark/branch/master/graph/badge.svg)](https://codecov.io/gh/apache/spark) +[![PyPI Downloads](https://static.pepy.tech/personalized-badge/pyspark?period=month&units=international_system&left_color=black&right_color=orange&left_text=PyPI%20downloads)](https://pypi.org/project/pyspark/) ## Online Documentation diff --git a/appveyor.yml b/appveyor.yml index 53ef8527c6555..fdb247d5d4375 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -28,6 +28,7 @@ only_commits: files: - appveyor.yml - dev/appveyor-install-dependencies.ps1 + - build/spark-build-info.ps1 - R/ - sql/core/src/main/scala/org/apache/spark/sql/api/r/ - core/src/main/scala/org/apache/spark/api/r/ @@ -50,10 +51,12 @@ build_script: # See SPARK-28759. # Ideally we should check the tests related to Hive in SparkR as well (SPARK-31745). - cmd: set SBT_MAVEN_PROFILES=-Psparkr - - cmd: set SBT_OPTS=-Djna.nosys=true -Dfile.encoding=UTF-8 -Xms4096m -Xms4096m -XX:ReservedCodeCacheSize=128m + - cmd: set SBT_OPTS=-Djna.nosys=true -Dfile.encoding=UTF-8 -XX:ReservedCodeCacheSize=128m + - cmd: set JAVA_OPTS=-Xms4096m -Xms4096m - cmd: sbt package - cmd: set SBT_MAVEN_PROFILES= - cmd: set SBT_OPTS= + - cmd: set JAVA_OPTS= environment: NOT_CRAN: true diff --git a/assembly/pom.xml b/assembly/pom.xml index 32126a5e13820..b09ffdad3ff3e 100644 --- a/assembly/pom.xml +++ b/assembly/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.12 - 3.3.1 + 3.4.1 ../pom.xml @@ -152,6 +152,16 @@ + + connect + + + org.apache.spark + spark-connect_${scala.binary.version} + ${project.version} + + + kubernetes diff --git a/bin/docker-image-tool.sh b/bin/docker-image-tool.sh index ad31bd1e7b7ab..a137a2fba52ee 100755 --- a/bin/docker-image-tool.sh +++ b/bin/docker-image-tool.sh @@ -181,7 +181,7 @@ function build { error "Failed to build Spark JVM Docker image, please refer to Docker build output for details." fi if [ "${CROSS_BUILD}" != "false" ]; then - (cd $(img_ctx_dir base) && docker buildx build $ARCHS $NOCACHEARG "${BUILD_ARGS[@]}" --push \ + (cd $(img_ctx_dir base) && docker buildx build $ARCHS $NOCACHEARG "${BUILD_ARGS[@]}" --push --provenance=false \ -t $(image_ref spark) \ -f "$BASEDOCKERFILE" .) fi @@ -194,7 +194,7 @@ function build { error "Failed to build PySpark Docker image, please refer to Docker build output for details." fi if [ "${CROSS_BUILD}" != "false" ]; then - (cd $(img_ctx_dir pyspark) && docker buildx build $ARCHS $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" --push \ + (cd $(img_ctx_dir pyspark) && docker buildx build $ARCHS $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" --push --provenance=false \ -t $(image_ref spark-py) \ -f "$PYDOCKERFILE" .) fi @@ -208,7 +208,7 @@ function build { error "Failed to build SparkR Docker image, please refer to Docker build output for details." fi if [ "${CROSS_BUILD}" != "false" ]; then - (cd $(img_ctx_dir sparkr) && docker buildx build $ARCHS $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" --push \ + (cd $(img_ctx_dir sparkr) && docker buildx build $ARCHS $NOCACHEARG "${BINDING_BUILD_ARGS[@]}" --push --provenance=false \ -t $(image_ref spark-r) \ -f "$RDOCKERFILE" .) fi @@ -233,7 +233,6 @@ Commands: Options: -f file (Optional) Dockerfile to build for JVM based Jobs. By default builds the Dockerfile shipped with Spark. - For Java 17, use `-f kubernetes/dockerfiles/spark/Dockerfile.java17` -p file (Optional) Dockerfile to build for PySpark Jobs. Builds Python dependencies and ships with Spark. Skips building PySpark docker image if not specified. -R file (Optional) Dockerfile to build for SparkR Jobs. Builds R dependencies and ships with Spark. @@ -262,25 +261,21 @@ Examples: $0 -m -t testing build - Build PySpark docker image - $0 -r docker.io/myrepo -t v2.3.0 -p kubernetes/dockerfiles/spark/bindings/python/Dockerfile build + $0 -r docker.io/myrepo -t v3.4.0 -p kubernetes/dockerfiles/spark/bindings/python/Dockerfile build - - Build and push image with tag "v2.3.0" to docker.io/myrepo - $0 -r docker.io/myrepo -t v2.3.0 build - $0 -r docker.io/myrepo -t v2.3.0 push + - Build and push image with tag "v3.4.0" to docker.io/myrepo + $0 -r docker.io/myrepo -t v3.4.0 build + $0 -r docker.io/myrepo -t v3.4.0 push - - Build and push Java11-based image with tag "v3.0.0" to docker.io/myrepo - $0 -r docker.io/myrepo -t v3.0.0 -b java_image_tag=11-jre-slim build - $0 -r docker.io/myrepo -t v3.0.0 push + - Build and push Java11-based image with tag "v3.4.0" to docker.io/myrepo + $0 -r docker.io/myrepo -t v3.4.0 -b java_image_tag=11-jre build + $0 -r docker.io/myrepo -t v3.4.0 push - - Build and push Java11-based image for multiple archs to docker.io/myrepo - $0 -r docker.io/myrepo -t v3.0.0 -X -b java_image_tag=11-jre-slim build + - Build and push image for multiple archs to docker.io/myrepo + $0 -r docker.io/myrepo -t v3.4.0 -X build # Note: buildx, which does cross building, needs to do the push during build # So there is no separate push step with -X - - Build and push Java17-based image with tag "v3.3.0" to docker.io/myrepo - $0 -r docker.io/myrepo -t v3.3.0 -f kubernetes/dockerfiles/spark/Dockerfile.java17 build - $0 -r docker.io/myrepo -t v3.3.0 push - EOF } diff --git a/bin/pyspark b/bin/pyspark index 21a514e5e2c4a..1ae28b1f507cd 100755 --- a/bin/pyspark +++ b/bin/pyspark @@ -50,7 +50,7 @@ export PYSPARK_DRIVER_PYTHON_OPTS # Add the PySpark classes to the Python path: export PYTHONPATH="${SPARK_HOME}/python/:$PYTHONPATH" -export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.5-src.zip:$PYTHONPATH" +export PYTHONPATH="${SPARK_HOME}/python/lib/py4j-0.10.9.7-src.zip:$PYTHONPATH" # Load the PySpark shell.py script when ./pyspark is used interactively: export OLD_PYTHONSTARTUP="$PYTHONSTARTUP" diff --git a/bin/pyspark2.cmd b/bin/pyspark2.cmd index eec02a406b680..232813b4ffdd6 100644 --- a/bin/pyspark2.cmd +++ b/bin/pyspark2.cmd @@ -30,7 +30,7 @@ if "x%PYSPARK_DRIVER_PYTHON%"=="x" ( ) set PYTHONPATH=%SPARK_HOME%\python;%PYTHONPATH% -set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.9.5-src.zip;%PYTHONPATH% +set PYTHONPATH=%SPARK_HOME%\python\lib\py4j-0.10.9.7-src.zip;%PYTHONPATH% set OLD_PYTHONSTARTUP=%PYTHONSTARTUP% set PYTHONSTARTUP=%SPARK_HOME%\python\pyspark\shell.py diff --git a/bin/spark-class b/bin/spark-class index c1461a7712289..fc343ca29fddd 100755 --- a/bin/spark-class +++ b/bin/spark-class @@ -77,7 +77,8 @@ set +o posix CMD=() DELIM=$'\n' CMD_START_FLAG="false" -while IFS= read -d "$DELIM" -r ARG; do +while IFS= read -d "$DELIM" -r _ARG; do + ARG=${_ARG//$'\r'} if [ "$CMD_START_FLAG" == "true" ]; then CMD+=("$ARG") else diff --git a/bin/spark-class2.cmd b/bin/spark-class2.cmd index 68b271d1d05d9..800ec0c02c22f 100755 --- a/bin/spark-class2.cmd +++ b/bin/spark-class2.cmd @@ -69,6 +69,8 @@ rem SPARK-28302: %RANDOM% would return the same number if we call it instantly a rem so we should make it sure to generate unique file to avoid process collision of writing into rem the same file concurrently. if exist %LAUNCHER_OUTPUT% goto :gen +rem unset SHELL to indicate non-bash environment to launcher/Main +set SHELL= "%RUNNER%" -Xmx128m -cp "%LAUNCH_CLASSPATH%" org.apache.spark.launcher.Main %* > %LAUNCHER_OUTPUT% for /f "tokens=*" %%i in (%LAUNCHER_OUTPUT%) do ( set SPARK_CMD=%%i diff --git a/bin/spark-connect-shell b/bin/spark-connect-shell new file mode 100755 index 0000000000000..9026c81e70d81 --- /dev/null +++ b/bin/spark-connect-shell @@ -0,0 +1,27 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# The shell script to start a spark-shell with spark connect enabled. + +if [ -z "${SPARK_HOME}" ]; then + source "$(dirname "$0")"/find-spark-home +fi + +# This requires building the spark with `-Pconnect`, e,g, `build/sbt -Pconnect package` +exec "${SPARK_HOME}"/bin/spark-shell --conf spark.plugins=org.apache.spark.sql.connect.SparkConnectPlugin "$@" \ No newline at end of file diff --git a/bin/sparkR b/bin/sparkR index 29ab10df8ab6d..8ecc755839fe3 100755 --- a/bin/sparkR +++ b/bin/sparkR @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one or more diff --git a/binder/postBuild b/binder/postBuild index 733eafe175ef0..70ae23b393707 100644 --- a/binder/postBuild +++ b/binder/postBuild @@ -1,4 +1,4 @@ -#!/bin/bash +#!/usr/bin/env bash # # Licensed to the Apache Software Foundation (ASF) under one or more @@ -32,11 +32,24 @@ else SPECIFIER="<=" fi -pip install plotly "pyspark[sql,ml,mllib,pandas_on_spark]$SPECIFIER$VERSION" +if [[ ! $VERSION < "3.4.0" ]]; then + pip install plotly "pandas<2.0.0" "pyspark[sql,ml,mllib,pandas_on_spark,connect]$SPECIFIER$VERSION" +else + pip install plotly "pandas<2.0.0" "pyspark[sql,ml,mllib,pandas_on_spark]$SPECIFIER$VERSION" +fi # Set 'PYARROW_IGNORE_TIMEZONE' to surpress warnings from PyArrow. echo "export PYARROW_IGNORE_TIMEZONE=1" >> ~/.profile +# Add sbin to PATH to run `start-connect-server.sh`. +SPARK_HOME=$(python -c "from pyspark.find_spark_home import _find_spark_home; print(_find_spark_home())") +echo "export PATH=${PATH}:${SPARK_HOME}/sbin" >> ~/.profile +echo "export SPARK_HOME=${SPARK_HOME}" >> ~/.profile + +# Add Spark version to env for running command dynamically based on Spark version. +SPARK_VERSION=$(python -c "import pyspark; print(pyspark.__version__)") +echo "export SPARK_VERSION=${SPARK_VERSION}" >> ~/.profile + # Surpress warnings from Spark jobs, and UI progress bar. mkdir -p ~/.ipython/profile_default/startup echo """from pyspark.sql import SparkSession diff --git a/build/mvn b/build/mvn index 4989c2d7efd62..aee9358fe44c6 100755 --- a/build/mvn +++ b/build/mvn @@ -36,7 +36,7 @@ _DIR="$( cd "$( dirname "${BASH_SOURCE[0]}" )" && pwd )" # Preserve the calling directory _CALLING_DIR="$(pwd)" # Options used during compilation -_COMPILE_JVM_OPTS="-Xmx2g -XX:ReservedCodeCacheSize=1g -Xss128m" +_COMPILE_JVM_OPTS="-Xss128m -Xmx4g -XX:ReservedCodeCacheSize=128m" # Installs any application tarball given a URL, the expected tarball name, # and, optionally, a checkable binary path to determine if the binary has @@ -119,7 +119,7 @@ install_mvn() { if [ "$MVN_BIN" ]; then local MVN_DETECTED_VERSION="$(mvn --version | head -n1 | awk '{print $3}')" fi - if [ $(version $MVN_DETECTED_VERSION) -lt $(version $MVN_VERSION) ]; then + if [ $(version $MVN_DETECTED_VERSION) -ne $(version $MVN_VERSION) ]; then local MVN_TARBALL="apache-maven-${MVN_VERSION}-bin.tar.gz" local FILE_PATH="maven/maven-3/${MVN_VERSION}/binaries/${MVN_TARBALL}" local APACHE_MIRROR=${APACHE_MIRROR:-'https://www.apache.org/dyn/closer.lua'} @@ -180,6 +180,13 @@ export MAVEN_OPTS=${MAVEN_OPTS:-"$_COMPILE_JVM_OPTS"} echo "Using \`mvn\` from path: $MVN_BIN" 1>&2 +if [ ! -z "${SPARK_LOCAL_HOSTNAME}" ]; then + echo "Using SPARK_LOCAL_HOSTNAME=$SPARK_LOCAL_HOSTNAME" 1>&2 +fi +if [ ! -z "${SPARK_LOCAL_IP}" ]; then + echo "Using SPARK_LOCAL_IP=$SPARK_LOCAL_IP" 1>&2 +fi + # call the `mvn` command as usual # SPARK-25854 "${MVN_BIN}" "$@" diff --git a/build/sbt b/build/sbt index 843d2a026ed64..db9d3b345ff6f 100755 --- a/build/sbt +++ b/build/sbt @@ -133,6 +133,13 @@ saveSttySettings() { fi } +if [ ! -z "${SPARK_LOCAL_HOSTNAME}" ]; then + echo "Using SPARK_LOCAL_HOSTNAME=$SPARK_LOCAL_HOSTNAME" 1>&2 +fi +if [ ! -z "${SPARK_LOCAL_IP}" ]; then + echo "Using SPARK_LOCAL_IP=$SPARK_LOCAL_IP" 1>&2 +fi + saveSttySettings trap onExit INT diff --git a/build/sbt-launch-lib.bash b/build/sbt-launch-lib.bash index 8fb6672bddc4d..01ba6b929f922 100755 --- a/build/sbt-launch-lib.bash +++ b/build/sbt-launch-lib.bash @@ -183,8 +183,8 @@ run() { # run sbt execRunner "$java_cmd" \ - ${SBT_OPTS:-$default_sbt_opts} \ $(get_mem_opts $sbt_mem) \ + ${SBT_OPTS:-$default_sbt_opts} \ ${java_opts} \ ${java_args[@]} \ -jar "$sbt_jar" \ diff --git a/build/spark-build-info b/build/spark-build-info index eb0e3d730e23e..4a4ff9169b3fa 100755 --- a/build/spark-build-info +++ b/build/spark-build-info @@ -24,7 +24,7 @@ RESOURCE_DIR="$1" mkdir -p "$RESOURCE_DIR" -SPARK_BUILD_INFO="${RESOURCE_DIR}"/spark-version-info.properties +SPARK_BUILD_INFO="${RESOURCE_DIR%/}"/spark-version-info.properties echo_build_properties() { echo version=$1 @@ -33,6 +33,7 @@ echo_build_properties() { echo branch=$(git rev-parse --abbrev-ref HEAD) echo date=$(date -u +%Y-%m-%dT%H:%M:%SZ) echo url=$(git config --get remote.origin.url | sed 's|https://\(.*\)@\(.*\)|https://\2|') + echo docroot=https://spark.apache.org/docs/latest } echo_build_properties $2 > "$SPARK_BUILD_INFO" diff --git a/build/spark-build-info.ps1 b/build/spark-build-info.ps1 new file mode 100644 index 0000000000000..43db8823340c6 --- /dev/null +++ b/build/spark-build-info.ps1 @@ -0,0 +1,46 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# This script generates the build info for spark and places it into the spark-version-info.properties file. +# Arguments: +# ResourceDir - The target directory where properties file would be created. [./core/target/extra-resources] +# SparkVersion - The current version of spark + +param( + # The resource directory. + [Parameter(Position = 0)] + [String] + $ResourceDir, + + # The Spark version. + [Parameter(Position = 1)] + [String] + $SparkVersion +) + +$null = New-Item -Type Directory -Force $ResourceDir +$SparkBuildInfoPath = $ResourceDir.TrimEnd('\').TrimEnd('/') + '\spark-version-info.properties' + +$SparkBuildInfoContent = +"version=$SparkVersion +user=$($Env:USERNAME) +revision=$(git rev-parse HEAD) +branch=$(git rev-parse --abbrev-ref HEAD) +date=$([DateTime]::UtcNow | Get-Date -UFormat +%Y-%m-%dT%H:%M:%SZ) +url=$(git config --get remote.origin.url)" + +Set-Content -Path $SparkBuildInfoPath -Value $SparkBuildInfoContent diff --git a/common/kvstore/pom.xml b/common/kvstore/pom.xml index 21bf56094503b..bb5467aa0e7a8 100644 --- a/common/kvstore/pom.xml +++ b/common/kvstore/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.3.1 + 3.4.1 ../../pom.xml @@ -89,7 +89,7 @@ org.apache.logging.log4j - log4j-slf4j-impl + log4j-slf4j2-impl test diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java index 431c7e42774e4..a353a53d4b8d7 100644 --- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java +++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/InMemoryStore.java @@ -468,11 +468,6 @@ public T next() { return iter.next(); } - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - @Override public List next(int max) { List list = new ArrayList<>(max); diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreSerializer.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreSerializer.java index ff99d052cf7a2..02dd73e1a2f27 100644 --- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreSerializer.java +++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVStoreSerializer.java @@ -49,7 +49,7 @@ public KVStoreSerializer() { this.mapper = new ObjectMapper(); } - public final byte[] serialize(Object o) throws Exception { + public byte[] serialize(Object o) throws Exception { if (o instanceof String) { return ((String) o).getBytes(UTF_8); } else { @@ -62,7 +62,7 @@ public final byte[] serialize(Object o) throws Exception { } @SuppressWarnings("unchecked") - public final T deserialize(byte[] data, Class klass) throws Exception { + public T deserialize(byte[] data, Class klass) throws Exception { if (klass.equals(String.class)) { return (T) new String(data, UTF_8); } else { diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVTypeInfo.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVTypeInfo.java index a7e5831846ad4..a15d07cf59958 100644 --- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVTypeInfo.java +++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/KVTypeInfo.java @@ -48,7 +48,6 @@ public KVTypeInfo(Class type) { checkIndex(idx, indices); f.setAccessible(true); indices.put(idx.value(), idx); - f.setAccessible(true); accessors.put(idx.value(), new FieldAccessor(f)); } } @@ -61,7 +60,6 @@ public KVTypeInfo(Class type) { "Annotated method %s::%s should not have any parameters.", type.getName(), m.getName()); m.setAccessible(true); indices.put(idx.value(), idx); - m.setAccessible(true); accessors.put(idx.value(), new MethodAccessor(m)); } } diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java index 6b28373a48065..b50906e2cbac4 100644 --- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java +++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDB.java @@ -270,10 +270,14 @@ public boolean removeAllByIndexValues( KVStoreView view = view(klass).index(index); for (Object indexValue : indexValues) { - for (T value: view.first(indexValue).last(indexValue)) { - Object itemKey = naturalIndex.getValue(value); - delete(klass, itemKey); - removed = true; + try (KVStoreIterator iterator = + view.first(indexValue).last(indexValue).closeableIterator()) { + while (iterator.hasNext()) { + T value = iterator.next(); + Object itemKey = naturalIndex.getValue(value); + delete(klass, itemKey); + removed = true; + } } } diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java index e8fb4fac5ba17..35d0c6065fb0f 100644 --- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java +++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/LevelDBIterator.java @@ -143,11 +143,6 @@ public T next() { } } - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - @Override public List next(int max) { List list = new ArrayList<>(max); @@ -159,6 +154,8 @@ public List next(int max) { @Override public boolean skip(long n) { + if (closed) return false; + long skipped = 0; while (skipped < n) { if (next != null) { @@ -189,6 +186,7 @@ public synchronized void close() throws IOException { if (!closed) { it.close(); closed = true; + next = null; } } diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/RocksDB.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/RocksDB.java index 7674bc52dc750..d328e5c79d341 100644 --- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/RocksDB.java +++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/RocksDB.java @@ -303,10 +303,14 @@ public boolean removeAllByIndexValues( KVStoreView view = view(klass).index(index); for (Object indexValue : indexValues) { - for (T value: view.first(indexValue).last(indexValue)) { - Object itemKey = naturalIndex.getValue(value); - delete(klass, itemKey); - removed = true; + try (KVStoreIterator iterator = + view.first(indexValue).last(indexValue).closeableIterator()) { + while (iterator.hasNext()) { + T value = iterator.next(); + Object itemKey = naturalIndex.getValue(value); + delete(klass, itemKey); + removed = true; + } } } diff --git a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/RocksDBIterator.java b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/RocksDBIterator.java index 1db47f4dad00a..2b12fddef6583 100644 --- a/common/kvstore/src/main/java/org/apache/spark/util/kvstore/RocksDBIterator.java +++ b/common/kvstore/src/main/java/org/apache/spark/util/kvstore/RocksDBIterator.java @@ -134,11 +134,6 @@ public T next() { } } - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - @Override public List next(int max) { List list = new ArrayList<>(max); @@ -150,6 +145,8 @@ public List next(int max) { @Override public boolean skip(long n) { + if(closed) return false; + long skipped = 0; while (skipped < n) { if (next != null) { @@ -183,6 +180,7 @@ public synchronized void close() throws IOException { if (!closed) { it.close(); closed = true; + next = null; } } diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/DBIteratorSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/DBIteratorSuite.java index ab1e27285853e..223f3f93a8790 100644 --- a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/DBIteratorSuite.java +++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/DBIteratorSuite.java @@ -490,11 +490,15 @@ private void compareLists(Iterable expected, List actual) { } private KVStoreView view() throws Exception { + // SPARK-38896: this `view` will be closed in + // the `collect(KVStoreView view)` method. return db.view(CustomType1.class); } private List collect(KVStoreView view) throws Exception { - return Arrays.asList(Iterables.toArray(view, CustomType1.class)); + try (KVStoreIterator iterator = view.closeableIterator()) { + return Lists.newArrayList(iterator); + } } private List sortBy(Comparator comp) { diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/InMemoryStoreSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/InMemoryStoreSuite.java index 35656fb12238a..b2acd1ae15b16 100644 --- a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/InMemoryStoreSuite.java +++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/InMemoryStoreSuite.java @@ -34,24 +34,14 @@ public void testObjectWriteReadDelete() throws Exception { t.id = "id"; t.name = "name"; - try { - store.read(CustomType1.class, t.key); - fail("Expected exception for non-existent object."); - } catch (NoSuchElementException nsee) { - // Expected. - } + assertThrows(NoSuchElementException.class, () -> store.read(CustomType1.class, t.key)); store.write(t); assertEquals(t, store.read(t.getClass(), t.key)); assertEquals(1L, store.count(t.getClass())); store.delete(t.getClass(), t.key); - try { - store.read(t.getClass(), t.key); - fail("Expected exception for deleted object."); - } catch (NoSuchElementException nsee) { - // Expected. - } + assertThrows(NoSuchElementException.class, () -> store.read(t.getClass(), t.key)); } @Test @@ -78,12 +68,7 @@ public void testMultipleObjectWriteReadDelete() throws Exception { store.delete(t1.getClass(), t1.key); assertEquals(t2, store.read(t2.getClass(), t2.key)); store.delete(t2.getClass(), t2.key); - try { - store.read(t2.getClass(), t2.key); - fail("Expected exception for deleted object."); - } catch (NoSuchElementException nsee) { - // Expected. - } + assertThrows(NoSuchElementException.class, () -> store.read(t2.getClass(), t2.key)); } @Test @@ -159,25 +144,25 @@ public void testRemoveAll() throws Exception { assertEquals(9, store.count(ArrayKeyIndexType.class)); // Try removing non-existing keys - assert(!store.removeAllByIndexValues( + assertFalse(store.removeAllByIndexValues( ArrayKeyIndexType.class, KVIndex.NATURAL_INDEX_NAME, ImmutableSet.of(new int[] {10, 10, 10}, new int[] { 3, 3, 3 }))); assertEquals(9, store.count(ArrayKeyIndexType.class)); - assert(store.removeAllByIndexValues( + assertTrue(store.removeAllByIndexValues( ArrayKeyIndexType.class, KVIndex.NATURAL_INDEX_NAME, ImmutableSet.of(new int[] {0, 0, 0}, new int[] { 2, 2, 2 }))); assertEquals(7, store.count(ArrayKeyIndexType.class)); - assert(store.removeAllByIndexValues( + assertTrue(store.removeAllByIndexValues( ArrayKeyIndexType.class, "id", ImmutableSet.of(new String [] { "things" }))); assertEquals(4, store.count(ArrayKeyIndexType.class)); - assert(store.removeAllByIndexValues( + assertTrue(store.removeAllByIndexValues( ArrayKeyIndexType.class, "id", ImmutableSet.of(new String [] { "more things" }))); diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBBenchmark.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBBenchmark.java index f2a91f916a309..9082e1887bf85 100644 --- a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBBenchmark.java +++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBBenchmark.java @@ -197,9 +197,15 @@ private void iterate(KVStoreView view, String name) throws Exception { } } - while (it.hasNext()) { - try(Timer.Context ctx = iter.time()) { - it.next(); + try { + while (it.hasNext()) { + try (Timer.Context ctx = iter.time()) { + it.next(); + } + } + } finally { + if (it != null) { + it.close(); } } } diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java index c43c9b171f5a4..86f65e9be895f 100644 --- a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java +++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBSuite.java @@ -22,6 +22,7 @@ import java.util.Iterator; import java.util.List; import java.util.NoSuchElementException; +import java.util.Spliterators; import java.util.stream.Collectors; import java.util.stream.StreamSupport; @@ -71,36 +72,21 @@ public void testReopenAndVersionCheckDb() throws Exception { db.close(); db = null; - try { - db = new LevelDB(dbpath); - fail("Should have failed version check."); - } catch (UnsupportedStoreVersionException e) { - // Expected. - } + assertThrows(UnsupportedStoreVersionException.class, () -> db = new LevelDB(dbpath)); } @Test public void testObjectWriteReadDelete() throws Exception { CustomType1 t = createCustomType1(1); - try { - db.read(CustomType1.class, t.key); - fail("Expected exception for non-existent object."); - } catch (NoSuchElementException nsee) { - // Expected. - } + assertThrows(NoSuchElementException.class, () -> db.read(CustomType1.class, t.key)); db.write(t); assertEquals(t, db.read(t.getClass(), t.key)); assertEquals(1L, db.count(t.getClass())); db.delete(t.getClass(), t.key); - try { - db.read(t.getClass(), t.key); - fail("Expected exception for deleted object."); - } catch (NoSuchElementException nsee) { - // Expected. - } + assertThrows(NoSuchElementException.class, () -> db.read(t.getClass(), t.key)); // Look into the actual DB and make sure that all the keys related to the type have been // removed. @@ -251,13 +237,14 @@ public void testSkip() throws Exception { db.write(createCustomType1(i)); } - KVStoreIterator it = db.view(CustomType1.class).closeableIterator(); - assertTrue(it.hasNext()); - assertTrue(it.skip(5)); - assertEquals("key5", it.next().key); - assertTrue(it.skip(3)); - assertEquals("key9", it.next().key); - assertFalse(it.hasNext()); + try (KVStoreIterator it = db.view(CustomType1.class).closeableIterator()) { + assertTrue(it.hasNext()); + assertTrue(it.skip(5)); + assertEquals("key5", it.next().key); + assertTrue(it.skip(3)); + assertEquals("key9", it.next().key); + assertFalse(it.hasNext()); + } } @Test @@ -272,12 +259,15 @@ public void testNegativeIndexValues() throws Exception { } }); - List results = StreamSupport - .stream(db.view(CustomType1.class).index("int").spliterator(), false) - .map(e -> e.num) - .collect(Collectors.toList()); + try (KVStoreIterator iterator = + db.view(CustomType1.class).index("int").closeableIterator()) { + List results = StreamSupport + .stream(Spliterators.spliteratorUnknownSize(iterator, 0), false) + .map(e -> e.num) + .collect(Collectors.toList()); - assertEquals(expected, results); + assertEquals(expected, results); + } } @Test @@ -315,6 +305,84 @@ public void testCloseLevelDBIterator() throws Exception { assertTrue(!dbPathForCloseTest.exists()); } + @Test + public void testHasNextAfterIteratorClose() throws Exception { + db.write(createCustomType1(0)); + KVStoreIterator iter = + db.view(CustomType1.class).closeableIterator(); + // iter should be true + assertTrue(iter.hasNext()); + // close iter + iter.close(); + // iter.hasNext should be false after iter close + assertFalse(iter.hasNext()); + } + + @Test + public void testHasNextAfterDBClose() throws Exception { + db.write(createCustomType1(0)); + KVStoreIterator iter = + db.view(CustomType1.class).closeableIterator(); + // iter should be true + assertTrue(iter.hasNext()); + // close db + db.close(); + // iter.hasNext should be false after db close + assertFalse(iter.hasNext()); + } + + @Test + public void testNextAfterIteratorClose() throws Exception { + db.write(createCustomType1(0)); + KVStoreIterator iter = + db.view(CustomType1.class).closeableIterator(); + // iter should be true + assertTrue(iter.hasNext()); + // close iter + iter.close(); + // iter.next should throw NoSuchElementException after iter close + assertThrows(NoSuchElementException.class, iter::next); + } + + @Test + public void testNextAfterDBClose() throws Exception { + db.write(createCustomType1(0)); + KVStoreIterator iter = + db.view(CustomType1.class).closeableIterator(); + // iter should be true + assertTrue(iter.hasNext()); + // close db + iter.close(); + // iter.next should throw NoSuchElementException after db close + assertThrows(NoSuchElementException.class, iter::next); + } + + @Test + public void testSkipAfterIteratorClose() throws Exception { + db.write(createCustomType1(0)); + KVStoreIterator iter = + db.view(CustomType1.class).closeableIterator(); + // close iter + iter.close(); + // skip should always return false after iter close + assertFalse(iter.skip(0)); + assertFalse(iter.skip(1)); + } + + @Test + public void testSkipAfterDBClose() throws Exception { + db.write(createCustomType1(0)); + KVStoreIterator iter = + db.view(CustomType1.class).closeableIterator(); + // iter should be true + assertTrue(iter.hasNext()); + // close db + db.close(); + // skip should always return false after db close + assertFalse(iter.skip(0)); + assertFalse(iter.skip(1)); + } + private CustomType1 createCustomType1(int i) { CustomType1 t = new CustomType1(); t.key = "key" + i; diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBTypeInfoSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBTypeInfoSuite.java index 38db3bedaef6a..0359e11404cd4 100644 --- a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBTypeInfoSuite.java +++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/LevelDBTypeInfoSuite.java @@ -43,34 +43,40 @@ public void testIndexAnnotation() throws Exception { assertEquals(t1.child, ti.getIndexValue("child", t1)); } - @Test(expected = IllegalArgumentException.class) - public void testNoNaturalIndex() throws Exception { - newTypeInfo(NoNaturalIndex.class); + @Test + public void testNoNaturalIndex() { + assertThrows(IllegalArgumentException.class, + () -> newTypeInfo(NoNaturalIndex.class)); } - @Test(expected = IllegalArgumentException.class) - public void testNoNaturalIndex2() throws Exception { - newTypeInfo(NoNaturalIndex2.class); + @Test + public void testNoNaturalIndex2() { + assertThrows(IllegalArgumentException.class, + () -> newTypeInfo(NoNaturalIndex2.class)); } - @Test(expected = IllegalArgumentException.class) - public void testDuplicateIndex() throws Exception { - newTypeInfo(DuplicateIndex.class); + @Test + public void testDuplicateIndex() { + assertThrows(IllegalArgumentException.class, + () -> newTypeInfo(DuplicateIndex.class)); } - @Test(expected = IllegalArgumentException.class) - public void testEmptyIndexName() throws Exception { - newTypeInfo(EmptyIndexName.class); + @Test + public void testEmptyIndexName() { + assertThrows(IllegalArgumentException.class, + () -> newTypeInfo(EmptyIndexName.class)); } - @Test(expected = IllegalArgumentException.class) - public void testIllegalIndexName() throws Exception { - newTypeInfo(IllegalIndexName.class); + @Test + public void testIllegalIndexName() { + assertThrows(IllegalArgumentException.class, + () -> newTypeInfo(IllegalIndexName.class)); } - @Test(expected = IllegalArgumentException.class) - public void testIllegalIndexMethod() throws Exception { - newTypeInfo(IllegalIndexMethod.class); + @Test + public void testIllegalIndexMethod() { + assertThrows(IllegalArgumentException.class, + () -> newTypeInfo(IllegalIndexMethod.class)); } @Test diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/RocksDBBenchmark.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/RocksDBBenchmark.java index 4517a47b32f6b..25930bb1013d9 100644 --- a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/RocksDBBenchmark.java +++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/RocksDBBenchmark.java @@ -196,10 +196,15 @@ private void iterate(KVStoreView view, String name) throws Exception { } } } - - while (it.hasNext()) { - try(Timer.Context ctx = iter.time()) { - it.next(); + try { + while (it.hasNext()) { + try (Timer.Context ctx = iter.time()) { + it.next(); + } + } + } finally { + if (it != null) { + it.close(); } } } diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/RocksDBIteratorSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/RocksDBIteratorSuite.java index d4bfc7e0413ab..5450f6531d60c 100644 --- a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/RocksDBIteratorSuite.java +++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/RocksDBIteratorSuite.java @@ -20,11 +20,8 @@ import java.io.File; import org.apache.commons.io.FileUtils; -import org.apache.commons.lang3.SystemUtils; import org.junit.AfterClass; -import static org.junit.Assume.assumeFalse; - public class RocksDBIteratorSuite extends DBIteratorSuite { private static File dbpath; @@ -42,7 +39,6 @@ public static void cleanup() throws Exception { @Override protected KVStore createStore() throws Exception { - assumeFalse(SystemUtils.IS_OS_MAC_OSX && SystemUtils.OS_ARCH.equals("aarch64")); dbpath = File.createTempFile("test.", ".rdb"); dbpath.delete(); db = new RocksDB(dbpath); diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/RocksDBSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/RocksDBSuite.java index cd18d227cba72..602ab2d6881a3 100644 --- a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/RocksDBSuite.java +++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/RocksDBSuite.java @@ -22,19 +22,18 @@ import java.util.Iterator; import java.util.List; import java.util.NoSuchElementException; +import java.util.Spliterators; import java.util.stream.Collectors; import java.util.stream.StreamSupport; import com.google.common.collect.ImmutableSet; import org.apache.commons.io.FileUtils; -import org.apache.commons.lang3.SystemUtils; import org.junit.After; import org.junit.Before; import org.junit.Test; import org.rocksdb.RocksIterator; import static org.junit.Assert.*; -import static org.junit.Assume.assumeFalse; public class RocksDBSuite { @@ -53,7 +52,6 @@ public void cleanup() throws Exception { @Before public void setup() throws Exception { - assumeFalse(SystemUtils.IS_OS_MAC_OSX && SystemUtils.OS_ARCH.equals("aarch64")); dbpath = File.createTempFile("test.", ".rdb"); dbpath.delete(); db = new RocksDB(dbpath); @@ -72,36 +70,21 @@ public void testReopenAndVersionCheckDb() throws Exception { db.close(); db = null; - try { - db = new RocksDB(dbpath); - fail("Should have failed version check."); - } catch (UnsupportedStoreVersionException e) { - // Expected. - } + assertThrows(UnsupportedStoreVersionException.class, () -> db = new RocksDB(dbpath)); } @Test public void testObjectWriteReadDelete() throws Exception { CustomType1 t = createCustomType1(1); - try { - db.read(CustomType1.class, t.key); - fail("Expected exception for non-existent object."); - } catch (NoSuchElementException nsee) { - // Expected. - } + assertThrows(NoSuchElementException.class, () -> db.read(CustomType1.class, t.key)); db.write(t); assertEquals(t, db.read(t.getClass(), t.key)); assertEquals(1L, db.count(t.getClass())); db.delete(t.getClass(), t.key); - try { - db.read(t.getClass(), t.key); - fail("Expected exception for deleted object."); - } catch (NoSuchElementException nsee) { - // Expected. - } + assertThrows(NoSuchElementException.class, () -> db.read(t.getClass(), t.key)); // Look into the actual DB and make sure that all the keys related to the type have been // removed. @@ -252,13 +235,14 @@ public void testSkip() throws Exception { db.write(createCustomType1(i)); } - KVStoreIterator it = db.view(CustomType1.class).closeableIterator(); - assertTrue(it.hasNext()); - assertTrue(it.skip(5)); - assertEquals("key5", it.next().key); - assertTrue(it.skip(3)); - assertEquals("key9", it.next().key); - assertFalse(it.hasNext()); + try (KVStoreIterator it = db.view(CustomType1.class).closeableIterator()) { + assertTrue(it.hasNext()); + assertTrue(it.skip(5)); + assertEquals("key5", it.next().key); + assertTrue(it.skip(3)); + assertEquals("key9", it.next().key); + assertFalse(it.hasNext()); + } } @Test @@ -273,12 +257,15 @@ public void testNegativeIndexValues() throws Exception { } }); - List results = StreamSupport - .stream(db.view(CustomType1.class).index("int").spliterator(), false) - .map(e -> e.num) - .collect(Collectors.toList()); + try (KVStoreIterator iterator = + db.view(CustomType1.class).index("int").closeableIterator()) { + List results = StreamSupport + .stream(Spliterators.spliteratorUnknownSize(iterator, 0), false) + .map(e -> e.num) + .collect(Collectors.toList()); - assertEquals(expected, results); + assertEquals(expected, results); + } } @Test @@ -316,6 +303,84 @@ public void testCloseRocksDBIterator() throws Exception { assertTrue(!dbPathForCloseTest.exists()); } + @Test + public void testHasNextAfterIteratorClose() throws Exception { + db.write(createCustomType1(0)); + KVStoreIterator iter = + db.view(CustomType1.class).closeableIterator(); + // iter should be true + assertTrue(iter.hasNext()); + // close iter + iter.close(); + // iter.hasNext should be false after iter close + assertFalse(iter.hasNext()); + } + + @Test + public void testHasNextAfterDBClose() throws Exception { + db.write(createCustomType1(0)); + KVStoreIterator iter = + db.view(CustomType1.class).closeableIterator(); + // iter should be true + assertTrue(iter.hasNext()); + // close db + db.close(); + // iter.hasNext should be false after db close + assertFalse(iter.hasNext()); + } + + @Test + public void testNextAfterIteratorClose() throws Exception { + db.write(createCustomType1(0)); + KVStoreIterator iter = + db.view(CustomType1.class).closeableIterator(); + // iter should be true + assertTrue(iter.hasNext()); + // close iter + iter.close(); + // iter.next should throw NoSuchElementException after iter close + assertThrows(NoSuchElementException.class, iter::next); + } + + @Test + public void testNextAfterDBClose() throws Exception { + db.write(createCustomType1(0)); + KVStoreIterator iter = + db.view(CustomType1.class).closeableIterator(); + // iter should be true + assertTrue(iter.hasNext()); + // close db + iter.close(); + // iter.next should throw NoSuchElementException after db close + assertThrows(NoSuchElementException.class, iter::next); + } + + @Test + public void testSkipAfterIteratorClose() throws Exception { + db.write(createCustomType1(0)); + KVStoreIterator iter = + db.view(CustomType1.class).closeableIterator(); + // close iter + iter.close(); + // skip should always return false after iter close + assertFalse(iter.skip(0)); + assertFalse(iter.skip(1)); + } + + @Test + public void testSkipAfterDBClose() throws Exception { + db.write(createCustomType1(0)); + KVStoreIterator iter = + db.view(CustomType1.class).closeableIterator(); + // iter should be true + assertTrue(iter.hasNext()); + // close db + db.close(); + // skip should always return false after db close + assertFalse(iter.skip(0)); + assertFalse(iter.skip(1)); + } + private CustomType1 createCustomType1(int i) { CustomType1 t = new CustomType1(); t.key = "key" + i; diff --git a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/RocksDBTypeInfoSuite.java b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/RocksDBTypeInfoSuite.java index a51fd1a7fea58..f694fd36b68b3 100644 --- a/common/kvstore/src/test/java/org/apache/spark/util/kvstore/RocksDBTypeInfoSuite.java +++ b/common/kvstore/src/test/java/org/apache/spark/util/kvstore/RocksDBTypeInfoSuite.java @@ -43,34 +43,40 @@ public void testIndexAnnotation() throws Exception { assertEquals(t1.child, ti.getIndexValue("child", t1)); } - @Test(expected = IllegalArgumentException.class) - public void testNoNaturalIndex() throws Exception { - newTypeInfo(NoNaturalIndex.class); + @Test + public void testNoNaturalIndex() { + assertThrows(IllegalArgumentException.class, + () -> newTypeInfo(NoNaturalIndex.class)); } - @Test(expected = IllegalArgumentException.class) - public void testNoNaturalIndex2() throws Exception { - newTypeInfo(NoNaturalIndex2.class); + @Test + public void testNoNaturalIndex2() { + assertThrows(IllegalArgumentException.class, + () -> newTypeInfo(NoNaturalIndex2.class)); } - @Test(expected = IllegalArgumentException.class) - public void testDuplicateIndex() throws Exception { - newTypeInfo(DuplicateIndex.class); + @Test + public void testDuplicateIndex() { + assertThrows(IllegalArgumentException.class, + () -> newTypeInfo(DuplicateIndex.class)); } - @Test(expected = IllegalArgumentException.class) - public void testEmptyIndexName() throws Exception { - newTypeInfo(EmptyIndexName.class); + @Test + public void testEmptyIndexName() { + assertThrows(IllegalArgumentException.class, + () -> newTypeInfo(EmptyIndexName.class)); } - @Test(expected = IllegalArgumentException.class) - public void testIllegalIndexName() throws Exception { - newTypeInfo(IllegalIndexName.class); + @Test + public void testIllegalIndexName() { + assertThrows(IllegalArgumentException.class, + () -> newTypeInfo(IllegalIndexName.class)); } - @Test(expected = IllegalArgumentException.class) - public void testIllegalIndexMethod() throws Exception { - newTypeInfo(IllegalIndexMethod.class); + @Test + public void testIllegalIndexMethod() { + assertThrows(IllegalArgumentException.class, + () -> newTypeInfo(IllegalIndexMethod.class)); } @Test diff --git a/common/network-common/pom.xml b/common/network-common/pom.xml index 43740354d84d1..aa8efeb8143e0 100644 --- a/common/network-common/pom.xml +++ b/common/network-common/pom.xml @@ -22,7 +22,7 @@ org.apache.spark spark-parent_2.12 - 3.3.1 + 3.4.1 ../../pom.xml @@ -42,20 +42,46 @@ + io.netty netty-all + + io.netty + netty-transport-native-epoll + linux-x86_64 + + + io.netty + netty-transport-native-epoll + linux-aarch_64 + + + io.netty + netty-transport-native-kqueue + osx-aarch_64 + + + io.netty + netty-transport-native-kqueue + osx-x86_64 + + + org.apache.commons commons-lang3 - ${leveldbjni.group} leveldbjni-all 1.8 + + org.rocksdb + rocksdbjni + com.fasterxml.jackson.core @@ -118,14 +144,13 @@ org.apache.logging.log4j - log4j-slf4j-impl + log4j-slf4j2-impl test org.apache.spark spark-tags_${scala.binary.version} - test + + + 4.0.0 + + org.apache.spark + spark-parent_2.12 + 3.4.1 + ../../pom.xml + + + spark-avro_2.12 + + avro + + jar + Spark Avro + https://spark.apache.org/ + + + + org.apache.spark + spark-sql_${scala.binary.version} + ${project.version} + provided + + + org.apache.spark + spark-core_${scala.binary.version} + ${project.version} + test-jar + test + + + org.apache.spark + spark-catalyst_${scala.binary.version} + ${project.version} + test-jar + test + + + org.apache.spark + spark-sql_${scala.binary.version} + ${project.version} + test-jar + test + + + org.scalacheck + scalacheck_${scala.binary.version} + test + + + org.apache.spark + spark-tags_${scala.binary.version} + + + + org.tukaani + xz + + + + target/scala-${scala.binary.version}/classes + target/scala-${scala.binary.version}/test-classes + + diff --git a/external/avro/src/main/java/org/apache/spark/sql/avro/SparkAvroKeyOutputFormat.java b/connector/avro/src/main/java/org/apache/spark/sql/avro/SparkAvroKeyOutputFormat.java similarity index 93% rename from external/avro/src/main/java/org/apache/spark/sql/avro/SparkAvroKeyOutputFormat.java rename to connector/avro/src/main/java/org/apache/spark/sql/avro/SparkAvroKeyOutputFormat.java index a4555844b5117..b2a57060fc2d9 100644 --- a/external/avro/src/main/java/org/apache/spark/sql/avro/SparkAvroKeyOutputFormat.java +++ b/connector/avro/src/main/java/org/apache/spark/sql/avro/SparkAvroKeyOutputFormat.java @@ -25,6 +25,7 @@ import org.apache.avro.file.CodecFactory; import org.apache.avro.file.DataFileWriter; import org.apache.avro.generic.GenericData; +import org.apache.avro.generic.GenericDatumWriter; import org.apache.avro.generic.GenericRecord; import org.apache.avro.mapred.AvroKey; import org.apache.avro.mapreduce.AvroKeyOutputFormat; @@ -46,13 +47,14 @@ static class SparkRecordWriterFactory extends RecordWriterFactory this.metadata = metadata; } + @Override protected RecordWriter, NullWritable> create( Schema writerSchema, GenericData dataModel, CodecFactory compressionCodec, OutputStream outputStream, int syncInterval) throws IOException { - return new SparkAvroKeyRecordWriter( + return new SparkAvroKeyRecordWriter<>( writerSchema, dataModel, compressionCodec, outputStream, syncInterval, metadata); } } @@ -71,7 +73,7 @@ class SparkAvroKeyRecordWriter extends RecordWriter, NullWritable> OutputStream outputStream, int syncInterval, Map metadata) throws IOException { - this.mAvroFileWriter = new DataFileWriter(dataModel.createDatumWriter(writerSchema)); + this.mAvroFileWriter = new DataFileWriter<>(new GenericDatumWriter<>(writerSchema, dataModel)); for (Map.Entry entry : metadata.entrySet()) { this.mAvroFileWriter.setMeta(entry.getKey(), entry.getValue()); } @@ -80,14 +82,17 @@ class SparkAvroKeyRecordWriter extends RecordWriter, NullWritable> this.mAvroFileWriter.create(writerSchema, outputStream); } + @Override public void write(AvroKey record, NullWritable ignore) throws IOException { this.mAvroFileWriter.append(record.datum()); } + @Override public void close(TaskAttemptContext context) throws IOException { this.mAvroFileWriter.close(); } + @Override public long sync() throws IOException { return this.mAvroFileWriter.sync(); } diff --git a/external/avro/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/connector/avro/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister similarity index 100% rename from external/avro/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister rename to connector/avro/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDataToCatalyst.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroDataToCatalyst.scala similarity index 100% rename from external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDataToCatalyst.scala rename to connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroDataToCatalyst.scala diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala similarity index 99% rename from external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala rename to connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala index 1192856ae7796..aac979cddb2dd 100644 --- a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala +++ b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroDeserializer.scala @@ -29,7 +29,7 @@ import org.apache.avro.Schema.Type._ import org.apache.avro.generic._ import org.apache.avro.util.Utf8 -import org.apache.spark.sql.avro.AvroUtils.{toFieldStr, AvroMatchedField} +import org.apache.spark.sql.avro.AvroUtils.{nonNullUnionBranches, toFieldStr, AvroMatchedField} import org.apache.spark.sql.catalyst.{InternalRow, NoopFilters, StructFilters} import org.apache.spark.sql.catalyst.expressions.{SpecificInternalRow, UnsafeArrayData} import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, DateTimeUtils, GenericArrayData} @@ -289,8 +289,7 @@ private[sql] class AvroDeserializer( updater.set(ordinal, new ArrayBasedMapData(keyArray, valueArray)) case (UNION, _) => - val allTypes = avroType.getTypes.asScala - val nonNullTypes = allTypes.filter(_.getType != NULL) + val nonNullTypes = nonNullUnionBranches(avroType) val nonNullAvroType = Schema.createUnion(nonNullTypes.asJava) if (nonNullTypes.nonEmpty) { if (nonNullTypes.length == 1) { diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala similarity index 97% rename from external/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala rename to connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala index a13e0624f351d..3e16e12108129 100755 --- a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala +++ b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroFileFormat.scala @@ -18,7 +18,6 @@ package org.apache.spark.sql.avro import java.io._ -import java.net.URI import scala.util.control.NonFatal @@ -96,9 +95,9 @@ private[sql] class AvroFileFormat extends FileFormat // Doing input file filtering is improper because we may generate empty tasks that process no // input files but stress the scheduler. We should probably add a more general input file // filtering mechanism for `FileFormat` data sources. See SPARK-16317. - if (parsedOptions.ignoreExtension || file.filePath.endsWith(".avro")) { + if (parsedOptions.ignoreExtension || file.urlEncodedPath.endsWith(".avro")) { val reader = { - val in = new FsInput(new Path(new URI(file.filePath)), conf) + val in = new FsInput(file.toPath, conf) try { val datumReader = userProvidedSchema match { case Some(userSchema) => new GenericDatumReader[GenericRecord](userSchema) diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroOptions.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroOptions.scala similarity index 78% rename from external/avro/src/main/scala/org/apache/spark/sql/avro/AvroOptions.scala rename to connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroOptions.scala index fec2b77773ddc..95001bb81508c 100644 --- a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroOptions.scala +++ b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroOptions.scala @@ -25,6 +25,7 @@ import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.{DataSourceOptions, FileSourceOptions} import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, FailFastMode, ParseMode} import org.apache.spark.sql.internal.SQLConf @@ -33,7 +34,10 @@ import org.apache.spark.sql.internal.SQLConf */ private[sql] class AvroOptions( @transient val parameters: CaseInsensitiveMap[String], - @transient val conf: Configuration) extends Logging with Serializable { + @transient val conf: Configuration) + extends FileSourceOptions(parameters) with Logging { + + import AvroOptions._ def this(parameters: Map[String, String], conf: Configuration) = { this(CaseInsensitiveMap(parameters), conf) @@ -52,8 +56,8 @@ private[sql] class AvroOptions( * instead of "string" type in the default converted schema. */ val schema: Option[Schema] = { - parameters.get("avroSchema").map(new Schema.Parser().setValidateDefaults(false).parse).orElse({ - val avroUrlSchema = parameters.get("avroSchemaUrl").map(url => { + parameters.get(AVRO_SCHEMA).map(new Schema.Parser().setValidateDefaults(false).parse).orElse({ + val avroUrlSchema = parameters.get(AVRO_SCHEMA_URL).map(url => { log.debug("loading avro schema from url: " + url) val fs = FileSystem.get(new URI(url), conf) val in = fs.open(new Path(url)) @@ -73,20 +77,20 @@ private[sql] class AvroOptions( * whose field names do not match. Defaults to false. */ val positionalFieldMatching: Boolean = - parameters.get("positionalFieldMatching").exists(_.toBoolean) + parameters.get(POSITIONAL_FIELD_MATCHING).exists(_.toBoolean) /** * Top level record name in write result, which is required in Avro spec. - * See https://avro.apache.org/docs/1.11.0/spec.html#schema_record . + * See https://avro.apache.org/docs/1.11.1/specification/#schema-record . * Default value is "topLevelRecord" */ - val recordName: String = parameters.getOrElse("recordName", "topLevelRecord") + val recordName: String = parameters.getOrElse(RECORD_NAME, "topLevelRecord") /** * Record namespace in write result. Default value is "". - * See Avro spec for details: https://avro.apache.org/docs/1.11.0/spec.html#schema_record . + * See Avro spec for details: https://avro.apache.org/docs/1.11.1/specification/#schema-record . */ - val recordNamespace: String = parameters.getOrElse("recordNamespace", "") + val recordNamespace: String = parameters.getOrElse(RECORD_NAMESPACE, "") /** * The `ignoreExtension` option controls ignoring of files without `.avro` extensions in read. @@ -102,7 +106,7 @@ private[sql] class AvroOptions( ignoreFilesWithoutExtensionByDefault) parameters - .get(AvroOptions.ignoreExtensionKey) + .get(IGNORE_EXTENSION) .map(_.toBoolean) .getOrElse(!ignoreFilesWithoutExtension) } @@ -114,21 +118,21 @@ private[sql] class AvroOptions( * taken into account. If the former one is not set too, the `snappy` codec is used by default. */ val compression: String = { - parameters.get("compression").getOrElse(SQLConf.get.avroCompressionCodec) + parameters.get(COMPRESSION).getOrElse(SQLConf.get.avroCompressionCodec) } val parseMode: ParseMode = - parameters.get("mode").map(ParseMode.fromString).getOrElse(FailFastMode) + parameters.get(MODE).map(ParseMode.fromString).getOrElse(FailFastMode) /** * The rebasing mode for the DATE and TIMESTAMP_MICROS, TIMESTAMP_MILLIS values in reads. */ val datetimeRebaseModeInRead: String = parameters - .get(AvroOptions.DATETIME_REBASE_MODE) + .get(DATETIME_REBASE_MODE) .getOrElse(SQLConf.get.getConf(SQLConf.AVRO_REBASE_MODE_IN_READ)) } -private[sql] object AvroOptions { +private[sql] object AvroOptions extends DataSourceOptions { def apply(parameters: Map[String, String]): AvroOptions = { val hadoopConf = SparkSession .getActiveSession @@ -137,11 +141,17 @@ private[sql] object AvroOptions { new AvroOptions(CaseInsensitiveMap(parameters), hadoopConf) } - val ignoreExtensionKey = "ignoreExtension" - + val IGNORE_EXTENSION = newOption("ignoreExtension") + val MODE = newOption("mode") + val RECORD_NAME = newOption("recordName") + val COMPRESSION = newOption("compression") + val AVRO_SCHEMA = newOption("avroSchema") + val AVRO_SCHEMA_URL = newOption("avroSchemaUrl") + val RECORD_NAMESPACE = newOption("recordNamespace") + val POSITIONAL_FIELD_MATCHING = newOption("positionalFieldMatching") // The option controls rebasing of the DATE and TIMESTAMP values between // Julian and Proleptic Gregorian calendars. It impacts on the behaviour of the Avro // datasource similarly to the SQL config `spark.sql.avro.datetimeRebaseModeInRead`, // and can be set to the same values: `EXCEPTION`, `LEGACY` or `CORRECTED`. - val DATETIME_REBASE_MODE = "datetimeRebaseMode" + val DATETIME_REBASE_MODE = newOption("datetimeRebaseMode") } diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroOutputWriter.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroOutputWriter.scala similarity index 100% rename from external/avro/src/main/scala/org/apache/spark/sql/avro/AvroOutputWriter.scala rename to connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroOutputWriter.scala diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroOutputWriterFactory.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroOutputWriterFactory.scala similarity index 100% rename from external/avro/src/main/scala/org/apache/spark/sql/avro/AvroOutputWriterFactory.scala rename to connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroOutputWriterFactory.scala diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroSerializer.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroSerializer.scala similarity index 78% rename from external/avro/src/main/scala/org/apache/spark/sql/avro/AvroSerializer.scala rename to connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroSerializer.scala index 4a82df6ba0dce..c95d731f0dedd 100644 --- a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroSerializer.scala +++ b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroSerializer.scala @@ -32,7 +32,7 @@ import org.apache.avro.generic.GenericData.Record import org.apache.avro.util.Utf8 import org.apache.spark.internal.Logging -import org.apache.spark.sql.avro.AvroUtils.{toFieldStr, AvroMatchedField} +import org.apache.spark.sql.avro.AvroUtils.{nonNullUnionBranches, toFieldStr, AvroMatchedField} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{SpecializedGetters, SpecificInternalRow} import org.apache.spark.sql.catalyst.util.DateTimeUtils @@ -218,6 +218,17 @@ private[sql] class AvroSerializer( val numFields = st.length (getter, ordinal) => structConverter(getter.getStruct(ordinal, numFields)) + case (st: StructType, UNION) => + val unionConvertor = newComplexUnionConverter(st, avroType, catalystPath, avroPath) + val numFields = st.length + (getter, ordinal) => unionConvertor(getter.getStruct(ordinal, numFields)) + + case (DoubleType, UNION) if nonNullUnionTypes(avroType) == Set(FLOAT, DOUBLE) => + (getter, ordinal) => getter.getDouble(ordinal) + + case (LongType, UNION) if nonNullUnionTypes(avroType) == Set(INT, LONG) => + (getter, ordinal) => getter.getLong(ordinal) + case (MapType(kt, vt, valueContainsNull), MAP) if kt == StringType => val valueConverter = newConverter( vt, resolveNullableType(avroType.getValueType, valueContainsNull), @@ -287,14 +298,59 @@ private[sql] class AvroSerializer( result } + /** + * Complex unions map to struct types where field names are member0, member1, etc. + * This is consistent with the behavior in [[SchemaConverters]] and when converting between Avro + * and Parquet. + */ + private def newComplexUnionConverter( + catalystStruct: StructType, + unionType: Schema, + catalystPath: Seq[String], + avroPath: Seq[String]): InternalRow => Any = { + val nonNullTypes = nonNullUnionBranches(unionType) + val expectedFieldNames = nonNullTypes.indices.map(i => s"member$i") + val catalystFieldNames = catalystStruct.fieldNames.toSeq + if (positionalFieldMatch) { + if (expectedFieldNames.length != catalystFieldNames.length) { + throw new IncompatibleSchemaException(s"Generic Avro union at ${toFieldStr(avroPath)} " + + s"does not match the SQL schema at ${toFieldStr(catalystPath)}. It expected the " + + s"${expectedFieldNames.length} members but got ${catalystFieldNames.length}") + } + } else { + if (catalystFieldNames != expectedFieldNames) { + throw new IncompatibleSchemaException(s"Generic Avro union at ${toFieldStr(avroPath)} " + + s"does not match the SQL schema at ${toFieldStr(catalystPath)}. It expected the " + + s"following members ${expectedFieldNames.mkString("(", ", ", ")")} but got " + + s"${catalystFieldNames.mkString("(", ", ", ")")}") + } + } + + val unionBranchConverters = nonNullTypes.zip(catalystStruct).map { case (unionBranch, cf) => + newConverter(cf.dataType, unionBranch, catalystPath :+ cf.name, avroPath :+ cf.name) + }.toArray + + val numBranches = catalystStruct.length + row: InternalRow => { + var idx = 0 + var retVal: Any = null + while (idx < numBranches && retVal == null) { + if (!row.isNullAt(idx)) { + retVal = unionBranchConverters(idx).apply(row, idx) + } + idx += 1 + } + retVal + } + } + /** * Resolve a possibly nullable Avro Type. * - * An Avro type is nullable when it is a [[UNION]] of two types: one null type and another - * non-null type. This method will check the nullability of the input Avro type and return the - * non-null type within when it is nullable. Otherwise it will return the input Avro type - * unchanged. It will throw an [[UnsupportedAvroTypeException]] when the input Avro type is an - * unsupported nullable type. + * An Avro type is nullable when it is a [[UNION]] which contains a null type. This method will + * check the nullability of the input Avro type. + * Returns the non-null type within the union when it contains only 1 non-null type. + * Otherwise it will return the input Avro type unchanged. * * It will also log a warning message if the nullability for Avro and catalyst types are * different. @@ -306,20 +362,18 @@ private[sql] class AvroSerializer( } /** - * Check the nullability of the input Avro type and resolve it when it is nullable. The first - * return value is a [[Boolean]] indicating if the input Avro type is nullable. The second - * return value is the possibly resolved type. + * Check the nullability of the input Avro type and resolve it when it is a single nullable type. + * The first return value is a [[Boolean]] indicating if the input Avro type is nullable. + * The second return value is the possibly resolved type otherwise the input Avro type unchanged. */ private def resolveAvroType(avroType: Schema): (Boolean, Schema) = { if (avroType.getType == Type.UNION) { - val fields = avroType.getTypes.asScala - val actualType = fields.filter(_.getType != Type.NULL) - if (fields.length != 2 || actualType.length != 1) { - throw new UnsupportedAvroTypeException( - s"Unsupported Avro UNION type $avroType: Only UNION of a null type and a non-null " + - "type is supported") + val containsNull = avroType.getTypes.asScala.exists(_.getType == Schema.Type.NULL) + nonNullUnionBranches(avroType) match { + case Seq() => (true, Schema.create(Type.NULL)) + case Seq(singleType) => (containsNull, singleType) + case _ => (containsNull, avroType) } - (true, actualType.head) } else { (false, avroType) } @@ -337,4 +391,8 @@ private[sql] class AvroSerializer( "schema will throw runtime exception if there is a record with null value.") } } + + private def nonNullUnionTypes(avroType: Schema): Set[Type] = { + nonNullUnionBranches(avroType).map(_.getType).toSet + } } diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala similarity index 95% rename from external/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala rename to connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala index de3626b1f3147..e1966bd1041c2 100644 --- a/external/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala +++ b/connector/avro/src/main/scala/org/apache/spark/sql/avro/AvroUtils.scala @@ -34,8 +34,9 @@ import org.apache.hadoop.mapreduce.Job import org.apache.spark.SparkException import org.apache.spark.internal.Logging import org.apache.spark.sql.SparkSession -import org.apache.spark.sql.avro.AvroOptions.ignoreExtensionKey -import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.avro.AvroOptions.IGNORE_EXTENSION +import org.apache.spark.sql.catalyst.{FileSourceOptions, InternalRow} +import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.execution.datasources.OutputWriterFactory import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types._ @@ -49,15 +50,15 @@ private[sql] object AvroUtils extends Logging { val conf = spark.sessionState.newHadoopConfWithOptions(options) val parsedOptions = new AvroOptions(options, conf) - if (parsedOptions.parameters.contains(ignoreExtensionKey)) { - logWarning(s"Option $ignoreExtensionKey is deprecated. Please use the " + + if (parsedOptions.parameters.contains(IGNORE_EXTENSION)) { + logWarning(s"Option $IGNORE_EXTENSION is deprecated. Please use the " + "general data source option pathGlobFilter for filtering file names.") } // User can specify an optional avro json schema. val avroSchema = parsedOptions.schema .getOrElse { inferAvroSchemaFromFiles(files, conf, parsedOptions.ignoreExtension, - spark.sessionState.conf.ignoreCorruptFiles) + new FileSourceOptions(CaseInsensitiveMap(options)).ignoreCorruptFiles) } SchemaConverters.toSqlType(avroSchema).dataType match { @@ -335,4 +336,9 @@ private[sql] object AvroUtils extends Logging { private[avro] def isNullable(avroField: Schema.Field): Boolean = avroField.schema().getType == Schema.Type.UNION && avroField.schema().getTypes.asScala.exists(_.getType == Schema.Type.NULL) + + /** Collect all non null branches of a union in order. */ + private[avro] def nonNullUnionBranches(avroType: Schema): Seq[Schema] = { + avroType.getTypes.asScala.filter(_.getType != Schema.Type.NULL).toSeq + } } diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/CatalystDataToAvro.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/CatalystDataToAvro.scala similarity index 100% rename from external/avro/src/main/scala/org/apache/spark/sql/avro/CatalystDataToAvro.scala rename to connector/avro/src/main/scala/org/apache/spark/sql/avro/CatalystDataToAvro.scala diff --git a/connector/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala new file mode 100644 index 0000000000000..f616cfa9b5d5c --- /dev/null +++ b/connector/avro/src/main/scala/org/apache/spark/sql/avro/SchemaConverters.scala @@ -0,0 +1,239 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.avro + +import scala.collection.JavaConverters._ + +import org.apache.avro.{LogicalTypes, Schema, SchemaBuilder} +import org.apache.avro.LogicalTypes.{Date, Decimal, LocalTimestampMicros, LocalTimestampMillis, TimestampMicros, TimestampMillis} +import org.apache.avro.Schema.Type._ + +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.sql.catalyst.parser.CatalystSqlParser +import org.apache.spark.sql.types._ +import org.apache.spark.sql.types.Decimal.minBytesForPrecision + +/** + * This object contains method that are used to convert sparkSQL schemas to avro schemas and vice + * versa. + */ +@DeveloperApi +object SchemaConverters { + private lazy val nullSchema = Schema.create(Schema.Type.NULL) + + /** + * Internal wrapper for SQL data type and nullability. + * + * @since 2.4.0 + */ + case class SchemaType(dataType: DataType, nullable: Boolean) + + /** + * Converts an Avro schema to a corresponding Spark SQL schema. + * + * @since 2.4.0 + */ + def toSqlType(avroSchema: Schema): SchemaType = { + toSqlTypeHelper(avroSchema, Set.empty) + } + + // The property specifies Catalyst type of the given field + private val CATALYST_TYPE_PROP_NAME = "spark.sql.catalyst.type" + + private def toSqlTypeHelper(avroSchema: Schema, existingRecordNames: Set[String]): SchemaType = { + avroSchema.getType match { + case INT => avroSchema.getLogicalType match { + case _: Date => SchemaType(DateType, nullable = false) + case _ => + val catalystTypeAttrValue = avroSchema.getProp(CATALYST_TYPE_PROP_NAME) + val catalystType = if (catalystTypeAttrValue == null) { + IntegerType + } else { + CatalystSqlParser.parseDataType(catalystTypeAttrValue) + } + SchemaType(catalystType, nullable = false) + } + case STRING => SchemaType(StringType, nullable = false) + case BOOLEAN => SchemaType(BooleanType, nullable = false) + case BYTES | FIXED => avroSchema.getLogicalType match { + // For FIXED type, if the precision requires more bytes than fixed size, the logical + // type will be null, which is handled by Avro library. + case d: Decimal => SchemaType(DecimalType(d.getPrecision, d.getScale), nullable = false) + case _ => SchemaType(BinaryType, nullable = false) + } + + case DOUBLE => SchemaType(DoubleType, nullable = false) + case FLOAT => SchemaType(FloatType, nullable = false) + case LONG => avroSchema.getLogicalType match { + case _: TimestampMillis | _: TimestampMicros => SchemaType(TimestampType, nullable = false) + case _: LocalTimestampMillis | _: LocalTimestampMicros => + SchemaType(TimestampNTZType, nullable = false) + case _ => + val catalystTypeAttrValue = avroSchema.getProp(CATALYST_TYPE_PROP_NAME) + val catalystType = if (catalystTypeAttrValue == null) { + LongType + } else { + CatalystSqlParser.parseDataType(catalystTypeAttrValue) + } + SchemaType(catalystType, nullable = false) + } + + case ENUM => SchemaType(StringType, nullable = false) + + case NULL => SchemaType(NullType, nullable = true) + + case RECORD => + if (existingRecordNames.contains(avroSchema.getFullName)) { + throw new IncompatibleSchemaException(s""" + |Found recursive reference in Avro schema, which can not be processed by Spark: + |${avroSchema.toString(true)} + """.stripMargin) + } + val newRecordNames = existingRecordNames + avroSchema.getFullName + val fields = avroSchema.getFields.asScala.map { f => + val schemaType = toSqlTypeHelper(f.schema(), newRecordNames) + StructField(f.name, schemaType.dataType, schemaType.nullable) + } + + SchemaType(StructType(fields.toArray), nullable = false) + + case ARRAY => + val schemaType = toSqlTypeHelper(avroSchema.getElementType, existingRecordNames) + SchemaType( + ArrayType(schemaType.dataType, containsNull = schemaType.nullable), + nullable = false) + + case MAP => + val schemaType = toSqlTypeHelper(avroSchema.getValueType, existingRecordNames) + SchemaType( + MapType(StringType, schemaType.dataType, valueContainsNull = schemaType.nullable), + nullable = false) + + case UNION => + if (avroSchema.getTypes.asScala.exists(_.getType == NULL)) { + // In case of a union with null, eliminate it and make a recursive call + val remainingUnionTypes = AvroUtils.nonNullUnionBranches(avroSchema) + if (remainingUnionTypes.size == 1) { + toSqlTypeHelper(remainingUnionTypes.head, existingRecordNames).copy(nullable = true) + } else { + toSqlTypeHelper(Schema.createUnion(remainingUnionTypes.asJava), existingRecordNames) + .copy(nullable = true) + } + } else avroSchema.getTypes.asScala.map(_.getType).toSeq match { + case Seq(t1) => + toSqlTypeHelper(avroSchema.getTypes.get(0), existingRecordNames) + case Seq(t1, t2) if Set(t1, t2) == Set(INT, LONG) => + SchemaType(LongType, nullable = false) + case Seq(t1, t2) if Set(t1, t2) == Set(FLOAT, DOUBLE) => + SchemaType(DoubleType, nullable = false) + case _ => + // Convert complex unions to struct types where field names are member0, member1, etc. + // This is consistent with the behavior when converting between Avro and Parquet. + val fields = avroSchema.getTypes.asScala.zipWithIndex.map { + case (s, i) => + val schemaType = toSqlTypeHelper(s, existingRecordNames) + // All fields are nullable because only one of them is set at a time + StructField(s"member$i", schemaType.dataType, nullable = true) + } + + SchemaType(StructType(fields.toArray), nullable = false) + } + + case other => throw new IncompatibleSchemaException(s"Unsupported type $other") + } + } + + /** + * Converts a Spark SQL schema to a corresponding Avro schema. + * + * @since 2.4.0 + */ + def toAvroType( + catalystType: DataType, + nullable: Boolean = false, + recordName: String = "topLevelRecord", + nameSpace: String = "") + : Schema = { + val builder = SchemaBuilder.builder() + + val schema = catalystType match { + case BooleanType => builder.booleanType() + case ByteType | ShortType | IntegerType => builder.intType() + case LongType => builder.longType() + case DateType => + LogicalTypes.date().addToSchema(builder.intType()) + case TimestampType => + LogicalTypes.timestampMicros().addToSchema(builder.longType()) + case TimestampNTZType => + LogicalTypes.localTimestampMicros().addToSchema(builder.longType()) + + case FloatType => builder.floatType() + case DoubleType => builder.doubleType() + case StringType => builder.stringType() + case NullType => builder.nullType() + case d: DecimalType => + val avroType = LogicalTypes.decimal(d.precision, d.scale) + val fixedSize = minBytesForPrecision(d.precision) + // Need to avoid naming conflict for the fixed fields + val name = nameSpace match { + case "" => s"$recordName.fixed" + case _ => s"$nameSpace.$recordName.fixed" + } + avroType.addToSchema(SchemaBuilder.fixed(name).size(fixedSize)) + + case BinaryType => builder.bytesType() + case ArrayType(et, containsNull) => + builder.array() + .items(toAvroType(et, containsNull, recordName, nameSpace)) + case MapType(StringType, vt, valueContainsNull) => + builder.map() + .values(toAvroType(vt, valueContainsNull, recordName, nameSpace)) + case st: StructType => + val childNameSpace = if (nameSpace != "") s"$nameSpace.$recordName" else recordName + val fieldsAssembler = builder.record(recordName).namespace(nameSpace).fields() + st.foreach { f => + val fieldAvroType = + toAvroType(f.dataType, f.nullable, f.name, childNameSpace) + fieldsAssembler.name(f.name).`type`(fieldAvroType).noDefault() + } + fieldsAssembler.endRecord() + + case ym: YearMonthIntervalType => + val ymIntervalType = builder.intType() + ymIntervalType.addProp(CATALYST_TYPE_PROP_NAME, ym.typeName) + ymIntervalType + case dt: DayTimeIntervalType => + val dtIntervalType = builder.longType() + dtIntervalType.addProp(CATALYST_TYPE_PROP_NAME, dt.typeName) + dtIntervalType + + // This should never happen. + case other => throw new IncompatibleSchemaException(s"Unexpected type $other.") + } + if (nullable && catalystType != NullType) { + Schema.createUnion(schema, nullSchema) + } else { + schema + } + } +} + +private[avro] class IncompatibleSchemaException( + msg: String, ex: Throwable = null) extends Exception(msg, ex) + +private[avro] class UnsupportedAvroTypeException(msg: String) extends Exception(msg) diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/functions.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/functions.scala similarity index 100% rename from external/avro/src/main/scala/org/apache/spark/sql/avro/functions.scala rename to connector/avro/src/main/scala/org/apache/spark/sql/avro/functions.scala diff --git a/external/avro/src/main/scala/org/apache/spark/sql/avro/package.scala b/connector/avro/src/main/scala/org/apache/spark/sql/avro/package.scala similarity index 100% rename from external/avro/src/main/scala/org/apache/spark/sql/avro/package.scala rename to connector/avro/src/main/scala/org/apache/spark/sql/avro/package.scala diff --git a/external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroDataSourceV2.scala b/connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroDataSourceV2.scala similarity index 100% rename from external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroDataSourceV2.scala rename to connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroDataSourceV2.scala diff --git a/external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroPartitionReaderFactory.scala b/connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroPartitionReaderFactory.scala similarity index 90% rename from external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroPartitionReaderFactory.scala rename to connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroPartitionReaderFactory.scala index a4dfdbfe68f9c..cc7bd180e8477 100644 --- a/external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroPartitionReaderFactory.scala +++ b/connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroPartitionReaderFactory.scala @@ -16,14 +16,11 @@ */ package org.apache.spark.sql.v2.avro -import java.net.URI - import scala.util.control.NonFatal import org.apache.avro.file.DataFileReader import org.apache.avro.generic.{GenericDatumReader, GenericRecord} import org.apache.avro.mapred.FsInput -import org.apache.hadoop.fs.Path import org.apache.spark.TaskContext import org.apache.spark.broadcast.Broadcast @@ -46,7 +43,7 @@ import org.apache.spark.util.SerializableConfiguration * @param dataSchema Schema of AVRO files. * @param readDataSchema Required data schema of AVRO files. * @param partitionSchema Schema of partitions. - * @param parsedOptions Options for parsing AVRO files. + * @param options Options for parsing AVRO files. */ case class AvroPartitionReaderFactory( sqlConf: SQLConf, @@ -54,17 +51,17 @@ case class AvroPartitionReaderFactory( dataSchema: StructType, readDataSchema: StructType, partitionSchema: StructType, - parsedOptions: AvroOptions, + options: AvroOptions, filters: Seq[Filter]) extends FilePartitionReaderFactory with Logging { - private val datetimeRebaseModeInRead = parsedOptions.datetimeRebaseModeInRead + private val datetimeRebaseModeInRead = options.datetimeRebaseModeInRead override def buildReader(partitionedFile: PartitionedFile): PartitionReader[InternalRow] = { val conf = broadcastedConf.value.value - val userProvidedSchema = parsedOptions.schema + val userProvidedSchema = options.schema - if (parsedOptions.ignoreExtension || partitionedFile.filePath.endsWith(".avro")) { + if (options.ignoreExtension || partitionedFile.urlEncodedPath.endsWith(".avro")) { val reader = { - val in = new FsInput(new Path(new URI(partitionedFile.filePath)), conf) + val in = new FsInput(partitionedFile.toPath, conf) try { val datumReader = userProvidedSchema match { case Some(userSchema) => new GenericDatumReader[GenericRecord](userSchema) @@ -104,7 +101,7 @@ case class AvroPartitionReaderFactory( override val deserializer = new AvroDeserializer( userProvidedSchema.getOrElse(reader.getSchema), readDataSchema, - parsedOptions.positionalFieldMatching, + options.positionalFieldMatching, datetimeRebaseMode, avroFilters) override val stopPosition = partitionedFile.start + partitionedFile.length diff --git a/external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroScan.scala b/connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroScan.scala similarity index 95% rename from external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroScan.scala rename to connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroScan.scala index d0f38c12427c3..763b9abe4f91b 100644 --- a/external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroScan.scala +++ b/connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroScan.scala @@ -70,10 +70,6 @@ case class AvroScan( override def hashCode(): Int = super.hashCode() - override def description(): String = { - super.description() + ", PushedFilters: " + pushedFilters.mkString("[", ", ", "]") - } - override def getMetaData(): Map[String, String] = { super.getMetaData() ++ Map("PushedFilters" -> seqToString(pushedFilters)) } diff --git a/external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroScanBuilder.scala b/connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroScanBuilder.scala similarity index 94% rename from external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroScanBuilder.scala rename to connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroScanBuilder.scala index 8fae89a945826..754c58e65b016 100644 --- a/external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroScanBuilder.scala +++ b/connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroScanBuilder.scala @@ -18,14 +18,13 @@ package org.apache.spark.sql.v2.avro import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.StructFilters -import org.apache.spark.sql.connector.read.Scan import org.apache.spark.sql.execution.datasources.PartitioningAwareFileIndex import org.apache.spark.sql.execution.datasources.v2.FileScanBuilder import org.apache.spark.sql.sources.Filter import org.apache.spark.sql.types.StructType import org.apache.spark.sql.util.CaseInsensitiveStringMap -class AvroScanBuilder ( +case class AvroScanBuilder ( sparkSession: SparkSession, fileIndex: PartitioningAwareFileIndex, schema: StructType, @@ -33,7 +32,7 @@ class AvroScanBuilder ( options: CaseInsensitiveStringMap) extends FileScanBuilder(sparkSession, fileIndex, dataSchema) { - override def build(): Scan = { + override def build(): AvroScan = { AvroScan( sparkSession, fileIndex, diff --git a/external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroTable.scala b/connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroTable.scala similarity index 100% rename from external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroTable.scala rename to connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroTable.scala diff --git a/external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroWrite.scala b/connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroWrite.scala similarity index 100% rename from external/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroWrite.scala rename to connector/avro/src/main/scala/org/apache/spark/sql/v2/avro/AvroWrite.scala diff --git a/external/avro/src/test/java/org/apache/spark/sql/avro/JavaAvroFunctionsSuite.java b/connector/avro/src/test/java/org/apache/spark/sql/avro/JavaAvroFunctionsSuite.java similarity index 100% rename from external/avro/src/test/java/org/apache/spark/sql/avro/JavaAvroFunctionsSuite.java rename to connector/avro/src/test/java/org/apache/spark/sql/avro/JavaAvroFunctionsSuite.java diff --git a/external/avro/src/test/resources/before_1582_date_v2_4_5.avro b/connector/avro/src/test/resources/before_1582_date_v2_4_5.avro similarity index 100% rename from external/avro/src/test/resources/before_1582_date_v2_4_5.avro rename to connector/avro/src/test/resources/before_1582_date_v2_4_5.avro diff --git a/external/avro/src/test/resources/before_1582_date_v2_4_6.avro b/connector/avro/src/test/resources/before_1582_date_v2_4_6.avro similarity index 100% rename from external/avro/src/test/resources/before_1582_date_v2_4_6.avro rename to connector/avro/src/test/resources/before_1582_date_v2_4_6.avro diff --git a/external/avro/src/test/resources/before_1582_date_v3_2_0.avro b/connector/avro/src/test/resources/before_1582_date_v3_2_0.avro similarity index 100% rename from external/avro/src/test/resources/before_1582_date_v3_2_0.avro rename to connector/avro/src/test/resources/before_1582_date_v3_2_0.avro diff --git a/external/avro/src/test/resources/before_1582_timestamp_micros_v2_4_5.avro b/connector/avro/src/test/resources/before_1582_timestamp_micros_v2_4_5.avro similarity index 100% rename from external/avro/src/test/resources/before_1582_timestamp_micros_v2_4_5.avro rename to connector/avro/src/test/resources/before_1582_timestamp_micros_v2_4_5.avro diff --git a/external/avro/src/test/resources/before_1582_timestamp_micros_v2_4_6.avro b/connector/avro/src/test/resources/before_1582_timestamp_micros_v2_4_6.avro similarity index 100% rename from external/avro/src/test/resources/before_1582_timestamp_micros_v2_4_6.avro rename to connector/avro/src/test/resources/before_1582_timestamp_micros_v2_4_6.avro diff --git a/external/avro/src/test/resources/before_1582_timestamp_micros_v3_2_0.avro b/connector/avro/src/test/resources/before_1582_timestamp_micros_v3_2_0.avro similarity index 100% rename from external/avro/src/test/resources/before_1582_timestamp_micros_v3_2_0.avro rename to connector/avro/src/test/resources/before_1582_timestamp_micros_v3_2_0.avro diff --git a/external/avro/src/test/resources/before_1582_timestamp_millis_v2_4_5.avro b/connector/avro/src/test/resources/before_1582_timestamp_millis_v2_4_5.avro similarity index 100% rename from external/avro/src/test/resources/before_1582_timestamp_millis_v2_4_5.avro rename to connector/avro/src/test/resources/before_1582_timestamp_millis_v2_4_5.avro diff --git a/external/avro/src/test/resources/before_1582_timestamp_millis_v2_4_6.avro b/connector/avro/src/test/resources/before_1582_timestamp_millis_v2_4_6.avro similarity index 100% rename from external/avro/src/test/resources/before_1582_timestamp_millis_v2_4_6.avro rename to connector/avro/src/test/resources/before_1582_timestamp_millis_v2_4_6.avro diff --git a/external/avro/src/test/resources/before_1582_timestamp_millis_v3_2_0.avro b/connector/avro/src/test/resources/before_1582_timestamp_millis_v3_2_0.avro similarity index 100% rename from external/avro/src/test/resources/before_1582_timestamp_millis_v3_2_0.avro rename to connector/avro/src/test/resources/before_1582_timestamp_millis_v3_2_0.avro diff --git a/external/avro/src/test/resources/episodes.avro b/connector/avro/src/test/resources/episodes.avro similarity index 100% rename from external/avro/src/test/resources/episodes.avro rename to connector/avro/src/test/resources/episodes.avro diff --git a/external/avro/src/test/resources/log4j2.properties b/connector/avro/src/test/resources/log4j2.properties similarity index 100% rename from external/avro/src/test/resources/log4j2.properties rename to connector/avro/src/test/resources/log4j2.properties diff --git a/external/avro/src/test/resources/test-random-partitioned/part-r-00000.avro b/connector/avro/src/test/resources/test-random-partitioned/part-r-00000.avro similarity index 100% rename from external/avro/src/test/resources/test-random-partitioned/part-r-00000.avro rename to connector/avro/src/test/resources/test-random-partitioned/part-r-00000.avro diff --git a/external/avro/src/test/resources/test-random-partitioned/part-r-00001.avro b/connector/avro/src/test/resources/test-random-partitioned/part-r-00001.avro similarity index 100% rename from external/avro/src/test/resources/test-random-partitioned/part-r-00001.avro rename to connector/avro/src/test/resources/test-random-partitioned/part-r-00001.avro diff --git a/external/avro/src/test/resources/test-random-partitioned/part-r-00002.avro b/connector/avro/src/test/resources/test-random-partitioned/part-r-00002.avro similarity index 100% rename from external/avro/src/test/resources/test-random-partitioned/part-r-00002.avro rename to connector/avro/src/test/resources/test-random-partitioned/part-r-00002.avro diff --git a/external/avro/src/test/resources/test-random-partitioned/part-r-00003.avro b/connector/avro/src/test/resources/test-random-partitioned/part-r-00003.avro similarity index 100% rename from external/avro/src/test/resources/test-random-partitioned/part-r-00003.avro rename to connector/avro/src/test/resources/test-random-partitioned/part-r-00003.avro diff --git a/external/avro/src/test/resources/test-random-partitioned/part-r-00004.avro b/connector/avro/src/test/resources/test-random-partitioned/part-r-00004.avro similarity index 100% rename from external/avro/src/test/resources/test-random-partitioned/part-r-00004.avro rename to connector/avro/src/test/resources/test-random-partitioned/part-r-00004.avro diff --git a/external/avro/src/test/resources/test-random-partitioned/part-r-00005.avro b/connector/avro/src/test/resources/test-random-partitioned/part-r-00005.avro similarity index 100% rename from external/avro/src/test/resources/test-random-partitioned/part-r-00005.avro rename to connector/avro/src/test/resources/test-random-partitioned/part-r-00005.avro diff --git a/external/avro/src/test/resources/test-random-partitioned/part-r-00006.avro b/connector/avro/src/test/resources/test-random-partitioned/part-r-00006.avro similarity index 100% rename from external/avro/src/test/resources/test-random-partitioned/part-r-00006.avro rename to connector/avro/src/test/resources/test-random-partitioned/part-r-00006.avro diff --git a/external/avro/src/test/resources/test-random-partitioned/part-r-00007.avro b/connector/avro/src/test/resources/test-random-partitioned/part-r-00007.avro similarity index 100% rename from external/avro/src/test/resources/test-random-partitioned/part-r-00007.avro rename to connector/avro/src/test/resources/test-random-partitioned/part-r-00007.avro diff --git a/external/avro/src/test/resources/test-random-partitioned/part-r-00008.avro b/connector/avro/src/test/resources/test-random-partitioned/part-r-00008.avro similarity index 100% rename from external/avro/src/test/resources/test-random-partitioned/part-r-00008.avro rename to connector/avro/src/test/resources/test-random-partitioned/part-r-00008.avro diff --git a/external/avro/src/test/resources/test-random-partitioned/part-r-00009.avro b/connector/avro/src/test/resources/test-random-partitioned/part-r-00009.avro similarity index 100% rename from external/avro/src/test/resources/test-random-partitioned/part-r-00009.avro rename to connector/avro/src/test/resources/test-random-partitioned/part-r-00009.avro diff --git a/external/avro/src/test/resources/test-random-partitioned/part-r-00010.avro b/connector/avro/src/test/resources/test-random-partitioned/part-r-00010.avro similarity index 100% rename from external/avro/src/test/resources/test-random-partitioned/part-r-00010.avro rename to connector/avro/src/test/resources/test-random-partitioned/part-r-00010.avro diff --git a/external/avro/src/test/resources/test.avro b/connector/avro/src/test/resources/test.avro similarity index 100% rename from external/avro/src/test/resources/test.avro rename to connector/avro/src/test/resources/test.avro diff --git a/external/avro/src/test/resources/test.avsc b/connector/avro/src/test/resources/test.avsc similarity index 100% rename from external/avro/src/test/resources/test.avsc rename to connector/avro/src/test/resources/test.avsc diff --git a/external/avro/src/test/resources/test.json b/connector/avro/src/test/resources/test.json similarity index 100% rename from external/avro/src/test/resources/test.json rename to connector/avro/src/test/resources/test.json diff --git a/external/avro/src/test/resources/test_sub.avsc b/connector/avro/src/test/resources/test_sub.avsc similarity index 100% rename from external/avro/src/test/resources/test_sub.avsc rename to connector/avro/src/test/resources/test_sub.avsc diff --git a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroCatalystDataConversionSuite.scala b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroCatalystDataConversionSuite.scala similarity index 100% rename from external/avro/src/test/scala/org/apache/spark/sql/avro/AvroCatalystDataConversionSuite.scala rename to connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroCatalystDataConversionSuite.scala diff --git a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroCodecSuite.scala b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroCodecSuite.scala similarity index 100% rename from external/avro/src/test/scala/org/apache/spark/sql/avro/AvroCodecSuite.scala rename to connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroCodecSuite.scala diff --git a/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroFunctionsSuite.scala b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroFunctionsSuite.scala new file mode 100644 index 0000000000000..abc0c3d3155d2 --- /dev/null +++ b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroFunctionsSuite.scala @@ -0,0 +1,314 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.avro + +import java.io.ByteArrayOutputStream + +import scala.collection.JavaConverters._ + +import org.apache.avro.{Schema, SchemaBuilder} +import org.apache.avro.generic.{GenericDatumWriter, GenericRecord, GenericRecordBuilder} +import org.apache.avro.io.EncoderFactory + +import org.apache.spark.SparkException +import org.apache.spark.sql.{QueryTest, Row} +import org.apache.spark.sql.execution.LocalTableScanExec +import org.apache.spark.sql.functions.{col, lit, struct} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.StructType + +class AvroFunctionsSuite extends QueryTest with SharedSparkSession { + import testImplicits._ + + test("roundtrip in to_avro and from_avro - int and string") { + val df = spark.range(10).select($"id", $"id".cast("string").as("str")) + + val avroDF = df.select( + functions.to_avro($"id").as("a"), + functions.to_avro($"str").as("b")) + val avroTypeLong = s""" + |{ + | "type": "int", + | "name": "id" + |} + """.stripMargin + val avroTypeStr = s""" + |{ + | "type": "string", + | "name": "str" + |} + """.stripMargin + checkAnswer(avroDF.select( + functions.from_avro($"a", avroTypeLong), + functions.from_avro($"b", avroTypeStr)), df) + } + + test("roundtrip in to_avro and from_avro - struct") { + val df = spark.range(10).select(struct($"id", $"id".cast("string").as("str")).as("struct")) + val avroStructDF = df.select(functions.to_avro($"struct").as("avro")) + val avroTypeStruct = s""" + |{ + | "type": "record", + | "name": "struct", + | "fields": [ + | {"name": "col1", "type": "long"}, + | {"name": "col2", "type": "string"} + | ] + |} + """.stripMargin + checkAnswer(avroStructDF.select( + functions.from_avro($"avro", avroTypeStruct)), df) + } + + test("handle invalid input in from_avro") { + val count = 10 + val df = spark.range(count).select(struct($"id", $"id".as("id2")).as("struct")) + val avroStructDF = df.select(functions.to_avro($"struct").as("avro")) + val avroTypeStruct = s""" + |{ + | "type": "record", + | "name": "struct", + | "fields": [ + | {"name": "col1", "type": "long"}, + | {"name": "col2", "type": "double"} + | ] + |} + """.stripMargin + + intercept[SparkException] { + avroStructDF.select( + functions.from_avro( + $"avro", avroTypeStruct, Map("mode" -> "FAILFAST").asJava)).collect() + } + + // For PERMISSIVE mode, the result should be row of null columns. + val expected = (0 until count).map(_ => Row(Row(null, null))) + checkAnswer( + avroStructDF.select( + functions.from_avro( + $"avro", avroTypeStruct, Map("mode" -> "PERMISSIVE").asJava)), + expected) + } + + test("roundtrip in to_avro and from_avro - array with null") { + val dfOne = Seq(Tuple1(Tuple1(1) :: Nil), Tuple1(null :: Nil)).toDF("array") + val avroTypeArrStruct = s""" + |[ { + | "type" : "array", + | "items" : [ { + | "type" : "record", + | "name" : "x", + | "fields" : [ { + | "name" : "y", + | "type" : "int" + | } ] + | }, "null" ] + |}, "null" ] + """.stripMargin + val readBackOne = dfOne.select(functions.to_avro($"array").as("avro")) + .select(functions.from_avro($"avro", avroTypeArrStruct).as("array")) + checkAnswer(dfOne, readBackOne) + } + + test("SPARK-27798: from_avro produces same value when converted to local relation") { + val simpleSchema = + """ + |{ + | "type": "record", + | "name" : "Payload", + | "fields" : [ {"name" : "message", "type" : "string" } ] + |} + """.stripMargin + + def generateBinary(message: String, avroSchema: String): Array[Byte] = { + val schema = new Schema.Parser().parse(avroSchema) + val out = new ByteArrayOutputStream() + val writer = new GenericDatumWriter[GenericRecord](schema) + val encoder = EncoderFactory.get().binaryEncoder(out, null) + val rootRecord = new GenericRecordBuilder(schema).set("message", message).build() + writer.write(rootRecord, encoder) + encoder.flush() + out.toByteArray + } + + // This bug is hit when the rule `ConvertToLocalRelation` is run. But the rule was excluded + // in `SharedSparkSession`. + withSQLConf(SQLConf.OPTIMIZER_EXCLUDED_RULES.key -> "") { + val df = Seq("one", "two", "three", "four").map(generateBinary(_, simpleSchema)) + .toDF() + .withColumn("value", + functions.from_avro(col("value"), simpleSchema)) + + assert(df.queryExecution.executedPlan.isInstanceOf[LocalTableScanExec]) + assert(df.collect().map(_.get(0)) === Seq(Row("one"), Row("two"), Row("three"), Row("four"))) + } + } + + test("SPARK-27506: roundtrip in to_avro and from_avro with different compatible schemas") { + val df = spark.range(10).select( + struct($"id".as("col1"), $"id".cast("string").as("col2")).as("struct") + ) + val avroStructDF = df.select(functions.to_avro($"struct").as("avro")) + val actualAvroSchema = + s""" + |{ + | "type": "record", + | "name": "struct", + | "fields": [ + | {"name": "col1", "type": "int"}, + | {"name": "col2", "type": "string"} + | ] + |} + |""".stripMargin + + val evolvedAvroSchema = + s""" + |{ + | "type": "record", + | "name": "struct", + | "fields": [ + | {"name": "col1", "type": "int"}, + | {"name": "col2", "type": "string"}, + | {"name": "col3", "type": "string", "default": ""} + | ] + |} + |""".stripMargin + + val expected = spark.range(10).select( + struct($"id".as("col1"), $"id".cast("string").as("col2"), lit("").as("col3")).as("struct") + ) + + checkAnswer( + avroStructDF.select( + functions.from_avro( + $"avro", + actualAvroSchema, + Map("avroSchema" -> evolvedAvroSchema).asJava)), + expected) + } + + test("roundtrip in to_avro and from_avro - struct with nullable Avro schema") { + val df = spark.range(10).select(struct($"id", $"id".cast("string").as("str")).as("struct")) + val avroTypeStruct = s""" + |{ + | "type": "record", + | "name": "struct", + | "fields": [ + | {"name": "id", "type": "long"}, + | {"name": "str", "type": ["null", "string"]} + | ] + |} + """.stripMargin + val avroStructDF = df.select(functions.to_avro($"struct", avroTypeStruct).as("avro")) + checkAnswer(avroStructDF.select( + functions.from_avro($"avro", avroTypeStruct)), df) + } + + test("to_avro optional union Avro schema") { + val df = spark.range(10).select(struct($"id", $"id".cast("string").as("str")).as("struct")) + for (supportedAvroType <- Seq("""["null", "int", "long"]""", """["int", "long"]""")) { + val avroTypeStruct = s""" + |{ + | "type": "record", + | "name": "struct", + | "fields": [ + | {"name": "id", "type": $supportedAvroType}, + | {"name": "str", "type": ["null", "string"]} + | ] + |} + """.stripMargin + val avroStructDF = df.select(functions.to_avro($"struct", avroTypeStruct).as("avro")) + checkAnswer(avroStructDF.select( + functions.from_avro($"avro", avroTypeStruct)), df) + } + } + + test("to_avro complex union Avro schema") { + val df = Seq((Some(1), None), (None, Some("a"))).toDF() + .select(struct(struct($"_1".as("member0"), $"_2".as("member1")).as("u")).as("struct")) + val avroTypeStruct = SchemaBuilder.record("struct").fields() + .name("u").`type`().unionOf().intType().and().stringType().endUnion().noDefault() + .endRecord().toString + val avroStructDF = df.select(functions.to_avro($"struct", avroTypeStruct).as("avro")) + checkAnswer(avroStructDF.select( + functions.from_avro($"avro", avroTypeStruct)), df) + } + + test("SPARK-39775: Disable validate default values when parsing Avro schemas") { + val avroTypeStruct = s""" + |{ + | "type": "record", + | "name": "struct", + | "fields": [ + | {"name": "id", "type": "long", "default": null} + | ] + |} + """.stripMargin + val avroSchema = AvroOptions(Map("avroSchema" -> avroTypeStruct)).schema.get + val sparkSchema = SchemaConverters.toSqlType(avroSchema).dataType.asInstanceOf[StructType] + + val df = spark.range(5).select($"id") + val structDf = df.select(struct($"id").as("struct")) + val avroStructDF = structDf.select(functions.to_avro($"struct", avroTypeStruct).as("avro")) + checkAnswer(avroStructDF.select(functions.from_avro($"avro", avroTypeStruct)), structDf) + + withTempPath { dir => + df.write.format("avro").save(dir.getCanonicalPath) + checkAnswer(spark.read.schema(sparkSchema).format("avro").load(dir.getCanonicalPath), df) + + val msg = intercept[SparkException] { + spark.read.option("avroSchema", avroTypeStruct).format("avro") + .load(dir.getCanonicalPath) + .collect() + }.getCause.getMessage + assert(msg.contains("Invalid default for field id: null not a \"long\"")) + } + } + + test("SPARK-39775: Disable validate default values when parsing Avro schemas") { + val avroTypeStruct = s""" + |{ + | "type": "record", + | "name": "struct", + | "fields": [ + | {"name": "id", "type": "long", "default": null} + | ] + |} + """.stripMargin + val avroSchema = AvroOptions(Map("avroSchema" -> avroTypeStruct)).schema.get + val sparkSchema = SchemaConverters.toSqlType(avroSchema).dataType.asInstanceOf[StructType] + + val df = spark.range(5).select($"id") + val structDf = df.select(struct($"id").as("struct")) + val avroStructDF = structDf.select(functions.to_avro('struct, avroTypeStruct).as("avro")) + checkAnswer(avroStructDF.select(functions.from_avro('avro, avroTypeStruct)), structDf) + + withTempPath { dir => + df.write.format("avro").save(dir.getCanonicalPath) + checkAnswer(spark.read.schema(sparkSchema).format("avro").load(dir.getCanonicalPath), df) + + val msg = intercept[SparkException] { + spark.read.option("avroSchema", avroTypeStruct).format("avro") + .load(dir.getCanonicalPath) + .collect() + }.getCause.getMessage + assert(msg.contains("Invalid default for field id: null not a \"long\"")) + } + } +} diff --git a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroLogicalTypeSuite.scala b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroLogicalTypeSuite.scala similarity index 94% rename from external/avro/src/test/scala/org/apache/spark/sql/avro/AvroLogicalTypeSuite.scala rename to connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroLogicalTypeSuite.scala index b7ac10c58e24a..c0022c62735c8 100644 --- a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroLogicalTypeSuite.scala +++ b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroLogicalTypeSuite.scala @@ -24,7 +24,7 @@ import org.apache.avro.Conversions.DecimalConversion import org.apache.avro.file.DataFileWriter import org.apache.avro.generic.{GenericData, GenericDatumWriter, GenericRecord} -import org.apache.spark.{SparkConf, SparkException} +import org.apache.spark.{SparkArithmeticException, SparkConf, SparkException} import org.apache.spark.sql.{QueryTest, Row} import org.apache.spark.sql.catalyst.util.DateTimeUtils import org.apache.spark.sql.internal.SQLConf @@ -129,7 +129,7 @@ abstract class AvroLogicalTypeSuite extends QueryTest with SharedSparkSession { withTempDir { dir => val expected = timestampInputData.map(t => Row(new Timestamp(t._1))) val timestampAvro = timestampFile(dir.getAbsolutePath) - val df = spark.read.format("avro").load(timestampAvro).select('timestamp_millis) + val df = spark.read.format("avro").load(timestampAvro).select($"timestamp_millis") checkAnswer(df, expected) @@ -144,7 +144,7 @@ abstract class AvroLogicalTypeSuite extends QueryTest with SharedSparkSession { withTempDir { dir => val expected = timestampInputData.map(t => Row(new Timestamp(t._2))) val timestampAvro = timestampFile(dir.getAbsolutePath) - val df = spark.read.format("avro").load(timestampAvro).select('timestamp_micros) + val df = spark.read.format("avro").load(timestampAvro).select($"timestamp_micros") checkAnswer(df, expected) @@ -160,7 +160,7 @@ abstract class AvroLogicalTypeSuite extends QueryTest with SharedSparkSession { val expected = timestampInputData.map(t => Row(DateTimeUtils.microsToLocalDateTime(DateTimeUtils.millisToMicros(t._3)))) val timestampAvro = timestampFile(dir.getAbsolutePath) - val df = spark.read.format("avro").load(timestampAvro).select('local_timestamp_millis) + val df = spark.read.format("avro").load(timestampAvro).select($"local_timestamp_millis") checkAnswer(df, expected) @@ -176,7 +176,7 @@ abstract class AvroLogicalTypeSuite extends QueryTest with SharedSparkSession { val expected = timestampInputData.map(t => Row(DateTimeUtils.microsToLocalDateTime(DateTimeUtils.millisToMicros(t._4)))) val timestampAvro = timestampFile(dir.getAbsolutePath) - val df = spark.read.format("avro").load(timestampAvro).select('local_timestamp_micros) + val df = spark.read.format("avro").load(timestampAvro).select($"local_timestamp_micros") checkAnswer(df, expected) @@ -194,7 +194,8 @@ abstract class AvroLogicalTypeSuite extends QueryTest with SharedSparkSession { withTempDir { dir => val timestampAvro = timestampFile(dir.getAbsolutePath) val df = - spark.read.format("avro").load(timestampAvro).select('timestamp_millis, 'timestamp_micros) + spark.read.format("avro").load(timestampAvro) + .select($"timestamp_millis", $"timestamp_micros") val expected = timestampInputData.map(t => Row(new Timestamp(t._1), new Timestamp(t._2))) @@ -226,7 +227,7 @@ abstract class AvroLogicalTypeSuite extends QueryTest with SharedSparkSession { withTempDir { dir => val timestampAvro = timestampFile(dir.getAbsolutePath) val df = spark.read.format("avro").load(timestampAvro).select( - 'local_timestamp_millis, 'local_timestamp_micros) + $"local_timestamp_millis", $"local_timestamp_micros") val expected = timestampInputData.map(t => Row(DateTimeUtils.microsToLocalDateTime(DateTimeUtils.millisToMicros(t._3)), @@ -260,7 +261,7 @@ abstract class AvroLogicalTypeSuite extends QueryTest with SharedSparkSession { withTempDir { dir => val timestampAvro = timestampFile(dir.getAbsolutePath) val schema = StructType(StructField("long", TimestampType, true) :: Nil) - val df = spark.read.format("avro").schema(schema).load(timestampAvro).select('long) + val df = spark.read.format("avro").schema(schema).load(timestampAvro).select($"long") val expected = timestampInputData.map(t => Row(new Timestamp(t._5))) @@ -272,7 +273,7 @@ abstract class AvroLogicalTypeSuite extends QueryTest with SharedSparkSession { withTempDir { dir => val timestampAvro = timestampFile(dir.getAbsolutePath) val schema = StructType(StructField("long", TimestampNTZType, true) :: Nil) - val df = spark.read.format("avro").schema(schema).load(timestampAvro).select('long) + val df = spark.read.format("avro").schema(schema).load(timestampAvro).select($"long") val expected = timestampInputData.map(t => Row(DateTimeUtils.microsToLocalDateTime(DateTimeUtils.millisToMicros(t._5)))) @@ -432,10 +433,17 @@ abstract class AvroLogicalTypeSuite extends QueryTest with SharedSparkSession { dataFileWriter.flush() dataFileWriter.close() - val msg = intercept[SparkException] { - spark.read.format("avro").load(s"$dir.avro").collect() - }.getCause.getCause.getMessage - assert(msg.contains("Unscaled value too large for precision")) + checkError( + exception = intercept[SparkException] { + spark.read.format("avro").load(s"$dir.avro").collect() + }.getCause.getCause.asInstanceOf[SparkArithmeticException], + errorClass = "NUMERIC_VALUE_OUT_OF_RANGE", + parameters = Map( + "value" -> "0", + "precision" -> "4", + "scale" -> "2", + "config" -> "\"spark.sql.ansi.enabled\"") + ) } } } diff --git a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroRowReaderSuite.scala b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroRowReaderSuite.scala similarity index 96% rename from external/avro/src/test/scala/org/apache/spark/sql/avro/AvroRowReaderSuite.scala rename to connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroRowReaderSuite.scala index 08c61381c5780..046ff4ef088d8 100644 --- a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroRowReaderSuite.scala +++ b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroRowReaderSuite.scala @@ -59,11 +59,13 @@ class AvroRowReaderSuite val df = spark.read.format("avro").load(dir.getCanonicalPath) val fileScan = df.queryExecution.executedPlan collectFirst { - case BatchScanExec(_, f: AvroScan, _, _) => f + case BatchScanExec(_, f: AvroScan, _, _, _, _, _, _, _) => f } val filePath = fileScan.get.fileIndex.inputFiles(0) val fileSize = new File(new URI(filePath)).length + // scalastyle:off pathfromuri val in = new FsInput(new Path(new URI(filePath)), new Configuration()) + // scalastyle:on pathfromuri val reader = DataFileReader.openReader(in, new GenericDatumReader[GenericRecord]()) val it = new Iterator[InternalRow] with AvroUtils.RowReader { diff --git a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroScanSuite.scala b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroScanSuite.scala similarity index 100% rename from external/avro/src/test/scala/org/apache/spark/sql/avro/AvroScanSuite.scala rename to connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroScanSuite.scala diff --git a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSchemaHelperSuite.scala b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSchemaHelperSuite.scala similarity index 100% rename from external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSchemaHelperSuite.scala rename to connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSchemaHelperSuite.scala diff --git a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSerdeSuite.scala b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSerdeSuite.scala similarity index 100% rename from external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSerdeSuite.scala rename to connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSerdeSuite.scala diff --git a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala similarity index 91% rename from external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala rename to connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala index e93c1c09c9fc2..d19a11b4546a7 100644 --- a/external/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala +++ b/connector/avro/src/test/scala/org/apache/spark/sql/avro/AvroSuite.scala @@ -299,21 +299,27 @@ abstract class AvroSuite test("Complex Union Type") { withTempPath { dir => - val fixedSchema = Schema.createFixed("fixed_name", "doc", "namespace", 4) - val enumSchema = Schema.createEnum("enum_name", "doc", "namespace", List("e1", "e2").asJava) - val complexUnionType = Schema.createUnion( - List(Schema.create(Type.INT), Schema.create(Type.STRING), fixedSchema, enumSchema).asJava) - val fields = Seq( - new Field("field1", complexUnionType, "doc", null.asInstanceOf[AnyVal]), - new Field("field2", complexUnionType, "doc", null.asInstanceOf[AnyVal]), - new Field("field3", complexUnionType, "doc", null.asInstanceOf[AnyVal]), - new Field("field4", complexUnionType, "doc", null.asInstanceOf[AnyVal]) - ).asJava - val schema = Schema.createRecord("name", "docs", "namespace", false) - schema.setFields(fields) + val nativeWriterPath = s"$dir.avro" + val sparkWriterPath = s"$dir/spark" + val fixedSchema = SchemaBuilder.fixed("fixed_name").size(4) + val enumSchema = SchemaBuilder.enumeration("enum_name").symbols("e1", "e2") + val complexUnionType = SchemaBuilder.unionOf() + .intType().and() + .stringType().and() + .`type`(fixedSchema).and() + .`type`(enumSchema).and() + .nullType() + .endUnion() + val schema = SchemaBuilder.record("name").fields() + .name("field1").`type`(complexUnionType).noDefault() + .name("field2").`type`(complexUnionType).noDefault() + .name("field3").`type`(complexUnionType).noDefault() + .name("field4").`type`(complexUnionType).noDefault() + .name("field5").`type`(complexUnionType).noDefault() + .endRecord() val datumWriter = new GenericDatumWriter[GenericRecord](schema) val dataFileWriter = new DataFileWriter[GenericRecord](datumWriter) - dataFileWriter.create(schema, new File(s"$dir.avro")) + dataFileWriter.create(schema, new File(nativeWriterPath)) val avroRec = new GenericData.Record(schema) val field1 = 1234 val field2 = "Hope that was not load bearing" @@ -323,15 +329,32 @@ abstract class AvroSuite avroRec.put("field2", field2) avroRec.put("field3", new Fixed(fixedSchema, field3)) avroRec.put("field4", new EnumSymbol(enumSchema, field4)) + avroRec.put("field5", null) dataFileWriter.append(avroRec) dataFileWriter.flush() dataFileWriter.close() - val df = spark.sqlContext.read.format("avro").load(s"$dir.avro") - assertResult(field1)(df.selectExpr("field1.member0").first().get(0)) - assertResult(field2)(df.selectExpr("field2.member1").first().get(0)) - assertResult(field3)(df.selectExpr("field3.member2").first().get(0)) - assertResult(field4)(df.selectExpr("field4.member3").first().get(0)) + val df = spark.sqlContext.read.format("avro").load(nativeWriterPath) + assertResult(Row(field1, null, null, null))(df.selectExpr("field1.*").first()) + assertResult(Row(null, field2, null, null))(df.selectExpr("field2.*").first()) + assertResult(Row(null, null, field3, null))(df.selectExpr("field3.*").first()) + assertResult(Row(null, null, null, field4))(df.selectExpr("field4.*").first()) + assertResult(Row(null, null, null, null))(df.selectExpr("field5.*").first()) + + df.write.format("avro").option("avroSchema", schema.toString).save(sparkWriterPath) + + val df2 = spark.sqlContext.read.format("avro").load(nativeWriterPath) + assertResult(Row(field1, null, null, null))(df2.selectExpr("field1.*").first()) + assertResult(Row(null, field2, null, null))(df2.selectExpr("field2.*").first()) + assertResult(Row(null, null, field3, null))(df2.selectExpr("field3.*").first()) + assertResult(Row(null, null, null, field4))(df2.selectExpr("field4.*").first()) + assertResult(Row(null, null, null, null))(df2.selectExpr("field5.*").first()) + + val reader = openDatumReader(new File(sparkWriterPath)) + assert(reader.hasNext) + assertResult(avroRec)(reader.next()) + assert(!reader.hasNext) + reader.close() } } @@ -550,8 +573,8 @@ abstract class AvroSuite val fixed = spark.read.format("avro").load(testAvro).select("fixed3").collect() assert(fixed.map(_(0).asInstanceOf[Array[Byte]]).exists(p => p(1) == 3)) - val enum = spark.read.format("avro").load(testAvro).select("enum").collect() - assert(enum.map(_(0)).toSet == Set("SPADES", "CLUBS", "DIAMONDS")) + val enums = spark.read.format("avro").load(testAvro).select("enum").collect() + assert(enums.map(_(0)).toSet == Set("SPADES", "CLUBS", "DIAMONDS")) val record = spark.read.format("avro").load(testAvro).select("record").collect() assert(record(0)(0).getClass.toString.contains("Row")) @@ -875,7 +898,7 @@ abstract class AvroSuite dfWithNull.write.format("avro") .option("avroSchema", avroSchema).save(s"$tempDir/${UUID.randomUUID()}") } - assertExceptionMsg[AvroTypeException](e1, "Not an enum: null") + assertExceptionMsg[AvroTypeException](e1, "value null is not a SuitEnumType") // Writing df containing data not in the enum will throw an exception val e2 = intercept[SparkException] { @@ -1069,14 +1092,13 @@ abstract class AvroSuite df.write.format("avro").option("avroSchema", avroSchema).save(tempSaveDir) checkAvroSchemaEquals(avroSchema, getAvroSchemaStringFromFiles(tempSaveDir)) - val message = intercept[Exception] { + val message = intercept[SparkException] { spark.createDataFrame(spark.sparkContext.parallelize(Seq(Row(2, null))), catalystSchema) .write.format("avro").option("avroSchema", avroSchema) .save(s"$tempDir/${UUID.randomUUID()}") - }.getCause.getMessage + }.getMessage assert(message.contains("Caused by: java.lang.NullPointerException: ")) - assert(message.contains( - "null of string in string in field Name of test_schema in test_schema")) + assert(message.contains("null value for (non-nullable) string at test_schema.Name")) } } @@ -1144,32 +1166,81 @@ abstract class AvroSuite } } - test("unsupported nullable avro type") { + test("int/long double/float conversion") { val catalystSchema = StructType(Seq( - StructField("Age", IntegerType, nullable = false), - StructField("Name", StringType, nullable = false))) + StructField("Age", LongType), + StructField("Length", DoubleType), + StructField("Name", StringType))) - for (unsupportedAvroType <- Seq("""["null", "int", "long"]""", """["int", "long"]""")) { + for (optionalNull <- Seq(""""null",""", "")) { val avroSchema = s""" |{ | "type" : "record", | "name" : "test_schema", | "fields" : [ - | {"name": "Age", "type": $unsupportedAvroType}, + | {"name": "Age", "type": [$optionalNull "int", "long"]}, + | {"name": "Length", "type": [$optionalNull "float", "double"]}, | {"name": "Name", "type": ["null", "string"]} | ] |} """.stripMargin val df = spark.createDataFrame( - spark.sparkContext.parallelize(Seq(Row(2, "Aurora"))), catalystSchema) + spark.sparkContext.parallelize(Seq(Row(2L, 1.8D, "Aurora"), Row(1L, 0.9D, null))), + catalystSchema) + + withTempPath { tempDir => + df.write.format("avro").option("avroSchema", avroSchema).save(tempDir.getPath) + checkAnswer( + spark.read + .format("avro") + .option("avroSchema", avroSchema) + .load(tempDir.getPath), + df) + } + } + } + + test("non-matching complex union types") { + val catalystSchema = new StructType().add("Union", new StructType() + .add("member0", IntegerType) + .add("member1", new StructType().add("f1", StringType, nullable = false)) + ) + + val df = spark.createDataFrame( + spark.sparkContext.parallelize(Seq(Row(Row(1, null)))), catalystSchema) + + val recordS = SchemaBuilder.record("r").fields().requiredString("f1").endRecord() + val intS = Schema.create(Schema.Type.INT) + val nullS = Schema.create(Schema.Type.NULL) + for ((unionTypes, compatible) <- Seq( + (Seq(nullS, intS, recordS), true), + (Seq(intS, nullS, recordS), true), + (Seq(intS, recordS, nullS), true), + (Seq(intS, recordS), true), + (Seq(nullS, recordS, intS), false), + (Seq(nullS, recordS), false), + (Seq(nullS, SchemaBuilder.record("r").fields().requiredString("f2").endRecord()), false) + )) { + val avroSchema = SchemaBuilder.record("test_schema").fields() + .name("union").`type`(Schema.createUnion(unionTypes: _*)).noDefault() + .endRecord().toString() withTempPath { tempDir => - val message = intercept[SparkException] { + if (!compatible) { + intercept[SparkException] { + df.write.format("avro").option("avroSchema", avroSchema).save(tempDir.getPath) + } + } else { df.write.format("avro").option("avroSchema", avroSchema).save(tempDir.getPath) - }.getCause.getMessage - assert(message.contains("Only UNION of a null type and a non-null type is supported")) + checkAnswer( + spark.read + .format("avro") + .option("avroSchema", avroSchema) + .load(tempDir.getPath), + df) + } } } } @@ -1182,14 +1253,16 @@ abstract class AvroSuite sql("select interval 1 days").write.format("avro").mode("overwrite").save(tempDir) }.getMessage assert(msg.contains("Cannot save interval data type into external storage.") || - msg.contains("AVRO data source does not support interval data type.")) + msg.contains("Column `INTERVAL '1' DAY` has a data type of interval day, " + + "which is not supported by Avro.")) msg = intercept[AnalysisException] { spark.udf.register("testType", () => new IntervalData()) sql("select testType()").write.format("avro").mode("overwrite").save(tempDir) }.getMessage assert(msg.toLowerCase(Locale.ROOT) - .contains(s"avro data source does not support interval data type.")) + .contains("column `testtype()` has a data type of interval, " + + "which is not supported by avro.")) } } } @@ -1803,13 +1876,13 @@ abstract class AvroSuite spark .read .format("avro") - .option(AvroOptions.ignoreExtensionKey, false) + .option(AvroOptions.IGNORE_EXTENSION, false) .load(dir.getCanonicalPath) .count() } val deprecatedEvents = logAppender.loggingEvents .filter(_.getMessage.getFormattedMessage.contains( - s"Option ${AvroOptions.ignoreExtensionKey} is deprecated")) + s"Option ${AvroOptions.IGNORE_EXTENSION} is deprecated")) assert(deprecatedEvents.size === 1) } } @@ -1817,7 +1890,7 @@ abstract class AvroSuite // It generates input files for the test below: // "SPARK-31183, SPARK-37705: compatibility with Spark 2.4/3.2 in reading dates/timestamps" ignore("SPARK-31855: generate test files for checking compatibility with Spark 2.4/3.2") { - val resourceDir = "external/avro/src/test/resources" + val resourceDir = "connector/avro/src/test/resources" val version = SPARK_VERSION_SHORT.replaceAll("\\.", "_") def save( in: Seq[String], @@ -1932,7 +2005,7 @@ abstract class AvroSuite val e = intercept[SparkException] { df.write.format("avro").option("avroSchema", avroSchema).save(path3_x) } - assert(e.getCause.getCause.getCause.isInstanceOf[SparkUpgradeException]) + assert(e.getCause.getCause.isInstanceOf[SparkUpgradeException]) checkDefaultLegacyRead(oldPath) withSQLConf(SQLConf.AVRO_REBASE_MODE_IN_WRITE.key -> CORRECTED.toString) { @@ -2103,12 +2176,15 @@ abstract class AvroSuite } private def checkMetaData(path: java.io.File, key: String, expectedValue: String): Unit = { + val value = openDatumReader(path).asInstanceOf[DataFileReader[_]].getMetaString(key) + assert(value === expectedValue) + } + + private def openDatumReader(path: File): org.apache.avro.file.FileReader[GenericRecord] = { val avroFiles = path.listFiles() .filter(f => f.isFile && !f.getName.startsWith(".") && !f.getName.startsWith("_")) assert(avroFiles.length === 1) - val reader = DataFileReader.openReader(avroFiles(0), new GenericDatumReader[GenericRecord]()) - val value = reader.asInstanceOf[DataFileReader[_]].getMetaString(key) - assert(value === expectedValue) + DataFileReader.openReader(avroFiles(0), new GenericDatumReader[GenericRecord]()) } test("SPARK-31327: Write Spark version into Avro file metadata") { @@ -2183,7 +2259,7 @@ abstract class AvroSuite val e = intercept[SparkException] { df.write.format("avro").option("avroSchema", avroSchema).save(dir.getCanonicalPath) } - val errMsg = e.getCause.getCause.getCause.asInstanceOf[SparkUpgradeException].getMessage + val errMsg = e.getCause.getCause.asInstanceOf[SparkUpgradeException].getMessage assert(errMsg.contains("You may get a different result due to the upgrading")) } } @@ -2193,7 +2269,7 @@ abstract class AvroSuite val e = intercept[SparkException] { df.write.format("avro").save(dir.getCanonicalPath) } - val errMsg = e.getCause.getCause.getCause.asInstanceOf[SparkUpgradeException].getMessage + val errMsg = e.getCause.getCause.asInstanceOf[SparkUpgradeException].getMessage assert(errMsg.contains("You may get a different result due to the upgrading")) } } @@ -2218,14 +2294,18 @@ abstract class AvroSuite withView("v") { spark.range(1).createTempView("v") withTempDir { dir => - val e = intercept[AnalysisException] { - sql( - s""" - |CREATE TABLE test_ddl USING AVRO - |LOCATION '${dir}' - |AS SELECT ID, IF(ID=1,1,0) FROM v""".stripMargin) - }.getMessage - assert(e.contains("Column name \"(IF((ID = 1), 1, 0))\" contains invalid character(s).")) + checkError( + exception = intercept[AnalysisException] { + sql( + s""" + |CREATE TABLE test_ddl USING AVRO + |LOCATION '${dir}' + |AS SELECT ID, IF(ID=1,1,0) FROM v""".stripMargin) + }, + errorClass = "INVALID_COLUMN_NAME_AS_PATH", + parameters = Map( + "datasource" -> "AvroFileFormat", "columnName" -> "`(IF((ID = 1), 1, 0))`") + ) } withTempDir { dir => @@ -2271,6 +2351,20 @@ abstract class AvroSuite checkAnswer(df2, df.collect().toSeq) } } + + test("SPARK-40667: validate Avro Options") { + assert(AvroOptions.getAllOptions.size == 9) + // Please add validation on any new Avro options here + assert(AvroOptions.isValidOption("ignoreExtension")) + assert(AvroOptions.isValidOption("mode")) + assert(AvroOptions.isValidOption("recordName")) + assert(AvroOptions.isValidOption("compression")) + assert(AvroOptions.isValidOption("avroSchema")) + assert(AvroOptions.isValidOption("avroSchemaUrl")) + assert(AvroOptions.isValidOption("recordNamespace")) + assert(AvroOptions.isValidOption("positionalFieldMatching")) + assert(AvroOptions.isValidOption("datetimeRebaseMode")) + } } class AvroV1Suite extends AvroSuite { @@ -2283,20 +2377,28 @@ class AvroV1Suite extends AvroSuite { withView("v") { spark.range(1).createTempView("v") withTempDir { dir => - val e = intercept[AnalysisException] { - sql("SELECT ID, IF(ID=1,1,0) FROM v").write.mode(SaveMode.Overwrite) - .format("avro").save(dir.getCanonicalPath) - }.getMessage - assert(e.contains("Column name \"(IF((ID = 1), 1, 0))\" contains invalid character(s).")) + checkError( + exception = intercept[AnalysisException] { + sql("SELECT ID, IF(ID=1,1,0) FROM v").write.mode(SaveMode.Overwrite) + .format("avro").save(dir.getCanonicalPath) + }, + errorClass = "INVALID_COLUMN_NAME_AS_PATH", + parameters = Map( + "datasource" -> "AvroFileFormat", "columnName" -> "`(IF((ID = 1), 1, 0))`") + ) } withTempDir { dir => - val e = intercept[AnalysisException] { - sql("SELECT NAMED_STRUCT('(IF((ID = 1), 1, 0))', IF(ID=1,ID,0)) AS col1 FROM v") - .write.mode(SaveMode.Overwrite) - .format("avro").save(dir.getCanonicalPath) - }.getMessage - assert(e.contains("Column name \"(IF((ID = 1), 1, 0))\" contains invalid character(s).")) + checkError( + exception = intercept[AnalysisException] { + sql("SELECT NAMED_STRUCT('(IF((ID = 1), 1, 0))', IF(ID=1,ID,0)) AS col1 FROM v") + .write.mode(SaveMode.Overwrite) + .format("avro").save(dir.getCanonicalPath) + }, + errorClass = "INVALID_COLUMN_NAME_AS_PATH", + parameters = Map( + "datasource" -> "AvroFileFormat", "columnName" -> "`(IF((ID = 1), 1, 0))`") + ) } } } @@ -2335,14 +2437,15 @@ class AvroV2Suite extends AvroSuite with ExplainSuiteHelper { }) val fileScan = df.queryExecution.executedPlan collectFirst { - case BatchScanExec(_, f: AvroScan, _, _) => f + case BatchScanExec(_, f: AvroScan, _, _, _, _, _, _, _) => f } assert(fileScan.nonEmpty) assert(fileScan.get.partitionFilters.nonEmpty) assert(fileScan.get.dataFilters.nonEmpty) assert(fileScan.get.planInputPartitions().forall { partition => partition.asInstanceOf[FilePartition].files.forall { file => - file.filePath.contains("p1=1") && file.filePath.contains("p2=2") + file.urlEncodedPath.contains("p1=1") && + file.urlEncodedPath.contains("p2=2") } }) checkAnswer(df, Row("b", 1, 2)) @@ -2368,7 +2471,7 @@ class AvroV2Suite extends AvroSuite with ExplainSuiteHelper { assert(filterCondition.isDefined) val fileScan = df.queryExecution.executedPlan collectFirst { - case BatchScanExec(_, f: AvroScan, _, _) => f + case BatchScanExec(_, f: AvroScan, _, _, _, _, _, _, _) => f } assert(fileScan.nonEmpty) assert(fileScan.get.partitionFilters.isEmpty) @@ -2408,7 +2511,7 @@ class AvroV2Suite extends AvroSuite with ExplainSuiteHelper { val basePath = dir.getCanonicalPath + "/avro" val expected_plan_fragment = s""" - |\\(1\\) BatchScan + |\\(1\\) BatchScan avro file:$basePath |Output \\[2\\]: \\[value#xL, id#x\\] |DataFilters: \\[isnotnull\\(value#xL\\), \\(value#xL > 2\\)\\] |Format: avro @@ -2449,7 +2552,7 @@ class AvroV2Suite extends AvroSuite with ExplainSuiteHelper { .where("value = 'a'") val fileScan = df.queryExecution.executedPlan collectFirst { - case BatchScanExec(_, f: AvroScan, _, _) => f + case BatchScanExec(_, f: AvroScan, _, _, _, _, _, _, _) => f } assert(fileScan.nonEmpty) if (filtersPushdown) { diff --git a/external/avro/src/test/scala/org/apache/spark/sql/avro/DeprecatedAvroFunctionsSuite.scala b/connector/avro/src/test/scala/org/apache/spark/sql/avro/DeprecatedAvroFunctionsSuite.scala similarity index 89% rename from external/avro/src/test/scala/org/apache/spark/sql/avro/DeprecatedAvroFunctionsSuite.scala rename to connector/avro/src/test/scala/org/apache/spark/sql/avro/DeprecatedAvroFunctionsSuite.scala index cdfa1b118b18d..40ed487087c8a 100644 --- a/external/avro/src/test/scala/org/apache/spark/sql/avro/DeprecatedAvroFunctionsSuite.scala +++ b/connector/avro/src/test/scala/org/apache/spark/sql/avro/DeprecatedAvroFunctionsSuite.scala @@ -34,9 +34,9 @@ class DeprecatedAvroFunctionsSuite extends QueryTest with SharedSparkSession { import testImplicits._ test("roundtrip in to_avro and from_avro - int and string") { - val df = spark.range(10).select('id, 'id.cast("string").as("str")) + val df = spark.range(10).select($"id", $"id".cast("string").as("str")) - val avroDF = df.select(to_avro('id).as("a"), to_avro('str).as("b")) + val avroDF = df.select(to_avro($"id").as("a"), to_avro($"str").as("b")) val avroTypeLong = s""" |{ | "type": "int", @@ -49,12 +49,12 @@ class DeprecatedAvroFunctionsSuite extends QueryTest with SharedSparkSession { | "name": "str" |} """.stripMargin - checkAnswer(avroDF.select(from_avro('a, avroTypeLong), from_avro('b, avroTypeStr)), df) + checkAnswer(avroDF.select(from_avro($"a", avroTypeLong), from_avro($"b", avroTypeStr)), df) } test("roundtrip in to_avro and from_avro - struct") { - val df = spark.range(10).select(struct('id, 'id.cast("string").as("str")).as("struct")) - val avroStructDF = df.select(to_avro('struct).as("avro")) + val df = spark.range(10).select(struct($"id", $"id".cast("string").as("str")).as("struct")) + val avroStructDF = df.select(to_avro($"struct").as("avro")) val avroTypeStruct = s""" |{ | "type": "record", @@ -65,7 +65,7 @@ class DeprecatedAvroFunctionsSuite extends QueryTest with SharedSparkSession { | ] |} """.stripMargin - checkAnswer(avroStructDF.select(from_avro('avro, avroTypeStruct)), df) + checkAnswer(avroStructDF.select(from_avro($"avro", avroTypeStruct)), df) } test("roundtrip in to_avro and from_avro - array with null") { diff --git a/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroReadBenchmark.scala b/connector/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroReadBenchmark.scala similarity index 99% rename from external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroReadBenchmark.scala rename to connector/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroReadBenchmark.scala index 7368543642b99..aa0d713bbfb77 100644 --- a/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroReadBenchmark.scala +++ b/connector/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroReadBenchmark.scala @@ -33,8 +33,8 @@ import org.apache.spark.sql.types._ * To run this benchmark: * 1. without sbt: bin/spark-submit --class * --jars ,,, - * 2. build/sbt "avro/test:runMain " - * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "avro/test:runMain " + * 2. build/sbt "avro/Test/runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "avro/Test/runMain " * Results will be written to "benchmarks/AvroReadBenchmark-results.txt". * }}} */ diff --git a/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala b/connector/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala similarity index 96% rename from external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala rename to connector/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala index 7f9febb5b14e5..d1db290f34b3b 100644 --- a/external/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala +++ b/connector/avro/src/test/scala/org/apache/spark/sql/execution/benchmark/AvroWriteBenchmark.scala @@ -30,8 +30,8 @@ import org.apache.spark.storage.StorageLevel * --jars ,, * , * - * 2. build/sbt "sql/test:runMain " - * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "avro/test:runMain " + * 2. build/sbt "avro/Test/runMain " + * 3. generate result: SPARK_GENERATE_BENCHMARK_FILES=1 build/sbt "avro/Test/runMain " * Results will be written to "benchmarks/AvroWriteBenchmark-results.txt". * }}} */ diff --git a/external/avro/src/test/scala/org/apache/spark/sql/execution/datasources/AvroReadSchemaSuite.scala b/connector/avro/src/test/scala/org/apache/spark/sql/execution/datasources/AvroReadSchemaSuite.scala similarity index 100% rename from external/avro/src/test/scala/org/apache/spark/sql/execution/datasources/AvroReadSchemaSuite.scala rename to connector/avro/src/test/scala/org/apache/spark/sql/execution/datasources/AvroReadSchemaSuite.scala diff --git a/connector/connect/README.md b/connector/connect/README.md new file mode 100644 index 0000000000000..dfe49cea3df1f --- /dev/null +++ b/connector/connect/README.md @@ -0,0 +1,46 @@ +# Spark Connect + +This module contains the implementation of Spark Connect which is a logical plan +facade for the implementation in Spark. Spark Connect is directly integrated into the build +of Spark. + +The documentation linked here is specifically for developers of Spark Connect and not +directly intended to be end-user documentation. + +## Development Topics + +### Guidelines for new clients + +When contributing a new client please be aware that we strive to have a common +user experience across all languages. Please follow the below guidelines: + +* [Connection string configuration](docs/client-connection-string.md) +* [Adding new messages](docs/adding-proto-messages.md) in the Spark Connect protocol. + +### Python client development + +Python-specific development guidelines are located in [python/docs/source/development/testing.rst](https://github.com/apache/spark/blob/master/python/docs/source/development/testing.rst) that is published at [Development tab](https://spark.apache.org/docs/latest/api/python/development/index.html) in PySpark documentation. + +### Build with user-defined `protoc` and `protoc-gen-grpc-java` + +When the user cannot use the official `protoc` and `protoc-gen-grpc-java` binary files to build the `connect` module in the compilation environment, +for example, compiling `connect` module on CentOS 6 or CentOS 7 which the default `glibc` version is less than 2.14, we can try to compile and test by +specifying the user-defined `protoc` and `protoc-gen-grpc-java` binary files as follows: + +```bash +export SPARK_PROTOC_EXEC_PATH=/path-to-protoc-exe +export CONNECT_PLUGIN_EXEC_PATH=/path-to-protoc-gen-grpc-java-exe +./build/mvn -Phive -Puser-defined-protoc clean package +``` + +or + +```bash +export SPARK_PROTOC_EXEC_PATH=/path-to-protoc-exe +export CONNECT_PLUGIN_EXEC_PATH=/path-to-protoc-gen-grpc-java-exe +./build/sbt -Puser-defined-protoc clean package +``` + +The user-defined `protoc` and `protoc-gen-grpc-java` binary files can be produced in the user's compilation environment by source code compilation, +for compilation steps, please refer to [protobuf](https://github.com/protocolbuffers/protobuf) and [grpc-java](https://github.com/grpc/grpc-java). + diff --git a/connector/connect/bin/spark-connect b/connector/connect/bin/spark-connect new file mode 100755 index 0000000000000..772a88a04f3eb --- /dev/null +++ b/connector/connect/bin/spark-connect @@ -0,0 +1,39 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Start the spark-connect with server logs printed in the standard output. The script rebuild the +# server dependencies and start the server at the default port. This can be used to debug client +# during client development. + +# Go to the Spark project root directory +FWDIR="$(cd "`dirname "$0"`"/../../..; pwd)" +cd "$FWDIR" +export SPARK_HOME=$FWDIR + +# Determine the Scala version used in Spark +SCALA_BINARY_VER=`grep "scala.binary.version" "${SPARK_HOME}/pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'` +SCALA_ARG="-Pscala-${SCALA_BINARY_VER}" + +# Build the jars needed for spark submit and spark connect +build/sbt "${SCALA_ARG}" -Phive -Pconnect package + +# This jar is already in the classpath, but the submit commands wants a jar as the input. +CONNECT_JAR=`ls "${SPARK_HOME}"/assembly/target/scala-"${SCALA_BINARY_VER}"/jars/spark-connect_*.jar | paste -sd ',' -` + +exec "${SPARK_HOME}"/bin/spark-submit "$@" --class org.apache.spark.sql.connect.SimpleSparkConnectService "$CONNECT_JAR" diff --git a/connector/connect/bin/spark-connect-scala-client b/connector/connect/bin/spark-connect-scala-client new file mode 100755 index 0000000000000..e7a15c56d7c4d --- /dev/null +++ b/connector/connect/bin/spark-connect-scala-client @@ -0,0 +1,48 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Use the spark connect JVM client to connect to a spark connect server. +# +# Start a local server: +# A local spark-connect server with default settings can be started using the following command: +# `connector/connect/bin/spark-connect` +# The client should be able to connect to this server directly with the default client settings. +# +# Connect to a remote server: +# To connect to a remote server, use env var `SPARK_REMOTE` to configure the client connection +# string. e.g. +# `export SPARK_REMOTE="sc://:/;token=;="` + +# Go to the Spark project root directory +FWDIR="$(cd "`dirname "$0"`"/../../..; pwd)" +cd "$FWDIR" +export SPARK_HOME=$FWDIR + +# Determine the Scala version used in Spark +SCALA_BINARY_VER=`grep "scala.binary.version" "${SPARK_HOME}/pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'` +SCALA_VER=`grep "scala.version" "${SPARK_HOME}/pom.xml" | grep ${SCALA_BINARY_VER} | head -n1 | awk -F '[<>]' '{print $3}'` +SCALA_ARG="-Pscala-${SCALA_BINARY_VER}" + +# Build the jars needed for spark connect JVM client +build/sbt "${SCALA_ARG}" "sql/package;connect-client-jvm/assembly" + +CONNECT_CLASSPATH="$(build/sbt "${SCALA_ARG}" -DcopyDependencies=false "export connect-client-jvm/fullClasspath" | grep jar | tail -n1)" +SQL_CLASSPATH="$(build/sbt "${SCALA_ARG}" -DcopyDependencies=false "export sql/fullClasspath" | grep jar | tail -n1)" + +exec java -cp "$CONNECT_CLASSPATH:$SQL_CLASSPATH" org.apache.spark.sql.application.ConnectRepl "$@" \ No newline at end of file diff --git a/connector/connect/bin/spark-connect-shell b/connector/connect/bin/spark-connect-shell new file mode 100755 index 0000000000000..0fcf831e03db1 --- /dev/null +++ b/connector/connect/bin/spark-connect-shell @@ -0,0 +1,36 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# The spark connect shell for development. This shell script builds the spark connect server with +# all dependencies and starts the server at the default port. +# Use `/bin/spark-connect-shell` instead if rebuilding the dependency jars are not needed. + +# Go to the Spark project root directory +FWDIR="$(cd "`dirname "$0"`"/../../..; pwd)" +cd "$FWDIR" +export SPARK_HOME=$FWDIR + +# Determine the Scala version used in Spark +SCALA_BINARY_VER=`grep "scala.binary.version" "${SPARK_HOME}/pom.xml" | head -n1 | awk -F '[<>]' '{print $3}'` +SCALA_ARG="-Pscala-${SCALA_BINARY_VER}" + +# Build the jars needed for spark submit and spark connect +build/sbt "${SCALA_ARG}" -Phive -Pconnect package + +exec "${SPARK_HOME}"/bin/spark-shell --conf spark.plugins=org.apache.spark.sql.connect.SparkConnectPlugin "$@" diff --git a/connector/connect/client/jvm/pom.xml b/connector/connect/client/jvm/pom.xml new file mode 100644 index 0000000000000..f16761d3a6ae2 --- /dev/null +++ b/connector/connect/client/jvm/pom.xml @@ -0,0 +1,227 @@ + + + + + 4.0.0 + + org.apache.spark + spark-parent_2.12 + 3.4.1 + ../../../../pom.xml + + + spark-connect-client-jvm_2.12 + jar + Spark Project Connect Client + https://spark.apache.org/ + + connect-client-jvm + 31.0.1-jre + 1.0.1 + 1.1.0 + + + + + org.apache.spark + spark-connect-common_${scala.binary.version} + ${project.version} + + + com.google.guava + guava + + + + + + org.apache.spark + spark-catalyst_${scala.binary.version} + ${project.version} + provided + + + com.google.guava + guava + + + + + com.google.protobuf + protobuf-java + ${protobuf.version} + compile + + + com.google.guava + guava + ${guava.version} + compile + + + com.google.guava + failureaccess + ${guava.failureaccess.version} + compile + + + io.netty + netty-codec-http2 + ${netty.version} + + + io.netty + netty-handler-proxy + ${netty.version} + + + io.netty + netty-transport-native-unix-common + ${netty.version} + + + com.lihaoyi + ammonite_${scala.version} + ${ammonite.version} + provided + + + org.scala-lang.modules + scala-xml_${scala.binary.version} + + + + + org.apache.spark + spark-connect-common_${scala.binary.version} + ${project.version} + test-jar + test + + + com.google.guava + guava + + + + + org.scalacheck + scalacheck_${scala.binary.version} + test + + + org.mockito + mockito-core + test + + + + com.typesafe + mima-core_${scala.binary.version} + ${mima.version} + test + + + + target/scala-${scala.binary.version}/test-classes + + + + + org.apache.maven.plugins + maven-shade-plugin + + false + + + com.google.android:* + com.google.api.grpc:* + com.google.code.findbugs:* + com.google.code.gson:* + com.google.errorprone:* + com.google.guava:* + com.google.j2objc:* + com.google.protobuf:* + io.grpc:* + io.netty:* + io.perfmark:* + org.codehaus.mojo:* + org.checkerframework:* + org.apache.spark:spark-connect-common_${scala.binary.version} + + + + + io.grpc + ${spark.shade.packageName}.connect.client.io.grpc + + io.grpc.** + + + + com.google + ${spark.shade.packageName}.connect.client.com.google + + + io.netty + ${spark.shade.packageName}.connect.client.io.netty + + + org.checkerframework + ${spark.shade.packageName}.connect.client.org.checkerframework + + + javax.annotation + ${spark.shade.packageName}.connect.client.javax.annotation + + + io.perfmark + ${spark.shade.packageName}.connect.client.io.perfmark + + + org.codehaus + ${spark.shade.packageName}.connect.client.org.codehaus + + + android.annotation + ${spark.shade.packageName}.connect.client.android.annotation + + + + + + + + + + org.apache.maven.plugins + maven-jar-plugin + + + prepare-test-jar + test-compile + + test-jar + + + + + + + \ No newline at end of file diff --git a/connector/connect/client/jvm/src/main/java/org/apache/spark/sql/SaveMode.java b/connector/connect/client/jvm/src/main/java/org/apache/spark/sql/SaveMode.java new file mode 100644 index 0000000000000..95af157687c85 --- /dev/null +++ b/connector/connect/client/jvm/src/main/java/org/apache/spark/sql/SaveMode.java @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql; + +import org.apache.spark.annotation.Stable; + +/** + * SaveMode is used to specify the expected behavior of saving a DataFrame to a data source. + * + * @since 3.4.0 + */ +@Stable +public enum SaveMode { + /** + * Append mode means that when saving a DataFrame to a data source, if data/table already exists, + * contents of the DataFrame are expected to be appended to existing data. + * + * @since 3.4.0 + */ + Append, + /** + * Overwrite mode means that when saving a DataFrame to a data source, + * if data/table already exists, existing data is expected to be overwritten by the contents of + * the DataFrame. + * + * @since 3.4.0 + */ + Overwrite, + /** + * ErrorIfExists mode means that when saving a DataFrame to a data source, if data already exists, + * an exception is expected to be thrown. + * + * @since 3.4.0 + */ + ErrorIfExists, + /** + * Ignore mode means that when saving a DataFrame to a data source, if data already exists, + * the save operation is expected to not save the contents of the DataFrame and to not + * change the existing data. + * + * @since 3.4.0 + */ + Ignore +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Column.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Column.scala new file mode 100644 index 0000000000000..6a660a7482e27 --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Column.scala @@ -0,0 +1,1478 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +import scala.collection.JavaConverters._ + +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.connect.proto +import org.apache.spark.connect.proto.Expression.SortOrder.NullOrdering +import org.apache.spark.connect.proto.Expression.SortOrder.SortDirection +import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.encoders.AgnosticEncoder +import org.apache.spark.sql.catalyst.parser.CatalystSqlParser +import org.apache.spark.sql.connect.common.DataTypeProtoConverter +import org.apache.spark.sql.expressions.Window +import org.apache.spark.sql.functions.lit +import org.apache.spark.sql.types._ + +/** + * A column that will be computed based on the data in a `DataFrame`. + * + * A new column can be constructed based on the input columns present in a DataFrame: + * + * {{{ + * df("columnName") // On a specific `df` DataFrame. + * col("columnName") // A generic column not yet associated with a DataFrame. + * col("columnName.field") // Extracting a struct field + * col("`a.column.with.dots`") // Escape `.` in column names. + * $"columnName" // Scala short hand for a named column. + * }}} + * + * [[Column]] objects can be composed to form complex expressions: + * + * {{{ + * $"a" + 1 + * }}} + * + * @since 3.4.0 + */ +class Column private[sql] (@DeveloperApi val expr: proto.Expression) extends Logging { + + private[sql] def this(name: String, planId: Option[Long]) = + this(Column.nameToExpression(name, planId)) + + private[sql] def this(name: String) = + this(name, None) + + private def fn(name: String): Column = Column.fn(name, this) + private def fn(name: String, other: Column): Column = Column.fn(name, this, other) + private def fn(name: String, other: Any): Column = Column.fn(name, this, lit(other)) + + override def toString: String = expr.toString + + override def equals(that: Any): Boolean = that match { + case that: Column => expr == that.expr + case _ => false + } + + override def hashCode: Int = expr.hashCode() + + /** + * Provides a type hint about the expected return value of this column. This information can be + * used by operations such as `select` on a [[Dataset]] to automatically convert the results + * into the correct JVM types. + * @since 3.4.0 + */ + def as[U: Encoder]: TypedColumn[Any, U] = { + val encoder = implicitly[Encoder[U]].asInstanceOf[AgnosticEncoder[U]] + new TypedColumn[Any, U](expr, encoder) + } + + /** + * Extracts a value or values from a complex type. The following types of extraction are + * supported: + * - Given an Array, an integer ordinal can be used to retrieve a single value. + * - Given a Map, a key of the correct type can be used to retrieve an individual value. + * - Given a Struct, a string fieldName can be used to extract that field. + * - Given an Array of Structs, a string fieldName can be used to extract filed of every + * struct in that array, and return an Array of fields. + * @group expr_ops + * @since 3.4.0 + */ + def apply(extraction: Any): Column = Column { builder => + builder.getUnresolvedExtractValueBuilder + .setChild(expr) + .setExtraction(lit(extraction).expr) + } + + /** + * Unary minus, i.e. negate the expression. + * {{{ + * // Scala: select the amount column and negates all values. + * df.select( -df("amount") ) + * + * // Java: + * import static org.apache.spark.sql.functions.*; + * df.select( negate(col("amount") ); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def unary_- : Column = fn("negative") + + /** + * Inversion of boolean expression, i.e. NOT. + * {{{ + * // Scala: select rows that are not active (isActive === false) + * df.filter( !df("isActive") ) + * + * // Java: + * import static org.apache.spark.sql.functions.*; + * df.filter( not(df.col("isActive")) ); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def unary_! : Column = fn("!") + + /** + * Equality test. + * {{{ + * // Scala: + * df.filter( df("colA") === df("colB") ) + * + * // Java + * import static org.apache.spark.sql.functions.*; + * df.filter( col("colA").equalTo(col("colB")) ); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def ===(other: Any): Column = fn("=", other) + + /** + * Equality test. + * {{{ + * // Scala: + * df.filter( df("colA") === df("colB") ) + * + * // Java + * import static org.apache.spark.sql.functions.*; + * df.filter( col("colA").equalTo(col("colB")) ); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def equalTo(other: Any): Column = this === other + + /** + * Inequality test. + * {{{ + * // Scala: + * df.select( df("colA") =!= df("colB") ) + * df.select( !(df("colA") === df("colB")) ) + * + * // Java: + * import static org.apache.spark.sql.functions.*; + * df.filter( col("colA").notEqual(col("colB")) ); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def =!=(other: Any): Column = !(this === other) + + /** + * Inequality test. + * {{{ + * // Scala: + * df.select( df("colA") !== df("colB") ) + * df.select( !(df("colA") === df("colB")) ) + * + * // Java: + * import static org.apache.spark.sql.functions.*; + * df.filter( col("colA").notEqual(col("colB")) ); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + @deprecated("!== does not have the same precedence as ===, use =!= instead", "2.0.0") + def !==(other: Any): Column = this =!= other + + /** + * Inequality test. + * {{{ + * // Scala: + * df.select( df("colA") !== df("colB") ) + * df.select( !(df("colA") === df("colB")) ) + * + * // Java: + * import static org.apache.spark.sql.functions.*; + * df.filter( col("colA").notEqual(col("colB")) ); + * }}} + * + * @group java_expr_ops + * @since 3.4.0 + */ + def notEqual(other: Any): Column = this =!= other + + /** + * Greater than. + * {{{ + * // Scala: The following selects people older than 21. + * people.select( people("age") > 21 ) + * + * // Java: + * import static org.apache.spark.sql.functions.*; + * people.select( people.col("age").gt(21) ); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def >(other: Any): Column = fn(">", other) + + /** + * Greater than. + * {{{ + * // Scala: The following selects people older than 21. + * people.select( people("age") > lit(21) ) + * + * // Java: + * import static org.apache.spark.sql.functions.*; + * people.select( people.col("age").gt(21) ); + * }}} + * + * @group java_expr_ops + * @since 3.4.0 + */ + def gt(other: Any): Column = this > other + + /** + * Less than. + * {{{ + * // Scala: The following selects people younger than 21. + * people.select( people("age") < 21 ) + * + * // Java: + * people.select( people.col("age").lt(21) ); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def <(other: Any): Column = fn("<", other) + + /** + * Less than. + * {{{ + * // Scala: The following selects people younger than 21. + * people.select( people("age") < 21 ) + * + * // Java: + * people.select( people.col("age").lt(21) ); + * }}} + * + * @group java_expr_ops + * @since 3.4.0 + */ + def lt(other: Any): Column = this < other + + /** + * Less than or equal to. + * {{{ + * // Scala: The following selects people age 21 or younger than 21. + * people.select( people("age") <= 21 ) + * + * // Java: + * people.select( people.col("age").leq(21) ); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def <=(other: Any): Column = fn("<=", other) + + /** + * Less than or equal to. + * {{{ + * // Scala: The following selects people age 21 or younger than 21. + * people.select( people("age") <= 21 ) + * + * // Java: + * people.select( people.col("age").leq(21) ); + * }}} + * + * @group java_expr_ops + * @since 3.4.0 + */ + def leq(other: Any): Column = this <= other + + /** + * Greater than or equal to an expression. + * {{{ + * // Scala: The following selects people age 21 or older than 21. + * people.select( people("age") >= 21 ) + * + * // Java: + * people.select( people.col("age").geq(21) ) + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def >=(other: Any): Column = fn(">=", other) + + /** + * Greater than or equal to an expression. + * {{{ + * // Scala: The following selects people age 21 or older than 21. + * people.select( people("age") >= 21 ) + * + * // Java: + * people.select( people.col("age").geq(21) ) + * }}} + * + * @group java_expr_ops + * @since 3.4.0 + */ + def geq(other: Any): Column = this >= other + + /** + * Equality test that is safe for null values. + * + * @group expr_ops + * @since 3.4.0 + */ + def <=>(other: Any): Column = fn("<=>", other) + + /** + * Equality test that is safe for null values. + * + * @group java_expr_ops + * @since 3.4.0 + */ + def eqNullSafe(other: Any): Column = this <=> other + + private def extractWhen(name: String): java.util.List[proto.Expression] = { + def fail(): Nothing = { + throw new IllegalArgumentException( + s"$name() can only be applied on a Column previously generated by when() function") + } + if (!expr.hasUnresolvedFunction) { + fail() + } + val parentFn = expr.getUnresolvedFunction + if (parentFn.getFunctionName != "when") { + fail() + } + parentFn.getArgumentsList + } + + /** + * Evaluates a list of conditions and returns one of multiple possible result expressions. If + * otherwise is not defined at the end, null is returned for unmatched conditions. + * + * {{{ + * // Example: encoding gender string column into integer. + * + * // Scala: + * people.select(when(people("gender") === "male", 0) + * .when(people("gender") === "female", 1) + * .otherwise(2)) + * + * // Java: + * people.select(when(col("gender").equalTo("male"), 0) + * .when(col("gender").equalTo("female"), 1) + * .otherwise(2)) + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def when(condition: Column, value: Any): Column = { + val expressions = extractWhen("when") + if (expressions.size() % 2 == 1) { + throw new IllegalArgumentException("when() cannot be applied once otherwise() is applied") + } + Column { builder => + builder.getUnresolvedFunctionBuilder + .setFunctionName("when") + .addAllArguments(expressions) + .addArguments(condition.expr) + .addArguments(lit(value).expr) + } + } + + /** + * Evaluates a list of conditions and returns one of multiple possible result expressions. If + * otherwise is not defined at the end, null is returned for unmatched conditions. + * + * {{{ + * // Example: encoding gender string column into integer. + * + * // Scala: + * people.select(when(people("gender") === "male", 0) + * .when(people("gender") === "female", 1) + * .otherwise(2)) + * + * // Java: + * people.select(when(col("gender").equalTo("male"), 0) + * .when(col("gender").equalTo("female"), 1) + * .otherwise(2)) + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def otherwise(value: Any): Column = { + val expressions = extractWhen("otherwise") + if (expressions.size() % 2 == 1) { + throw new IllegalArgumentException( + "otherwise() can only be applied once on a Column previously generated by when()") + } + Column { builder => + builder.getUnresolvedFunctionBuilder + .setFunctionName("when") + .addAllArguments(expressions) + .addArguments(lit(value).expr) + } + } + + /** + * True if the current column is between the lower bound and upper bound, inclusive. + * + * @group java_expr_ops + * @since 3.4.0 + */ + def between(lowerBound: Any, upperBound: Any): Column = { + (this >= lowerBound) && (this <= upperBound) + } + + /** + * True if the current expression is NaN. + * + * @group expr_ops + * @since 3.4.0 + */ + def isNaN: Column = fn("isNaN") + + /** + * True if the current expression is null. + * + * @group expr_ops + * @since 3.4.0 + */ + def isNull: Column = fn("isNull") + + /** + * True if the current expression is NOT null. + * + * @group expr_ops + * @since 3.4.0 + */ + def isNotNull: Column = fn("isNotNull") + + /** + * Boolean OR. + * {{{ + * // Scala: The following selects people that are in school or employed. + * people.filter( people("inSchool") || people("isEmployed") ) + * + * // Java: + * people.filter( people.col("inSchool").or(people.col("isEmployed")) ); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def ||(other: Any): Column = fn("or", other) + + /** + * Boolean OR. + * {{{ + * // Scala: The following selects people that are in school or employed. + * people.filter( people("inSchool") || people("isEmployed") ) + * + * // Java: + * people.filter( people.col("inSchool").or(people.col("isEmployed")) ); + * }}} + * + * @group java_expr_ops + * @since 3.4.0 + */ + def or(other: Column): Column = this || other + + /** + * Boolean AND. + * {{{ + * // Scala: The following selects people that are in school and employed at the same time. + * people.select( people("inSchool") && people("isEmployed") ) + * + * // Java: + * people.select( people.col("inSchool").and(people.col("isEmployed")) ); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def &&(other: Any): Column = fn("and", other) + + /** + * Boolean AND. + * {{{ + * // Scala: The following selects people that are in school and employed at the same time. + * people.select( people("inSchool") && people("isEmployed") ) + * + * // Java: + * people.select( people.col("inSchool").and(people.col("isEmployed")) ); + * }}} + * + * @group java_expr_ops + * @since 3.4.0 + */ + def and(other: Column): Column = this && other + + /** + * Sum of this expression and another expression. + * {{{ + * // Scala: The following selects the sum of a person's height and weight. + * people.select( people("height") + people("weight") ) + * + * // Java: + * people.select( people.col("height").plus(people.col("weight")) ); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def +(other: Any): Column = fn("+", other) + + /** + * Sum of this expression and another expression. + * {{{ + * // Scala: The following selects the sum of a person's height and weight. + * people.select( people("height") + people("weight") ) + * + * // Java: + * people.select( people.col("height").plus(people.col("weight")) ); + * }}} + * + * @group java_expr_ops + * @since 3.4.0 + */ + def plus(other: Any): Column = this + other + + /** + * Subtraction. Subtract the other expression from this expression. + * {{{ + * // Scala: The following selects the difference between people's height and their weight. + * people.select( people("height") - people("weight") ) + * + * // Java: + * people.select( people.col("height").minus(people.col("weight")) ); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def -(other: Any): Column = fn("-", other) + + /** + * Subtraction. Subtract the other expression from this expression. + * {{{ + * // Scala: The following selects the difference between people's height and their weight. + * people.select( people("height") - people("weight") ) + * + * // Java: + * people.select( people.col("height").minus(people.col("weight")) ); + * }}} + * + * @group java_expr_ops + * @since 3.4.0 + */ + def minus(other: Any): Column = this - other + + /** + * Multiplication of this expression and another expression. + * {{{ + * // Scala: The following multiplies a person's height by their weight. + * people.select( people("height") * people("weight") ) + * + * // Java: + * people.select( people.col("height").multiply(people.col("weight")) ); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def *(other: Any): Column = fn("*", other) + + /** + * Multiplication of this expression and another expression. + * {{{ + * // Scala: The following multiplies a person's height by their weight. + * people.select( people("height") * people("weight") ) + * + * // Java: + * people.select( people.col("height").multiply(people.col("weight")) ); + * }}} + * + * @group java_expr_ops + * @since 3.4.0 + */ + def multiply(other: Any): Column = this * other + + /** + * Division this expression by another expression. + * {{{ + * // Scala: The following divides a person's height by their weight. + * people.select( people("height") / people("weight") ) + * + * // Java: + * people.select( people.col("height").divide(people.col("weight")) ); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def /(other: Any): Column = fn("/", other) + + /** + * Division this expression by another expression. + * {{{ + * // Scala: The following divides a person's height by their weight. + * people.select( people("height") / people("weight") ) + * + * // Java: + * people.select( people.col("height").divide(people.col("weight")) ); + * }}} + * + * @group java_expr_ops + * @since 3.4.0 + */ + def divide(other: Any): Column = this / other + + /** + * Modulo (a.k.a. remainder) expression. + * + * @group expr_ops + * @since 3.4.0 + */ + def %(other: Any): Column = fn("%", other) + + /** + * Modulo (a.k.a. remainder) expression. + * + * @group java_expr_ops + * @since 3.4.0 + */ + def mod(other: Any): Column = this % other + + /** + * A boolean expression that is evaluated to true if the value of this expression is contained + * by the evaluated values of the arguments. + * + * Note: Since the type of the elements in the list are inferred only during the run time, the + * elements will be "up-casted" to the most common type for comparison. For eg: 1) In the case + * of "Int vs String", the "Int" will be up-casted to "String" and the comparison will look like + * "String vs String". 2) In the case of "Float vs Double", the "Float" will be up-casted to + * "Double" and the comparison will look like "Double vs Double" + * + * @group expr_ops + * @since 3.4.0 + */ + @scala.annotation.varargs + def isin(list: Any*): Column = Column.fn("in", this +: list.map(lit): _*) + + /** + * A boolean expression that is evaluated to true if the value of this expression is contained + * by the provided collection. + * + * Note: Since the type of the elements in the collection are inferred only during the run time, + * the elements will be "up-casted" to the most common type for comparison. For eg: 1) In the + * case of "Int vs String", the "Int" will be up-casted to "String" and the comparison will look + * like "String vs String". 2) In the case of "Float vs Double", the "Float" will be up-casted + * to "Double" and the comparison will look like "Double vs Double" + * + * @group expr_ops + * @since 3.4.0 + */ + def isInCollection(values: scala.collection.Iterable[_]): Column = isin(values.toSeq: _*) + + /** + * A boolean expression that is evaluated to true if the value of this expression is contained + * by the provided collection. + * + * Note: Since the type of the elements in the collection are inferred only during the run time, + * the elements will be "up-casted" to the most common type for comparison. For eg: 1) In the + * case of "Int vs String", the "Int" will be up-casted to "String" and the comparison will look + * like "String vs String". 2) In the case of "Float vs Double", the "Float" will be up-casted + * to "Double" and the comparison will look like "Double vs Double" + * + * @group java_expr_ops + * @since 3.4.0 + */ + def isInCollection(values: java.lang.Iterable[_]): Column = isInCollection(values.asScala) + + /** + * SQL like expression. Returns a boolean column based on a SQL LIKE match. + * + * @group expr_ops + * @since 3.4.0 + */ + def like(literal: String): Column = fn("like", literal) + + /** + * SQL RLIKE expression (LIKE with Regex). Returns a boolean column based on a regex match. + * + * @group expr_ops + * @since 3.4.0 + */ + def rlike(literal: String): Column = fn("rlike", literal) + + /** + * SQL ILIKE expression (case insensitive LIKE). + * + * @group expr_ops + * @since 3.4.0 + */ + def ilike(literal: String): Column = fn("ilike", literal) + + /** + * An expression that gets an item at position `ordinal` out of an array, or gets a value by key + * `key` in a `MapType`. + * + * @group expr_ops + * @since 3.4.0 + */ + def getItem(key: Any): Column = apply(key) + + // scalastyle:off line.size.limit + /** + * An expression that adds/replaces field in `StructType` by name. + * + * {{{ + * val df = sql("SELECT named_struct('a', 1, 'b', 2) struct_col") + * df.select($"struct_col".withField("c", lit(3))) + * // result: {"a":1,"b":2,"c":3} + * + * val df = sql("SELECT named_struct('a', 1, 'b', 2) struct_col") + * df.select($"struct_col".withField("b", lit(3))) + * // result: {"a":1,"b":3} + * + * val df = sql("SELECT CAST(NULL AS struct) struct_col") + * df.select($"struct_col".withField("c", lit(3))) + * // result: null of type struct + * + * val df = sql("SELECT named_struct('a', 1, 'b', 2, 'b', 3) struct_col") + * df.select($"struct_col".withField("b", lit(100))) + * // result: {"a":1,"b":100,"b":100} + * + * val df = sql("SELECT named_struct('a', named_struct('a', 1, 'b', 2)) struct_col") + * df.select($"struct_col".withField("a.c", lit(3))) + * // result: {"a":{"a":1,"b":2,"c":3}} + * + * val df = sql("SELECT named_struct('a', named_struct('b', 1), 'a', named_struct('c', 2)) struct_col") + * df.select($"struct_col".withField("a.c", lit(3))) + * // result: org.apache.spark.sql.AnalysisException: Ambiguous reference to fields + * }}} + * + * This method supports adding/replacing nested fields directly e.g. + * + * {{{ + * val df = sql("SELECT named_struct('a', named_struct('a', 1, 'b', 2)) struct_col") + * df.select($"struct_col".withField("a.c", lit(3)).withField("a.d", lit(4))) + * // result: {"a":{"a":1,"b":2,"c":3,"d":4}} + * }}} + * + * However, if you are going to add/replace multiple nested fields, it is more optimal to + * extract out the nested struct before adding/replacing multiple fields e.g. + * + * {{{ + * val df = sql("SELECT named_struct('a', named_struct('a', 1, 'b', 2)) struct_col") + * df.select($"struct_col".withField("a", $"struct_col.a".withField("c", lit(3)).withField("d", lit(4)))) + * // result: {"a":{"a":1,"b":2,"c":3,"d":4}} + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + // scalastyle:on line.size.limit + def withField(fieldName: String, col: Column): Column = { + require(fieldName != null, "fieldName cannot be null") + require(col != null, "col cannot be null") + Column { builder => + builder.getUpdateFieldsBuilder + .setStructExpression(expr) + .setFieldName(fieldName) + .setValueExpression(col.expr) + } + } + + // scalastyle:off line.size.limit + /** + * An expression that drops fields in `StructType` by name. This is a no-op if schema doesn't + * contain field name(s). + * + * {{{ + * val df = sql("SELECT named_struct('a', 1, 'b', 2) struct_col") + * df.select($"struct_col".dropFields("b")) + * // result: {"a":1} + * + * val df = sql("SELECT named_struct('a', 1, 'b', 2) struct_col") + * df.select($"struct_col".dropFields("c")) + * // result: {"a":1,"b":2} + * + * val df = sql("SELECT named_struct('a', 1, 'b', 2, 'c', 3) struct_col") + * df.select($"struct_col".dropFields("b", "c")) + * // result: {"a":1} + * + * val df = sql("SELECT named_struct('a', 1, 'b', 2) struct_col") + * df.select($"struct_col".dropFields("a", "b")) + * // result: org.apache.spark.sql.AnalysisException: [DATATYPE_MISMATCH.CANNOT_DROP_ALL_FIELDS] Cannot resolve "update_fields(struct_col, dropfield(), dropfield())" due to data type mismatch: Cannot drop all fields in struct.; + * + * val df = sql("SELECT CAST(NULL AS struct) struct_col") + * df.select($"struct_col".dropFields("b")) + * // result: null of type struct + * + * val df = sql("SELECT named_struct('a', 1, 'b', 2, 'b', 3) struct_col") + * df.select($"struct_col".dropFields("b")) + * // result: {"a":1} + * + * val df = sql("SELECT named_struct('a', named_struct('a', 1, 'b', 2)) struct_col") + * df.select($"struct_col".dropFields("a.b")) + * // result: {"a":{"a":1}} + * + * val df = sql("SELECT named_struct('a', named_struct('b', 1), 'a', named_struct('c', 2)) struct_col") + * df.select($"struct_col".dropFields("a.c")) + * // result: org.apache.spark.sql.AnalysisException: Ambiguous reference to fields + * }}} + * + * This method supports dropping multiple nested fields directly e.g. + * + * {{{ + * val df = sql("SELECT named_struct('a', named_struct('a', 1, 'b', 2)) struct_col") + * df.select($"struct_col".dropFields("a.b", "a.c")) + * // result: {"a":{"a":1}} + * }}} + * + * However, if you are going to drop multiple nested fields, it is more optimal to extract out + * the nested struct before dropping multiple fields from it e.g. + * + * {{{ + * val df = sql("SELECT named_struct('a', named_struct('a', 1, 'b', 2)) struct_col") + * df.select($"struct_col".withField("a", $"struct_col.a".dropFields("b", "c"))) + * // result: {"a":{"a":1}} + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + // scalastyle:on line.size.limit + def dropFields(fieldNames: String*): Column = { + fieldNames.foldLeft(this) { case (column, fieldName) => + Column { builder => + builder.getUpdateFieldsBuilder + .setStructExpression(column.expr) + .setFieldName(fieldName) + } + } + } + + /** + * An expression that gets a field by name in a `StructType`. + * + * @group expr_ops + * @since 3.4.0 + */ + def getField(fieldName: String): Column = apply(fieldName) + + /** + * An expression that returns a substring. + * @param startPos + * expression for the starting position. + * @param len + * expression for the length of the substring. + * + * @group expr_ops + * @since 3.4.0 + */ + def substr(startPos: Column, len: Column): Column = Column.fn("substr", this, startPos, len) + + /** + * An expression that returns a substring. + * @param startPos + * starting position. + * @param len + * length of the substring. + * + * @group expr_ops + * @since 3.4.0 + */ + def substr(startPos: Int, len: Int): Column = substr(lit(startPos), lit(len)) + + /** + * Contains the other element. Returns a boolean column based on a string match. + * + * @group expr_ops + * @since 3.4.0 + */ + def contains(other: Any): Column = fn("contains", other) + + /** + * String starts with. Returns a boolean column based on a string match. + * + * @group expr_ops + * @since 3.4.0 + */ + def startsWith(other: Column): Column = fn("startswith", other) + + /** + * String starts with another string literal. Returns a boolean column based on a string match. + * + * @group expr_ops + * @since 3.4.0 + */ + def startsWith(literal: String): Column = startsWith(lit(literal)) + + /** + * String ends with. Returns a boolean column based on a string match. + * + * @group expr_ops + * @since 3.4.0 + */ + def endsWith(other: Column): Column = fn("endswith", other) + + /** + * String ends with another string literal. Returns a boolean column based on a string match. + * + * @group expr_ops + * @since 3.4.0 + */ + def endsWith(literal: String): Column = endsWith(lit(literal)) + + /** + * Gives the column an alias. Same as `as`. + * {{{ + * // Renames colA to colB in select output. + * df.select($"colA".alias("colB")) + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def alias(alias: String): Column = name(alias) + + /** + * Gives the column an alias. + * {{{ + * // Renames colA to colB in select output. + * df.select($"colA".as("colB")) + * }}} + * + * If the current column has metadata associated with it, this metadata will be propagated to + * the new column. If this not desired, use the API `as(alias: String, metadata: Metadata)` with + * explicit metadata. + * + * @group expr_ops + * @since 3.4.0 + */ + def as(alias: String): Column = name(alias) + + /** + * (Scala-specific) Assigns the given aliases to the results of a table generating function. + * {{{ + * // Renames colA to colB in select output. + * df.select(explode($"myMap").as("key" :: "value" :: Nil)) + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def as(aliases: Seq[String]): Column = Column { builder => + builder.getAliasBuilder.setExpr(expr).addAllName(aliases.asJava) + } + + /** + * Assigns the given aliases to the results of a table generating function. + * {{{ + * // Renames colA to colB in select output. + * df.select(explode($"myMap").as("key" :: "value" :: Nil)) + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def as(aliases: Array[String]): Column = as(aliases.toSeq) + + /** + * Gives the column an alias. + * {{{ + * // Renames colA to colB in select output. + * df.select($"colA".as("colB")) + * }}} + * + * If the current column has metadata associated with it, this metadata will be propagated to + * the new column. If this not desired, use the API `as(alias: String, metadata: Metadata)` with + * explicit metadata. + * + * @group expr_ops + * @since 3.4.0 + */ + def as(alias: Symbol): Column = name(alias.name) + + /** + * Gives the column an alias with metadata. + * {{{ + * val metadata: Metadata = ... + * df.select($"colA".as("colB", metadata)) + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def as(alias: String, metadata: Metadata): Column = Column { builder => + builder.getAliasBuilder + .setExpr(expr) + .addName(alias) + .setMetadata(metadata.json) + } + + /** + * Gives the column a name (alias). + * {{{ + * // Renames colA to colB in select output. + * df.select($"colA".name("colB")) + * }}} + * + * If the current column has metadata associated with it, this metadata will be propagated to + * the new column. If this not desired, use the API `as(alias: String, metadata: Metadata)` with + * explicit metadata. + * + * @group expr_ops + * @since 3.4.0 + */ + def name(alias: String): Column = as(alias :: Nil) + + /** + * Casts the column to a different data type. + * {{{ + * // Casts colA to IntegerType. + * import org.apache.spark.sql.types.IntegerType + * df.select(df("colA").cast(IntegerType)) + * + * // equivalent to + * df.select(df("colA").cast("int")) + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def cast(to: DataType): Column = Column { builder => + builder.getCastBuilder + .setExpr(expr) + .setType(DataTypeProtoConverter.toConnectProtoType(to)) + } + + /** + * Casts the column to a different data type, using the canonical string representation of the + * type. The supported types are: `string`, `boolean`, `byte`, `short`, `int`, `long`, `float`, + * `double`, `decimal`, `date`, `timestamp`. + * {{{ + * // Casts colA to integer. + * df.select(df("colA").cast("int")) + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def cast(to: String): Column = cast(CatalystSqlParser.parseDataType(to)) + + /** + * Returns a sort expression based on the descending order of the column. + * {{{ + * // Scala + * df.sort(df("age").desc) + * + * // Java + * df.sort(df.col("age").desc()); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def desc: Column = desc_nulls_last + + /** + * Returns a sort expression based on the descending order of the column, and null values appear + * before non-null values. + * {{{ + * // Scala: sort a DataFrame by age column in descending order and null values appearing first. + * df.sort(df("age").desc_nulls_first) + * + * // Java + * df.sort(df.col("age").desc_nulls_first()); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def desc_nulls_first: Column = + buildSortOrder(SortDirection.SORT_DIRECTION_DESCENDING, NullOrdering.SORT_NULLS_FIRST) + + /** + * Returns a sort expression based on the descending order of the column, and null values appear + * after non-null values. + * {{{ + * // Scala: sort a DataFrame by age column in descending order and null values appearing last. + * df.sort(df("age").desc_nulls_last) + * + * // Java + * df.sort(df.col("age").desc_nulls_last()); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def desc_nulls_last: Column = + buildSortOrder(SortDirection.SORT_DIRECTION_DESCENDING, NullOrdering.SORT_NULLS_LAST) + + /** + * Returns a sort expression based on ascending order of the column. + * {{{ + * // Scala: sort a DataFrame by age column in ascending order. + * df.sort(df("age").asc) + * + * // Java + * df.sort(df.col("age").asc()); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def asc: Column = asc_nulls_first + + /** + * Returns a sort expression based on ascending order of the column, and null values return + * before non-null values. + * {{{ + * // Scala: sort a DataFrame by age column in ascending order and null values appearing first. + * df.sort(df("age").asc_nulls_first) + * + * // Java + * df.sort(df.col("age").asc_nulls_first()); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def asc_nulls_first: Column = + buildSortOrder(SortDirection.SORT_DIRECTION_ASCENDING, NullOrdering.SORT_NULLS_FIRST) + + /** + * Returns a sort expression based on ascending order of the column, and null values appear + * after non-null values. + * {{{ + * // Scala: sort a DataFrame by age column in ascending order and null values appearing last. + * df.sort(df("age").asc_nulls_last) + * + * // Java + * df.sort(df.col("age").asc_nulls_last()); + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def asc_nulls_last: Column = + buildSortOrder(SortDirection.SORT_DIRECTION_ASCENDING, NullOrdering.SORT_NULLS_LAST) + + private def buildSortOrder(sortDirection: SortDirection, nullOrdering: NullOrdering): Column = { + Column { builder => + builder.getSortOrderBuilder + .setChild(expr) + .setDirection(sortDirection) + .setNullOrdering(nullOrdering) + } + } + + private[sql] def sortOrder: proto.Expression.SortOrder = { + val base = if (expr.hasSortOrder) { + expr + } else { + asc.expr + } + base.getSortOrder + } + + /** + * Prints the expression to the console for debugging purposes. + * + * @group df_ops + * @since 3.4.0 + */ + def explain(extended: Boolean): Unit = { + // scalastyle:off println + if (extended) { + println(expr) + } else { + println(toString) + } + // scalastyle:on println + } + + /** + * Compute bitwise OR of this expression with another expression. + * {{{ + * df.select($"colA".bitwiseOR($"colB")) + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def bitwiseOR(other: Any): Column = fn("|", other) + + /** + * Compute bitwise AND of this expression with another expression. + * {{{ + * df.select($"colA".bitwiseAND($"colB")) + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def bitwiseAND(other: Any): Column = fn("&", other) + + /** + * Compute bitwise XOR of this expression with another expression. + * {{{ + * df.select($"colA".bitwiseXOR($"colB")) + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def bitwiseXOR(other: Any): Column = fn("^", other) + + /** + * Defines a windowing column. + * + * {{{ + * val w = Window.partitionBy("name").orderBy("id") + * df.select( + * sum("price").over(w.rangeBetween(Window.unboundedPreceding, 2)), + * avg("price").over(w.rowsBetween(Window.currentRow, 4)) + * ) + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def over(window: expressions.WindowSpec): Column = window.withAggregate(this) + + /** + * Defines an empty analytic clause. In this case the analytic function is applied and presented + * for all rows in the result set. + * + * {{{ + * df.select( + * sum("price").over(), + * avg("price").over() + * ) + * }}} + * + * @group expr_ops + * @since 3.4.0 + */ + def over(): Column = over(Window.spec) +} + +private[sql] object Column { + + def apply(name: String): Column = new Column(name) + + def apply(name: String, planId: Option[Long]): Column = new Column(name, planId) + + def nameToExpression(name: String, planId: Option[Long] = None): proto.Expression = { + val builder = proto.Expression.newBuilder() + name match { + case "*" => + builder.getUnresolvedStarBuilder + case _ if name.endsWith(".*") => + builder.getUnresolvedStarBuilder.setUnparsedTarget(name) + case _ => + val attributeBuilder = builder.getUnresolvedAttributeBuilder.setUnparsedIdentifier(name) + planId.foreach(attributeBuilder.setPlanId) + } + builder.build() + } + + private[sql] def apply(f: proto.Expression.Builder => Unit): Column = { + val builder = proto.Expression.newBuilder() + f(builder) + new Column(builder.build()) + } + + @DeveloperApi + def apply(extension: com.google.protobuf.Any): Column = { + apply(_.setExtension(extension)) + } + + private[sql] def fn(name: String, inputs: Column*): Column = { + fn(name, isDistinct = false, inputs: _*) + } + + private[sql] def fn(name: String, isDistinct: Boolean, inputs: Column*): Column = Column { + builder => + builder.getUnresolvedFunctionBuilder + .setFunctionName(name) + .setIsDistinct(isDistinct) + .addAllArguments(inputs.map(_.expr).asJava) + } +} + +/** + * A convenient class used for constructing schema. + * + * @since 3.4.0 + */ +class ColumnName(name: String) extends Column(name) { + + /** + * Creates a new `StructField` of type boolean. + * @since 3.4.0 + */ + def boolean: StructField = StructField(name, BooleanType) + + /** + * Creates a new `StructField` of type byte. + * @since 3.4.0 + */ + def byte: StructField = StructField(name, ByteType) + + /** + * Creates a new `StructField` of type short. + * @since 3.4.0 + */ + def short: StructField = StructField(name, ShortType) + + /** + * Creates a new `StructField` of type int. + * @since 3.4.0 + */ + def int: StructField = StructField(name, IntegerType) + + /** + * Creates a new `StructField` of type long. + * @since 3.4.0 + */ + def long: StructField = StructField(name, LongType) + + /** + * Creates a new `StructField` of type float. + * @since 3.4.0 + */ + def float: StructField = StructField(name, FloatType) + + /** + * Creates a new `StructField` of type double. + * @since 3.4.0 + */ + def double: StructField = StructField(name, DoubleType) + + /** + * Creates a new `StructField` of type string. + * @since 3.4.0 + */ + def string: StructField = StructField(name, StringType) + + /** + * Creates a new `StructField` of type date. + * @since 3.4.0 + */ + def date: StructField = StructField(name, DateType) + + /** + * Creates a new `StructField` of type decimal. + * @since 3.4.0 + */ + def decimal: StructField = StructField(name, DecimalType.USER_DEFAULT) + + /** + * Creates a new `StructField` of type decimal. + * @since 3.4.0 + */ + def decimal(precision: Int, scale: Int): StructField = + StructField(name, DecimalType(precision, scale)) + + /** + * Creates a new `StructField` of type timestamp. + * @since 3.4.0 + */ + def timestamp: StructField = StructField(name, TimestampType) + + /** + * Creates a new `StructField` of type binary. + * @since 3.4.0 + */ + def binary: StructField = StructField(name, BinaryType) + + /** + * Creates a new `StructField` of type array. + * @since 3.4.0 + */ + def array(dataType: DataType): StructField = StructField(name, ArrayType(dataType)) + + /** + * Creates a new `StructField` of type map. + * @since 3.4.0 + */ + def map(keyType: DataType, valueType: DataType): StructField = + map(MapType(keyType, valueType)) + + /** + * Creates a new `StructField` of type map. + * @since 3.4.0 + */ + def map(mapType: MapType): StructField = StructField(name, mapType) + + /** + * Creates a new `StructField` of type struct. + * @since 3.4.0 + */ + def struct(fields: StructField*): StructField = struct(StructType(fields)) + + /** + * Creates a new `StructField` of type struct. + * @since 3.4.0 + */ + def struct(structType: StructType): StructField = StructField(name, structType) +} + +/** + * A [[Column]] where an [[Encoder]] has been given for the expected input and return type. To + * create a [[TypedColumn]], use the `as` function on a [[Column]]. + * + * @tparam T + * The input type expected for this expression. Can be `Any` if the expression is type checked + * by the analyzer instead of the compiler (i.e. `expr("sum(...)")`). + * @tparam U + * The output type of this column. + * + * @since 3.4.0 + */ +class TypedColumn[-T, U] private[sql] ( + expr: proto.Expression, + private[sql] val encoder: AgnosticEncoder[U]) + extends Column(expr) { + + /** + * Gives the [[TypedColumn]] a name (alias). If the current `TypedColumn` has metadata + * associated with it, this metadata will be propagated to the new column. + * + * @group expr_ops + * @since 3.4.0 + */ + override def name(alias: String): TypedColumn[T, U] = + new TypedColumn[T, U](super.name(alias).expr, encoder) +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala new file mode 100644 index 0000000000000..17b95018f8986 --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameNaFunctions.scala @@ -0,0 +1,441 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.util.Locale + +import scala.collection.JavaConverters._ + +import org.apache.spark.connect.proto.{NAReplace, Relation} +import org.apache.spark.connect.proto.Expression.{Literal => GLiteral} +import org.apache.spark.connect.proto.NAReplace.Replacement + +/** + * Functionality for working with missing data in `DataFrame`s. + * + * @since 3.4.0 + */ +final class DataFrameNaFunctions private[sql] (sparkSession: SparkSession, root: Relation) { + + /** + * Returns a new `DataFrame` that drops rows containing any null or NaN values. + * + * @since 3.4.0 + */ + def drop(): DataFrame = buildDropDataFrame(None, None) + + /** + * Returns a new `DataFrame` that drops rows containing null or NaN values. + * + * If `how` is "any", then drop rows containing any null or NaN values. If `how` is "all", then + * drop rows only if every column is null or NaN for that row. + * + * @since 3.4.0 + */ + def drop(how: String): DataFrame = { + buildDropDataFrame(None, buildMinNonNulls(how)) + } + + /** + * Returns a new `DataFrame` that drops rows containing any null or NaN values in the specified + * columns. + * + * @since 3.4.0 + */ + def drop(cols: Array[String]): DataFrame = drop(cols.toSeq) + + /** + * (Scala-specific) Returns a new `DataFrame` that drops rows containing any null or NaN values + * in the specified columns. + * + * @since 3.4.0 + */ + def drop(cols: Seq[String]): DataFrame = buildDropDataFrame(Some(cols), None) + + /** + * Returns a new `DataFrame` that drops rows containing null or NaN values in the specified + * columns. + * + * If `how` is "any", then drop rows containing any null or NaN values in the specified columns. + * If `how` is "all", then drop rows only if every specified column is null or NaN for that row. + * + * @since 3.4.0 + */ + def drop(how: String, cols: Array[String]): DataFrame = drop(how, cols.toSeq) + + /** + * (Scala-specific) Returns a new `DataFrame` that drops rows containing null or NaN values in + * the specified columns. + * + * If `how` is "any", then drop rows containing any null or NaN values in the specified columns. + * If `how` is "all", then drop rows only if every specified column is null or NaN for that row. + * + * @since 3.4.0 + */ + def drop(how: String, cols: Seq[String]): DataFrame = { + buildDropDataFrame(Some(cols), buildMinNonNulls(how)) + } + + /** + * Returns a new `DataFrame` that drops rows containing less than `minNonNulls` non-null and + * non-NaN values. + * + * @since 3.4.0 + */ + def drop(minNonNulls: Int): DataFrame = { + buildDropDataFrame(None, Some(minNonNulls)) + } + + /** + * Returns a new `DataFrame` that drops rows containing less than `minNonNulls` non-null and + * non-NaN values in the specified columns. + * + * @since 3.4.0 + */ + def drop(minNonNulls: Int, cols: Array[String]): DataFrame = drop(minNonNulls, cols.toSeq) + + /** + * (Scala-specific) Returns a new `DataFrame` that drops rows containing less than `minNonNulls` + * non-null and non-NaN values in the specified columns. + * + * @since 3.4.0 + */ + def drop(minNonNulls: Int, cols: Seq[String]): DataFrame = { + buildDropDataFrame(Some(cols), Some(minNonNulls)) + } + + private def buildMinNonNulls(how: String): Option[Int] = { + how.toLowerCase(Locale.ROOT) match { + case "any" => None // No-Op. Do nothing. + case "all" => Some(1) + case _ => throw new IllegalArgumentException(s"how ($how) must be 'any' or 'all'") + } + } + + private def buildDropDataFrame( + cols: Option[Seq[String]], + minNonNulls: Option[Int]): DataFrame = { + sparkSession.newDataFrame { builder => + val dropNaBuilder = builder.getDropNaBuilder.setInput(root) + cols.foreach(c => dropNaBuilder.addAllCols(c.asJava)) + minNonNulls.foreach(dropNaBuilder.setMinNonNulls) + } + } + + /** + * Returns a new `DataFrame` that replaces null or NaN values in numeric columns with `value`. + * + * @since 3.4.0 + */ + def fill(value: Long): DataFrame = { + buildFillDataFrame(None, GLiteral.newBuilder().setLong(value).build()) + } + + /** + * Returns a new `DataFrame` that replaces null or NaN values in specified numeric columns. If a + * specified column is not a numeric column, it is ignored. + * + * @since 3.4.0 + */ + def fill(value: Long, cols: Array[String]): DataFrame = fill(value, cols.toSeq) + + /** + * (Scala-specific) Returns a new `DataFrame` that replaces null or NaN values in specified + * numeric columns. If a specified column is not a numeric column, it is ignored. + * + * @since 3.4.0 + */ + def fill(value: Long, cols: Seq[String]): DataFrame = { + buildFillDataFrame(Some(cols), GLiteral.newBuilder().setLong(value).build()) + } + + /** + * Returns a new `DataFrame` that replaces null or NaN values in numeric columns with `value`. + * + * @since 3.4.0 + */ + def fill(value: Double): DataFrame = { + buildFillDataFrame(None, GLiteral.newBuilder().setDouble(value).build()) + } + + /** + * Returns a new `DataFrame` that replaces null or NaN values in specified numeric columns. If a + * specified column is not a numeric column, it is ignored. + * + * @since 3.4.0 + */ + def fill(value: Double, cols: Array[String]): DataFrame = fill(value, cols.toSeq) + + /** + * (Scala-specific) Returns a new `DataFrame` that replaces null or NaN values in specified + * numeric columns. If a specified column is not a numeric column, it is ignored. + * + * @since 3.4.0 + */ + def fill(value: Double, cols: Seq[String]): DataFrame = { + buildFillDataFrame(Some(cols), GLiteral.newBuilder().setDouble(value).build()) + } + + /** + * Returns a new `DataFrame` that replaces null values in string columns with `value`. + * + * @since 3.4.0 + */ + def fill(value: String): DataFrame = { + buildFillDataFrame(None, GLiteral.newBuilder().setString(value).build()) + } + + /** + * Returns a new `DataFrame` that replaces null values in specified string columns. If a + * specified column is not a string column, it is ignored. + * + * @since 3.4.0 + */ + def fill(value: String, cols: Array[String]): DataFrame = fill(value, cols.toSeq) + + /** + * (Scala-specific) Returns a new `DataFrame` that replaces null values in specified string + * columns. If a specified column is not a string column, it is ignored. + * + * @since 3.4.0 + */ + def fill(value: String, cols: Seq[String]): DataFrame = { + buildFillDataFrame(Some(cols), GLiteral.newBuilder().setString(value).build()) + } + + /** + * Returns a new `DataFrame` that replaces null values in boolean columns with `value`. + * + * @since 3.4.0 + */ + def fill(value: Boolean): DataFrame = { + buildFillDataFrame(None, GLiteral.newBuilder().setBoolean(value).build()) + } + + /** + * Returns a new `DataFrame` that replaces null values in specified boolean columns. If a + * specified column is not a boolean column, it is ignored. + * + * @since 3.4.0 + */ + def fill(value: Boolean, cols: Array[String]): DataFrame = fill(value, cols.toSeq) + + /** + * (Scala-specific) Returns a new `DataFrame` that replaces null values in specified boolean + * columns. If a specified column is not a boolean column, it is ignored. + * + * @since 3.4.0 + */ + def fill(value: Boolean, cols: Seq[String]): DataFrame = { + buildFillDataFrame(Some(cols), GLiteral.newBuilder().setBoolean(value).build()) + } + + private def buildFillDataFrame(cols: Option[Seq[String]], value: GLiteral): DataFrame = { + sparkSession.newDataFrame { builder => + val fillNaBuilder = builder.getFillNaBuilder.setInput(root) + fillNaBuilder.addValues(value) + cols.foreach(c => fillNaBuilder.addAllCols(c.asJava)) + } + } + + /** + * Returns a new `DataFrame` that replaces null values. + * + * The key of the map is the column name, and the value of the map is the replacement value. The + * value must be of the following type: `Integer`, `Long`, `Float`, `Double`, `String`, + * `Boolean`. Replacement values are cast to the column data type. + * + * For example, the following replaces null values in column "A" with string "unknown", and null + * values in column "B" with numeric value 1.0. + * {{{ + * import com.google.common.collect.ImmutableMap; + * df.na.fill(ImmutableMap.of("A", "unknown", "B", 1.0)); + * }}} + * + * @since 3.4.0 + */ + def fill(valueMap: java.util.Map[String, Any]): DataFrame = fillMap(valueMap.asScala.toSeq) + + /** + * Returns a new `DataFrame` that replaces null values. + * + * The key of the map is the column name, and the value of the map is the replacement value. The + * value must be of the following type: `Integer`, `Long`, `Float`, `Double`, `String`, + * `Boolean`. Replacement values are cast to the column data type. + * + * For example, the following replaces null values in column "A" with string "unknown", and null + * values in column "B" with numeric value 1.0. + * {{{ + * import com.google.common.collect.ImmutableMap; + * df.na.fill(ImmutableMap.of("A", "unknown", "B", 1.0)); + * }}} + * + * @since 3.4.0 + */ + def fill(valueMap: Map[String, Any]): DataFrame = fillMap(valueMap.toSeq) + + private def fillMap(values: Seq[(String, Any)]): DataFrame = { + sparkSession.newDataFrame { builder => + val fillNaBuilder = builder.getFillNaBuilder.setInput(root) + values.map { case (colName, replaceValue) => + fillNaBuilder.addCols(colName).addValues(functions.lit(replaceValue).expr.getLiteral) + } + } + } + + /** + * Replaces values matching keys in `replacement` map with the corresponding values. + * + * {{{ + * import com.google.common.collect.ImmutableMap; + * + * // Replaces all occurrences of 1.0 with 2.0 in column "height". + * df.na.replace("height", ImmutableMap.of(1.0, 2.0)); + * + * // Replaces all occurrences of "UNKNOWN" with "unnamed" in column "name". + * df.na.replace("name", ImmutableMap.of("UNKNOWN", "unnamed")); + * + * // Replaces all occurrences of "UNKNOWN" with "unnamed" in all string columns. + * df.na.replace("*", ImmutableMap.of("UNKNOWN", "unnamed")); + * }}} + * + * @param col + * name of the column to apply the value replacement. If `col` is "*", replacement is applied + * on all string, numeric or boolean columns. + * @param replacement + * value replacement map. Key and value of `replacement` map must have the same type, and can + * only be doubles, strings or booleans. The map value can have nulls. + * @since 3.4.0 + */ + def replace[T](col: String, replacement: java.util.Map[T, T]): DataFrame = + replace(col, replacement.asScala.toMap) + + /** + * (Scala-specific) Replaces values matching keys in `replacement` map. + * + * {{{ + * // Replaces all occurrences of 1.0 with 2.0 in column "height". + * df.na.replace("height", Map(1.0 -> 2.0)); + * + * // Replaces all occurrences of "UNKNOWN" with "unnamed" in column "name". + * df.na.replace("name", Map("UNKNOWN" -> "unnamed")); + * + * // Replaces all occurrences of "UNKNOWN" with "unnamed" in all string columns. + * df.na.replace("*", Map("UNKNOWN" -> "unnamed")); + * }}} + * + * @param col + * name of the column to apply the value replacement. If `col` is "*", replacement is applied + * on all string, numeric or boolean columns. + * @param replacement + * value replacement map. Key and value of `replacement` map must have the same type, and can + * only be doubles, strings or booleans. The map value can have nulls. + * @since 3.4.0 + */ + def replace[T](col: String, replacement: Map[T, T]): DataFrame = { + val cols = if (col != "*") Some(Seq(col)) else None + buildReplaceDataFrame(cols, buildReplacement(replacement)) + } + + /** + * Replaces values matching keys in `replacement` map with the corresponding values. + * + * {{{ + * import com.google.common.collect.ImmutableMap; + * + * // Replaces all occurrences of 1.0 with 2.0 in column "height" and "weight". + * df.na.replace(new String[] {"height", "weight"}, ImmutableMap.of(1.0, 2.0)); + * + * // Replaces all occurrences of "UNKNOWN" with "unnamed" in column "firstname" and "lastname". + * df.na.replace(new String[] {"firstname", "lastname"}, ImmutableMap.of("UNKNOWN", "unnamed")); + * }}} + * + * @param cols + * list of columns to apply the value replacement. If `col` is "*", replacement is applied on + * all string, numeric or boolean columns. + * @param replacement + * value replacement map. Key and value of `replacement` map must have the same type, and can + * only be doubles, strings or booleans. The map value can have nulls. + * @since 3.4.0 + */ + def replace[T](cols: Array[String], replacement: java.util.Map[T, T]): DataFrame = { + replace(cols.toSeq, replacement.asScala.toMap) + } + + /** + * (Scala-specific) Replaces values matching keys in `replacement` map. + * + * {{{ + * // Replaces all occurrences of 1.0 with 2.0 in column "height" and "weight". + * df.na.replace("height" :: "weight" :: Nil, Map(1.0 -> 2.0)); + * + * // Replaces all occurrences of "UNKNOWN" with "unnamed" in column "firstname" and "lastname". + * df.na.replace("firstname" :: "lastname" :: Nil, Map("UNKNOWN" -> "unnamed")); + * }}} + * + * @param cols + * list of columns to apply the value replacement. If `col` is "*", replacement is applied on + * all string, numeric or boolean columns. + * @param replacement + * value replacement map. Key and value of `replacement` map must have the same type, and can + * only be doubles, strings or booleans. The map value can have nulls. + * @since 3.4.0 + */ + def replace[T](cols: Seq[String], replacement: Map[T, T]): DataFrame = { + buildReplaceDataFrame(Some(cols), buildReplacement(replacement)) + } + + private def buildReplaceDataFrame( + cols: Option[Seq[String]], + replacements: Iterable[NAReplace.Replacement]): DataFrame = { + sparkSession.newDataFrame { builder => + val replaceBuilder = builder.getReplaceBuilder.setInput(root) + replaceBuilder.addAllReplacements(replacements.asJava) + cols.foreach(c => replaceBuilder.addAllCols(c.asJava)) + } + } + + private def buildReplacement[T](replacement: Map[T, T]): Iterable[NAReplace.Replacement] = { + // Convert the NumericType in replacement map to DoubleType, + // while leaving StringType, BooleanType and null untouched. + val replacementMap: Map[_, _] = replacement.map { + case (k, v: String) => (k, v) + case (k, v: Boolean) => (k, v) + case (k: String, null) => (k, null) + case (k: Boolean, null) => (k, null) + case (k, null) => (convertToDouble(k), null) + case (k, v) => (convertToDouble(k), convertToDouble(v)) + } + replacementMap.map { case (oldValue, newValue) => + Replacement + .newBuilder() + .setOldValue(functions.lit(oldValue).expr.getLiteral) + .setNewValue(functions.lit(newValue).expr.getLiteral) + .build() + } + } + + private def convertToDouble(v: Any): Double = v match { + case v: Float => v.toDouble + case v: Double => v + case v: Long => v.toDouble + case v: Int => v.toDouble + case v => + throw new IllegalArgumentException(s"Unsupported value type ${v.getClass.getName} ($v).") + } +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameReader.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameReader.scala new file mode 100644 index 0000000000000..40f9ac1df2b22 --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameReader.scala @@ -0,0 +1,580 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.util.Properties + +import scala.collection.JavaConverters._ + +import org.apache.spark.annotation.Stable +import org.apache.spark.connect.proto.Parse.ParseFormat +import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.StringEncoder +import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CharVarcharUtils} +import org.apache.spark.sql.connect.common.DataTypeProtoConverter +import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.types.StructType + +/** + * Interface used to load a [[Dataset]] from external storage systems (e.g. file systems, + * key-value stores, etc). Use `SparkSession.read` to access this. + * + * @since 3.4.0 + */ +@Stable +class DataFrameReader private[sql] (sparkSession: SparkSession) extends Logging { + + /** + * Specifies the input data source format. + * + * @since 3.4.0 + */ + def format(source: String): DataFrameReader = { + this.source = source + this + } + + /** + * Specifies the input schema. Some data sources (e.g. JSON) can infer the input schema + * automatically from data. By specifying the schema here, the underlying data source can skip + * the schema inference step, and thus speed up data loading. + * + * @since 3.4.0 + */ + def schema(schema: StructType): DataFrameReader = { + if (schema != null) { + val replaced = CharVarcharUtils.failIfHasCharVarchar(schema).asInstanceOf[StructType] + this.userSpecifiedSchema = Option(replaced) + } + this + } + + /** + * Specifies the schema by using the input DDL-formatted string. Some data sources (e.g. JSON) + * can infer the input schema automatically from data. By specifying the schema here, the + * underlying data source can skip the schema inference step, and thus speed up data loading. + * + * {{{ + * spark.read.schema("a INT, b STRING, c DOUBLE").csv("test.csv") + * }}} + * + * @since 3.4.0 + */ + def schema(schemaString: String): DataFrameReader = { + schema(StructType.fromDDL(schemaString)) + } + + /** + * Adds an input option for the underlying data source. + * + * All options are maintained in a case-insensitive way in terms of key names. If a new option + * has the same key case-insensitively, it will override the existing option. + * + * @since 3.4.0 + */ + def option(key: String, value: String): DataFrameReader = { + this.extraOptions = this.extraOptions + (key -> value) + this + } + + /** + * Adds an input option for the underlying data source. + * + * All options are maintained in a case-insensitive way in terms of key names. If a new option + * has the same key case-insensitively, it will override the existing option. + * + * @since 3.4.0 + */ + def option(key: String, value: Boolean): DataFrameReader = option(key, value.toString) + + /** + * Adds an input option for the underlying data source. + * + * All options are maintained in a case-insensitive way in terms of key names. If a new option + * has the same key case-insensitively, it will override the existing option. + * + * @since 3.4.0 + */ + def option(key: String, value: Long): DataFrameReader = option(key, value.toString) + + /** + * Adds an input option for the underlying data source. + * + * All options are maintained in a case-insensitive way in terms of key names. If a new option + * has the same key case-insensitively, it will override the existing option. + * + * @since 3.4.0 + */ + def option(key: String, value: Double): DataFrameReader = option(key, value.toString) + + /** + * (Scala-specific) Adds input options for the underlying data source. + * + * All options are maintained in a case-insensitive way in terms of key names. If a new option + * has the same key case-insensitively, it will override the existing option. + * + * @since 3.4.0 + */ + def options(options: scala.collection.Map[String, String]): DataFrameReader = { + this.extraOptions ++= options + this + } + + /** + * Adds input options for the underlying data source. + * + * All options are maintained in a case-insensitive way in terms of key names. If a new option + * has the same key case-insensitively, it will override the existing option. + * + * @since 3.4.0 + */ + def options(options: java.util.Map[String, String]): DataFrameReader = { + this.options(options.asScala) + this + } + + /** + * Loads input in as a `DataFrame`, for data sources that don't require a path (e.g. external + * key-value stores). + * + * @since 3.4.0 + */ + def load(): DataFrame = { + load(Seq.empty: _*) // force invocation of `load(...varargs...)` + } + + /** + * Loads input in as a `DataFrame`, for data sources that require a path (e.g. data backed by a + * local or distributed file system). + * + * @since 3.4.0 + */ + def load(path: String): DataFrame = { + // force invocation of `load(...varargs...)` + load(Seq(path): _*) + } + + /** + * Loads input in as a `DataFrame`, for data sources that support multiple paths. Only works if + * the source is a HadoopFsRelationProvider. + * + * @since 3.4.0 + */ + @scala.annotation.varargs + def load(paths: String*): DataFrame = { + sparkSession.newDataFrame { builder => + val dataSourceBuilder = builder.getReadBuilder.getDataSourceBuilder + assertSourceFormatSpecified() + dataSourceBuilder.setFormat(source) + userSpecifiedSchema.foreach(schema => dataSourceBuilder.setSchema(schema.toDDL)) + extraOptions.foreach { case (k, v) => + dataSourceBuilder.putOptions(k, v) + } + paths.foreach(path => dataSourceBuilder.addPaths(path)) + builder.build() + } + } + + /** + * Construct a `DataFrame` representing the database table accessible via JDBC URL url named + * table and connection properties. + * + * You can find the JDBC-specific option and parameter documentation for reading tables via JDBC + * in + * Data Source Option in the version you use. + * + * @since 3.4.0 + */ + def jdbc(url: String, table: String, properties: Properties): DataFrame = { + // properties should override settings in extraOptions. + this.extraOptions ++= properties.asScala + // explicit url and dbtable should override all + this.extraOptions ++= Seq("url" -> url, "dbtable" -> table) + format("jdbc").load() + } + + // scalastyle:off line.size.limit + /** + * Construct a `DataFrame` representing the database table accessible via JDBC URL url named + * table. Partitions of the table will be retrieved in parallel based on the parameters passed + * to this function. + * + * Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash + * your external database systems. + * + * You can find the JDBC-specific option and parameter documentation for reading tables via JDBC + * in + * Data Source Option in the version you use. + * + * @param table + * Name of the table in the external database. + * @param columnName + * Alias of `partitionColumn` option. Refer to `partitionColumn` in + * Data Source Option in the version you use. + * @param connectionProperties + * JDBC database connection arguments, a list of arbitrary string tag/value. Normally at least + * a "user" and "password" property should be included. "fetchsize" can be used to control the + * number of rows per fetch and "queryTimeout" can be used to wait for a Statement object to + * execute to the given number of seconds. + * @since 3.4.0 + */ + // scalastyle:on line.size.limit + def jdbc( + url: String, + table: String, + columnName: String, + lowerBound: Long, + upperBound: Long, + numPartitions: Int, + connectionProperties: Properties): DataFrame = { + // columnName, lowerBound, upperBound and numPartitions override settings in extraOptions. + this.extraOptions ++= Map( + "partitionColumn" -> columnName, + "lowerBound" -> lowerBound.toString, + "upperBound" -> upperBound.toString, + "numPartitions" -> numPartitions.toString) + jdbc(url, table, connectionProperties) + } + + /** + * Construct a `DataFrame` representing the database table accessible via JDBC URL url named + * table using connection properties. The `predicates` parameter gives a list expressions + * suitable for inclusion in WHERE clauses; each one defines one partition of the `DataFrame`. + * + * Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash + * your external database systems. + * + * You can find the JDBC-specific option and parameter documentation for reading tables via JDBC + * in + * Data Source Option in the version you use. + * + * @param table + * Name of the table in the external database. + * @param predicates + * Condition in the where clause for each partition. + * @param connectionProperties + * JDBC database connection arguments, a list of arbitrary string tag/value. Normally at least + * a "user" and "password" property should be included. "fetchsize" can be used to control the + * number of rows per fetch. + * @since 3.4.0 + */ + def jdbc( + url: String, + table: String, + predicates: Array[String], + connectionProperties: Properties): DataFrame = { + sparkSession.newDataFrame { builder => + val dataSourceBuilder = builder.getReadBuilder.getDataSourceBuilder + format("jdbc") + dataSourceBuilder.setFormat(source) + predicates.foreach(predicate => dataSourceBuilder.addPredicates(predicate)) + this.extraOptions ++= Seq("url" -> url, "dbtable" -> table) + val params = extraOptions ++ connectionProperties.asScala + params.foreach { case (k, v) => + dataSourceBuilder.putOptions(k, v) + } + builder.build() + } + } + + /** + * Loads a JSON file and returns the results as a `DataFrame`. + * + * See the documentation on the overloaded `json()` method with varargs for more details. + * + * @since 3.4.0 + */ + def json(path: String): DataFrame = { + // This method ensures that calls that explicit need single argument works, see SPARK-16009 + json(Seq(path): _*) + } + + /** + * Loads JSON files and returns the results as a `DataFrame`. + * + * JSON Lines (newline-delimited JSON) is supported by + * default. For JSON (one record per file), set the `multiLine` option to true. + * + * This function goes through the input once to determine the input schema. If you know the + * schema in advance, use the version that specifies the schema to avoid the extra scan. + * + * You can find the JSON-specific options for reading JSON files in + * Data Source Option in the version you use. + * + * @since 3.4.0 + */ + @scala.annotation.varargs + def json(paths: String*): DataFrame = { + format("json").load(paths: _*) + } + + /** + * Loads a `Dataset[String]` storing JSON objects (JSON Lines + * text format or newline-delimited JSON) and returns the result as a `DataFrame`. + * + * Unless the schema is specified using `schema` function, this function goes through the input + * once to determine the input schema. + * + * @param jsonDataset + * input Dataset with one JSON object per record + * @since 3.4.0 + */ + def json(jsonDataset: Dataset[String]): DataFrame = + parse(jsonDataset, ParseFormat.PARSE_FORMAT_JSON) + + /** + * Loads a CSV file and returns the result as a `DataFrame`. See the documentation on the other + * overloaded `csv()` method for more details. + * + * @since 3.4.0 + */ + def csv(path: String): DataFrame = { + // This method ensures that calls that explicit need single argument works, see SPARK-16009 + csv(Seq(path): _*) + } + + /** + * Loads CSV files and returns the result as a `DataFrame`. + * + * This function will go through the input once to determine the input schema if `inferSchema` + * is enabled. To avoid going through the entire data once, disable `inferSchema` option or + * specify the schema explicitly using `schema`. + * + * You can find the CSV-specific options for reading CSV files in + * Data Source Option in the version you use. + * + * @since 3.4.0 + */ + @scala.annotation.varargs + def csv(paths: String*): DataFrame = format("csv").load(paths: _*) + + /** + * Loads an `Dataset[String]` storing CSV rows and returns the result as a `DataFrame`. + * + * If the schema is not specified using `schema` function and `inferSchema` option is enabled, + * this function goes through the input once to determine the input schema. + * + * If the schema is not specified using `schema` function and `inferSchema` option is disabled, + * it determines the columns as string types and it reads only the first line to determine the + * names and the number of fields. + * + * If the enforceSchema is set to `false`, only the CSV header in the first line is checked to + * conform specified or inferred schema. + * + * @note + * if `header` option is set to `true` when calling this API, all lines same with the header + * will be removed if exists. + * @param csvDataset + * input Dataset with one CSV row per record + * @since 3.4.0 + */ + def csv(csvDataset: Dataset[String]): DataFrame = + parse(csvDataset, ParseFormat.PARSE_FORMAT_CSV) + + /** + * Loads a Parquet file, returning the result as a `DataFrame`. See the documentation on the + * other overloaded `parquet()` method for more details. + * + * @since 3.4.0 + */ + def parquet(path: String): DataFrame = { + // This method ensures that calls that explicit need single argument works, see SPARK-16009 + parquet(Seq(path): _*) + } + + /** + * Loads a Parquet file, returning the result as a `DataFrame`. + * + * Parquet-specific option(s) for reading Parquet files can be found in Data + * Source Option in the version you use. + * + * @since 3.4.0 + */ + @scala.annotation.varargs + def parquet(paths: String*): DataFrame = { + format("parquet").load(paths: _*) + } + + /** + * Loads an ORC file and returns the result as a `DataFrame`. + * + * @param path + * input path + * @since 3.4.0 + */ + def orc(path: String): DataFrame = { + // This method ensures that calls that explicit need single argument works, see SPARK-16009 + orc(Seq(path): _*) + } + + /** + * Loads ORC files and returns the result as a `DataFrame`. + * + * ORC-specific option(s) for reading ORC files can be found in Data + * Source Option in the version you use. + * + * @param paths + * input paths + * @since 3.4.0 + */ + @scala.annotation.varargs + def orc(paths: String*): DataFrame = format("orc").load(paths: _*) + + /** + * Returns the specified table/view as a `DataFrame`. If it's a table, it must support batch + * reading and the returned DataFrame is the batch scan query plan of this table. If it's a + * view, the returned DataFrame is simply the query plan of the view, which can either be a + * batch or streaming query plan. + * + * @param tableName + * is either a qualified or unqualified name that designates a table or view. If a database is + * specified, it identifies the table/view from the database. Otherwise, it first attempts to + * find a temporary view with the given name and then match the table/view from the current + * database. Note that, the global temporary view database is also valid here. + * @since 3.4.0 + */ + def table(tableName: String): DataFrame = { + sparkSession.newDataFrame { builder => + builder.getReadBuilder.getNamedTableBuilder + .setUnparsedIdentifier(tableName) + .putAllOptions(extraOptions.toMap.asJava) + } + } + + /** + * Loads text files and returns a `DataFrame` whose schema starts with a string column named + * "value", and followed by partitioned columns if there are any. See the documentation on the + * other overloaded `text()` method for more details. + * + * @since 3.4.0 + */ + def text(path: String): DataFrame = { + // This method ensures that calls that explicit need single argument works, see SPARK-16009 + text(Seq(path): _*) + } + + /** + * Loads text files and returns a `DataFrame` whose schema starts with a string column named + * "value", and followed by partitioned columns if there are any. The text files must be encoded + * as UTF-8. + * + * By default, each line in the text files is a new row in the resulting DataFrame. For example: + * {{{ + * // Scala: + * spark.read.text("/path/to/spark/README.md") + * + * // Java: + * spark.read().text("/path/to/spark/README.md") + * }}} + * + * You can find the text-specific options for reading text files in + * Data Source Option in the version you use. + * + * @param paths + * input paths + * @since 3.4.0 + */ + @scala.annotation.varargs + def text(paths: String*): DataFrame = format("text").load(paths: _*) + + /** + * Loads text files and returns a [[Dataset]] of String. See the documentation on the other + * overloaded `textFile()` method for more details. + * @since 3.4.0 + */ + def textFile(path: String): Dataset[String] = { + // This method ensures that calls that explicit need single argument works, see SPARK-16009 + textFile(Seq(path): _*) + } + + /** + * Loads text files and returns a [[Dataset]] of String. The underlying schema of the Dataset + * contains a single string column named "value". The text files must be encoded as UTF-8. + * + * If the directory structure of the text files contains partitioning information, those are + * ignored in the resulting Dataset. To include partitioning information as columns, use `text`. + * + * By default, each line in the text files is a new row in the resulting DataFrame. For example: + * {{{ + * // Scala: + * spark.read.textFile("/path/to/spark/README.md") + * + * // Java: + * spark.read().textFile("/path/to/spark/README.md") + * }}} + * + * You can set the text-specific options as specified in `DataFrameReader.text`. + * + * @param paths + * input path + * @since 3.4.0 + */ + @scala.annotation.varargs + def textFile(paths: String*): Dataset[String] = { + assertNoSpecifiedSchema("textFile") + text(paths: _*).select("value").as(StringEncoder) + } + + private def assertSourceFormatSpecified(): Unit = { + if (source == null) { + throw new IllegalArgumentException("The source format must be specified.") + } + } + + private def parse(ds: Dataset[String], format: ParseFormat): DataFrame = { + sparkSession.newDataFrame { builder => + val parseBuilder = builder.getParseBuilder + .setInput(ds.plan.getRoot) + .setFormat(format) + userSpecifiedSchema.foreach(schema => + parseBuilder.setSchema(DataTypeProtoConverter.toConnectProtoType(schema))) + extraOptions.foreach { case (k, v) => + parseBuilder.putOptions(k, v) + } + } + } + + /** + * A convenient function for schema validation in APIs. + */ + private def assertNoSpecifiedSchema(operation: String): Unit = { + if (userSpecifiedSchema.nonEmpty) { + throw QueryCompilationErrors.userSpecifiedSchemaUnsupportedError(operation) + } + } + + /////////////////////////////////////////////////////////////////////////////////////// + // Builder pattern config options + /////////////////////////////////////////////////////////////////////////////////////// + + private var source: String = _ + + private var userSpecifiedSchema: Option[StructType] = None + + private var extraOptions = CaseInsensitiveMap[String](Map.empty) + +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala new file mode 100644 index 0000000000000..0d4372b8738ee --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameStatFunctions.scala @@ -0,0 +1,592 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.{lang => jl, util => ju} + +import scala.collection.JavaConverters._ + +import org.apache.spark.connect.proto.{Relation, StatSampleBy} +import org.apache.spark.sql.DataFrameStatFunctions.approxQuantileResultEncoder +import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{ArrayEncoder, BinaryEncoder, PrimitiveDoubleEncoder} +import org.apache.spark.sql.functions.lit +import org.apache.spark.util.sketch.CountMinSketch + +/** + * Statistic functions for `DataFrame`s. + * + * @since 3.4.0 + */ +final class DataFrameStatFunctions private[sql] (sparkSession: SparkSession, root: Relation) { + + /** + * Calculates the approximate quantiles of a numerical column of a DataFrame. + * + * The result of this algorithm has the following deterministic bound: If the DataFrame has N + * elements and if we request the quantile at probability `p` up to error `err`, then the + * algorithm will return a sample `x` from the DataFrame so that the *exact* rank of `x` is + * close to (p * N). More precisely, + * + * {{{ + * floor((p - err) * N) <= rank(x) <= ceil((p + err) * N) + * }}} + * + * This method implements a variation of the Greenwald-Khanna algorithm (with some speed + * optimizations). The algorithm was first present in Space-efficient Online Computation of Quantile + * Summaries by Greenwald and Khanna. + * + * @param col + * the name of the numerical column + * @param probabilities + * a list of quantile probabilities Each number must belong to [0, 1]. For example 0 is the + * minimum, 0.5 is the median, 1 is the maximum. + * @param relativeError + * The relative target precision to achieve (greater than or equal to 0). If set to zero, the + * exact quantiles are computed, which could be very expensive. Note that values greater than + * 1 are accepted but give the same result as 1. + * @return + * the approximate quantiles at the given probabilities + * + * @note + * null and NaN values will be removed from the numerical column before calculation. If the + * dataframe is empty or the column only contains null or NaN, an empty array is returned. + * + * @since 3.4.0 + */ + def approxQuantile( + col: String, + probabilities: Array[Double], + relativeError: Double): Array[Double] = { + approxQuantile(Array(col), probabilities, relativeError).head + } + + /** + * Calculates the approximate quantiles of numerical columns of a DataFrame. + * @see + * `approxQuantile(col:Str* approxQuantile)` for detailed description. + * + * @param cols + * the names of the numerical columns + * @param probabilities + * a list of quantile probabilities Each number must belong to [0, 1]. For example 0 is the + * minimum, 0.5 is the median, 1 is the maximum. + * @param relativeError + * The relative target precision to achieve (greater than or equal to 0). If set to zero, the + * exact quantiles are computed, which could be very expensive. Note that values greater than + * 1 are accepted but give the same result as 1. + * @return + * the approximate quantiles at the given probabilities of each column + * + * @note + * null and NaN values will be ignored in numerical columns before calculation. For columns + * only containing null or NaN values, an empty array is returned. + * + * @since 3.4.0 + */ + def approxQuantile( + cols: Array[String], + probabilities: Array[Double], + relativeError: Double): Array[Array[Double]] = { + require( + probabilities.forall(p => p >= 0.0 && p <= 1.0), + "percentile should be in the range [0.0, 1.0]") + require(relativeError >= 0, s"Relative Error must be non-negative but got $relativeError") + sparkSession + .newDataset(approxQuantileResultEncoder) { builder => + val approxQuantileBuilder = builder.getApproxQuantileBuilder + .setInput(root) + .setRelativeError(relativeError) + cols.foreach(approxQuantileBuilder.addCols) + probabilities.foreach(approxQuantileBuilder.addProbabilities) + } + .head() + } + + /** + * Calculate the sample covariance of two numerical columns of a DataFrame. + * @param col1 + * the name of the first column + * @param col2 + * the name of the second column + * @return + * the covariance of the two columns. + * + * {{{ + * val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10)) + * .withColumn("rand2", rand(seed=27)) + * df.stat.cov("rand1", "rand2") + * res1: Double = 0.065... + * }}} + * + * @since 3.4.0 + */ + def cov(col1: String, col2: String): Double = { + sparkSession + .newDataset(PrimitiveDoubleEncoder) { builder => + builder.getCovBuilder.setInput(root).setCol1(col1).setCol2(col2) + } + .head() + } + + /** + * Calculates the correlation of two columns of a DataFrame. Currently only supports the Pearson + * Correlation Coefficient. For Spearman Correlation, consider using RDD methods found in + * MLlib's Statistics. + * + * @param col1 + * the name of the column + * @param col2 + * the name of the column to calculate the correlation against + * @return + * The Pearson Correlation Coefficient as a Double. + * + * {{{ + * val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10)) + * .withColumn("rand2", rand(seed=27)) + * df.stat.corr("rand1", "rand2") + * res1: Double = 0.613... + * }}} + * + * @since 3.4.0 + */ + def corr(col1: String, col2: String, method: String): Double = { + require( + method == "pearson", + "Currently only the calculation of the Pearson Correlation " + + "coefficient is supported.") + sparkSession + .newDataset(PrimitiveDoubleEncoder) { builder => + builder.getCorrBuilder.setInput(root).setCol1(col1).setCol2(col2) + } + .head() + } + + /** + * Calculates the Pearson Correlation Coefficient of two columns of a DataFrame. + * + * @param col1 + * the name of the column + * @param col2 + * the name of the column to calculate the correlation against + * @return + * The Pearson Correlation Coefficient as a Double. + * + * {{{ + * val df = sc.parallelize(0 until 10).toDF("id").withColumn("rand1", rand(seed=10)) + * .withColumn("rand2", rand(seed=27)) + * df.stat.corr("rand1", "rand2", "pearson") + * res1: Double = 0.613... + * }}} + * + * @since 3.4.0 + */ + def corr(col1: String, col2: String): Double = { + corr(col1, col2, "pearson") + } + + /** + * Computes a pair-wise frequency table of the given columns. Also known as a contingency table. + * The first column of each row will be the distinct values of `col1` and the column names will + * be the distinct values of `col2`. The name of the first column will be `col1_col2`. Counts + * will be returned as `Long`s. Pairs that have no occurrences will have zero as their counts. + * Null elements will be replaced by "null", and back ticks will be dropped from elements if + * they exist. + * + * @param col1 + * The name of the first column. Distinct items will make the first item of each row. + * @param col2 + * The name of the second column. Distinct items will make the column names of the DataFrame. + * @return + * A DataFrame containing for the contingency table. + * + * {{{ + * val df = spark.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2), (3, 3))) + * .toDF("key", "value") + * val ct = df.stat.crosstab("key", "value") + * ct.show() + * +---------+---+---+---+ + * |key_value| 1| 2| 3| + * +---------+---+---+---+ + * | 2| 2| 0| 1| + * | 1| 1| 1| 0| + * | 3| 0| 1| 1| + * +---------+---+---+---+ + * }}} + * + * @since 3.4.0 + */ + def crosstab(col1: String, col2: String): DataFrame = { + sparkSession.newDataFrame { builder => + builder.getCrosstabBuilder.setInput(root).setCol1(col1).setCol2(col2) + } + } + + /** + * Finding frequent items for columns, possibly with false positives. Using the frequent element + * count algorithm described in here, + * proposed by Karp, Schenker, and Papadimitriou. The `support` should be greater than 1e-4. + * + * This function is meant for exploratory data analysis, as we make no guarantee about the + * backward compatibility of the schema of the resulting `DataFrame`. + * + * @param cols + * the names of the columns to search frequent items in. + * @param support + * The minimum frequency for an item to be considered `frequent`. Should be greater than 1e-4. + * @return + * A Local DataFrame with the Array of frequent items for each column. + * + * {{{ + * val rows = Seq.tabulate(100) { i => + * if (i % 2 == 0) (1, -1.0) else (i, i * -1.0) + * } + * val df = spark.createDataFrame(rows).toDF("a", "b") + * // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns + * // "a" and "b" + * val freqSingles = df.stat.freqItems(Array("a", "b"), 0.4) + * freqSingles.show() + * +-----------+-------------+ + * |a_freqItems| b_freqItems| + * +-----------+-------------+ + * | [1, 99]|[-1.0, -99.0]| + * +-----------+-------------+ + * // find the pair of items with a frequency greater than 0.1 in columns "a" and "b" + * val pairDf = df.select(struct("a", "b").as("a-b")) + * val freqPairs = pairDf.stat.freqItems(Array("a-b"), 0.1) + * freqPairs.select(explode($"a-b_freqItems").as("freq_ab")).show() + * +----------+ + * | freq_ab| + * +----------+ + * | [1,-1.0]| + * | ... | + * +----------+ + * }}} + * + * @since 3.4.0 + */ + def freqItems(cols: Array[String], support: Double): DataFrame = { + sparkSession.newDataFrame { builder => + val freqItemsBuilder = builder.getFreqItemsBuilder.setInput(root).setSupport(support) + cols.foreach(freqItemsBuilder.addCols) + } + } + + /** + * Finding frequent items for columns, possibly with false positives. Using the frequent element + * count algorithm described in here, + * proposed by Karp, Schenker, and Papadimitriou. Uses a `default` support of 1%. + * + * This function is meant for exploratory data analysis, as we make no guarantee about the + * backward compatibility of the schema of the resulting `DataFrame`. + * + * @param cols + * the names of the columns to search frequent items in. + * @return + * A Local DataFrame with the Array of frequent items for each column. + * + * @since 3.4.0 + */ + def freqItems(cols: Array[String]): DataFrame = { + freqItems(cols, 0.01) + } + + /** + * (Scala-specific) Finding frequent items for columns, possibly with false positives. Using the + * frequent element count algorithm described in here, proposed by Karp, Schenker, and + * Papadimitriou. + * + * This function is meant for exploratory data analysis, as we make no guarantee about the + * backward compatibility of the schema of the resulting `DataFrame`. + * + * @param cols + * the names of the columns to search frequent items in. + * @return + * A Local DataFrame with the Array of frequent items for each column. + * + * {{{ + * val rows = Seq.tabulate(100) { i => + * if (i % 2 == 0) (1, -1.0) else (i, i * -1.0) + * } + * val df = spark.createDataFrame(rows).toDF("a", "b") + * // find the items with a frequency greater than 0.4 (observed 40% of the time) for columns + * // "a" and "b" + * val freqSingles = df.stat.freqItems(Seq("a", "b"), 0.4) + * freqSingles.show() + * +-----------+-------------+ + * |a_freqItems| b_freqItems| + * +-----------+-------------+ + * | [1, 99]|[-1.0, -99.0]| + * +-----------+-------------+ + * // find the pair of items with a frequency greater than 0.1 in columns "a" and "b" + * val pairDf = df.select(struct("a", "b").as("a-b")) + * val freqPairs = pairDf.stat.freqItems(Seq("a-b"), 0.1) + * freqPairs.select(explode($"a-b_freqItems").as("freq_ab")).show() + * +----------+ + * | freq_ab| + * +----------+ + * | [1,-1.0]| + * | ... | + * +----------+ + * }}} + * + * @since 3.4.0 + */ + def freqItems(cols: Seq[String], support: Double): DataFrame = { + freqItems(cols.toArray, support) + } + + /** + * (Scala-specific) Finding frequent items for columns, possibly with false positives. Using the + * frequent element count algorithm described in here, proposed by Karp, Schenker, and + * Papadimitriou. Uses a `default` support of 1%. + * + * This function is meant for exploratory data analysis, as we make no guarantee about the + * backward compatibility of the schema of the resulting `DataFrame`. + * + * @param cols + * the names of the columns to search frequent items in. + * @return + * A Local DataFrame with the Array of frequent items for each column. + * + * @since 3.4.0 + */ + def freqItems(cols: Seq[String]): DataFrame = { + freqItems(cols.toArray, 0.01) + } + + /** + * Returns a stratified sample without replacement based on the fraction given on each stratum. + * @param col + * column that defines strata + * @param fractions + * sampling fraction for each stratum. If a stratum is not specified, we treat its fraction as + * zero. + * @param seed + * random seed + * @tparam T + * stratum type + * @return + * a new `DataFrame` that represents the stratified sample + * + * {{{ + * val df = spark.createDataFrame(Seq((1, 1), (1, 2), (2, 1), (2, 1), (2, 3), (3, 2), + * (3, 3))).toDF("key", "value") + * val fractions = Map(1 -> 1.0, 3 -> 0.5) + * df.stat.sampleBy("key", fractions, 36L).show() + * +---+-----+ + * |key|value| + * +---+-----+ + * | 1| 1| + * | 1| 2| + * | 3| 2| + * +---+-----+ + * }}} + * + * @since 3.4.0 + */ + def sampleBy[T](col: String, fractions: Map[T, Double], seed: Long): DataFrame = { + sampleBy(Column(col), fractions, seed) + } + + /** + * Returns a stratified sample without replacement based on the fraction given on each stratum. + * @param col + * column that defines strata + * @param fractions + * sampling fraction for each stratum. If a stratum is not specified, we treat its fraction as + * zero. + * @param seed + * random seed + * @tparam T + * stratum type + * @return + * a new `DataFrame` that represents the stratified sample + * + * @since 3.4.0 + */ + def sampleBy[T](col: String, fractions: ju.Map[T, jl.Double], seed: Long): DataFrame = { + sampleBy(col, fractions.asScala.toMap.asInstanceOf[Map[T, Double]], seed) + } + + /** + * Returns a stratified sample without replacement based on the fraction given on each stratum. + * @param col + * column that defines strata + * @param fractions + * sampling fraction for each stratum. If a stratum is not specified, we treat its fraction as + * zero. + * @param seed + * random seed + * @tparam T + * stratum type + * @return + * a new `DataFrame` that represents the stratified sample + * + * The stratified sample can be performed over multiple columns: + * {{{ + * import org.apache.spark.sql.Row + * import org.apache.spark.sql.functions.struct + * + * val df = spark.createDataFrame(Seq(("Bob", 17), ("Alice", 10), ("Nico", 8), ("Bob", 17), + * ("Alice", 10))).toDF("name", "age") + * val fractions = Map(Row("Alice", 10) -> 0.3, Row("Nico", 8) -> 1.0) + * df.stat.sampleBy(struct($"name", $"age"), fractions, 36L).show() + * +-----+---+ + * | name|age| + * +-----+---+ + * | Nico| 8| + * |Alice| 10| + * +-----+---+ + * }}} + * + * @since 3.4.0 + */ + def sampleBy[T](col: Column, fractions: Map[T, Double], seed: Long): DataFrame = { + require( + fractions.values.forall(p => p >= 0.0 && p <= 1.0), + s"Fractions must be in [0, 1], but got $fractions.") + sparkSession.newDataFrame { builder => + val sampleByBuilder = builder.getSampleByBuilder + .setInput(root) + .setCol(col.expr) + .setSeed(seed) + fractions.foreach { case (k, v) => + sampleByBuilder.addFractions( + StatSampleBy.Fraction + .newBuilder() + .setStratum(lit(k).expr.getLiteral) + .setFraction(v)) + } + } + } + + /** + * (Java-specific) Returns a stratified sample without replacement based on the fraction given + * on each stratum. + * @param col + * column that defines strata + * @param fractions + * sampling fraction for each stratum. If a stratum is not specified, we treat its fraction as + * zero. + * @param seed + * random seed + * @tparam T + * stratum type + * @return + * a new `DataFrame` that represents the stratified sample + * + * @since 3.4.0 + */ + def sampleBy[T](col: Column, fractions: ju.Map[T, jl.Double], seed: Long): DataFrame = { + sampleBy(col, fractions.asScala.toMap.asInstanceOf[Map[T, Double]], seed) + } + + /** + * Builds a Count-min Sketch over a specified column. + * + * @param colName + * name of the column over which the sketch is built + * @param depth + * depth of the sketch + * @param width + * width of the sketch + * @param seed + * random seed + * @return + * a `CountMinSketch` over column `colName` + * @since 3.4.0 + */ + def countMinSketch(colName: String, depth: Int, width: Int, seed: Int): CountMinSketch = { + countMinSketch(Column(colName), depth, width, seed) + } + + /** + * Builds a Count-min Sketch over a specified column. + * + * @param colName + * name of the column over which the sketch is built + * @param eps + * relative error of the sketch + * @param confidence + * confidence of the sketch + * @param seed + * random seed + * @return + * a `CountMinSketch` over column `colName` + * @since 3.4.0 + */ + def countMinSketch( + colName: String, + eps: Double, + confidence: Double, + seed: Int): CountMinSketch = { + countMinSketch(Column(colName), eps, confidence, seed) + } + + /** + * Builds a Count-min Sketch over a specified column. + * + * @param col + * the column over which the sketch is built + * @param depth + * depth of the sketch + * @param width + * width of the sketch + * @param seed + * random seed + * @return + * a `CountMinSketch` over column `colName` + * @since 3.4.0 + */ + def countMinSketch(col: Column, depth: Int, width: Int, seed: Int): CountMinSketch = { + countMinSketch(col, eps = 2.0 / width, confidence = 1 - 1 / Math.pow(2, depth), seed) + } + + /** + * Builds a Count-min Sketch over a specified column. + * + * @param col + * the column over which the sketch is built + * @param eps + * relative error of the sketch + * @param confidence + * confidence of the sketch + * @param seed + * random seed + * @return + * a `CountMinSketch` over column `colName` + * @since 3.4.0 + */ + def countMinSketch(col: Column, eps: Double, confidence: Double, seed: Int): CountMinSketch = { + val agg = Column.fn("count_min_sketch", col, lit(eps), lit(confidence), lit(seed)) + val ds = sparkSession.newDataset(BinaryEncoder) { builder => + builder.getProjectBuilder + .setInput(root) + .addExpressions(agg.expr) + } + CountMinSketch.readFrom(ds.head()) + } +} + +private object DataFrameStatFunctions { + private val approxQuantileResultEncoder: ArrayEncoder[Array[Double]] = + ArrayEncoder(ArrayEncoder(PrimitiveDoubleEncoder, containsNull = false), containsNull = false) +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala new file mode 100644 index 0000000000000..b9d1fefb105e8 --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameWriter.scala @@ -0,0 +1,490 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.util.{Locale, Properties} + +import scala.collection.JavaConverters._ + +import org.apache.spark.annotation.Stable +import org.apache.spark.connect.proto +import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap + +/** + * Interface used to write a [[Dataset]] to external storage systems (e.g. file systems, key-value + * stores, etc). Use `Dataset.write` to access this. + * + * @since 3.4.0 + */ +@Stable +final class DataFrameWriter[T] private[sql] (ds: Dataset[T]) { + + /** + * Specifies the behavior when data or table already exists. Options include:
    + *
  • `SaveMode.Overwrite`: overwrite the existing data.
  • `SaveMode.Append`: append the + * data.
  • `SaveMode.Ignore`: ignore the operation (i.e. no-op).
  • + *
  • `SaveMode.ErrorIfExists`: throw an exception at runtime.

The default + * option is `ErrorIfExists`. + * + * @since 3.4.0 + */ + def mode(saveMode: SaveMode): DataFrameWriter[T] = { + this.mode = saveMode + this + } + + /** + * Specifies the behavior when data or table already exists. Options include:

    + *
  • `overwrite`: overwrite the existing data.
  • `append`: append the data.
  • + *
  • `ignore`: ignore the operation (i.e. no-op).
  • `error` or `errorifexists`: default + * option, throw an exception at runtime.
+ * + * @since 3.4.0 + */ + def mode(saveMode: String): DataFrameWriter[T] = { + saveMode.toLowerCase(Locale.ROOT) match { + case "overwrite" => mode(SaveMode.Overwrite) + case "append" => mode(SaveMode.Append) + case "ignore" => mode(SaveMode.Ignore) + case "error" | "errorifexists" | "default" => mode(SaveMode.ErrorIfExists) + case _ => + throw new IllegalArgumentException(s"Unknown save mode: $saveMode. Accepted " + + "save modes are 'overwrite', 'append', 'ignore', 'error', 'errorifexists', 'default'.") + } + } + + /** + * Specifies the underlying output data source. Built-in options include "parquet", "json", etc. + * + * @since 3.4.0 + */ + def format(source: String): DataFrameWriter[T] = { + this.source = Some(source) + this + } + + /** + * Adds an output option for the underlying data source. + * + * All options are maintained in a case-insensitive way in terms of key names. If a new option + * has the same key case-insensitively, it will override the existing option. + * + * @since 3.4.0 + */ + def option(key: String, value: String): DataFrameWriter[T] = { + this.extraOptions = this.extraOptions + (key -> value) + this + } + + /** + * Adds an output option for the underlying data source. + * + * All options are maintained in a case-insensitive way in terms of key names. If a new option + * has the same key case-insensitively, it will override the existing option. + * + * @since 3.4.0 + */ + def option(key: String, value: Boolean): DataFrameWriter[T] = option(key, value.toString) + + /** + * Adds an output option for the underlying data source. + * + * All options are maintained in a case-insensitive way in terms of key names. If a new option + * has the same key case-insensitively, it will override the existing option. + * + * @since 3.4.0 + */ + def option(key: String, value: Long): DataFrameWriter[T] = option(key, value.toString) + + /** + * Adds an output option for the underlying data source. + * + * All options are maintained in a case-insensitive way in terms of key names. If a new option + * has the same key case-insensitively, it will override the existing option. + * + * @since 3.4.0 + */ + def option(key: String, value: Double): DataFrameWriter[T] = option(key, value.toString) + + /** + * (Scala-specific) Adds output options for the underlying data source. + * + * All options are maintained in a case-insensitive way in terms of key names. If a new option + * has the same key case-insensitively, it will override the existing option. + * + * @since 3.4.0 + */ + def options(options: scala.collection.Map[String, String]): DataFrameWriter[T] = { + this.extraOptions ++= options + this + } + + /** + * Adds output options for the underlying data source. + * + * All options are maintained in a case-insensitive way in terms of key names. If a new option + * has the same key case-insensitively, it will override the existing option. + * + * @since 3.4.0 + */ + def options(options: java.util.Map[String, String]): DataFrameWriter[T] = { + this.options(options.asScala) + this + } + + /** + * Partitions the output by the given columns on the file system. If specified, the output is + * laid out on the file system similar to Hive's partitioning scheme. As an example, when we + * partition a dataset by year and then month, the directory layout would look like:
    + *
  • year=2016/month=01/
  • year=2016/month=02/
+ * + * Partitioning is one of the most widely used techniques to optimize physical data layout. It + * provides a coarse-grained index for skipping unnecessary data reads when queries have + * predicates on the partitioned columns. In order for partitioning to work well, the number of + * distinct values in each column should typically be less than tens of thousands. + * + * This is applicable for all file-based data sources (e.g. Parquet, JSON) starting with Spark + * 2.1.0. + * + * @since 3.4.0 + */ + @scala.annotation.varargs + def partitionBy(colNames: String*): DataFrameWriter[T] = { + this.partitioningColumns = Option(colNames) + this + } + + /** + * Buckets the output by the given columns. If specified, the output is laid out on the file + * system similar to Hive's bucketing scheme, but with a different bucket hash function and is + * not compatible with Hive's bucketing. + * + * This is applicable for all file-based data sources (e.g. Parquet, JSON) starting with Spark + * 2.1.0. + * + * @since 3.4.0 + */ + @scala.annotation.varargs + def bucketBy(numBuckets: Int, colName: String, colNames: String*): DataFrameWriter[T] = { + require(numBuckets > 0, "The numBuckets should be > 0.") + this.numBuckets = Option(numBuckets) + this.bucketColumnNames = Option(colName +: colNames) + this + } + + /** + * Sorts the output in each bucket by the given columns. + * + * This is applicable for all file-based data sources (e.g. Parquet, JSON) starting with Spark + * 2.1.0. + * + * @since 3.4.0 + */ + @scala.annotation.varargs + def sortBy(colName: String, colNames: String*): DataFrameWriter[T] = { + this.sortColumnNames = Option(colName +: colNames) + this + } + + /** + * Saves the content of the `DataFrame` at the specified path. + * + * @since 3.4.0 + */ + def save(path: String): Unit = { + saveInternal(Some(path)) + } + + /** + * Saves the content of the `DataFrame` as the specified table. + * + * @since 3.4.0 + */ + def save(): Unit = saveInternal(None) + + private def saveInternal(path: Option[String]): Unit = { + executeWriteOperation(builder => path.foreach(builder.setPath)) + } + + private def executeWriteOperation(f: proto.WriteOperation.Builder => Unit): Unit = { + val builder = proto.WriteOperation.newBuilder() + + builder.setInput(ds.plan.getRoot) + + // Set path or table + f(builder) + + // Cannot both be set + require(!(builder.hasPath && builder.hasTable)) + + builder.setMode(mode match { + case SaveMode.Append => proto.WriteOperation.SaveMode.SAVE_MODE_APPEND + case SaveMode.Overwrite => proto.WriteOperation.SaveMode.SAVE_MODE_OVERWRITE + case SaveMode.Ignore => proto.WriteOperation.SaveMode.SAVE_MODE_IGNORE + case SaveMode.ErrorIfExists => proto.WriteOperation.SaveMode.SAVE_MODE_ERROR_IF_EXISTS + }) + + source.foreach(builder.setSource) + sortColumnNames.foreach(names => builder.addAllSortColumnNames(names.asJava)) + partitioningColumns.foreach(cols => builder.addAllPartitioningColumns(cols.asJava)) + + numBuckets.foreach(n => { + val bucketBuilder = proto.WriteOperation.BucketBy.newBuilder() + bucketBuilder.setNumBuckets(n) + bucketColumnNames.foreach(names => bucketBuilder.addAllBucketColumnNames(names.asJava)) + builder.setBucketBy(bucketBuilder) + }) + + extraOptions.foreach { case (k, v) => + builder.putOptions(k, v) + } + + ds.sparkSession.execute(proto.Command.newBuilder().setWriteOperation(builder).build()) + } + + /** + * Inserts the content of the `DataFrame` to the specified table. It requires that the schema of + * the `DataFrame` is the same as the schema of the table. + * + * @note + * Unlike `saveAsTable`, `insertInto` ignores the column names and just uses position-based + * resolution. For example: + * + * @note + * SaveMode.ErrorIfExists and SaveMode.Ignore behave as SaveMode.Append in `insertInto` as + * `insertInto` is not a table creating operation. + * + * {{{ + * scala> Seq((1, 2)).toDF("i", "j").write.mode("overwrite").saveAsTable("t1") + * scala> Seq((3, 4)).toDF("j", "i").write.insertInto("t1") + * scala> Seq((5, 6)).toDF("a", "b").write.insertInto("t1") + * scala> sql("select * from t1").show + * +---+---+ + * | i| j| + * +---+---+ + * | 5| 6| + * | 3| 4| + * | 1| 2| + * +---+---+ + * }}} + * + * Because it inserts data to an existing table, format or options will be ignored. + * + * @since 3.4.0 + */ + def insertInto(tableName: String): Unit = { + executeWriteOperation(builder => { + builder.setTable( + proto.WriteOperation.SaveTable + .newBuilder() + .setTableName(tableName) + .setSaveMethod( + proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_INSERT_INTO)) + }) + } + + /** + * Saves the content of the `DataFrame` as the specified table. + * + * In the case the table already exists, behavior of this function depends on the save mode, + * specified by the `mode` function (default to throwing an exception). When `mode` is + * `Overwrite`, the schema of the `DataFrame` does not need to be the same as that of the + * existing table. + * + * When `mode` is `Append`, if there is an existing table, we will use the format and options of + * the existing table. The column order in the schema of the `DataFrame` doesn't need to be same + * as that of the existing table. Unlike `insertInto`, `saveAsTable` will use the column names + * to find the correct column positions. For example: + * + * {{{ + * scala> Seq((1, 2)).toDF("i", "j").write.mode("overwrite").saveAsTable("t1") + * scala> Seq((3, 4)).toDF("j", "i").write.mode("append").saveAsTable("t1") + * scala> sql("select * from t1").show + * +---+---+ + * | i| j| + * +---+---+ + * | 1| 2| + * | 4| 3| + * +---+---+ + * }}} + * + * In this method, save mode is used to determine the behavior if the data source table exists + * in Spark catalog. We will always overwrite the underlying data of data source (e.g. a table + * in JDBC data source) if the table doesn't exist in Spark catalog, and will always append to + * the underlying data of data source if the table already exists. + * + * When the DataFrame is created from a non-partitioned `HadoopFsRelation` with a single input + * path, and the data source provider can be mapped to an existing Hive builtin SerDe (i.e. ORC + * and Parquet), the table is persisted in a Hive compatible format, which means other systems + * like Hive will be able to read this table. Otherwise, the table is persisted in a Spark SQL + * specific format. + * + * @since 3.4.0 + */ + def saveAsTable(tableName: String): Unit = { + executeWriteOperation(builder => { + builder.setTable( + proto.WriteOperation.SaveTable + .newBuilder() + .setTableName(tableName) + .setSaveMethod( + proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_SAVE_AS_TABLE)) + }) + } + + /** + * Saves the content of the `DataFrame` to an external database table via JDBC. In the case the + * table already exists in the external database, behavior of this function depends on the save + * mode, specified by the `mode` function (default to throwing an exception). + * + * Don't create too many partitions in parallel on a large cluster; otherwise Spark might crash + * your external database systems. + * + * JDBC-specific option and parameter documentation for storing tables via JDBC in + * Data Source Option in the version you use. + * + * @param table + * Name of the table in the external database. + * @param connectionProperties + * JDBC database connection arguments, a list of arbitrary string tag/value. Normally at least + * a "user" and "password" property should be included. "batchsize" can be used to control the + * number of rows per insert. "isolationLevel" can be one of "NONE", "READ_COMMITTED", + * "READ_UNCOMMITTED", "REPEATABLE_READ", or "SERIALIZABLE", corresponding to standard + * transaction isolation levels defined by JDBC's Connection object, with default of + * "READ_UNCOMMITTED". + * @since 3.4.0 + */ + def jdbc(url: String, table: String, connectionProperties: Properties): Unit = { + // connectionProperties should override settings in extraOptions. + this.extraOptions ++= connectionProperties.asScala + // explicit url and dbtable should override all + this.extraOptions ++= Seq("url" -> url, "dbtable" -> table) + format("jdbc").save() + } + + /** + * Saves the content of the `DataFrame` in JSON format ( JSON + * Lines text format or newline-delimited JSON) at the specified path. This is equivalent + * to: + * {{{ + * format("json").save(path) + * }}} + * + * You can find the JSON-specific options for writing JSON files in + * Data Source Option in the version you use. + * + * @since 3.4.0 + */ + def json(path: String): Unit = { + format("json").save(path) + } + + /** + * Saves the content of the `DataFrame` in Parquet format at the specified path. This is + * equivalent to: + * {{{ + * format("parquet").save(path) + * }}} + * + * Parquet-specific option(s) for writing Parquet files can be found in Data + * Source Option in the version you use. + * + * @since 3.4.0 + */ + def parquet(path: String): Unit = { + format("parquet").save(path) + } + + /** + * Saves the content of the `DataFrame` in ORC format at the specified path. This is equivalent + * to: + * {{{ + * format("orc").save(path) + * }}} + * + * ORC-specific option(s) for writing ORC files can be found in Data + * Source Option in the version you use. + * + * @since 3.4.0 + */ + def orc(path: String): Unit = { + format("orc").save(path) + } + + /** + * Saves the content of the `DataFrame` in a text file at the specified path. The DataFrame must + * have only one column that is of string type. Each row becomes a new line in the output file. + * For example: + * {{{ + * // Scala: + * df.write.text("/path/to/output") + * + * // Java: + * df.write().text("/path/to/output") + * }}} + * The text files will be encoded as UTF-8. + * + * You can find the text-specific options for writing text files in + * Data Source Option in the version you use. + * + * @since 3.4.0 + */ + def text(path: String): Unit = { + format("text").save(path) + } + + /** + * Saves the content of the `DataFrame` in CSV format at the specified path. This is equivalent + * to: + * {{{ + * format("csv").save(path) + * }}} + * + * You can find the CSV-specific options for writing CSV files in + * Data Source Option in the version you use. + * + * @since 3.4.0 + */ + def csv(path: String): Unit = { + format("csv").save(path) + } + + /////////////////////////////////////////////////////////////////////////////////////// + // Builder pattern config options + /////////////////////////////////////////////////////////////////////////////////////// + + private var source: Option[String] = None + + private var mode: SaveMode = SaveMode.ErrorIfExists + + private var extraOptions = CaseInsensitiveMap[String](Map.empty) + + private var partitioningColumns: Option[Seq[String]] = None + + private var bucketColumnNames: Option[Seq[String]] = None + + private var numBuckets: Option[Int] = None + + private var sortColumnNames: Option[Seq[String]] = None +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala new file mode 100644 index 0000000000000..b698e1dfaa1c9 --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DataFrameWriterV2.scala @@ -0,0 +1,289 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import scala.collection.JavaConverters._ +import scala.collection.mutable + +import org.apache.spark.annotation.Experimental +import org.apache.spark.connect.proto + +/** + * Interface used to write a [[org.apache.spark.sql.Dataset]] to external storage using the v2 + * API. + * + * @since 3.4.0 + */ +@Experimental +final class DataFrameWriterV2[T] private[sql] (table: String, ds: Dataset[T]) + extends CreateTableWriter[T] { + + private var provider: Option[String] = None + + private val options = new mutable.HashMap[String, String]() + + private val properties = new mutable.HashMap[String, String]() + + private var partitioning: Option[Seq[proto.Expression]] = None + + private var overwriteCondition: Option[proto.Expression] = None + + override def using(provider: String): CreateTableWriter[T] = { + this.provider = Some(provider) + this + } + + override def option(key: String, value: String): DataFrameWriterV2[T] = { + this.options.put(key, value) + this + } + + override def options(options: scala.collection.Map[String, String]): DataFrameWriterV2[T] = { + options.foreach { case (key, value) => + this.options.put(key, value) + } + this + } + + override def options(options: java.util.Map[String, String]): DataFrameWriterV2[T] = { + this.options(options.asScala) + this + } + + override def tableProperty(property: String, value: String): CreateTableWriter[T] = { + this.properties.put(property, value) + this + } + + @scala.annotation.varargs + override def partitionedBy(column: Column, columns: Column*): CreateTableWriter[T] = { + val asTransforms = (column +: columns).map(_.expr) + this.partitioning = Some(asTransforms) + this + } + + override def create(): Unit = { + executeWriteOperation(proto.WriteOperationV2.Mode.MODE_CREATE) + } + + override def replace(): Unit = { + executeWriteOperation(proto.WriteOperationV2.Mode.MODE_REPLACE) + } + + override def createOrReplace(): Unit = { + executeWriteOperation(proto.WriteOperationV2.Mode.MODE_CREATE_OR_REPLACE) + } + + /** + * Append the contents of the data frame to the output table. + * + * If the output table does not exist, this operation will fail with + * [[org.apache.spark.sql.catalyst.analysis.NoSuchTableException]]. The data frame will be + * validated to ensure it is compatible with the existing table. + * + * @throws org.apache.spark.sql.catalyst.analysis.NoSuchTableException + * If the table does not exist + */ + def append(): Unit = { + executeWriteOperation(proto.WriteOperationV2.Mode.MODE_APPEND) + } + + /** + * Overwrite rows matching the given filter condition with the contents of the data frame in the + * output table. + * + * If the output table does not exist, this operation will fail with + * [[org.apache.spark.sql.catalyst.analysis.NoSuchTableException]]. The data frame will be + * validated to ensure it is compatible with the existing table. + * + * @throws org.apache.spark.sql.catalyst.analysis.NoSuchTableException + * If the table does not exist + */ + def overwrite(condition: Column): Unit = { + overwriteCondition = Some(condition.expr) + executeWriteOperation(proto.WriteOperationV2.Mode.MODE_OVERWRITE) + } + + /** + * Overwrite all partition for which the data frame contains at least one row with the contents + * of the data frame in the output table. + * + * This operation is equivalent to Hive's `INSERT OVERWRITE ... PARTITION`, which replaces + * partitions dynamically depending on the contents of the data frame. + * + * If the output table does not exist, this operation will fail with + * [[org.apache.spark.sql.catalyst.analysis.NoSuchTableException]]. The data frame will be + * validated to ensure it is compatible with the existing table. + * + * @throws org.apache.spark.sql.catalyst.analysis.NoSuchTableException + * If the table does not exist + */ + def overwritePartitions(): Unit = { + executeWriteOperation(proto.WriteOperationV2.Mode.MODE_OVERWRITE_PARTITIONS) + } + + private def executeWriteOperation(mode: proto.WriteOperationV2.Mode): Unit = { + val builder = proto.WriteOperationV2.newBuilder() + + builder.setInput(ds.plan.getRoot) + builder.setTableName(table) + provider.foreach(builder.setProvider) + + partitioning.foreach(columns => builder.addAllPartitioningColumns(columns.asJava)) + + options.foreach { case (k, v) => + builder.putOptions(k, v) + } + properties.foreach { case (k, v) => + builder.putTableProperties(k, v) + } + + builder.setMode(mode) + + overwriteCondition.foreach(builder.setOverwriteCondition) + + ds.sparkSession.execute(proto.Command.newBuilder().setWriteOperationV2(builder).build()) + } +} + +/** + * Configuration methods common to create/replace operations and insert/overwrite operations. + * @tparam R + * builder type to return + * @since 3.4.0 + */ +trait WriteConfigMethods[R] { + + /** + * Add a write option. + * + * @since 3.4.0 + */ + def option(key: String, value: String): R + + /** + * Add a boolean output option. + * + * @since 3.4.0 + */ + def option(key: String, value: Boolean): R = option(key, value.toString) + + /** + * Add a long output option. + * + * @since 3.4.0 + */ + def option(key: String, value: Long): R = option(key, value.toString) + + /** + * Add a double output option. + * + * @since 3.4.0 + */ + def option(key: String, value: Double): R = option(key, value.toString) + + /** + * Add write options from a Scala Map. + * + * @since 3.4.0 + */ + def options(options: scala.collection.Map[String, String]): R + + /** + * Add write options from a Java Map. + * + * @since 3.4.0 + */ + def options(options: java.util.Map[String, String]): R +} + +/** + * Trait to restrict calls to create and replace operations. + * + * @since 3.4.0 + */ +trait CreateTableWriter[T] extends WriteConfigMethods[CreateTableWriter[T]] { + + /** + * Create a new table from the contents of the data frame. + * + * The new table's schema, partition layout, properties, and other configuration will be based + * on the configuration set on this writer. + * + * If the output table exists, this operation will fail with + * [[org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException]]. + * + * @throws org.apache.spark.sql.catalyst.analysis.TableAlreadyExistsException + * If the table already exists + */ + def create(): Unit + + /** + * Replace an existing table with the contents of the data frame. + * + * The existing table's schema, partition layout, properties, and other configuration will be + * replaced with the contents of the data frame and the configuration set on this writer. + * + * If the output table does not exist, this operation will fail with + * [[org.apache.spark.sql.catalyst.analysis.CannotReplaceMissingTableException]]. + * + * @throws org.apache.spark.sql.catalyst.analysis.CannotReplaceMissingTableException + * If the table does not exist + */ + def replace(): Unit + + /** + * Create a new table or replace an existing table with the contents of the data frame. + * + * The output table's schema, partition layout, properties, and other configuration will be + * based on the contents of the data frame and the configuration set on this writer. If the + * table exists, its configuration and data will be replaced. + */ + def createOrReplace(): Unit + + /** + * Partition the output table created by `create`, `createOrReplace`, or `replace` using the + * given columns or transforms. + * + * When specified, the table data will be stored by these values for efficient reads. + * + * For example, when a table is partitioned by day, it may be stored in a directory layout like: + *
  • `table/day=2019-06-01/`
  • `table/day=2019-06-02/`
+ * + * Partitioning is one of the most widely used techniques to optimize physical data layout. It + * provides a coarse-grained index for skipping unnecessary data reads when queries have + * predicates on the partitioned columns. In order for partitioning to work well, the number of + * distinct values in each column should typically be less than tens of thousands. + * + * @since 3.4.0 + */ + def partitionedBy(column: Column, columns: Column*): CreateTableWriter[T] + + /** + * Specifies a provider for the underlying output data source. Spark's default catalog supports + * "parquet", "json", etc. + * + * @since 3.4.0 + */ + def using(provider: String): CreateTableWriter[T] + + /** + * Add a table property. + */ + def tableProperty(property: String, value: String): CreateTableWriter[T] +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala new file mode 100644 index 0000000000000..ca90afa14cf3f --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/Dataset.scala @@ -0,0 +1,2870 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +import java.util.{Collections, Locale} + +import scala.collection.JavaConverters._ +import scala.collection.mutable +import scala.util.control.NonFatal + +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.connect.proto +import org.apache.spark.sql.catalyst.encoders.AgnosticEncoder +import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{PrimitiveLongEncoder, StringEncoder, UnboundRowEncoder} +import org.apache.spark.sql.catalyst.expressions.RowOrdering +import org.apache.spark.sql.connect.client.SparkResult +import org.apache.spark.sql.connect.common.{DataTypeProtoConverter, StorageLevelProtoConverter} +import org.apache.spark.sql.functions.{struct, to_json} +import org.apache.spark.sql.types.{Metadata, StructType} +import org.apache.spark.storage.StorageLevel +import org.apache.spark.util.Utils + +/** + * A Dataset is a strongly typed collection of domain-specific objects that can be transformed in + * parallel using functional or relational operations. Each Dataset also has an untyped view + * called a `DataFrame`, which is a Dataset of [[Row]]. + * + * Operations available on Datasets are divided into transformations and actions. Transformations + * are the ones that produce new Datasets, and actions are the ones that trigger computation and + * return results. Example transformations include map, filter, select, and aggregate (`groupBy`). + * Example actions count, show, or writing data out to file systems. + * + * Datasets are "lazy", i.e. computations are only triggered when an action is invoked. + * Internally, a Dataset represents a logical plan that describes the computation required to + * produce the data. When an action is invoked, Spark's query optimizer optimizes the logical plan + * and generates a physical plan for efficient execution in a parallel and distributed manner. To + * explore the logical plan as well as optimized physical plan, use the `explain` function. + * + * To efficiently support domain-specific objects, an [[Encoder]] is required. The encoder maps + * the domain specific type `T` to Spark's internal type system. For example, given a class + * `Person` with two fields, `name` (string) and `age` (int), an encoder is used to tell Spark to + * generate code at runtime to serialize the `Person` object into a binary structure. This binary + * structure often has much lower memory footprint as well as are optimized for efficiency in data + * processing (e.g. in a columnar format). To understand the internal binary representation for + * data, use the `schema` function. + * + * There are typically two ways to create a Dataset. The most common way is by pointing Spark to + * some files on storage systems, using the `read` function available on a `SparkSession`. + * {{{ + * val people = spark.read.parquet("...").as[Person] // Scala + * Dataset people = spark.read().parquet("...").as(Encoders.bean(Person.class)); // Java + * }}} + * + * Datasets can also be created through transformations available on existing Datasets. For + * example, the following creates a new Dataset by applying a filter on the existing one: + * {{{ + * val names = people.map(_.name) // in Scala; names is a Dataset[String] + * Dataset names = people.map((Person p) -> p.name, Encoders.STRING)); + * }}} + * + * Dataset operations can also be untyped, through various domain-specific-language (DSL) + * functions defined in: Dataset (this class), [[Column]], and [[functions]]. These operations are + * very similar to the operations available in the data frame abstraction in R or Python. + * + * To select a column from the Dataset, use `apply` method in Scala and `col` in Java. + * {{{ + * val ageCol = people("age") // in Scala + * Column ageCol = people.col("age"); // in Java + * }}} + * + * Note that the [[Column]] type can also be manipulated through its various functions. + * {{{ + * // The following creates a new column that increases everybody's age by 10. + * people("age") + 10 // in Scala + * people.col("age").plus(10); // in Java + * }}} + * + * A more concrete example in Scala: + * {{{ + * // To create Dataset[Row] using SparkSession + * val people = spark.read.parquet("...") + * val department = spark.read.parquet("...") + * + * people.filter("age > 30") + * .join(department, people("deptId") === department("id")) + * .groupBy(department("name"), people("gender")) + * .agg(avg(people("salary")), max(people("age"))) + * }}} + * + * and in Java: + * {{{ + * // To create Dataset using SparkSession + * Dataset people = spark.read().parquet("..."); + * Dataset department = spark.read().parquet("..."); + * + * people.filter(people.col("age").gt(30)) + * .join(department, people.col("deptId").equalTo(department.col("id"))) + * .groupBy(department.col("name"), people.col("gender")) + * .agg(avg(people.col("salary")), max(people.col("age"))); + * }}} + * + * @groupname basic Basic Dataset functions + * @groupname action Actions + * @groupname untypedrel Untyped transformations + * @groupname typedrel Typed transformations + * + * @since 3.4.0 + */ +class Dataset[T] private[sql] ( + val sparkSession: SparkSession, + @DeveloperApi val plan: proto.Plan, + val encoder: AgnosticEncoder[T]) + extends Serializable { + // Make sure we don't forget to set plan id. + assert(plan.getRoot.getCommon.hasPlanId) + + override def toString: String = { + try { + val builder = new mutable.StringBuilder + val fields = schema.take(2).map { f => + s"${f.name}: ${f.dataType.simpleString(2)}" + } + builder.append("[") + builder.append(fields.mkString(", ")) + if (schema.length > 2) { + if (schema.length - fields.size == 1) { + builder.append(" ... 1 more field") + } else { + builder.append(" ... " + (schema.length - 2) + " more fields") + } + } + builder.append("]").toString() + } catch { + case NonFatal(e) => + s"Invalid Dataframe; ${e.getMessage}" + } + } + + /** + * Converts this strongly typed collection of data to generic Dataframe. In contrast to the + * strongly typed objects that Dataset operations work on, a Dataframe returns generic [[Row]] + * objects that allow fields to be accessed by ordinal or name. + * + * @group basic + * @since 3.4.0 + */ + def toDF(): DataFrame = new Dataset(sparkSession, plan, UnboundRowEncoder) + + /** + * Returns a new Dataset where each record has been mapped on to the specified type. The method + * used to map columns depend on the type of `U`:
  • When `U` is a class, fields for the + * class will be mapped to columns of the same name (case sensitivity is determined by + * `spark.sql.caseSensitive`).
  • When `U` is a tuple, the columns will be mapped by + * ordinal (i.e. the first column will be assigned to `_1`).
  • When `U` is a primitive + * type (i.e. String, Int, etc), then the first column of the `DataFrame` will be used.
  • + *
+ * + * If the schema of the Dataset does not match the desired `U` type, you can use `select` along + * with `alias` or `as` to rearrange or rename as required. + * + * Note that `as[]` only changes the view of the data that is passed into typed operations, such + * as `map()`, and does not eagerly project away any columns that are not present in the + * specified class. + * + * @group basic + * @since 3.4.0 + */ + def as[U: Encoder]: Dataset[U] = { + val encoder = implicitly[Encoder[U]].asInstanceOf[AgnosticEncoder[U]] + // We should add some validation/coercion here. We cannot use `to` + // because that does not work with positional arguments. + new Dataset[U](sparkSession, plan, encoder) + } + + /** + * Converts this strongly typed collection of data to generic `DataFrame` with columns renamed. + * This can be quite convenient in conversion from an RDD of tuples into a `DataFrame` with + * meaningful names. For example: + * {{{ + * val rdd: RDD[(Int, String)] = ... + * rdd.toDF() // this implicit conversion creates a DataFrame with column name `_1` and `_2` + * rdd.toDF("id", "name") // this creates a DataFrame with column name "id" and "name" + * }}} + * + * @group basic + * @since 3.4.0 + */ + @scala.annotation.varargs + def toDF(colNames: String*): DataFrame = sparkSession.newDataFrame { builder => + builder.getToDfBuilder + .setInput(plan.getRoot) + .addAllColumnNames(colNames.asJava) + } + + /** + * Returns a new DataFrame where each row is reconciled to match the specified schema. Spark + * will:
  • Reorder columns and/or inner fields by name to match the specified + * schema.
  • Project away columns and/or inner fields that are not needed by the + * specified schema. Missing columns and/or inner fields (present in the specified schema but + * not input DataFrame) lead to failures.
  • Cast the columns and/or inner fields to match + * the data types in the specified schema, if the types are compatible, e.g., numeric to numeric + * (error if overflows), but not string to int.
  • Carry over the metadata from the + * specified schema, while the columns and/or inner fields still keep their own metadata if not + * overwritten by the specified schema.
  • Fail if the nullability is not compatible. For + * example, the column and/or inner field is nullable but the specified schema requires them to + * be not nullable.
+ * + * @group basic + * @since 3.4.0 + */ + def to(schema: StructType): DataFrame = sparkSession.newDataFrame { builder => + builder.getToSchemaBuilder + .setInput(plan.getRoot) + .setSchema(DataTypeProtoConverter.toConnectProtoType(schema)) + } + + /** + * Returns the schema of this Dataset. + * + * @group basic + * @since 3.4.0 + */ + def schema: StructType = { + if (encoder == UnboundRowEncoder) { + DataTypeProtoConverter + .toCatalystType( + sparkSession + .analyze(plan, proto.AnalyzePlanRequest.AnalyzeCase.SCHEMA) + .getSchema + .getSchema) + .asInstanceOf[StructType] + } else { + encoder.schema + } + } + + /** + * Prints the schema to the console in a nice tree format. + * + * @group basic + * @since 3.4.0 + */ + def printSchema(): Unit = printSchema(Int.MaxValue) + + // scalastyle:off println + /** + * Prints the schema up to the given level to the console in a nice tree format. + * + * @group basic + * @since 3.4.0 + */ + def printSchema(level: Int): Unit = println(schema.treeString(level)) + // scalastyle:on println + + /** + * Prints the plans (logical and physical) with a format specified by a given explain mode. + * + * @param mode + * specifies the expected output format of plans.
  • `simple` Print only a physical + * plan.
  • `extended`: Print both logical and physical plans.
  • `codegen`: Print + * a physical plan and generated codes if they are available.
  • `cost`: Print a logical + * plan and statistics if they are available.
  • `formatted`: Split explain output into + * two sections: a physical plan outline and node details.
+ * @group basic + * @since 3.4.0 + */ + def explain(mode: String): Unit = { + val protoMode = mode.trim.toLowerCase(Locale.ROOT) match { + case "simple" => proto.AnalyzePlanRequest.Explain.ExplainMode.EXPLAIN_MODE_SIMPLE + case "extended" => proto.AnalyzePlanRequest.Explain.ExplainMode.EXPLAIN_MODE_EXTENDED + case "codegen" => proto.AnalyzePlanRequest.Explain.ExplainMode.EXPLAIN_MODE_CODEGEN + case "cost" => proto.AnalyzePlanRequest.Explain.ExplainMode.EXPLAIN_MODE_COST + case "formatted" => proto.AnalyzePlanRequest.Explain.ExplainMode.EXPLAIN_MODE_FORMATTED + case _ => throw new IllegalArgumentException("Unsupported explain mode: " + mode) + } + explain(protoMode) + } + + /** + * Prints the plans (logical and physical) to the console for debugging purposes. + * + * @param extended + * default `false`. If `false`, prints only the physical plan. + * + * @group basic + * @since 3.4.0 + */ + def explain(extended: Boolean): Unit = { + val mode = if (extended) { + proto.AnalyzePlanRequest.Explain.ExplainMode.EXPLAIN_MODE_EXTENDED + } else { + proto.AnalyzePlanRequest.Explain.ExplainMode.EXPLAIN_MODE_SIMPLE + } + explain(mode) + } + + /** + * Prints the physical plan to the console for debugging purposes. + * + * @group basic + * @since 3.4.0 + */ + def explain(): Unit = explain(proto.AnalyzePlanRequest.Explain.ExplainMode.EXPLAIN_MODE_SIMPLE) + + private def explain(mode: proto.AnalyzePlanRequest.Explain.ExplainMode): Unit = { + // scalastyle:off println + println( + sparkSession + .analyze(plan, proto.AnalyzePlanRequest.AnalyzeCase.EXPLAIN, Some(mode)) + .getExplain + .getExplainString) + // scalastyle:on println + } + + /** + * Returns all column names and their data types as an array. + * + * @group basic + * @since 3.4.0 + */ + def dtypes: Array[(String, String)] = schema.fields.map { field => + (field.name, field.dataType.toString) + } + + /** + * Returns all column names as an array. + * + * @group basic + * @since 3.4.0 + */ + def columns: Array[String] = schema.fields.map(_.name) + + /** + * Returns true if the `collect` and `take` methods can be run locally (without any Spark + * executors). + * + * @group basic + * @since 3.4.0 + */ + def isLocal: Boolean = sparkSession + .analyze(plan, proto.AnalyzePlanRequest.AnalyzeCase.IS_LOCAL) + .getIsLocal + .getIsLocal + + /** + * Returns true if the `Dataset` is empty. + * + * @group basic + * @since 3.4.0 + */ + def isEmpty: Boolean = select().limit(1).withResult { result => + result.length == 0 + } + + /** + * Returns true if this Dataset contains one or more sources that continuously return data as it + * arrives. A Dataset that reads data from a streaming source must be executed as a + * `StreamingQuery` using the `start()` method in `DataStreamWriter`. + * + * @group streaming + * @since 3.4.0 + */ + def isStreaming: Boolean = sparkSession + .analyze(plan, proto.AnalyzePlanRequest.AnalyzeCase.IS_STREAMING) + .getIsStreaming + .getIsStreaming + + /** + * Displays the Dataset in a tabular form. Strings more than 20 characters will be truncated, + * and all cells will be aligned right. For example: + * {{{ + * year month AVG('Adj Close) MAX('Adj Close) + * 1980 12 0.503218 0.595103 + * 1981 01 0.523289 0.570307 + * 1982 02 0.436504 0.475256 + * 1983 03 0.410516 0.442194 + * 1984 04 0.450090 0.483521 + * }}} + * + * @param numRows + * Number of rows to show + * + * @group action + * @since 3.4.0 + */ + def show(numRows: Int): Unit = show(numRows, truncate = true) + + /** + * Displays the top 20 rows of Dataset in a tabular form. Strings more than 20 characters will + * be truncated, and all cells will be aligned right. + * + * @group action + * @since 3.4.0 + */ + def show(): Unit = show(20) + + /** + * Displays the top 20 rows of Dataset in a tabular form. + * + * @param truncate + * Whether truncate long strings. If true, strings more than 20 characters will be truncated + * and all cells will be aligned right + * + * @group action + * @since 3.4.0 + */ + def show(truncate: Boolean): Unit = show(20, truncate) + + /** + * Displays the Dataset in a tabular form. For example: + * {{{ + * year month AVG('Adj Close) MAX('Adj Close) + * 1980 12 0.503218 0.595103 + * 1981 01 0.523289 0.570307 + * 1982 02 0.436504 0.475256 + * 1983 03 0.410516 0.442194 + * 1984 04 0.450090 0.483521 + * }}} + * @param numRows + * Number of rows to show + * @param truncate + * Whether truncate long strings. If true, strings more than 20 characters will be truncated + * and all cells will be aligned right + * + * @group action + * @since 3.4.0 + */ + // scalastyle:off println + def show(numRows: Int, truncate: Boolean): Unit = { + val truncateValue = if (truncate) 20 else 0 + show(numRows, truncateValue, vertical = false) + } + + /** + * Displays the Dataset in a tabular form. For example: + * {{{ + * year month AVG('Adj Close) MAX('Adj Close) + * 1980 12 0.503218 0.595103 + * 1981 01 0.523289 0.570307 + * 1982 02 0.436504 0.475256 + * 1983 03 0.410516 0.442194 + * 1984 04 0.450090 0.483521 + * }}} + * + * @param numRows + * Number of rows to show + * @param truncate + * If set to more than 0, truncates strings to `truncate` characters and all cells will be + * aligned right. + * @group action + * @since 3.4.0 + */ + def show(numRows: Int, truncate: Int): Unit = show(numRows, truncate, vertical = false) + + /** + * Displays the Dataset in a tabular form. For example: + * {{{ + * year month AVG('Adj Close) MAX('Adj Close) + * 1980 12 0.503218 0.595103 + * 1981 01 0.523289 0.570307 + * 1982 02 0.436504 0.475256 + * 1983 03 0.410516 0.442194 + * 1984 04 0.450090 0.483521 + * }}} + * + * If `vertical` enabled, this command prints output rows vertically (one line per column + * value)? + * + * {{{ + * -RECORD 0------------------- + * year | 1980 + * month | 12 + * AVG('Adj Close) | 0.503218 + * AVG('Adj Close) | 0.595103 + * -RECORD 1------------------- + * year | 1981 + * month | 01 + * AVG('Adj Close) | 0.523289 + * AVG('Adj Close) | 0.570307 + * -RECORD 2------------------- + * year | 1982 + * month | 02 + * AVG('Adj Close) | 0.436504 + * AVG('Adj Close) | 0.475256 + * -RECORD 3------------------- + * year | 1983 + * month | 03 + * AVG('Adj Close) | 0.410516 + * AVG('Adj Close) | 0.442194 + * -RECORD 4------------------- + * year | 1984 + * month | 04 + * AVG('Adj Close) | 0.450090 + * AVG('Adj Close) | 0.483521 + * }}} + * + * @param numRows + * Number of rows to show + * @param truncate + * If set to more than 0, truncates strings to `truncate` characters and all cells will be + * aligned right. + * @param vertical + * If set to true, prints output rows vertically (one line per column value). + * @group action + * @since 3.4.0 + */ + def show(numRows: Int, truncate: Int, vertical: Boolean): Unit = { + val df = sparkSession.newDataset(StringEncoder) { builder => + builder.getShowStringBuilder + .setInput(plan.getRoot) + .setNumRows(numRows) + .setTruncate(truncate) + .setVertical(vertical) + } + df.withResult { result => + assert(result.length == 1) + assert(result.schema.size == 1) + // scalastyle:off println + println(result.toArray.head) + // scalastyle:on println + } + } + + /** + * Returns a [[DataFrameNaFunctions]] for working with missing data. + * {{{ + * // Dropping rows containing any null values. + * ds.na.drop() + * }}} + * + * @group untypedrel + * @since 3.4.0 + */ + def na: DataFrameNaFunctions = new DataFrameNaFunctions(sparkSession, plan.getRoot) + + /** + * Returns a [[DataFrameStatFunctions]] for working statistic functions support. + * {{{ + * // Finding frequent items in column with name 'a'. + * ds.stat.freqItems(Seq("a")) + * }}} + * + * @group untypedrel + * @since 3.4.0 + */ + def stat: DataFrameStatFunctions = new DataFrameStatFunctions(sparkSession, plan.getRoot) + + private def buildJoin(right: Dataset[_])(f: proto.Join.Builder => Unit): DataFrame = { + sparkSession.newDataFrame { builder => + val joinBuilder = builder.getJoinBuilder + joinBuilder.setLeft(plan.getRoot).setRight(right.plan.getRoot) + f(joinBuilder) + } + } + + private def toJoinType(name: String): proto.Join.JoinType = { + name.trim.toLowerCase(Locale.ROOT) match { + case "inner" => + proto.Join.JoinType.JOIN_TYPE_INNER + case "cross" => + proto.Join.JoinType.JOIN_TYPE_CROSS + case "outer" | "full" | "fullouter" | "full_outer" => + proto.Join.JoinType.JOIN_TYPE_FULL_OUTER + case "left" | "leftouter" | "left_outer" => + proto.Join.JoinType.JOIN_TYPE_LEFT_OUTER + case "right" | "rightouter" | "right_outer" => + proto.Join.JoinType.JOIN_TYPE_RIGHT_OUTER + case "semi" | "leftsemi" | "left_semi" => + proto.Join.JoinType.JOIN_TYPE_LEFT_SEMI + case "anti" | "leftanti" | "left_anti" => + proto.Join.JoinType.JOIN_TYPE_LEFT_ANTI + case _ => + throw new IllegalArgumentException(s"Unsupported join type `joinType`.") + } + } + + /** + * Join with another `DataFrame`. + * + * Behaves as an INNER JOIN and requires a subsequent join predicate. + * + * @param right + * Right side of the join operation. + * + * @group untypedrel + * @since 3.4.0 + */ + def join(right: Dataset[_]): DataFrame = buildJoin(right) { builder => + builder.setJoinType(proto.Join.JoinType.JOIN_TYPE_INNER) + } + + /** + * Inner equi-join with another `DataFrame` using the given column. + * + * Different from other join functions, the join column will only appear once in the output, + * i.e. similar to SQL's `JOIN USING` syntax. + * + * {{{ + * // Joining df1 and df2 using the column "user_id" + * df1.join(df2, "user_id") + * }}} + * + * @param right + * Right side of the join operation. + * @param usingColumn + * Name of the column to join on. This column must exist on both sides. + * + * @note + * If you perform a self-join using this function without aliasing the input `DataFrame`s, you + * will NOT be able to reference any columns after the join, since there is no way to + * disambiguate which side of the join you would like to reference. + * + * @group untypedrel + * @since 3.4.0 + */ + def join(right: Dataset[_], usingColumn: String): DataFrame = { + join(right, Seq(usingColumn)) + } + + /** + * (Java-specific) Inner equi-join with another `DataFrame` using the given columns. See the + * Scala-specific overload for more details. + * + * @param right + * Right side of the join operation. + * @param usingColumns + * Names of the columns to join on. This columns must exist on both sides. + * + * @group untypedrel + * @since 3.4.0 + */ + def join(right: Dataset[_], usingColumns: Array[String]): DataFrame = { + join(right, usingColumns.toSeq) + } + + /** + * (Scala-specific) Inner equi-join with another `DataFrame` using the given columns. + * + * Different from other join functions, the join columns will only appear once in the output, + * i.e. similar to SQL's `JOIN USING` syntax. + * + * {{{ + * // Joining df1 and df2 using the columns "user_id" and "user_name" + * df1.join(df2, Seq("user_id", "user_name")) + * }}} + * + * @param right + * Right side of the join operation. + * @param usingColumns + * Names of the columns to join on. This columns must exist on both sides. + * + * @note + * If you perform a self-join using this function without aliasing the input `DataFrame`s, you + * will NOT be able to reference any columns after the join, since there is no way to + * disambiguate which side of the join you would like to reference. + * + * @group untypedrel + * @since 3.4.0 + */ + def join(right: Dataset[_], usingColumns: Seq[String]): DataFrame = { + join(right, usingColumns, "inner") + } + + /** + * Equi-join with another `DataFrame` using the given column. A cross join with a predicate is + * specified as an inner join. If you would explicitly like to perform a cross join use the + * `crossJoin` method. + * + * Different from other join functions, the join column will only appear once in the output, + * i.e. similar to SQL's `JOIN USING` syntax. + * + * @param right + * Right side of the join operation. + * @param usingColumn + * Name of the column to join on. This column must exist on both sides. + * @param joinType + * Type of join to perform. Default `inner`. Must be one of: `inner`, `cross`, `outer`, + * `full`, `fullouter`, `full_outer`, `left`, `leftouter`, `left_outer`, `right`, + * `rightouter`, `right_outer`, `semi`, `leftsemi`, `left_semi`, `anti`, `leftanti`, + * `left_anti`. + * + * @note + * If you perform a self-join using this function without aliasing the input `DataFrame`s, you + * will NOT be able to reference any columns after the join, since there is no way to + * disambiguate which side of the join you would like to reference. + * + * @group untypedrel + * @since 3.4.0 + */ + def join(right: Dataset[_], usingColumn: String, joinType: String): DataFrame = { + join(right, Seq(usingColumn), joinType) + } + + /** + * (Java-specific) Equi-join with another `DataFrame` using the given columns. See the + * Scala-specific overload for more details. + * + * @param right + * Right side of the join operation. + * @param usingColumns + * Names of the columns to join on. This columns must exist on both sides. + * @param joinType + * Type of join to perform. Default `inner`. Must be one of: `inner`, `cross`, `outer`, + * `full`, `fullouter`, `full_outer`, `left`, `leftouter`, `left_outer`, `right`, + * `rightouter`, `right_outer`, `semi`, `leftsemi`, `left_semi`, `anti`, `leftanti`, + * `left_anti`. + * + * @group untypedrel + * @since 3.4.0 + */ + def join(right: Dataset[_], usingColumns: Array[String], joinType: String): DataFrame = { + join(right, usingColumns.toSeq, joinType) + } + + /** + * (Scala-specific) Equi-join with another `DataFrame` using the given columns. A cross join + * with a predicate is specified as an inner join. If you would explicitly like to perform a + * cross join use the `crossJoin` method. + * + * Different from other join functions, the join columns will only appear once in the output, + * i.e. similar to SQL's `JOIN USING` syntax. + * + * @param right + * Right side of the join operation. + * @param usingColumns + * Names of the columns to join on. This columns must exist on both sides. + * @param joinType + * Type of join to perform. Default `inner`. Must be one of: `inner`, `cross`, `outer`, + * `full`, `fullouter`, `full_outer`, `left`, `leftouter`, `left_outer`, `right`, + * `rightouter`, `right_outer`, `semi`, `leftsemi`, `left_semi`, `anti`, `leftanti`, + * `left_anti`. + * + * @note + * If you perform a self-join using this function without aliasing the input `DataFrame`s, you + * will NOT be able to reference any columns after the join, since there is no way to + * disambiguate which side of the join you would like to reference. + * + * @group untypedrel + * @since 3.4.0 + */ + def join(right: Dataset[_], usingColumns: Seq[String], joinType: String): DataFrame = { + buildJoin(right) { builder => + builder + .setJoinType(toJoinType(joinType)) + .addAllUsingColumns(usingColumns.asJava) + } + } + + /** + * Inner join with another `DataFrame`, using the given join expression. + * + * {{{ + * // The following two are equivalent: + * df1.join(df2, $"df1Key" === $"df2Key") + * df1.join(df2).where($"df1Key" === $"df2Key") + * }}} + * + * @group untypedrel + * @since 3.4.0 + */ + def join(right: Dataset[_], joinExprs: Column): DataFrame = join(right, joinExprs, "inner") + + /** + * Join with another `DataFrame`, using the given join expression. The following performs a full + * outer join between `df1` and `df2`. + * + * {{{ + * // Scala: + * import org.apache.spark.sql.functions._ + * df1.join(df2, $"df1Key" === $"df2Key", "outer") + * + * // Java: + * import static org.apache.spark.sql.functions.*; + * df1.join(df2, col("df1Key").equalTo(col("df2Key")), "outer"); + * }}} + * + * @param right + * Right side of the join. + * @param joinExprs + * Join expression. + * @param joinType + * Type of join to perform. Default `inner`. Must be one of: `inner`, `cross`, `outer`, + * `full`, `fullouter`, `full_outer`, `left`, `leftouter`, `left_outer`, `right`, + * `rightouter`, `right_outer`, `semi`, `leftsemi`, `left_semi`, `anti`, `leftanti`, + * `left_anti`. + * + * @group untypedrel + * @since 3.4.0 + */ + def join(right: Dataset[_], joinExprs: Column, joinType: String): DataFrame = { + buildJoin(right) { builder => + builder + .setJoinType(toJoinType(joinType)) + .setJoinCondition(joinExprs.expr) + } + } + + /** + * Explicit cartesian join with another `DataFrame`. + * + * @param right + * Right side of the join operation. + * + * @note + * Cartesian joins are very expensive without an extra filter that can be pushed down. + * + * @group untypedrel + * @since 3.4.0 + */ + def crossJoin(right: Dataset[_]): DataFrame = buildJoin(right) { builder => + builder.setJoinType(proto.Join.JoinType.JOIN_TYPE_CROSS) + } + + private def buildSort(global: Boolean, sortExprs: Seq[Column]): Dataset[T] = { + sparkSession.newDataset(encoder) { builder => + builder.getSortBuilder + .setInput(plan.getRoot) + .setIsGlobal(global) + .addAllOrder(sortExprs.map(_.sortOrder).asJava) + } + } + + /** + * Returns a new Dataset with each partition sorted by the given expressions. + * + * This is the same operation as "SORT BY" in SQL (Hive QL). + * + * @group typedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def sortWithinPartitions(sortCol: String, sortCols: String*): Dataset[T] = { + sortWithinPartitions((sortCol +: sortCols).map(Column(_)): _*) + } + + /** + * Returns a new Dataset with each partition sorted by the given expressions. + * + * This is the same operation as "SORT BY" in SQL (Hive QL). + * + * @group typedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def sortWithinPartitions(sortExprs: Column*): Dataset[T] = { + buildSort(global = false, sortExprs) + } + + /** + * Returns a new Dataset sorted by the specified column, all in ascending order. + * {{{ + * // The following 3 are equivalent + * ds.sort("sortcol") + * ds.sort($"sortcol") + * ds.sort($"sortcol".asc) + * }}} + * + * @group typedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def sort(sortCol: String, sortCols: String*): Dataset[T] = { + sort((sortCol +: sortCols).map(Column(_)): _*) + } + + /** + * Returns a new Dataset sorted by the given expressions. For example: + * {{{ + * ds.sort($"col1", $"col2".desc) + * }}} + * + * @group typedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def sort(sortExprs: Column*): Dataset[T] = { + buildSort(global = true, sortExprs) + } + + /** + * Returns a new Dataset sorted by the given expressions. This is an alias of the `sort` + * function. + * + * @group typedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def orderBy(sortCol: String, sortCols: String*): Dataset[T] = sort(sortCol, sortCols: _*) + + /** + * Returns a new Dataset sorted by the given expressions. This is an alias of the `sort` + * function. + * + * @group typedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def orderBy(sortExprs: Column*): Dataset[T] = sort(sortExprs: _*) + + /** + * Selects column based on the column name and returns it as a [[Column]]. + * + * @note + * The column name can also reference to a nested column like `a.b`. + * + * @group untypedrel + * @since 3.4.0 + */ + def apply(colName: String): Column = col(colName) + + /** + * Specifies some hint on the current Dataset. As an example, the following code specifies that + * one of the plan can be broadcasted: + * + * {{{ + * df1.join(df2.hint("broadcast")) + * }}} + * + * @group basic + * @since 3.4.0 + */ + @scala.annotation.varargs + def hint(name: String, parameters: Any*): Dataset[T] = sparkSession.newDataset(encoder) { + builder => + builder.getHintBuilder + .setInput(plan.getRoot) + .setName(name) + .addAllParameters(parameters.map(p => functions.lit(p).expr).asJava) + } + + private def getPlanId: Option[Long] = + if (plan.getRoot.hasCommon && plan.getRoot.getCommon.hasPlanId) { + Option(plan.getRoot.getCommon.getPlanId) + } else { + None + } + + /** + * Selects column based on the column name and returns it as a [[Column]]. + * + * @note + * The column name can also reference to a nested column like `a.b`. + * + * @group untypedrel + * @since 3.4.0 + */ + def col(colName: String): Column = { + Column.apply(colName, getPlanId) + } + + /** + * Selects column based on the column name specified as a regex and returns it as [[Column]]. + * @group untypedrel + * @since 3.4.0 + */ + def colRegex(colName: String): Column = { + Column { builder => + val unresolvedRegexBuilder = builder.getUnresolvedRegexBuilder.setColName(colName) + getPlanId.foreach(unresolvedRegexBuilder.setPlanId) + } + } + + /** + * Returns a new Dataset with an alias set. + * + * @group typedrel + * @since 3.4.0 + */ + def as(alias: String): Dataset[T] = sparkSession.newDataset(encoder) { builder => + builder.getSubqueryAliasBuilder + .setInput(plan.getRoot) + .setAlias(alias) + } + + /** + * (Scala-specific) Returns a new Dataset with an alias set. + * + * @group typedrel + * @since 3.4.0 + */ + def as(alias: Symbol): Dataset[T] = as(alias.name) + + /** + * Returns a new Dataset with an alias set. Same as `as`. + * + * @group typedrel + * @since 3.4.0 + */ + def alias(alias: String): Dataset[T] = as(alias) + + /** + * (Scala-specific) Returns a new Dataset with an alias set. Same as `as`. + * + * @group typedrel + * @since 3.4.0 + */ + def alias(alias: Symbol): Dataset[T] = as(alias) + + /** + * Selects a set of column based expressions. + * {{{ + * ds.select($"colA", $"colB" + 1) + * }}} + * + * @group untypedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def select(cols: Column*): DataFrame = sparkSession.newDataFrame { builder => + builder.getProjectBuilder + .setInput(plan.getRoot) + .addAllExpressions(cols.map(_.expr).asJava) + } + + /** + * Selects a set of columns. This is a variant of `select` that can only select existing columns + * using column names (i.e. cannot construct expressions). + * + * {{{ + * // The following two are equivalent: + * ds.select("colA", "colB") + * ds.select($"colA", $"colB") + * }}} + * + * @group untypedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def select(col: String, cols: String*): DataFrame = select((col +: cols).map(Column(_)): _*) + + /** + * Selects a set of SQL expressions. This is a variant of `select` that accepts SQL expressions. + * + * {{{ + * // The following are equivalent: + * ds.selectExpr("colA", "colB as newName", "abs(colC)") + * ds.select(expr("colA"), expr("colB as newName"), expr("abs(colC)")) + * }}} + * + * @group untypedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def selectExpr(exprs: String*): DataFrame = { + select(exprs.map(functions.expr): _*) + } + + /** + * Returns a new Dataset by computing the given [[Column]] expression for each element. + * + * {{{ + * val ds = Seq(1, 2, 3).toDS() + * val newDS = ds.select(expr("value + 1").as[Int]) + * }}} + * + * @group typedrel + * @since 3.4.0 + */ + def select[U1](c1: TypedColumn[T, U1]): Dataset[U1] = { + val encoder = c1.encoder + val expr = if (encoder.schema == encoder.dataType) { + functions.inline(functions.array(c1)).expr + } else { + c1.expr + } + sparkSession.newDataset(encoder) { builder => + builder.getProjectBuilder + .setInput(plan.getRoot) + .addExpressions(expr) + } + } + + /** + * Filters rows using the given condition. + * {{{ + * // The following are equivalent: + * peopleDs.filter($"age" > 15) + * peopleDs.where($"age" > 15) + * }}} + * + * @group typedrel + * @since 3.4.0 + */ + def filter(condition: Column): Dataset[T] = sparkSession.newDataset(encoder) { builder => + builder.getFilterBuilder.setInput(plan.getRoot).setCondition(condition.expr) + } + + /** + * Filters rows using the given SQL expression. + * {{{ + * peopleDs.filter("age > 15") + * }}} + * + * @group typedrel + * @since 3.4.0 + */ + def filter(conditionExpr: String): Dataset[T] = filter(functions.expr(conditionExpr)) + + /** + * Filters rows using the given condition. This is an alias for `filter`. + * {{{ + * // The following are equivalent: + * peopleDs.filter($"age" > 15) + * peopleDs.where($"age" > 15) + * }}} + * + * @group typedrel + * @since 3.4.0 + */ + def where(condition: Column): Dataset[T] = filter(condition) + + /** + * Filters rows using the given SQL expression. + * {{{ + * peopleDs.where("age > 15") + * }}} + * + * @group typedrel + * @since 3.4.0 + */ + def where(conditionExpr: String): Dataset[T] = filter(conditionExpr) + + private def buildUnpivot( + ids: Array[Column], + valuesOption: Option[Array[Column]], + variableColumnName: String, + valueColumnName: String): DataFrame = sparkSession.newDataFrame { builder => + val unpivot = builder.getUnpivotBuilder + .setInput(plan.getRoot) + .addAllIds(ids.toSeq.map(_.expr).asJava) + .setValueColumnName(variableColumnName) + .setValueColumnName(valueColumnName) + valuesOption.foreach { values => + unpivot.getValuesBuilder + .addAllValues(values.toSeq.map(_.expr).asJava) + } + } + + /** + * Groups the Dataset using the specified columns, so we can run aggregation on them. See + * [[RelationalGroupedDataset]] for all the available aggregate functions. + * + * {{{ + * // Compute the average for all numeric columns grouped by department. + * ds.groupBy($"department").avg() + * + * // Compute the max age and average salary, grouped by department and gender. + * ds.groupBy($"department", $"gender").agg(Map( + * "salary" -> "avg", + * "age" -> "max" + * )) + * }}} + * + * @group untypedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def groupBy(cols: Column*): RelationalGroupedDataset = { + new RelationalGroupedDataset( + toDF(), + cols.map(_.expr), + proto.Aggregate.GroupType.GROUP_TYPE_GROUPBY) + } + + /** + * Groups the Dataset using the specified columns, so that we can run aggregation on them. See + * [[RelationalGroupedDataset]] for all the available aggregate functions. + * + * This is a variant of groupBy that can only group by existing columns using column names (i.e. + * cannot construct expressions). + * + * {{{ + * // Compute the average for all numeric columns grouped by department. + * ds.groupBy("department").avg() + * + * // Compute the max age and average salary, grouped by department and gender. + * ds.groupBy($"department", $"gender").agg(Map( + * "salary" -> "avg", + * "age" -> "max" + * )) + * }}} + * @group untypedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def groupBy(col1: String, cols: String*): RelationalGroupedDataset = { + val colNames: Seq[String] = col1 +: cols + new RelationalGroupedDataset( + toDF(), + colNames.map(colName => Column(colName).expr), + proto.Aggregate.GroupType.GROUP_TYPE_GROUPBY) + } + + /** + * Create a multi-dimensional rollup for the current Dataset using the specified columns, so we + * can run aggregation on them. See [[RelationalGroupedDataset]] for all the available aggregate + * functions. + * + * {{{ + * // Compute the average for all numeric columns rolled up by department and group. + * ds.rollup($"department", $"group").avg() + * + * // Compute the max age and average salary, rolled up by department and gender. + * ds.rollup($"department", $"gender").agg(Map( + * "salary" -> "avg", + * "age" -> "max" + * )) + * }}} + * + * @group untypedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def rollup(cols: Column*): RelationalGroupedDataset = { + new RelationalGroupedDataset( + toDF(), + cols.map(_.expr), + proto.Aggregate.GroupType.GROUP_TYPE_ROLLUP) + } + + /** + * Create a multi-dimensional rollup for the current Dataset using the specified columns, so we + * can run aggregation on them. See [[RelationalGroupedDataset]] for all the available aggregate + * functions. + * + * This is a variant of rollup that can only group by existing columns using column names (i.e. + * cannot construct expressions). + * + * {{{ + * // Compute the average for all numeric columns rolled up by department and group. + * ds.rollup("department", "group").avg() + * + * // Compute the max age and average salary, rolled up by department and gender. + * ds.rollup($"department", $"gender").agg(Map( + * "salary" -> "avg", + * "age" -> "max" + * )) + * }}} + * + * @group untypedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def rollup(col1: String, cols: String*): RelationalGroupedDataset = { + val colNames: Seq[String] = col1 +: cols + new RelationalGroupedDataset( + toDF(), + colNames.map(colName => Column(colName).expr), + proto.Aggregate.GroupType.GROUP_TYPE_ROLLUP) + } + + /** + * Create a multi-dimensional cube for the current Dataset using the specified columns, so we + * can run aggregation on them. See [[RelationalGroupedDataset]] for all the available aggregate + * functions. + * + * {{{ + * // Compute the average for all numeric columns cubed by department and group. + * ds.cube($"department", $"group").avg() + * + * // Compute the max age and average salary, cubed by department and gender. + * ds.cube($"department", $"gender").agg(Map( + * "salary" -> "avg", + * "age" -> "max" + * )) + * }}} + * + * @group untypedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def cube(cols: Column*): RelationalGroupedDataset = { + new RelationalGroupedDataset( + toDF(), + cols.map(_.expr), + proto.Aggregate.GroupType.GROUP_TYPE_CUBE) + } + + /** + * Create a multi-dimensional cube for the current Dataset using the specified columns, so we + * can run aggregation on them. See [[RelationalGroupedDataset]] for all the available aggregate + * functions. + * + * This is a variant of cube that can only group by existing columns using column names (i.e. + * cannot construct expressions). + * + * {{{ + * // Compute the average for all numeric columns cubed by department and group. + * ds.cube("department", "group").avg() + * + * // Compute the max age and average salary, cubed by department and gender. + * ds.cube($"department", $"gender").agg(Map( + * "salary" -> "avg", + * "age" -> "max" + * )) + * }}} + * @group untypedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def cube(col1: String, cols: String*): RelationalGroupedDataset = { + val colNames: Seq[String] = col1 +: cols + new RelationalGroupedDataset( + toDF(), + colNames.map(colName => Column(colName).expr), + proto.Aggregate.GroupType.GROUP_TYPE_CUBE) + } + + /** + * (Scala-specific) Aggregates on the entire Dataset without groups. + * {{{ + * // ds.agg(...) is a shorthand for ds.groupBy().agg(...) + * ds.agg("age" -> "max", "salary" -> "avg") + * ds.groupBy().agg("age" -> "max", "salary" -> "avg") + * }}} + * + * @group untypedrel + * @since 3.4.0 + */ + def agg(aggExpr: (String, String), aggExprs: (String, String)*): DataFrame = { + groupBy().agg(aggExpr, aggExprs: _*) + } + + /** + * (Scala-specific) Aggregates on the entire Dataset without groups. + * {{{ + * // ds.agg(...) is a shorthand for ds.groupBy().agg(...) + * ds.agg(Map("age" -> "max", "salary" -> "avg")) + * ds.groupBy().agg(Map("age" -> "max", "salary" -> "avg")) + * }}} + * + * @group untypedrel + * @since 3.4.0 + */ + def agg(exprs: Map[String, String]): DataFrame = groupBy().agg(exprs) + + /** + * (Java-specific) Aggregates on the entire Dataset without groups. + * {{{ + * // ds.agg(...) is a shorthand for ds.groupBy().agg(...) + * ds.agg(Map("age" -> "max", "salary" -> "avg")) + * ds.groupBy().agg(Map("age" -> "max", "salary" -> "avg")) + * }}} + * + * @group untypedrel + * @since 3.4.0 + */ + def agg(exprs: java.util.Map[String, String]): DataFrame = groupBy().agg(exprs) + + /** + * Aggregates on the entire Dataset without groups. + * {{{ + * // ds.agg(...) is a shorthand for ds.groupBy().agg(...) + * ds.agg(max($"age"), avg($"salary")) + * ds.groupBy().agg(max($"age"), avg($"salary")) + * }}} + * + * @group untypedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def agg(expr: Column, exprs: Column*): DataFrame = groupBy().agg(expr, exprs: _*) + + /** + * Unpivot a DataFrame from wide format to long format, optionally leaving identifier columns + * set. This is the reverse to `groupBy(...).pivot(...).agg(...)`, except for the aggregation, + * which cannot be reversed. + * + * This function is useful to massage a DataFrame into a format where some columns are + * identifier columns ("ids"), while all other columns ("values") are "unpivoted" to the rows, + * leaving just two non-id columns, named as given by `variableColumnName` and + * `valueColumnName`. + * + * {{{ + * val df = Seq((1, 11, 12L), (2, 21, 22L)).toDF("id", "int", "long") + * df.show() + * // output: + * // +---+---+----+ + * // | id|int|long| + * // +---+---+----+ + * // | 1| 11| 12| + * // | 2| 21| 22| + * // +---+---+----+ + * + * df.unpivot(Array($"id"), Array($"int", $"long"), "variable", "value").show() + * // output: + * // +---+--------+-----+ + * // | id|variable|value| + * // +---+--------+-----+ + * // | 1| int| 11| + * // | 1| long| 12| + * // | 2| int| 21| + * // | 2| long| 22| + * // +---+--------+-----+ + * // schema: + * //root + * // |-- id: integer (nullable = false) + * // |-- variable: string (nullable = false) + * // |-- value: long (nullable = true) + * }}} + * + * When no "id" columns are given, the unpivoted DataFrame consists of only the "variable" and + * "value" columns. + * + * All "value" columns must share a least common data type. Unless they are the same data type, + * all "value" columns are cast to the nearest common data type. For instance, types + * `IntegerType` and `LongType` are cast to `LongType`, while `IntegerType` and `StringType` do + * not have a common data type and `unpivot` fails with an `AnalysisException`. + * + * @param ids + * Id columns + * @param values + * Value columns to unpivot + * @param variableColumnName + * Name of the variable column + * @param valueColumnName + * Name of the value column + * + * @group untypedrel + * @since 3.4.0 + */ + def unpivot( + ids: Array[Column], + values: Array[Column], + variableColumnName: String, + valueColumnName: String): DataFrame = { + buildUnpivot(ids, Option(values), variableColumnName, valueColumnName) + } + + /** + * Unpivot a DataFrame from wide format to long format, optionally leaving identifier columns + * set. This is the reverse to `groupBy(...).pivot(...).agg(...)`, except for the aggregation, + * which cannot be reversed. + * + * @see + * `org.apache.spark.sql.Dataset.unpivot(Array, Array, String, String)` + * + * This is equivalent to calling `Dataset#unpivot(Array, Array, String, String)` where `values` + * is set to all non-id columns that exist in the DataFrame. + * + * @param ids + * Id columns + * @param variableColumnName + * Name of the variable column + * @param valueColumnName + * Name of the value column + * + * @group untypedrel + * @since 3.4.0 + */ + def unpivot( + ids: Array[Column], + variableColumnName: String, + valueColumnName: String): DataFrame = { + buildUnpivot(ids, None, variableColumnName, valueColumnName) + } + + /** + * Unpivot a DataFrame from wide format to long format, optionally leaving identifier columns + * set. This is the reverse to `groupBy(...).pivot(...).agg(...)`, except for the aggregation, + * which cannot be reversed. This is an alias for `unpivot`. + * + * @see + * `org.apache.spark.sql.Dataset.unpivot(Array, Array, String, String)` + * + * @param ids + * Id columns + * @param values + * Value columns to unpivot + * @param variableColumnName + * Name of the variable column + * @param valueColumnName + * Name of the value column + * + * @group untypedrel + * @since 3.4.0 + */ + def melt( + ids: Array[Column], + values: Array[Column], + variableColumnName: String, + valueColumnName: String): DataFrame = + unpivot(ids, values, variableColumnName, valueColumnName) + + /** + * Unpivot a DataFrame from wide format to long format, optionally leaving identifier columns + * set. This is the reverse to `groupBy(...).pivot(...).agg(...)`, except for the aggregation, + * which cannot be reversed. This is an alias for `unpivot`. + * + * @see + * `org.apache.spark.sql.Dataset.unpivot(Array, Array, String, String)` + * + * This is equivalent to calling `Dataset#unpivot(Array, Array, String, String)` where `values` + * is set to all non-id columns that exist in the DataFrame. + * + * @param ids + * Id columns + * @param variableColumnName + * Name of the variable column + * @param valueColumnName + * Name of the value column + * + * @group untypedrel + * @since 3.4.0 + */ + def melt(ids: Array[Column], variableColumnName: String, valueColumnName: String): DataFrame = + unpivot(ids, variableColumnName, valueColumnName) + + /** + * Returns a new Dataset by taking the first `n` rows. The difference between this function and + * `head` is that `head` is an action and returns an array (by triggering query execution) while + * `limit` returns a new Dataset. + * + * @group typedrel + * @since 3.4.0 + */ + def limit(n: Int): Dataset[T] = sparkSession.newDataset(encoder) { builder => + builder.getLimitBuilder + .setInput(plan.getRoot) + .setLimit(n) + } + + /** + * Returns a new Dataset by skipping the first `n` rows. + * + * @group typedrel + * @since 3.4.0 + */ + def offset(n: Int): Dataset[T] = sparkSession.newDataset(encoder) { builder => + builder.getOffsetBuilder + .setInput(plan.getRoot) + .setOffset(n) + } + + private def buildSetOp(right: Dataset[T], setOpType: proto.SetOperation.SetOpType)( + f: proto.SetOperation.Builder => Unit): Dataset[T] = { + sparkSession.newDataset(encoder) { builder => + f( + builder.getSetOpBuilder + .setSetOpType(setOpType) + .setLeftInput(plan.getRoot) + .setRightInput(right.plan.getRoot)) + } + } + + /** + * Returns a new Dataset containing union of rows in this Dataset and another Dataset. + * + * This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union (that does + * deduplication of elements), use this function followed by a [[distinct]]. + * + * Also as standard in SQL, this function resolves columns by position (not by name): + * + * {{{ + * val df1 = Seq((1, 2, 3)).toDF("col0", "col1", "col2") + * val df2 = Seq((4, 5, 6)).toDF("col1", "col2", "col0") + * df1.union(df2).show + * + * // output: + * // +----+----+----+ + * // |col0|col1|col2| + * // +----+----+----+ + * // | 1| 2| 3| + * // | 4| 5| 6| + * // +----+----+----+ + * }}} + * + * Notice that the column positions in the schema aren't necessarily matched with the fields in + * the strongly typed objects in a Dataset. This function resolves columns by their positions in + * the schema, not the fields in the strongly typed objects. Use [[unionByName]] to resolve + * columns by field name in the typed objects. + * + * @group typedrel + * @since 3.4.0 + */ + def union(other: Dataset[T]): Dataset[T] = { + buildSetOp(other, proto.SetOperation.SetOpType.SET_OP_TYPE_UNION) { builder => + builder.setIsAll(true) + } + } + + /** + * Returns a new Dataset containing union of rows in this Dataset and another Dataset. This is + * an alias for `union`. + * + * This is equivalent to `UNION ALL` in SQL. To do a SQL-style set union (that does + * deduplication of elements), use this function followed by a [[distinct]]. + * + * Also as standard in SQL, this function resolves columns by position (not by name). + * + * @group typedrel + * @since 3.4.0 + */ + def unionAll(other: Dataset[T]): Dataset[T] = union(other) + + /** + * Returns a new Dataset containing union of rows in this Dataset and another Dataset. + * + * This is different from both `UNION ALL` and `UNION DISTINCT` in SQL. To do a SQL-style set + * union (that does deduplication of elements), use this function followed by a [[distinct]]. + * + * The difference between this function and [[union]] is that this function resolves columns by + * name (not by position): + * + * {{{ + * val df1 = Seq((1, 2, 3)).toDF("col0", "col1", "col2") + * val df2 = Seq((4, 5, 6)).toDF("col1", "col2", "col0") + * df1.unionByName(df2).show + * + * // output: + * // +----+----+----+ + * // |col0|col1|col2| + * // +----+----+----+ + * // | 1| 2| 3| + * // | 6| 4| 5| + * // +----+----+----+ + * }}} + * + * Note that this supports nested columns in struct and array types. Nested columns in map types + * are not currently supported. + * + * @group typedrel + * @since 3.4.0 + */ + def unionByName(other: Dataset[T]): Dataset[T] = unionByName(other, allowMissingColumns = false) + + /** + * Returns a new Dataset containing union of rows in this Dataset and another Dataset. + * + * The difference between this function and [[union]] is that this function resolves columns by + * name (not by position). + * + * When the parameter `allowMissingColumns` is `true`, the set of column names in this and other + * `Dataset` can differ; missing columns will be filled with null. Further, the missing columns + * of this `Dataset` will be added at the end in the schema of the union result: + * + * {{{ + * val df1 = Seq((1, 2, 3)).toDF("col0", "col1", "col2") + * val df2 = Seq((4, 5, 6)).toDF("col1", "col0", "col3") + * df1.unionByName(df2, true).show + * + * // output: "col3" is missing at left df1 and added at the end of schema. + * // +----+----+----+----+ + * // |col0|col1|col2|col3| + * // +----+----+----+----+ + * // | 1| 2| 3|null| + * // | 5| 4|null| 6| + * // +----+----+----+----+ + * + * df2.unionByName(df1, true).show + * + * // output: "col2" is missing at left df2 and added at the end of schema. + * // +----+----+----+----+ + * // |col1|col0|col3|col2| + * // +----+----+----+----+ + * // | 4| 5| 6|null| + * // | 2| 1|null| 3| + * // +----+----+----+----+ + * }}} + * + * Note that this supports nested columns in struct and array types. With `allowMissingColumns`, + * missing nested columns of struct columns with the same name will also be filled with null + * values and added to the end of struct. Nested columns in map types are not currently + * supported. + * + * @group typedrel + * @since 3.4.0 + */ + def unionByName(other: Dataset[T], allowMissingColumns: Boolean): Dataset[T] = { + buildSetOp(other, proto.SetOperation.SetOpType.SET_OP_TYPE_UNION) { builder => + builder.setByName(true).setIsAll(true).setAllowMissingColumns(allowMissingColumns) + } + } + + /** + * Returns a new Dataset containing rows only in both this Dataset and another Dataset. This is + * equivalent to `INTERSECT` in SQL. + * + * @note + * Equality checking is performed directly on the encoded representation of the data and thus + * is not affected by a custom `equals` function defined on `T`. + * + * @group typedrel + * @since 3.4.0 + */ + def intersect(other: Dataset[T]): Dataset[T] = { + buildSetOp(other, proto.SetOperation.SetOpType.SET_OP_TYPE_INTERSECT) { builder => + builder.setIsAll(false) + } + } + + /** + * Returns a new Dataset containing rows only in both this Dataset and another Dataset while + * preserving the duplicates. This is equivalent to `INTERSECT ALL` in SQL. + * + * @note + * Equality checking is performed directly on the encoded representation of the data and thus + * is not affected by a custom `equals` function defined on `T`. Also as standard in SQL, this + * function resolves columns by position (not by name). + * + * @group typedrel + * @since 3.4.0 + */ + def intersectAll(other: Dataset[T]): Dataset[T] = { + buildSetOp(other, proto.SetOperation.SetOpType.SET_OP_TYPE_INTERSECT) { builder => + builder.setIsAll(true) + } + } + + /** + * Returns a new Dataset containing rows in this Dataset but not in another Dataset. This is + * equivalent to `EXCEPT DISTINCT` in SQL. + * + * @note + * Equality checking is performed directly on the encoded representation of the data and thus + * is not affected by a custom `equals` function defined on `T`. + * + * @group typedrel + * @since 3.4.0 + */ + def except(other: Dataset[T]): Dataset[T] = { + buildSetOp(other, proto.SetOperation.SetOpType.SET_OP_TYPE_EXCEPT) { builder => + builder.setIsAll(false) + } + } + + /** + * Returns a new Dataset containing rows in this Dataset but not in another Dataset while + * preserving the duplicates. This is equivalent to `EXCEPT ALL` in SQL. + * + * @note + * Equality checking is performed directly on the encoded representation of the data and thus + * is not affected by a custom `equals` function defined on `T`. Also as standard in SQL, this + * function resolves columns by position (not by name). + * + * @group typedrel + * @since 3.4.0 + */ + def exceptAll(other: Dataset[T]): Dataset[T] = { + buildSetOp(other, proto.SetOperation.SetOpType.SET_OP_TYPE_EXCEPT) { builder => + builder.setIsAll(true) + } + } + + /** + * Returns a new [[Dataset]] by sampling a fraction of rows (without replacement), using a + * user-supplied seed. + * + * @param fraction + * Fraction of rows to generate, range [0.0, 1.0]. + * @param seed + * Seed for sampling. + * + * @note + * This is NOT guaranteed to provide exactly the fraction of the count of the given + * [[Dataset]]. + * + * @group typedrel + * @since 3.4.0 + */ + def sample(fraction: Double, seed: Long): Dataset[T] = { + sample(withReplacement = false, fraction = fraction, seed = seed) + } + + /** + * Returns a new [[Dataset]] by sampling a fraction of rows (without replacement), using a + * random seed. + * + * @param fraction + * Fraction of rows to generate, range [0.0, 1.0]. + * + * @note + * This is NOT guaranteed to provide exactly the fraction of the count of the given + * [[Dataset]]. + * + * @group typedrel + * @since 3.4.0 + */ + def sample(fraction: Double): Dataset[T] = { + sample(withReplacement = false, fraction = fraction) + } + + /** + * Returns a new [[Dataset]] by sampling a fraction of rows, using a user-supplied seed. + * + * @param withReplacement + * Sample with replacement or not. + * @param fraction + * Fraction of rows to generate, range [0.0, 1.0]. + * @param seed + * Seed for sampling. + * + * @note + * This is NOT guaranteed to provide exactly the fraction of the count of the given + * [[Dataset]]. + * + * @group typedrel + * @since 3.4.0 + */ + def sample(withReplacement: Boolean, fraction: Double, seed: Long): Dataset[T] = { + sparkSession.newDataset(encoder) { builder => + builder.getSampleBuilder + .setInput(plan.getRoot) + .setWithReplacement(withReplacement) + .setLowerBound(0.0d) + .setUpperBound(fraction) + .setSeed(seed) + } + } + + /** + * Returns a new [[Dataset]] by sampling a fraction of rows, using a random seed. + * + * @param withReplacement + * Sample with replacement or not. + * @param fraction + * Fraction of rows to generate, range [0.0, 1.0]. + * + * @note + * This is NOT guaranteed to provide exactly the fraction of the total count of the given + * [[Dataset]]. + * + * @group typedrel + * @since 3.4.0 + */ + def sample(withReplacement: Boolean, fraction: Double): Dataset[T] = { + sample(withReplacement, fraction, Utils.random.nextLong) + } + + /** + * Randomly splits this Dataset with the provided weights. + * + * @param weights + * weights for splits, will be normalized if they don't sum to 1. + * @param seed + * Seed for sampling. + * + * For Java API, use [[randomSplitAsList]]. + * + * @group typedrel + * @since 3.4.0 + */ + def randomSplit(weights: Array[Double], seed: Long): Array[Dataset[T]] = { + require( + weights.forall(_ >= 0), + s"Weights must be nonnegative, but got ${weights.mkString("[", ",", "]")}") + require( + weights.sum > 0, + s"Sum of weights must be positive, but got ${weights.mkString("[", ",", "]")}") + + // It is possible that the underlying dataframe doesn't guarantee the ordering of rows in its + // constituent partitions each time a split is materialized which could result in + // overlapping splits. To prevent this, we explicitly sort each input partition to make the + // ordering deterministic. Note that MapTypes cannot be sorted and are explicitly pruned out + // from the sort order. + // TODO we need to have a proper way of stabilizing the input data. The current approach does + // not work well with spark connects' extremely lazy nature. When the schema is modified + // between construction and execution the query might fail or produce wrong results. Another + // problem can come from data that arrives between the execution of the returned datasets. + val sortOrder = schema.collect { + case f if RowOrdering.isOrderable(f.dataType) => col(f.name).asc + } + val sortedInput = sortWithinPartitions(sortOrder: _*).plan.getRoot + val sum = weights.sum + val normalizedCumWeights = weights.map(_ / sum).scanLeft(0.0d)(_ + _) + normalizedCumWeights + .sliding(2) + .map { case Array(low, high) => + sparkSession.newDataset(encoder) { builder => + builder.getSampleBuilder + .setInput(sortedInput) + .setWithReplacement(false) + .setLowerBound(low) + .setUpperBound(high) + .setSeed(seed) + } + } + .toArray + } + + /** + * Returns a Java list that contains randomly split Dataset with the provided weights. + * + * @param weights + * weights for splits, will be normalized if they don't sum to 1. + * @param seed + * Seed for sampling. + * + * @group typedrel + * @since 3.4.0 + */ + def randomSplitAsList(weights: Array[Double], seed: Long): java.util.List[Dataset[T]] = { + val values = randomSplit(weights, seed) + java.util.Arrays.asList(values: _*) + } + + /** + * Randomly splits this Dataset with the provided weights. + * + * @param weights + * weights for splits, will be normalized if they don't sum to 1. + * @group typedrel + * @since 3.4.0 + */ + def randomSplit(weights: Array[Double]): Array[Dataset[T]] = { + randomSplit(weights, Utils.random.nextLong) + } + + private def withColumns(names: Seq[String], values: Seq[Column]): DataFrame = { + val aliases = values.zip(names).map { case (value, name) => + value.name(name).expr.getAlias + } + sparkSession.newDataFrame { builder => + builder.getWithColumnsBuilder + .setInput(plan.getRoot) + .addAllAliases(aliases.asJava) + } + } + + /** + * Returns a new Dataset by adding a column or replacing the existing column that has the same + * name. + * + * `column`'s expression must only refer to attributes supplied by this Dataset. It is an error + * to add a column that refers to some other Dataset. + * + * @note + * this method introduces a projection internally. Therefore, calling it multiple times, for + * instance, via loops in order to add multiple columns can generate big plans which can cause + * performance issues and even `StackOverflowException`. To avoid this, use `select` with the + * multiple columns at once. + * + * @group untypedrel + * @since 3.4.0 + */ + def withColumn(colName: String, col: Column): DataFrame = withColumns(Seq(colName), Seq(col)) + + /** + * (Scala-specific) Returns a new Dataset by adding columns or replacing the existing columns + * that has the same names. + * + * `colsMap` is a map of column name and column, the column must only refer to attributes + * supplied by this Dataset. It is an error to add columns that refers to some other Dataset. + * + * @group untypedrel + * @since 3.4.0 + */ + def withColumns(colsMap: Map[String, Column]): DataFrame = { + val (colNames, newCols) = colsMap.toSeq.unzip + withColumns(colNames, newCols) + } + + /** + * (Java-specific) Returns a new Dataset by adding columns or replacing the existing columns + * that has the same names. + * + * `colsMap` is a map of column name and column, the column must only refer to attribute + * supplied by this Dataset. It is an error to add columns that refers to some other Dataset. + * + * @group untypedrel + * @since 3.4.0 + */ + def withColumns(colsMap: java.util.Map[String, Column]): DataFrame = withColumns( + colsMap.asScala.toMap) + + /** + * Returns a new Dataset with a column renamed. This is a no-op if schema doesn't contain + * existingName. + * + * @group untypedrel + * @since 3.4.0 + */ + def withColumnRenamed(existingName: String, newName: String): DataFrame = { + withColumnsRenamed(Collections.singletonMap(existingName, newName)) + } + + /** + * (Scala-specific) Returns a new Dataset with a columns renamed. This is a no-op if schema + * doesn't contain existingName. + * + * `colsMap` is a map of existing column name and new column name. + * + * @throws AnalysisException + * if there are duplicate names in resulting projection + * + * @group untypedrel + * @since 3.4.0 + */ + @throws[AnalysisException] + def withColumnsRenamed(colsMap: Map[String, String]): DataFrame = { + withColumnsRenamed(colsMap.asJava) + } + + /** + * (Java-specific) Returns a new Dataset with a columns renamed. This is a no-op if schema + * doesn't contain existingName. + * + * `colsMap` is a map of existing column name and new column name. + * + * @group untypedrel + * @since 3.4.0 + */ + def withColumnsRenamed(colsMap: java.util.Map[String, String]): DataFrame = { + sparkSession.newDataFrame { builder => + builder.getWithColumnsRenamedBuilder + .setInput(plan.getRoot) + .putAllRenameColumnsMap(colsMap) + } + } + + /** + * Returns a new Dataset by updating an existing column with metadata. + * + * @group untypedrel + * @since 3.4.0 + */ + def withMetadata(columnName: String, metadata: Metadata): DataFrame = { + val newAlias = proto.Expression.Alias + .newBuilder() + .setExpr(col(columnName).expr) + .addName(columnName) + .setMetadata(metadata.json) + sparkSession.newDataFrame { builder => + builder.getWithColumnsBuilder + .setInput(plan.getRoot) + .addAliases(newAlias) + } + } + + /** + * Registers this Dataset as a temporary table using the given name. The lifetime of this + * temporary table is tied to the [[SparkSession]] that was used to create this Dataset. + * + * @group basic + * @since 3.4.0 + */ + @deprecated("Use createOrReplaceTempView(viewName) instead.", "3.4.0") + def registerTempTable(tableName: String): Unit = { + createOrReplaceTempView(tableName) + } + + /** + * Creates a local temporary view using the given name. The lifetime of this temporary view is + * tied to the [[SparkSession]] that was used to create this Dataset. + * + * Local temporary view is session-scoped. Its lifetime is the lifetime of the session that + * created it, i.e. it will be automatically dropped when the session terminates. It's not tied + * to any databases, i.e. we can't use `db1.view1` to reference a local temporary view. + * + * @throws AnalysisException + * if the view name is invalid or already exists + * + * @group basic + * @since 3.4.0 + */ + @throws[AnalysisException] + def createTempView(viewName: String): Unit = { + buildAndExecuteTempView(viewName, replace = false, global = false) + } + + /** + * Creates a local temporary view using the given name. The lifetime of this temporary view is + * tied to the [[SparkSession]] that was used to create this Dataset. + * + * @group basic + * @since 3.4.0 + */ + def createOrReplaceTempView(viewName: String): Unit = { + buildAndExecuteTempView(viewName, replace = true, global = false) + } + + /** + * Creates a global temporary view using the given name. The lifetime of this temporary view is + * tied to this Spark application. + * + * Global temporary view is cross-session. Its lifetime is the lifetime of the Spark + * application, + * i.e. it will be automatically dropped when the application terminates. It's tied to a system + * preserved database `global_temp`, and we must use the qualified name to refer a global temp + * view, e.g. `SELECT * FROM global_temp.view1`. + * + * @throws AnalysisException + * if the view name is invalid or already exists + * + * @group basic + * @since 3.4.0 + */ + @throws[AnalysisException] + def createGlobalTempView(viewName: String): Unit = { + buildAndExecuteTempView(viewName, replace = false, global = true) + } + + /** + * Creates or replaces a global temporary view using the given name. The lifetime of this + * temporary view is tied to this Spark application. + * + * Global temporary view is cross-session. Its lifetime is the lifetime of the Spark + * application, + * i.e. it will be automatically dropped when the application terminates. It's tied to a system + * preserved database `global_temp`, and we must use the qualified name to refer a global temp + * view, e.g. `SELECT * FROM global_temp.view1`. + * + * @group basic + * @since 3.4.0 + */ + def createOrReplaceGlobalTempView(viewName: String): Unit = { + buildAndExecuteTempView(viewName, replace = true, global = true) + } + + private def buildAndExecuteTempView( + viewName: String, + replace: Boolean, + global: Boolean): Unit = { + val command = sparkSession.newCommand { builder => + builder.getCreateDataframeViewBuilder + .setInput(plan.getRoot) + .setName(viewName) + .setIsGlobal(global) + .setReplace(replace) + } + sparkSession.execute(command) + } + + /** + * Returns a new Dataset with a column dropped. This is a no-op if schema doesn't contain column + * name. + * + * This method can only be used to drop top level columns. the colName string is treated + * literally without further interpretation. + * + * @group untypedrel + * @since 3.4.0 + */ + def drop(colName: String): DataFrame = { + drop(Seq(colName): _*) + } + + /** + * Returns a new Dataset with columns dropped. This is a no-op if schema doesn't contain column + * name(s). + * + * This method can only be used to drop top level columns. the colName string is treated + * literally without further interpretation. + * + * @group untypedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def drop(colNames: String*): DataFrame = buildDropByNames(colNames) + + /** + * Returns a new Dataset with column dropped. + * + * This method can only be used to drop top level column. This version of drop accepts a + * [[Column]] rather than a name. This is a no-op if the Dataset doesn't have a column with an + * equivalent expression. + * + * @group untypedrel + * @since 3.4.0 + */ + def drop(col: Column): DataFrame = { + buildDrop(col :: Nil) + } + + /** + * Returns a new Dataset with columns dropped. + * + * This method can only be used to drop top level columns. This is a no-op if the Dataset + * doesn't have a columns with an equivalent expression. + * + * @group untypedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def drop(col: Column, cols: Column*): DataFrame = buildDrop(col +: cols) + + private def buildDrop(cols: Seq[Column]): DataFrame = sparkSession.newDataFrame { builder => + builder.getDropBuilder + .setInput(plan.getRoot) + .addAllColumns(cols.map(_.expr).asJava) + } + + private def buildDropByNames(cols: Seq[String]): DataFrame = sparkSession.newDataFrame { + builder => + builder.getDropBuilder + .setInput(plan.getRoot) + .addAllColumnNames(cols.asJava) + } + + /** + * Returns a new Dataset that contains only the unique rows from this Dataset. This is an alias + * for `distinct`. + * + * @group typedrel + * @since 3.4.0 + */ + def dropDuplicates(): Dataset[T] = sparkSession.newDataset(encoder) { builder => + builder.getDeduplicateBuilder + .setInput(plan.getRoot) + .setAllColumnsAsKeys(true) + } + + /** + * (Scala-specific) Returns a new Dataset with duplicate rows removed, considering only the + * subset of columns. + * + * @group typedrel + * @since 3.4.0 + */ + def dropDuplicates(colNames: Seq[String]): Dataset[T] = sparkSession.newDataset(encoder) { + builder => + builder.getDeduplicateBuilder + .setInput(plan.getRoot) + .addAllColumnNames(colNames.asJava) + } + + /** + * Returns a new Dataset with duplicate rows removed, considering only the subset of columns. + * + * @group typedrel + * @since 3.4.0 + */ + def dropDuplicates(colNames: Array[String]): Dataset[T] = dropDuplicates(colNames.toSeq) + + /** + * Returns a new [[Dataset]] with duplicate rows removed, considering only the subset of + * columns. + * + * @group typedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def dropDuplicates(col1: String, cols: String*): Dataset[T] = { + val colNames: Seq[String] = col1 +: cols + dropDuplicates(colNames) + } + + /** + * Computes basic statistics for numeric and string columns, including count, mean, stddev, min, + * and max. If no columns are given, this function computes statistics for all numerical or + * string columns. + * + * This function is meant for exploratory data analysis, as we make no guarantee about the + * backward compatibility of the schema of the resulting Dataset. If you want to + * programmatically compute summary statistics, use the `agg` function instead. + * + * {{{ + * ds.describe("age", "height").show() + * + * // output: + * // summary age height + * // count 10.0 10.0 + * // mean 53.3 178.05 + * // stddev 11.6 15.7 + * // min 18.0 163.0 + * // max 92.0 192.0 + * }}} + * + * Use [[summary]] for expanded statistics and control over which statistics to compute. + * + * @param cols + * Columns to compute statistics on. + * + * @group action + * @since 3.4.0 + */ + @scala.annotation.varargs + def describe(cols: String*): DataFrame = sparkSession.newDataFrame { builder => + builder.getDescribeBuilder + .setInput(plan.getRoot) + .addAllCols(cols.asJava) + } + + /** + * Computes specified statistics for numeric and string columns. Available statistics are:
    + *
  • count
  • mean
  • stddev
  • min
  • max
  • arbitrary + * approximate percentiles specified as a percentage (e.g. 75%)
  • count_distinct
  • + *
  • approx_count_distinct
+ * + * If no statistics are given, this function computes count, mean, stddev, min, approximate + * quartiles (percentiles at 25%, 50%, and 75%), and max. + * + * This function is meant for exploratory data analysis, as we make no guarantee about the + * backward compatibility of the schema of the resulting Dataset. If you want to + * programmatically compute summary statistics, use the `agg` function instead. + * + * {{{ + * ds.summary().show() + * + * // output: + * // summary age height + * // count 10.0 10.0 + * // mean 53.3 178.05 + * // stddev 11.6 15.7 + * // min 18.0 163.0 + * // 25% 24.0 176.0 + * // 50% 24.0 176.0 + * // 75% 32.0 180.0 + * // max 92.0 192.0 + * }}} + * + * {{{ + * ds.summary("count", "min", "25%", "75%", "max").show() + * + * // output: + * // summary age height + * // count 10.0 10.0 + * // min 18.0 163.0 + * // 25% 24.0 176.0 + * // 75% 32.0 180.0 + * // max 92.0 192.0 + * }}} + * + * To do a summary for specific columns first select them: + * + * {{{ + * ds.select("age", "height").summary().show() + * }}} + * + * Specify statistics to output custom summaries: + * + * {{{ + * ds.summary("count", "count_distinct").show() + * }}} + * + * The distinct count isn't included by default. + * + * You can also run approximate distinct counts which are faster: + * + * {{{ + * ds.summary("count", "approx_count_distinct").show() + * }}} + * + * See also [[describe]] for basic statistics. + * + * @param statistics + * Statistics from above list to be computed. + * + * @group action + * @since 3.4.0 + */ + @scala.annotation.varargs + def summary(statistics: String*): DataFrame = sparkSession.newDataFrame { builder => + builder.getSummaryBuilder + .setInput(plan.getRoot) + .addAllStatistics(statistics.asJava) + } + + /** + * Returns the first `n` rows. + * + * @note + * this method should only be used if the resulting array is expected to be small, as all the + * data is loaded into the driver's memory. + * + * @group action + * @since 3.4.0 + */ + def head(n: Int): Array[T] = limit(n).collect() + + /** + * Returns the first row. + * @group action + * @since 3.4.0 + */ + def head(): T = head(1).head + + /** + * Returns the first row. Alias for head(). + * @group action + * @since 3.4.0 + */ + def first(): T = head() + + /** + * Concise syntax for chaining custom transformations. + * {{{ + * def featurize(ds: Dataset[T]): Dataset[U] = ... + * + * ds + * .transform(featurize) + * .transform(...) + * }}} + * + * @group typedrel + * @since 3.4.0 + */ + def transform[U](t: Dataset[T] => Dataset[U]): Dataset[U] = t(this) + + /** + * Returns the first `n` rows in the Dataset. + * + * Running take requires moving data into the application's driver process, and doing so with a + * very large `n` can crash the driver process with OutOfMemoryError. + * + * @group action + * @since 3.4.0 + */ + def take(n: Int): Array[T] = head(n) + + /** + * Returns the last `n` rows in the Dataset. + * + * Running tail requires moving data into the application's driver process, and doing so with a + * very large `n` can crash the driver process with OutOfMemoryError. + * + * @group action + * @since 3.4.0 + */ + def tail(n: Int): Array[T] = { + val lastN = sparkSession.newDataset(encoder) { builder => + builder.getTailBuilder + .setInput(plan.getRoot) + .setLimit(n) + } + lastN.collect() + } + + /** + * Returns the first `n` rows in the Dataset as a list. + * + * Running take requires moving data into the application's driver process, and doing so with a + * very large `n` can crash the driver process with OutOfMemoryError. + * + * @group action + * @since 3.4.0 + */ + def takeAsList(n: Int): java.util.List[T] = java.util.Arrays.asList(take(n): _*) + + /** + * Returns an array that contains all rows in this Dataset. + * + * Running collect requires moving all the data into the application's driver process, and doing + * so on a very large dataset can crash the driver process with OutOfMemoryError. + * + * For Java API, use [[collectAsList]]. + * + * @group action + * @since 3.4.0 + */ + def collect(): Array[T] = withResult { result => + result.toArray + } + + /** + * Returns a Java list that contains all rows in this Dataset. + * + * Running collect requires moving all the data into the application's driver process, and doing + * so on a very large dataset can crash the driver process with OutOfMemoryError. + * + * @group action + * @since 3.4.0 + */ + def collectAsList(): java.util.List[T] = { + java.util.Arrays.asList(collect(): _*) + } + + /** + * Returns an iterator that contains all rows in this Dataset. + * + * The returned iterator implements [[AutoCloseable]]. For memory management it is better to + * close it once you are done. If you don't close it, it and the underlying data will be cleaned + * up once the iterator is garbage collected. + * + * @group action + * @since 3.4.0 + */ + def toLocalIterator(): java.util.Iterator[T] = { + // TODO make this a destructive iterator. + collectResult().iterator + } + + /** + * Returns the number of rows in the Dataset. + * @group action + * @since 3.4.0 + */ + def count(): Long = { + groupBy().count().as(PrimitiveLongEncoder).collect().head + } + + private def buildRepartition(numPartitions: Int, shuffle: Boolean): Dataset[T] = { + sparkSession.newDataset(encoder) { builder => + builder.getRepartitionBuilder + .setInput(plan.getRoot) + .setNumPartitions(numPartitions) + .setShuffle(shuffle) + } + } + + private def buildRepartitionByExpression( + numPartitions: Option[Int], + partitionExprs: Seq[Column]): Dataset[T] = sparkSession.newDataset(encoder) { builder => + val repartitionBuilder = builder.getRepartitionByExpressionBuilder + .setInput(plan.getRoot) + .addAllPartitionExprs(partitionExprs.map(_.expr).asJava) + numPartitions.foreach(repartitionBuilder.setNumPartitions) + } + + /** + * Returns a new Dataset that has exactly `numPartitions` partitions. + * + * @group typedrel + * @since 3.4.0 + */ + def repartition(numPartitions: Int): Dataset[T] = { + buildRepartition(numPartitions, shuffle = true) + } + + private def repartitionByExpression( + numPartitions: Option[Int], + partitionExprs: Seq[Column]): Dataset[T] = { + // The underlying `LogicalPlan` operator special-cases all-`SortOrder` arguments. + // However, we don't want to complicate the semantics of this API method. + // Instead, let's give users a friendly error message, pointing them to the new method. + val sortOrders = partitionExprs.filter(_.expr.hasSortOrder) + if (sortOrders.nonEmpty) { + throw new IllegalArgumentException( + s"Invalid partitionExprs specified: $sortOrders\n" + + s"For range partitioning use repartitionByRange(...) instead.") + } + buildRepartitionByExpression(numPartitions, partitionExprs) + } + + /** + * Returns a new Dataset partitioned by the given partitioning expressions into `numPartitions`. + * The resulting Dataset is hash partitioned. + * + * This is the same operation as "DISTRIBUTE BY" in SQL (Hive QL). + * + * @group typedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def repartition(numPartitions: Int, partitionExprs: Column*): Dataset[T] = { + repartitionByExpression(Some(numPartitions), partitionExprs) + } + + /** + * Returns a new Dataset partitioned by the given partitioning expressions, using + * `spark.sql.shuffle.partitions` as number of partitions. The resulting Dataset is hash + * partitioned. + * + * This is the same operation as "DISTRIBUTE BY" in SQL (Hive QL). + * + * @group typedrel + * @since 2.0.0 + */ + @scala.annotation.varargs + def repartition(partitionExprs: Column*): Dataset[T] = { + repartitionByExpression(None, partitionExprs) + } + + private def repartitionByRange( + numPartitions: Option[Int], + partitionExprs: Seq[Column]): Dataset[T] = { + require(partitionExprs.nonEmpty, "At least one partition-by expression must be specified.") + val sortExprs = partitionExprs.map { + case e if e.expr.hasSortOrder => e + case e => e.asc + } + buildRepartitionByExpression(numPartitions, sortExprs) + } + + /** + * Returns a new Dataset partitioned by the given partitioning expressions into `numPartitions`. + * The resulting Dataset is range partitioned. + * + * At least one partition-by expression must be specified. When no explicit sort order is + * specified, "ascending nulls first" is assumed. Note, the rows are not sorted in each + * partition of the resulting Dataset. + * + * Note that due to performance reasons this method uses sampling to estimate the ranges. Hence, + * the output may not be consistent, since sampling can return different values. The sample size + * can be controlled by the config `spark.sql.execution.rangeExchange.sampleSizePerPartition`. + * + * @group typedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def repartitionByRange(numPartitions: Int, partitionExprs: Column*): Dataset[T] = { + repartitionByRange(Some(numPartitions), partitionExprs) + } + + /** + * Returns a new Dataset partitioned by the given partitioning expressions, using + * `spark.sql.shuffle.partitions` as number of partitions. The resulting Dataset is range + * partitioned. + * + * At least one partition-by expression must be specified. When no explicit sort order is + * specified, "ascending nulls first" is assumed. Note, the rows are not sorted in each + * partition of the resulting Dataset. + * + * Note that due to performance reasons this method uses sampling to estimate the ranges. Hence, + * the output may not be consistent, since sampling can return different values. The sample size + * can be controlled by the config `spark.sql.execution.rangeExchange.sampleSizePerPartition`. + * + * @group typedrel + * @since 3.4.0 + */ + @scala.annotation.varargs + def repartitionByRange(partitionExprs: Column*): Dataset[T] = { + repartitionByRange(None, partitionExprs) + } + + /** + * Returns a new Dataset that has exactly `numPartitions` partitions, when the fewer partitions + * are requested. If a larger number of partitions is requested, it will stay at the current + * number of partitions. Similar to coalesce defined on an `RDD`, this operation results in a + * narrow dependency, e.g. if you go from 1000 partitions to 100 partitions, there will not be a + * shuffle, instead each of the 100 new partitions will claim 10 of the current partitions. + * + * However, if you're doing a drastic coalesce, e.g. to numPartitions = 1, this may result in + * your computation taking place on fewer nodes than you like (e.g. one node in the case of + * numPartitions = 1). To avoid this, you can call repartition. This will add a shuffle step, + * but means the current upstream partitions will be executed in parallel (per whatever the + * current partitioning is). + * + * @group typedrel + * @since 3.4.0 + */ + def coalesce(numPartitions: Int): Dataset[T] = { + buildRepartition(numPartitions, shuffle = false) + } + + /** + * Returns a new Dataset that contains only the unique rows from this Dataset. This is an alias + * for `dropDuplicates`. + * + * Note that for a streaming [[Dataset]], this method returns distinct rows only once regardless + * of the output mode, which the behavior may not be same with `DISTINCT` in SQL against + * streaming [[Dataset]]. + * + * @note + * Equality checking is performed directly on the encoded representation of the data and thus + * is not affected by a custom `equals` function defined on `T`. + * + * @group typedrel + * @since 3.4.0 + */ + def distinct(): Dataset[T] = dropDuplicates() + + /** + * Returns a best-effort snapshot of the files that compose this Dataset. This method simply + * asks each constituent BaseRelation for its respective files and takes the union of all + * results. Depending on the source relations, this may not find all input files. Duplicates are + * removed. + * + * @group basic + * @since 3.4.0 + */ + def inputFiles: Array[String] = + sparkSession + .analyze(plan, proto.AnalyzePlanRequest.AnalyzeCase.INPUT_FILES) + .getInputFiles + .getFilesList + .asScala + .toArray + + /** + * Interface for saving the content of the non-streaming Dataset out into external storage. + * + * @group basic + * @since 3.4.0 + */ + def write: DataFrameWriter[T] = { + new DataFrameWriter[T](this) + } + + /** + * Create a write configuration builder for v2 sources. + * + * This builder is used to configure and execute write operations. For example, to append to an + * existing table, run: + * + * {{{ + * df.writeTo("catalog.db.table").append() + * }}} + * + * This can also be used to create or replace existing tables: + * + * {{{ + * df.writeTo("catalog.db.table").partitionedBy($"col").createOrReplace() + * }}} + * + * @group basic + * @since 3.4.0 + */ + def writeTo(table: String): DataFrameWriterV2[T] = { + new DataFrameWriterV2[T](table, this) + } + + /** + * Persist this Dataset with the default storage level (`MEMORY_AND_DISK`). + * + * @group basic + * @since 3.4.0 + */ + def persist(): this.type = { + sparkSession.analyze { builder => + builder.getPersistBuilder.setRelation(plan.getRoot) + } + this + } + + /** + * Persist this Dataset with the given storage level. + * + * @param newLevel + * One of: `MEMORY_ONLY`, `MEMORY_AND_DISK`, `MEMORY_ONLY_SER`, `MEMORY_AND_DISK_SER`, + * `DISK_ONLY`, `MEMORY_ONLY_2`, `MEMORY_AND_DISK_2`, etc. + * @group basic + * @since 3.4.0 + */ + def persist(newLevel: StorageLevel): this.type = { + sparkSession.analyze { builder => + builder.getPersistBuilder + .setRelation(plan.getRoot) + .setStorageLevel(StorageLevelProtoConverter.toConnectProtoType(newLevel)) + } + this + } + + /** + * Mark the Dataset as non-persistent, and remove all blocks for it from memory and disk. This + * will not un-persist any cached data that is built upon this Dataset. + * + * @param blocking + * Whether to block until all blocks are deleted. + * @group basic + * @since 3.4.0 + */ + def unpersist(blocking: Boolean): this.type = { + sparkSession.analyze { builder => + builder.getUnpersistBuilder + .setRelation(plan.getRoot) + .setBlocking(blocking) + } + this + } + + /** + * Mark the Dataset as non-persistent, and remove all blocks for it from memory and disk. This + * will not un-persist any cached data that is built upon this Dataset. + * + * @group basic + * @since 3.4.0 + */ + def unpersist(): this.type = unpersist(blocking = false) + + /** + * Persist this Dataset with the default storage level (`MEMORY_AND_DISK`). + * + * @group basic + * @since 3.4.0 + */ + def cache(): this.type = persist() + + /** + * Get the Dataset's current storage level, or StorageLevel.NONE if not persisted. + * + * @group basic + * @since 3.4.0 + */ + def storageLevel: StorageLevel = { + StorageLevelProtoConverter.toStorageLevel( + sparkSession + .analyze { builder => + builder.getGetStorageLevelBuilder.setRelation(plan.getRoot) + } + .getGetStorageLevel + .getStorageLevel) + } + + def withWatermark(eventTime: String, delayThreshold: String): Dataset[T] = { + throw new UnsupportedOperationException("withWatermark is not implemented.") + } + + def observe(name: String, expr: Column, exprs: Column*): Dataset[T] = { + throw new UnsupportedOperationException("observe is not implemented.") + } + + def foreach(f: T => Unit): Unit = { + throw new UnsupportedOperationException("foreach is not implemented.") + } + + def foreachPartition(f: Iterator[T] => Unit): Unit = { + throw new UnsupportedOperationException("foreach is not implemented.") + } + + def checkpoint(): Dataset[T] = { + throw new UnsupportedOperationException("checkpoint is not implemented.") + } + + def checkpoint(eager: Boolean): Dataset[T] = { + throw new UnsupportedOperationException("checkpoint is not implemented.") + } + + def localCheckpoint(): Dataset[T] = { + throw new UnsupportedOperationException("localCheckpoint is not implemented.") + } + + def localCheckpoint(eager: Boolean): Dataset[T] = { + throw new UnsupportedOperationException("localCheckpoint is not implemented.") + } + + /** + * Returns `true` when the logical query plans inside both [[Dataset]]s are equal and therefore + * return same results. + * + * @note + * The equality comparison here is simplified by tolerating the cosmetic differences such as + * attribute names. + * @note + * This API can compare both [[Dataset]]s but can still return `false` on the [[Dataset]] that + * return the same results, for instance, from different plans. Such false negative semantic + * can be useful when caching as an example. This comparison may not be fast because it will + * execute a RPC call. + * @since 3.4.0 + */ + @DeveloperApi + def sameSemantics(other: Dataset[T]): Boolean = { + sparkSession.sameSemantics(this.plan, other.plan) + } + + /** + * Returns a `hashCode` of the logical query plan against this [[Dataset]]. + * + * @note + * Unlike the standard `hashCode`, the hash is calculated against the query plan simplified by + * tolerating the cosmetic differences such as attribute names. + * @since 3.4.0 + */ + @DeveloperApi + def semanticHash(): Int = { + sparkSession.semanticHash(this.plan) + } + + def toJSON: Dataset[String] = { + select(to_json(struct(col("*")))).as(StringEncoder) + } + + private[sql] def analyze: proto.AnalyzePlanResponse = { + sparkSession.analyze(plan, proto.AnalyzePlanRequest.AnalyzeCase.SCHEMA) + } + + def collectResult(): SparkResult[T] = sparkSession.execute(plan, encoder) + + private[sql] def withResult[E](f: SparkResult[T] => E): E = { + val result = collectResult() + try f(result) + finally { + result.close() + } + } +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DatasetHolder.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DatasetHolder.scala new file mode 100644 index 0000000000000..66f591bf1fb99 --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/DatasetHolder.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +/** + * A container for a [[Dataset]], used for implicit conversions in Scala. + * + * To use this, import implicit conversions in SQL: + * {{{ + * val spark: SparkSession = ... + * import spark.implicits._ + * }}} + * + * @since 3.4.0 + */ +case class DatasetHolder[T] private[sql] (private val ds: Dataset[T]) { + + // This is declared with parentheses to prevent the Scala compiler from treating + // `rdd.toDS("1")` as invoking this toDS and then apply on the returned Dataset. + def toDS(): Dataset[T] = ds + + // This is declared with parentheses to prevent the Scala compiler from treating + // `rdd.toDF("1")` as invoking this toDF and then apply on the returned DataFrame. + def toDF(): DataFrame = ds.toDF() + + def toDF(colNames: String*): DataFrame = ds.toDF(colNames: _*) +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala new file mode 100644 index 0000000000000..5a10e1d52eb39 --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/RelationalGroupedDataset.scala @@ -0,0 +1,417 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.util.Locale + +import scala.collection.JavaConverters._ + +import org.apache.spark.connect.proto + +/** + * A set of methods for aggregations on a `DataFrame`, created by [[Dataset#groupBy groupBy]], + * [[Dataset#cube cube]] or [[Dataset#rollup rollup]] (and also `pivot`). + * + * The main method is the `agg` function, which has multiple variants. This class also contains + * some first-order statistics such as `mean`, `sum` for convenience. + * + * @note + * This class was named `GroupedData` in Spark 1.x. + * + * @since 3.4.0 + */ +class RelationalGroupedDataset private[sql] ( + private[sql] val df: DataFrame, + private[sql] val groupingExprs: Seq[proto.Expression], + groupType: proto.Aggregate.GroupType, + pivot: Option[proto.Aggregate.Pivot] = None) { + + private[this] def toDF(aggExprs: Seq[Column]): DataFrame = { + df.sparkSession.newDataFrame { builder => + builder.getAggregateBuilder + .setInput(df.plan.getRoot) + .addAllGroupingExpressions(groupingExprs.asJava) + .addAllAggregateExpressions(aggExprs.map(e => e.expr).asJava) + + groupType match { + case proto.Aggregate.GroupType.GROUP_TYPE_ROLLUP => + builder.getAggregateBuilder.setGroupType(proto.Aggregate.GroupType.GROUP_TYPE_ROLLUP) + case proto.Aggregate.GroupType.GROUP_TYPE_CUBE => + builder.getAggregateBuilder.setGroupType(proto.Aggregate.GroupType.GROUP_TYPE_CUBE) + case proto.Aggregate.GroupType.GROUP_TYPE_GROUPBY => + builder.getAggregateBuilder.setGroupType(proto.Aggregate.GroupType.GROUP_TYPE_GROUPBY) + case proto.Aggregate.GroupType.GROUP_TYPE_PIVOT => + assert(pivot.isDefined) + builder.getAggregateBuilder + .setGroupType(proto.Aggregate.GroupType.GROUP_TYPE_PIVOT) + .setPivot(pivot.get) + case g => throw new UnsupportedOperationException(g.toString) + } + } + } + + /** + * (Scala-specific) Compute aggregates by specifying the column names and aggregate methods. The + * resulting `DataFrame` will also contain the grouping columns. + * + * The available aggregate methods are `avg`, `max`, `min`, `sum`, `count`. + * {{{ + * // Selects the age of the oldest employee and the aggregate expense for each department + * df.groupBy("department").agg( + * "age" -> "max", + * "expense" -> "sum" + * ) + * }}} + * + * @since 3.4.0 + */ + def agg(aggExpr: (String, String), aggExprs: (String, String)*): DataFrame = { + toDF((aggExpr +: aggExprs).map { case (colName, expr) => + strToColumn(expr, df(colName)) + }) + } + + /** + * (Scala-specific) Compute aggregates by specifying a map from column name to aggregate + * methods. The resulting `DataFrame` will also contain the grouping columns. + * + * The available aggregate methods are `avg`, `max`, `min`, `sum`, `count`. + * {{{ + * // Selects the age of the oldest employee and the aggregate expense for each department + * df.groupBy("department").agg(Map( + * "age" -> "max", + * "expense" -> "sum" + * )) + * }}} + * + * @since 3.4.0 + */ + def agg(exprs: Map[String, String]): DataFrame = { + toDF(exprs.map { case (colName, expr) => + strToColumn(expr, df(colName)) + }.toSeq) + } + + /** + * (Java-specific) Compute aggregates by specifying a map from column name to aggregate methods. + * The resulting `DataFrame` will also contain the grouping columns. + * + * The available aggregate methods are `avg`, `max`, `min`, `sum`, `count`. + * {{{ + * // Selects the age of the oldest employee and the aggregate expense for each department + * import com.google.common.collect.ImmutableMap; + * df.groupBy("department").agg(ImmutableMap.of("age", "max", "expense", "sum")); + * }}} + * + * @since 3.4.0 + */ + def agg(exprs: java.util.Map[String, String]): DataFrame = { + agg(exprs.asScala.toMap) + } + + private[this] def strToColumn(expr: String, inputExpr: Column): Column = { + expr.toLowerCase(Locale.ROOT) match { + case "avg" | "average" | "mean" => functions.avg(inputExpr) + case "stddev" | "std" => functions.stddev(inputExpr) + case "count" | "size" => functions.count(inputExpr) + case name => Column.fn(name, inputExpr) + } + } + + /** + * Compute aggregates by specifying a series of aggregate columns. Note that this function by + * default retains the grouping columns in its output. To not retain grouping columns, set + * `spark.sql.retainGroupColumns` to false. + * + * The available aggregate methods are defined in [[org.apache.spark.sql.functions]]. + * + * {{{ + * // Selects the age of the oldest employee and the aggregate expense for each department + * + * // Scala: + * import org.apache.spark.sql.functions._ + * df.groupBy("department").agg(max("age"), sum("expense")) + * + * // Java: + * import static org.apache.spark.sql.functions.*; + * df.groupBy("department").agg(max("age"), sum("expense")); + * }}} + * + * Note that before Spark 1.4, the default behavior is to NOT retain grouping columns. To change + * to that behavior, set config variable `spark.sql.retainGroupColumns` to `false`. + * {{{ + * // Scala, 1.3.x: + * df.groupBy("department").agg($"department", max("age"), sum("expense")) + * + * // Java, 1.3.x: + * df.groupBy("department").agg(col("department"), max("age"), sum("expense")); + * }}} + * + * @since 3.4.0 + */ + @scala.annotation.varargs + def agg(expr: Column, exprs: Column*): DataFrame = { + toDF((expr +: exprs).map { case c => + c + // TODO: deal with typed columns. + }) + } + + /** + * Count the number of rows for each group. The resulting `DataFrame` will also contain the + * grouping columns. + * + * @since 3.4.0 + */ + def count(): DataFrame = toDF(Seq(functions.count(functions.lit(1)).alias("count"))) + + /** + * Compute the average value for each numeric columns for each group. This is an alias for + * `avg`. The resulting `DataFrame` will also contain the grouping columns. When specified + * columns are given, only compute the average values for them. + * + * @since 3.4.0 + */ + @scala.annotation.varargs + def mean(colNames: String*): DataFrame = { + toDF(colNames.map(colName => functions.mean(colName))) + } + + /** + * Compute the max value for each numeric columns for each group. The resulting `DataFrame` will + * also contain the grouping columns. When specified columns are given, only compute the max + * values for them. + * + * @since 3.4.0 + */ + @scala.annotation.varargs + def max(colNames: String*): DataFrame = { + toDF(colNames.map(colName => functions.max(colName))) + } + + /** + * Compute the mean value for each numeric columns for each group. The resulting `DataFrame` + * will also contain the grouping columns. When specified columns are given, only compute the + * mean values for them. + * + * @since 3.4.0 + */ + @scala.annotation.varargs + def avg(colNames: String*): DataFrame = { + toDF(colNames.map(colName => functions.avg(colName))) + } + + /** + * Compute the min value for each numeric column for each group. The resulting `DataFrame` will + * also contain the grouping columns. When specified columns are given, only compute the min + * values for them. + * + * @since 3.4.0 + */ + @scala.annotation.varargs + def min(colNames: String*): DataFrame = { + toDF(colNames.map(colName => functions.min(colName))) + } + + /** + * Compute the sum for each numeric columns for each group. The resulting `DataFrame` will also + * contain the grouping columns. When specified columns are given, only compute the sum for + * them. + * + * @since 3.4.0 + */ + @scala.annotation.varargs + def sum(colNames: String*): DataFrame = { + toDF(colNames.map(colName => functions.sum(colName))) + } + + /** + * Pivots a column of the current `DataFrame` and performs the specified aggregation. + * + * There are two versions of `pivot` function: one that requires the caller to specify the list + * of distinct values to pivot on, and one that does not. The latter is more concise but less + * efficient, because Spark needs to first compute the list of distinct values internally. + * + * {{{ + * // Compute the sum of earnings for each year by course with each course as a separate column + * df.groupBy("year").pivot("course", Seq("dotNET", "Java")).sum("earnings") + * + * // Or without specifying column values (less efficient) + * df.groupBy("year").pivot("course").sum("earnings") + * }}} + * + * @see + * `org.apache.spark.sql.Dataset.unpivot` for the reverse operation, except for the + * aggregation. + * + * @param pivotColumn + * Name of the column to pivot. + * @since 3.4.0 + */ + def pivot(pivotColumn: String): RelationalGroupedDataset = pivot(Column(pivotColumn)) + + /** + * Pivots a column of the current `DataFrame` and performs the specified aggregation. There are + * two versions of pivot function: one that requires the caller to specify the list of distinct + * values to pivot on, and one that does not. The latter is more concise but less efficient, + * because Spark needs to first compute the list of distinct values internally. + * + * {{{ + * // Compute the sum of earnings for each year by course with each course as a separate column + * df.groupBy("year").pivot("course", Seq("dotNET", "Java")).sum("earnings") + * + * // Or without specifying column values (less efficient) + * df.groupBy("year").pivot("course").sum("earnings") + * }}} + * + * From Spark 3.0.0, values can be literal columns, for instance, struct. For pivoting by + * multiple columns, use the `struct` function to combine the columns and values: + * + * {{{ + * df.groupBy("year") + * .pivot("trainingCourse", Seq(struct(lit("java"), lit("Experts")))) + * .agg(sum($"earnings")) + * }}} + * + * @see + * `org.apache.spark.sql.Dataset.unpivot` for the reverse operation, except for the + * aggregation. + * + * @param pivotColumn + * Name of the column to pivot. + * @param values + * List of values that will be translated to columns in the output DataFrame. + * @since 3.4.0 + */ + def pivot(pivotColumn: String, values: Seq[Any]): RelationalGroupedDataset = { + pivot(Column(pivotColumn), values) + } + + /** + * (Java-specific) Pivots a column of the current `DataFrame` and performs the specified + * aggregation. + * + * There are two versions of pivot function: one that requires the caller to specify the list of + * distinct values to pivot on, and one that does not. The latter is more concise but less + * efficient, because Spark needs to first compute the list of distinct values internally. + * + * {{{ + * // Compute the sum of earnings for each year by course with each course as a separate column + * df.groupBy("year").pivot("course", Arrays.asList("dotNET", "Java")).sum("earnings"); + * + * // Or without specifying column values (less efficient) + * df.groupBy("year").pivot("course").sum("earnings"); + * }}} + * + * @see + * `org.apache.spark.sql.Dataset.unpivot` for the reverse operation, except for the + * aggregation. + * + * @param pivotColumn + * Name of the column to pivot. + * @param values + * List of values that will be translated to columns in the output DataFrame. + * @since 3.4.0 + */ + def pivot(pivotColumn: String, values: java.util.List[Any]): RelationalGroupedDataset = { + pivot(Column(pivotColumn), values) + } + + /** + * Pivots a column of the current `DataFrame` and performs the specified aggregation. This is an + * overloaded version of the `pivot` method with `pivotColumn` of the `String` type. + * + * {{{ + * // Compute the sum of earnings for each year by course with each course as a separate column + * df.groupBy($"year").pivot($"course", Seq("dotNET", "Java")).sum($"earnings") + * }}} + * + * @see + * `org.apache.spark.sql.Dataset.unpivot` for the reverse operation, except for the + * aggregation. + * + * @param pivotColumn + * the column to pivot. + * @param values + * List of values that will be translated to columns in the output DataFrame. + * @since 3.4.0 + */ + def pivot(pivotColumn: Column, values: Seq[Any]): RelationalGroupedDataset = { + groupType match { + case proto.Aggregate.GroupType.GROUP_TYPE_GROUPBY => + val valueExprs = values.map(_ match { + case c: Column if c.expr.hasLiteral => c.expr.getLiteral + case c: Column if !c.expr.hasLiteral => + throw new IllegalArgumentException("values only accept literal Column") + case v => functions.lit(v).expr.getLiteral + }) + new RelationalGroupedDataset( + df, + groupingExprs, + proto.Aggregate.GroupType.GROUP_TYPE_PIVOT, + Some( + proto.Aggregate.Pivot + .newBuilder() + .setCol(pivotColumn.expr) + .addAllValues(valueExprs.asJava) + .build())) + case _ => + throw new UnsupportedOperationException() + } + } + + /** + * Pivots a column of the current `DataFrame` and performs the specified aggregation. This is an + * overloaded version of the `pivot` method with `pivotColumn` of the `String` type. + * + * {{{ + * // Or without specifying column values (less efficient) + * df.groupBy($"year").pivot($"course").sum($"earnings"); + * }}} + * + * @see + * `org.apache.spark.sql.Dataset.unpivot` for the reverse operation, except for the + * aggregation. + * + * @param pivotColumn + * he column to pivot. + * @since 3.4.0 + */ + def pivot(pivotColumn: Column): RelationalGroupedDataset = { + pivot(pivotColumn, Seq()) + } + + /** + * (Java-specific) Pivots a column of the current `DataFrame` and performs the specified + * aggregation. This is an overloaded version of the `pivot` method with `pivotColumn` of the + * `String` type. + * + * @see + * `org.apache.spark.sql.Dataset.unpivot` for the reverse operation, except for the + * aggregation. + * + * @param pivotColumn + * the column to pivot. + * @param values + * List of values that will be translated to columns in the output DataFrame. + * @since 3.4.0 + */ + def pivot(pivotColumn: Column, values: java.util.List[Any]): RelationalGroupedDataset = { + pivot(pivotColumn, values.asScala.toSeq) + } +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala new file mode 100644 index 0000000000000..f77dd512ef257 --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/RuntimeConfig.scala @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +import org.apache.spark.connect.proto.{ConfigRequest, ConfigResponse, KeyValue} +import org.apache.spark.internal.Logging +import org.apache.spark.sql.connect.client.SparkConnectClient + +/** + * Runtime configuration interface for Spark. To access this, use `SparkSession.conf`. + * + * @since 3.4.0 + */ +class RuntimeConfig private[sql] (client: SparkConnectClient) extends Logging { + + /** + * Sets the given Spark runtime configuration property. + * + * @since 3.4.0 + */ + def set(key: String, value: String): Unit = { + executeConfigRequest { builder => + builder.getSetBuilder.addPairsBuilder().setKey(key).setValue(value) + } + } + + /** + * Sets the given Spark runtime configuration property. + * + * @since 3.4.0 + */ + def set(key: String, value: Boolean): Unit = set(key, String.valueOf(value)) + + /** + * Sets the given Spark runtime configuration property. + * + * @since 3.4.0 + */ + def set(key: String, value: Long): Unit = set(key, String.valueOf(value)) + + /** + * Returns the value of Spark runtime configuration property for the given key. + * + * @throws java.util.NoSuchElementException + * if the key is not set and does not have a default value + * @since 3.4.0 + */ + @throws[NoSuchElementException]("if the key is not set") + def get(key: String): String = getOption(key).getOrElse { + throw new NoSuchElementException(key) + } + + /** + * Returns the value of Spark runtime configuration property for the given key. + * + * @since 3.4.0 + */ + def get(key: String, default: String): String = { + executeConfigRequestSingleValue { builder => + builder.getGetWithDefaultBuilder.addPairsBuilder().setKey(key).setValue(default) + } + } + + /** + * Returns all properties set in this conf. + * + * @since 3.4.0 + */ + def getAll: Map[String, String] = { + val response = executeConfigRequest { builder => + builder.getGetAllBuilder + } + val builder = Map.newBuilder[String, String] + response.getPairsList.forEach { kv => + require(kv.hasValue) + builder += ((kv.getKey, kv.getValue)) + } + builder.result() + } + + /** + * Returns the value of Spark runtime configuration property for the given key. + * + * @since 3.4.0 + */ + def getOption(key: String): Option[String] = { + val pair = executeConfigRequestSinglePair { builder => + builder.getGetOptionBuilder.addKeys(key) + } + if (pair.hasValue) { + Option(pair.getValue) + } else { + None + } + } + + /** + * Resets the configuration property for the given key. + * + * @since 3.4.0 + */ + def unset(key: String): Unit = { + executeConfigRequest { builder => + builder.getUnsetBuilder.addKeys(key) + } + } + + /** + * Indicates whether the configuration property with the given key is modifiable in the current + * session. + * + * @return + * `true` if the configuration property is modifiable. For static SQL, Spark Core, invalid + * (not existing) and other non-modifiable configuration properties, the returned value is + * `false`. + * @since 3.4.0 + */ + def isModifiable(key: String): Boolean = { + val modifiable = executeConfigRequestSingleValue { builder => + builder.getIsModifiableBuilder.addKeys(key) + } + java.lang.Boolean.valueOf(modifiable) + } + + private def executeConfigRequestSingleValue( + f: ConfigRequest.Operation.Builder => Unit): String = { + val pair = executeConfigRequestSinglePair(f) + require(pair.hasValue, "The returned pair does not have a value set") + pair.getValue + } + + private def executeConfigRequestSinglePair( + f: ConfigRequest.Operation.Builder => Unit): KeyValue = { + val response = executeConfigRequest(f) + require(response.getPairsCount == 1, "") + response.getPairs(0) + } + + private def executeConfigRequest(f: ConfigRequest.Operation.Builder => Unit): ConfigResponse = { + val builder = ConfigRequest.Operation.newBuilder() + f(builder) + val response = client.config(builder.build()) + response.getWarningsList.forEach { warning => + logWarning(warning) + } + response + } +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SQLImplicits.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SQLImplicits.scala new file mode 100644 index 0000000000000..6c626fd716d5b --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SQLImplicits.scala @@ -0,0 +1,289 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +import scala.collection.Map +import scala.language.implicitConversions +import scala.reflect.classTag +import scala.reflect.runtime.universe.TypeTag + +import org.apache.spark.sql.catalyst.ScalaReflection +import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, AgnosticEncoders} +import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders._ + +/** + * A collection of implicit methods for converting names and Symbols into [[Column]]s, and for + * converting common Scala objects into [[Dataset]]s. + * + * @since 3.4.0 + */ +abstract class SQLImplicits private[sql] (session: SparkSession) extends LowPrioritySQLImplicits { + + /** + * Converts $"col name" into a [[Column]]. + * + * @since 3.4.0 + */ + implicit class StringToColumn(val sc: StringContext) { + def $(args: Any*): ColumnName = { + new ColumnName(sc.s(args: _*)) + } + } + + /** + * An implicit conversion that turns a Scala `Symbol` into a [[Column]]. + * @since 3.4.0 + */ + implicit def symbolToColumn(s: Symbol): ColumnName = new ColumnName(s.name) + + /** @since 3.4.0 */ + implicit val newIntEncoder: Encoder[Int] = PrimitiveIntEncoder + + /** @since 3.4.0 */ + implicit val newLongEncoder: Encoder[Long] = PrimitiveLongEncoder + + /** @since 3.4.0 */ + implicit val newDoubleEncoder: Encoder[Double] = PrimitiveDoubleEncoder + + /** @since 3.4.0 */ + implicit val newFloatEncoder: Encoder[Float] = PrimitiveFloatEncoder + + /** @since 3.4.0 */ + implicit val newByteEncoder: Encoder[Byte] = PrimitiveByteEncoder + + /** @since 3.4.0 */ + implicit val newShortEncoder: Encoder[Short] = PrimitiveShortEncoder + + /** @since 3.4.0 */ + implicit val newBooleanEncoder: Encoder[Boolean] = PrimitiveBooleanEncoder + + /** @since 3.4.0 */ + implicit val newStringEncoder: Encoder[String] = StringEncoder + + /** @since 3.4.0 */ + implicit val newJavaDecimalEncoder: Encoder[java.math.BigDecimal] = + AgnosticEncoders.DEFAULT_JAVA_DECIMAL_ENCODER + + /** @since 3.4.0 */ + implicit val newScalaDecimalEncoder: Encoder[scala.math.BigDecimal] = + AgnosticEncoders.DEFAULT_SCALA_DECIMAL_ENCODER + + /** @since 3.4.0 */ + implicit val newDateEncoder: Encoder[java.sql.Date] = AgnosticEncoders.STRICT_DATE_ENCODER + + /** @since 3.4.0 */ + implicit val newLocalDateEncoder: Encoder[java.time.LocalDate] = + AgnosticEncoders.STRICT_LOCAL_DATE_ENCODER + + /** @since 3.4.0 */ + implicit val newLocalDateTimeEncoder: Encoder[java.time.LocalDateTime] = + AgnosticEncoders.LocalDateTimeEncoder + + /** @since 3.4.0 */ + implicit val newTimeStampEncoder: Encoder[java.sql.Timestamp] = + AgnosticEncoders.STRICT_TIMESTAMP_ENCODER + + /** @since 3.4.0 */ + implicit val newInstantEncoder: Encoder[java.time.Instant] = + AgnosticEncoders.STRICT_INSTANT_ENCODER + + /** @since 3.4.0 */ + implicit val newDurationEncoder: Encoder[java.time.Duration] = DayTimeIntervalEncoder + + /** @since 3.4.0 */ + implicit val newPeriodEncoder: Encoder[java.time.Period] = YearMonthIntervalEncoder + + /** @since 3.4.0 */ + implicit def newJavaEnumEncoder[A <: java.lang.Enum[_]: TypeTag]: Encoder[A] = { + ScalaReflection.encoderFor[A] + } + + // Boxed primitives + + /** @since 3.4.0 */ + implicit val newBoxedIntEncoder: Encoder[java.lang.Integer] = BoxedIntEncoder + + /** @since 3.4.0 */ + implicit val newBoxedLongEncoder: Encoder[java.lang.Long] = BoxedLongEncoder + + /** @since 3.4.0 */ + implicit val newBoxedDoubleEncoder: Encoder[java.lang.Double] = BoxedDoubleEncoder + + /** @since 3.4.0 */ + implicit val newBoxedFloatEncoder: Encoder[java.lang.Float] = BoxedFloatEncoder + + /** @since 3.4.0 */ + implicit val newBoxedByteEncoder: Encoder[java.lang.Byte] = BoxedByteEncoder + + /** @since 3.4.0 */ + implicit val newBoxedShortEncoder: Encoder[java.lang.Short] = BoxedShortEncoder + + /** @since 3.4.0 */ + implicit val newBoxedBooleanEncoder: Encoder[java.lang.Boolean] = BoxedBooleanEncoder + + // Seqs + private def newSeqEncoder[E](elementEncoder: AgnosticEncoder[E]): AgnosticEncoder[Seq[E]] = { + IterableEncoder( + classTag[Seq[E]], + elementEncoder, + elementEncoder.nullable, + elementEncoder.lenientSerialization) + } + + /** + * @since 3.4.0 + * @deprecated + * use [[newSequenceEncoder]] + */ + val newIntSeqEncoder: Encoder[Seq[Int]] = newSeqEncoder(PrimitiveIntEncoder) + + /** + * @since 3.4.0 + * @deprecated + * use [[newSequenceEncoder]] + */ + val newLongSeqEncoder: Encoder[Seq[Long]] = newSeqEncoder(PrimitiveLongEncoder) + + /** + * @since 3.4.0 + * @deprecated + * use [[newSequenceEncoder]] + */ + val newDoubleSeqEncoder: Encoder[Seq[Double]] = newSeqEncoder(PrimitiveDoubleEncoder) + + /** + * @since 3.4.0 + * @deprecated + * use [[newSequenceEncoder]] + */ + val newFloatSeqEncoder: Encoder[Seq[Float]] = newSeqEncoder(PrimitiveFloatEncoder) + + /** + * @since 3.4.0 + * @deprecated + * use [[newSequenceEncoder]] + */ + val newByteSeqEncoder: Encoder[Seq[Byte]] = newSeqEncoder(PrimitiveByteEncoder) + + /** + * @since 3.4.0 + * @deprecated + * use [[newSequenceEncoder]] + */ + val newShortSeqEncoder: Encoder[Seq[Short]] = newSeqEncoder(PrimitiveShortEncoder) + + /** + * @since 3.4.0 + * @deprecated + * use [[newSequenceEncoder]] + */ + val newBooleanSeqEncoder: Encoder[Seq[Boolean]] = newSeqEncoder(PrimitiveBooleanEncoder) + + /** + * @since 3.4.0 + * @deprecated + * use [[newSequenceEncoder]] + */ + val newStringSeqEncoder: Encoder[Seq[String]] = newSeqEncoder(StringEncoder) + + /** + * @since 3.4.0 + * @deprecated + * use [[newSequenceEncoder]] + */ + def newProductSeqEncoder[A <: Product: TypeTag]: Encoder[Seq[A]] = + newSeqEncoder(ScalaReflection.encoderFor[A]) + + /** @since 3.4.0 */ + implicit def newSequenceEncoder[T <: Seq[_]: TypeTag]: Encoder[T] = + ScalaReflection.encoderFor[T] + + // Maps + /** @since 3.4.0 */ + implicit def newMapEncoder[T <: Map[_, _]: TypeTag]: Encoder[T] = ScalaReflection.encoderFor[T] + + /** + * Notice that we serialize `Set` to Catalyst array. The set property is only kept when + * manipulating the domain objects. The serialization format doesn't keep the set property. When + * we have a Catalyst array which contains duplicated elements and convert it to + * `Dataset[Set[T]]` by using the encoder, the elements will be de-duplicated. + * + * @since 3.4.0 + */ + implicit def newSetEncoder[T <: Set[_]: TypeTag]: Encoder[T] = ScalaReflection.encoderFor[T] + + // Arrays + private def newArrayEncoder[E]( + elementEncoder: AgnosticEncoder[E]): AgnosticEncoder[Array[E]] = { + ArrayEncoder(elementEncoder, elementEncoder.nullable) + } + + /** @since 3.4.0 */ + implicit val newIntArrayEncoder: Encoder[Array[Int]] = newArrayEncoder(PrimitiveIntEncoder) + + /** @since 3.4.0 */ + implicit val newLongArrayEncoder: Encoder[Array[Long]] = newArrayEncoder(PrimitiveLongEncoder) + + /** @since 3.4.0 */ + implicit val newDoubleArrayEncoder: Encoder[Array[Double]] = + newArrayEncoder(PrimitiveDoubleEncoder) + + /** @since 3.4.0 */ + implicit val newFloatArrayEncoder: Encoder[Array[Float]] = newArrayEncoder( + PrimitiveFloatEncoder) + + /** @since 3.4.0 */ + implicit val newByteArrayEncoder: Encoder[Array[Byte]] = BinaryEncoder + + /** @since 3.4.0 */ + implicit val newShortArrayEncoder: Encoder[Array[Short]] = newArrayEncoder( + PrimitiveShortEncoder) + + /** @since 3.4.0 */ + implicit val newBooleanArrayEncoder: Encoder[Array[Boolean]] = + newArrayEncoder(PrimitiveBooleanEncoder) + + /** @since 3.4.0 */ + implicit val newStringArrayEncoder: Encoder[Array[String]] = newArrayEncoder(StringEncoder) + + /** @since 3.4.0 */ + implicit def newProductArrayEncoder[A <: Product: TypeTag]: Encoder[Array[A]] = { + newArrayEncoder(ScalaReflection.encoderFor[A]) + } + + /** + * Creates a [[Dataset]] from a local Seq. + * @since 3.4.0 + */ + implicit def localSeqToDatasetHolder[T: Encoder](s: Seq[T]): DatasetHolder[T] = { + DatasetHolder(session.createDataset(s)) + } +} + +/** + * Lower priority implicit methods for converting Scala objects into [[Dataset]]s. Conflicting + * implicits are placed here to disambiguate resolution. + * + * Reasons for including specific implicits: newProductEncoder - to disambiguate for `List`s which + * are both `Seq` and `Product` + */ +trait LowPrioritySQLImplicits { + + /** @since 3.4.0 */ + implicit def newProductEncoder[T <: Product: TypeTag]: Encoder[T] = + ScalaReflection.encoderFor[T] +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala new file mode 100644 index 0000000000000..2d6781dd69c8b --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/SparkSession.scala @@ -0,0 +1,561 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +import java.io.Closeable +import java.net.URI +import java.util.concurrent.TimeUnit._ +import java.util.concurrent.atomic.AtomicLong + +import scala.collection.JavaConverters._ +import scala.reflect.runtime.universe.TypeTag + +import org.apache.arrow.memory.RootAllocator + +import org.apache.spark.annotation.{DeveloperApi, Experimental} +import org.apache.spark.connect.proto +import org.apache.spark.internal.Logging +import org.apache.spark.sql.catalyst.{JavaTypeInference, ScalaReflection} +import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, RowEncoder} +import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.{BoxedLongEncoder, UnboundRowEncoder} +import org.apache.spark.sql.connect.client.{SparkConnectClient, SparkResult} +import org.apache.spark.sql.connect.client.util.{Cleaner, ConvertToArrow} +import org.apache.spark.sql.connect.common.LiteralValueProtoConverter.toLiteralProto +import org.apache.spark.sql.types.StructType + +/** + * The entry point to programming Spark with the Dataset and DataFrame API. + * + * In environments that this has been created upfront (e.g. REPL, notebooks), use the builder to + * get an existing session: + * + * {{{ + * SparkSession.builder().getOrCreate() + * }}} + * + * The builder can also be used to create a new session: + * + * {{{ + * SparkSession.builder + * .master("local") + * .appName("Word Count") + * .config("spark.some.config.option", "some-value") + * .getOrCreate() + * }}} + */ +class SparkSession private[sql] ( + private val client: SparkConnectClient, + private val cleaner: Cleaner, + private val planIdGenerator: AtomicLong) + extends Serializable + with Closeable + with Logging { + + private[this] val allocator = new RootAllocator() + + lazy val version: String = { + client.analyze(proto.AnalyzePlanRequest.AnalyzeCase.SPARK_VERSION).getSparkVersion.getVersion + } + + /** + * Runtime configuration interface for Spark. + * + * This is the interface through which the user can get and set all Spark configurations that + * are relevant to Spark SQL. When getting the value of a config, his defaults to the value set + * in server, if any. + * + * @since 3.4.0 + */ + val conf: RuntimeConfig = new RuntimeConfig(client) + + /** + * Executes some code block and prints to stdout the time taken to execute the block. This is + * available in Scala only and is used primarily for interactive testing and debugging. + * + * @since 3.4.0 + */ + def time[T](f: => T): T = { + val start = System.nanoTime() + val ret = f + val end = System.nanoTime() + // scalastyle:off println + println(s"Time taken: ${NANOSECONDS.toMillis(end - start)} ms") + // scalastyle:on println + ret + } + + /** + * Returns a `DataFrame` with no rows or columns. + * + * @since 3.4.0 + */ + @transient + val emptyDataFrame: DataFrame = emptyDataset(UnboundRowEncoder) + + /** + * Creates a new [[Dataset]] of type T containing zero elements. + * + * @since 3.4.0 + */ + def emptyDataset[T: Encoder]: Dataset[T] = createDataset[T](Nil) + + private def createDataset[T](encoder: AgnosticEncoder[T], data: Iterator[T]): Dataset[T] = { + newDataset(encoder) { builder => + val localRelationBuilder = builder.getLocalRelationBuilder + .setSchema(encoder.schema.json) + if (data.nonEmpty) { + val timeZoneId = conf.get("spark.sql.session.timeZone") + val arrowData = ConvertToArrow(encoder, data, timeZoneId, allocator) + localRelationBuilder.setData(arrowData) + } + } + } + + /** + * Creates a `DataFrame` from a local Seq of Product. + * + * @since 3.4.0 + */ + def createDataFrame[A <: Product: TypeTag](data: Seq[A]): DataFrame = { + createDataset(ScalaReflection.encoderFor[A], data.iterator).toDF() + } + + /** + * :: DeveloperApi :: Creates a `DataFrame` from a `java.util.List` containing [[Row]]s using + * the given schema. It is important to make sure that the structure of every [[Row]] of the + * provided List matches the provided schema. Otherwise, there will be runtime exception. + * + * @since 3.4.0 + */ + def createDataFrame(rows: java.util.List[Row], schema: StructType): DataFrame = { + createDataset(RowEncoder.encoderFor(schema), rows.iterator().asScala).toDF() + } + + /** + * Applies a schema to a List of Java Beans. + * + * WARNING: Since there is no guaranteed ordering for fields in a Java Bean, SELECT * queries + * will return the columns in an undefined order. + * @since 3.4.0 + */ + def createDataFrame(data: java.util.List[_], beanClass: Class[_]): DataFrame = { + val encoder = JavaTypeInference.encoderFor(beanClass.asInstanceOf[Class[Any]]) + createDataset(encoder, data.iterator().asScala).toDF() + } + + /** + * Creates a [[Dataset]] from a local Seq of data of a given type. This method requires an + * encoder (to convert a JVM object of type `T` to and from the internal Spark SQL + * representation) that is generally created automatically through implicits from a + * `SparkSession`, or can be created explicitly by calling static methods on [[Encoders]]. + * + * ==Example== + * + * {{{ + * + * import spark.implicits._ + * case class Person(name: String, age: Long) + * val data = Seq(Person("Michael", 29), Person("Andy", 30), Person("Justin", 19)) + * val ds = spark.createDataset(data) + * + * ds.show() + * // +-------+---+ + * // | name|age| + * // +-------+---+ + * // |Michael| 29| + * // | Andy| 30| + * // | Justin| 19| + * // +-------+---+ + * }}} + * + * @since 3.4.0 + */ + def createDataset[T: Encoder](data: Seq[T]): Dataset[T] = { + createDataset(encoderFor[T], data.iterator) + } + + /** + * Creates a [[Dataset]] from a `java.util.List` of a given type. This method requires an + * encoder (to convert a JVM object of type `T` to and from the internal Spark SQL + * representation) that is generally created automatically through implicits from a + * `SparkSession`, or can be created explicitly by calling static methods on [[Encoders]]. + * + * ==Java Example== + * + * {{{ + * List data = Arrays.asList("hello", "world"); + * Dataset ds = spark.createDataset(data, Encoders.STRING()); + * }}} + * + * @since 3.4.0 + */ + def createDataset[T: Encoder](data: java.util.List[T]): Dataset[T] = { + createDataset(data.asScala.toSeq) + } + + /** + * Executes a SQL query substituting named parameters by the given arguments, returning the + * result as a `DataFrame`. This API eagerly runs DDL/DML commands, but not for SELECT queries. + * + * @param sqlText + * A SQL statement with named parameters to execute. + * @param args + * A map of parameter names to Java/Scala objects that can be converted to SQL literal + * expressions. See + * Supported Data Types for supported value types in Scala/Java. For example, map keys: + * "rank", "name", "birthdate"; map values: 1, "Steven", LocalDate.of(2023, 4, 2). Map value + * can be also a `Column` of literal expression, in that case it is taken as is. + * + * @since 3.4.0 + */ + @Experimental + def sql(sqlText: String, args: Map[String, Any]): DataFrame = { + sql(sqlText, args.asJava) + } + + /** + * Executes a SQL query substituting named parameters by the given arguments, returning the + * result as a `DataFrame`. This API eagerly runs DDL/DML commands, but not for SELECT queries. + * + * @param sqlText + * A SQL statement with named parameters to execute. + * @param args + * A map of parameter names to Java/Scala objects that can be converted to SQL literal + * expressions. See + * Supported Data Types for supported value types in Scala/Java. For example, map keys: + * "rank", "name", "birthdate"; map values: 1, "Steven", LocalDate.of(2023, 4, 2). Map value + * can be also a `Column` of literal expression, in that case it is taken as is. + * + * @since 3.4.0 + */ + @Experimental + def sql(sqlText: String, args: java.util.Map[String, Any]): DataFrame = newDataFrame { + builder => + // Send the SQL once to the server and then check the output. + val cmd = newCommand(b => + b.setSqlCommand( + proto.SqlCommand + .newBuilder() + .setSql(sqlText) + .putAllArgs(args.asScala.mapValues(toLiteralProto).toMap.asJava))) + val plan = proto.Plan.newBuilder().setCommand(cmd) + val responseIter = client.execute(plan.build()) + + val response = responseIter.asScala + .find(_.hasSqlCommandResult) + .getOrElse(throw new RuntimeException("SQLCommandResult must be present")) + + // Update the builder with the values from the result. + builder.mergeFrom(response.getSqlCommandResult.getRelation) + } + + /** + * Executes a SQL query using Spark, returning the result as a `DataFrame`. This API eagerly + * runs DDL/DML commands, but not for SELECT queries. + * + * @since 3.4.0 + */ + def sql(query: String): DataFrame = { + sql(query, Map.empty[String, String]) + } + + /** + * Returns a [[DataFrameReader]] that can be used to read non-streaming data in as a + * `DataFrame`. + * {{{ + * sparkSession.read.parquet("/path/to/file.parquet") + * sparkSession.read.schema(schema).json("/path/to/file.json") + * }}} + * + * @since 3.4.0 + */ + def read: DataFrameReader = new DataFrameReader(this) + + /** + * Returns the specified table/view as a `DataFrame`. If it's a table, it must support batch + * reading and the returned DataFrame is the batch scan query plan of this table. If it's a + * view, the returned DataFrame is simply the query plan of the view, which can either be a + * batch or streaming query plan. + * + * @param tableName + * is either a qualified or unqualified name that designates a table or view. If a database is + * specified, it identifies the table/view from the database. Otherwise, it first attempts to + * find a temporary view with the given name and then match the table/view from the current + * database. Note that, the global temporary view database is also valid here. + * @since 3.4.0 + */ + def table(tableName: String): DataFrame = { + read.table(tableName) + } + + /** + * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements in a + * range from 0 to `end` (exclusive) with step value 1. + * + * @since 3.4.0 + */ + def range(end: Long): Dataset[java.lang.Long] = range(0, end) + + /** + * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements in a + * range from `start` to `end` (exclusive) with step value 1. + * + * @since 3.4.0 + */ + def range(start: Long, end: Long): Dataset[java.lang.Long] = { + range(start, end, step = 1) + } + + /** + * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements in a + * range from `start` to `end` (exclusive) with a step value. + * + * @since 3.4.0 + */ + def range(start: Long, end: Long, step: Long): Dataset[java.lang.Long] = { + range(start, end, step, None) + } + + /** + * Creates a [[Dataset]] with a single `LongType` column named `id`, containing elements in a + * range from `start` to `end` (exclusive) with a step value, with partition number specified. + * + * @since 3.4.0 + */ + def range(start: Long, end: Long, step: Long, numPartitions: Int): Dataset[java.lang.Long] = { + range(start, end, step, Option(numPartitions)) + } + + // scalastyle:off + // Disable style checker so "implicits" object can start with lowercase i + /** + * (Scala-specific) Implicit methods available in Scala for converting common names and + * [[Symbol]]s into [[Column]]s, and for converting common Scala objects into `DataFrame`s. + * + * {{{ + * val sparkSession = SparkSession.builder.getOrCreate() + * import sparkSession.implicits._ + * }}} + * + * @since 3.4.0 + */ + object implicits extends SQLImplicits(this) + // scalastyle:on + + def newSession(): SparkSession = { + SparkSession.builder().client(client.copy()).build() + } + + private def range( + start: Long, + end: Long, + step: Long, + numPartitions: Option[Int]): Dataset[java.lang.Long] = { + newDataset(BoxedLongEncoder) { builder => + val rangeBuilder = builder.getRangeBuilder + .setStart(start) + .setEnd(end) + .setStep(step) + numPartitions.foreach(rangeBuilder.setNumPartitions) + } + } + + private[sql] def newDataFrame(f: proto.Relation.Builder => Unit): DataFrame = { + newDataset(UnboundRowEncoder)(f) + } + + private[sql] def newDataset[T](encoder: AgnosticEncoder[T])( + f: proto.Relation.Builder => Unit): Dataset[T] = { + val builder = proto.Relation.newBuilder() + f(builder) + builder.getCommonBuilder.setPlanId(planIdGenerator.getAndIncrement()) + val plan = proto.Plan.newBuilder().setRoot(builder).build() + new Dataset[T](this, plan, encoder) + } + + @DeveloperApi + def newDataFrame(extension: com.google.protobuf.Any): DataFrame = { + newDataset(extension, UnboundRowEncoder) + } + + @DeveloperApi + def newDataset[T]( + extension: com.google.protobuf.Any, + encoder: AgnosticEncoder[T]): Dataset[T] = { + newDataset(encoder)(_.setExtension(extension)) + } + + private[sql] def newCommand[T](f: proto.Command.Builder => Unit): proto.Command = { + val builder = proto.Command.newBuilder() + f(builder) + builder.build() + } + + private[sql] def analyze( + plan: proto.Plan, + method: proto.AnalyzePlanRequest.AnalyzeCase, + explainMode: Option[proto.AnalyzePlanRequest.Explain.ExplainMode] = None) + : proto.AnalyzePlanResponse = { + client.analyze(method, Some(plan), explainMode) + } + + private[sql] def analyze( + f: proto.AnalyzePlanRequest.Builder => Unit): proto.AnalyzePlanResponse = { + val builder = proto.AnalyzePlanRequest.newBuilder() + f(builder) + client.analyze(builder) + } + + private[sql] def sameSemantics(plan: proto.Plan, otherPlan: proto.Plan): Boolean = { + client.sameSemantics(plan, otherPlan).getSameSemantics.getResult + } + + private[sql] def semanticHash(plan: proto.Plan): Int = { + client.semanticHash(plan).getSemanticHash.getResult + } + + private[sql] def execute[T](plan: proto.Plan, encoder: AgnosticEncoder[T]): SparkResult[T] = { + val value = client.execute(plan) + val result = new SparkResult(value, allocator, encoder) + cleaner.register(result) + result + } + + private[sql] def execute(command: proto.Command): Unit = { + val plan = proto.Plan.newBuilder().setCommand(command).build() + client.execute(plan).asScala.foreach(_ => ()) + } + + @DeveloperApi + def execute(extension: com.google.protobuf.Any): Unit = { + val command = proto.Command.newBuilder().setExtension(extension).build() + execute(command) + } + + /** + * Add a single artifact to the client session. + * + * Currently only local files with extensions .jar and .class are supported. + * + * @since 3.4.0 + */ + @Experimental + def addArtifact(path: String): Unit = client.addArtifact(path) + + /** + * Add a single artifact to the client session. + * + * Currently only local files with extensions .jar and .class are supported. + * + * @since 3.4.0 + */ + @Experimental + def addArtifact(uri: URI): Unit = client.addArtifact(uri) + + /** + * Add one or more artifacts to the session. + * + * Currently only local files with extensions .jar and .class are supported. + * + * @since 3.4.0 + */ + @Experimental + @scala.annotation.varargs + def addArtifacts(uri: URI*): Unit = client.addArtifacts(uri) + + /** + * This resets the plan id generator so we can produce plans that are comparable. + * + * For testing only! + */ + private[sql] def resetPlanIdGenerator(): Unit = { + planIdGenerator.set(0) + } + + /** + * Synonym for `close()`. + * + * @since 3.4.0 + */ + def stop(): Unit = close() + + /** + * Close the [[SparkSession]]. This closes the connection, and the allocator. The latter will + * throw an exception if there are still open [[SparkResult]]s. + * + * @since 3.4.0 + */ + override def close(): Unit = { + client.shutdown() + allocator.close() + } +} + +// The minimal builder needed to create a spark session. +// TODO: implements all methods mentioned in the scaladoc of [[SparkSession]] +object SparkSession extends Logging { + private val planIdGenerator = new AtomicLong + + def builder(): Builder = new Builder() + + private[sql] lazy val cleaner = { + val cleaner = new Cleaner + cleaner.start() + cleaner + } + + class Builder() extends Logging { + private var _client: SparkConnectClient = _ + + def remote(connectionString: String): Builder = { + client(SparkConnectClient.builder().connectionString(connectionString).build()) + this + } + + private[sql] def client(client: SparkConnectClient): Builder = { + _client = client + this + } + + def build(): SparkSession = { + if (_client == null) { + _client = SparkConnectClient.builder().build() + } + new SparkSession(_client, cleaner, planIdGenerator) + } + } + + def getActiveSession: Option[SparkSession] = { + throw new UnsupportedOperationException("getActiveSession is not supported") + } + + def getDefaultSession: Option[SparkSession] = { + throw new UnsupportedOperationException("getDefaultSession is not supported") + } + + def setActiveSession(session: SparkSession): Unit = { + throw new UnsupportedOperationException("setActiveSession is not supported") + } + + def clearActiveSession(): Unit = { + throw new UnsupportedOperationException("clearActiveSession is not supported") + } + + def active: SparkSession = { + throw new UnsupportedOperationException("active is not supported") + } +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/application/ConnectRepl.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/application/ConnectRepl.scala new file mode 100644 index 0000000000000..ec31697ee59e2 --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/application/ConnectRepl.scala @@ -0,0 +1,88 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.application + +import scala.util.control.NonFatal + +import ammonite.compiler.CodeClassWrapper +import ammonite.util.Bind + +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connect.client.{SparkConnectClient, SparkConnectClientParser} + +/** + * REPL for spark connect. + */ +@DeveloperApi +object ConnectRepl { + private val name = "Spark Connect REPL" + + private val splash = + """ + |Spark session available as 'spark'. + | _____ __ ______ __ + | / ___/____ ____ ______/ /__ / ____/___ ____ ____ ___ _____/ /_ + | \__ \/ __ \/ __ `/ ___/ //_/ / / / __ \/ __ \/ __ \/ _ \/ ___/ __/ + | ___/ / /_/ / /_/ / / / ,< / /___/ /_/ / / / / / / / __/ /__/ /_ + |/____/ .___/\__,_/_/ /_/|_| \____/\____/_/ /_/_/ /_/\___/\___/\__/ + | /_/ + |""".stripMargin + + def main(args: Array[String]): Unit = { + // Build the client. + val client = + try { + SparkConnectClient + .builder() + .loadFromEnvironment() + .userAgent(name) + .parse(args) + .build() + } catch { + case NonFatal(e) => + // scalastyle:off println + println(s""" + |$name + |${e.getMessage} + |${SparkConnectClientParser.usage()} + |""".stripMargin) + // scalastyle:on println + sys.exit(1) + } + + // Build the session. + val spark = SparkSession.builder().client(client).build() + + // Add the proper imports. + val imports = + """ + |import org.apache.spark.sql.functions._ + |import spark.implicits._ + |import spark.sql + |""".stripMargin + + // Please note that we make ammonite generate classes instead of objects. + // Classes tend to have superior serialization behavior when using UDFs. + val main = ammonite.Main( + welcomeBanner = Option(splash), + predefCode = imports, + replCodeWrapper = CodeClassWrapper, + scriptCodeWrapper = CodeClassWrapper) + main.run(new Bind("spark", spark)) + } +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/ArtifactManager.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/ArtifactManager.scala new file mode 100644 index 0000000000000..ead500a53e639 --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/ArtifactManager.scala @@ -0,0 +1,305 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.client + +import java.io.InputStream +import java.net.URI +import java.nio.file.{Files, Path, Paths} +import java.util.zip.{CheckedInputStream, CRC32} + +import scala.collection.mutable +import scala.concurrent.Promise +import scala.concurrent.duration.Duration +import scala.util.control.NonFatal + +import Artifact._ +import com.google.protobuf.ByteString +import io.grpc.ManagedChannel +import io.grpc.stub.StreamObserver + +import org.apache.spark.connect.proto +import org.apache.spark.connect.proto.AddArtifactsResponse +import org.apache.spark.connect.proto.AddArtifactsResponse.ArtifactSummary +import org.apache.spark.util.{ThreadUtils, Utils} + +/** + * The Artifact Manager is responsible for handling and transferring artifacts from the local + * client to the server (local/remote). + * @param userContext + * @param channel + */ +class ArtifactManager(userContext: proto.UserContext, channel: ManagedChannel) { + // Using the midpoint recommendation of 32KiB for chunk size as specified in + // https://github.com/grpc/grpc.github.io/issues/371. + private val CHUNK_SIZE: Int = 32 * 1024 + + private[this] val stub = proto.SparkConnectServiceGrpc.newStub(channel) + + /** + * Add a single artifact to the session. + * + * Currently only local files with extensions .jar and .class are supported. + */ + def addArtifact(path: String): Unit = { + addArtifact(Utils.resolveURI(path)) + } + + private def parseArtifacts(uri: URI): Seq[Artifact] = { + // Currently only local files with extensions .jar and .class are supported. + uri.getScheme match { + case "file" => + val path = Paths.get(uri) + val artifact = path.getFileName.toString match { + case jar if jar.endsWith(".jar") => + newJarArtifact(path.getFileName, new LocalFile(path)) + case cf if cf.endsWith(".class") => + newClassArtifact(path.getFileName, new LocalFile(path)) + case other => + throw new UnsupportedOperationException(s"Unsuppoted file format: $other") + } + Seq[Artifact](artifact) + + case other => + throw new UnsupportedOperationException(s"Unsupported scheme: $other") + } + } + + /** + * Add a single artifact to the session. + * + * Currently only local files with extensions .jar and .class are supported. + */ + def addArtifact(uri: URI): Unit = addArtifacts(parseArtifacts(uri)) + + /** + * Add multiple artifacts to the session. + * + * Currently only local files with extensions .jar and .class are supported. + */ + def addArtifacts(uris: Seq[URI]): Unit = addArtifacts(uris.flatMap(parseArtifacts)) + + /** + * Add a number of artifacts to the session. + */ + private def addArtifacts(artifacts: Iterable[Artifact]): Unit = { + val promise = Promise[Seq[ArtifactSummary]] + val responseHandler = new StreamObserver[proto.AddArtifactsResponse] { + private val summaries = mutable.Buffer.empty[ArtifactSummary] + override def onNext(v: AddArtifactsResponse): Unit = { + v.getArtifactsList.forEach { summary => + summaries += summary + } + } + override def onError(throwable: Throwable): Unit = { + promise.failure(throwable) + } + override def onCompleted(): Unit = { + promise.success(summaries.toSeq) + } + } + val stream = stub.addArtifacts(responseHandler) + val currentBatch = mutable.Buffer.empty[Artifact] + var currentBatchSize = 0L + + def addToBatch(dep: Artifact, size: Long): Unit = { + currentBatch += dep + currentBatchSize += size + } + + def writeBatch(): Unit = { + addBatchedArtifacts(currentBatch.toSeq, stream) + currentBatch.clear() + currentBatchSize = 0 + } + + artifacts.iterator.foreach { artifact => + val data = artifact.storage + val size = data.size + if (size > CHUNK_SIZE) { + // Payload can either be a batch OR a single chunked artifact. Write batch if non-empty + // before chunking current artifact. + if (currentBatch.nonEmpty) { + writeBatch() + } + addChunkedArtifact(artifact, stream) + } else { + if (currentBatchSize + size > CHUNK_SIZE) { + writeBatch() + } + addToBatch(artifact, size) + } + } + if (currentBatch.nonEmpty) { + writeBatch() + } + stream.onCompleted() + ThreadUtils.awaitResult(promise.future, Duration.Inf) + // TODO(SPARK-42658): Handle responses containing CRC failures. + } + + /** + * Add a batch of artifacts to the stream. All the artifacts in this call are packaged into a + * single [[proto.AddArtifactsRequest]]. + */ + private def addBatchedArtifacts( + artifacts: Seq[Artifact], + stream: StreamObserver[proto.AddArtifactsRequest]): Unit = { + val builder = proto.AddArtifactsRequest + .newBuilder() + .setUserContext(userContext) + artifacts.foreach { artifact => + val in = new CheckedInputStream(artifact.storage.asInstanceOf[LocalData].stream, new CRC32) + try { + val data = proto.AddArtifactsRequest.ArtifactChunk + .newBuilder() + .setData(ByteString.readFrom(in)) + .setCrc(in.getChecksum.getValue) + + builder.getBatchBuilder + .addArtifactsBuilder() + .setName(artifact.path.toString) + .setData(data) + .build() + } catch { + case NonFatal(e) => + stream.onError(e) + throw e + } finally { + in.close() + } + } + stream.onNext(builder.build()) + } + + /** + * Read data from an [[InputStream]] in pieces of `chunkSize` bytes and convert to + * protobuf-compatible [[ByteString]]. + * @param in + * @return + */ + private def readNextChunk(in: InputStream): ByteString = { + val buf = new Array[Byte](CHUNK_SIZE) + var bytesRead = 0 + var count = 0 + while (count != -1 && bytesRead < CHUNK_SIZE) { + count = in.read(buf, bytesRead, CHUNK_SIZE - bytesRead) + if (count != -1) { + bytesRead += count + } + } + if (bytesRead == 0) ByteString.empty() + else ByteString.copyFrom(buf, 0, bytesRead) + } + + /** + * Add a artifact in chunks to the stream. The artifact's data is spread out over multiple + * [[proto.AddArtifactsRequest requests]]. + */ + private def addChunkedArtifact( + artifact: Artifact, + stream: StreamObserver[proto.AddArtifactsRequest]): Unit = { + val builder = proto.AddArtifactsRequest + .newBuilder() + .setUserContext(userContext) + + val in = new CheckedInputStream(artifact.storage.asInstanceOf[LocalData].stream, new CRC32) + try { + // First RPC contains the `BeginChunkedArtifact` payload (`begin_chunk`). + // Subsequent RPCs contains the `ArtifactChunk` payload (`chunk`). + val artifactChunkBuilder = proto.AddArtifactsRequest.ArtifactChunk.newBuilder() + var dataChunk = readNextChunk(in) + // Integer division that rounds up to the nearest whole number. + def getNumChunks(size: Long): Long = (size + (CHUNK_SIZE - 1)) / CHUNK_SIZE + + builder.getBeginChunkBuilder + .setName(artifact.path.toString) + .setTotalBytes(artifact.size) + .setNumChunks(getNumChunks(artifact.size)) + .setInitialChunk( + artifactChunkBuilder + .setData(dataChunk) + .setCrc(in.getChecksum.getValue)) + stream.onNext(builder.build()) + in.getChecksum.reset() + builder.clearBeginChunk() + + dataChunk = readNextChunk(in) + // Consume stream in chunks until there is no data left to read. + while (!dataChunk.isEmpty) { + artifactChunkBuilder.setData(dataChunk).setCrc(in.getChecksum.getValue) + builder.setChunk(artifactChunkBuilder.build()) + stream.onNext(builder.build()) + in.getChecksum.reset() + builder.clearChunk() + dataChunk = readNextChunk(in) + } + } catch { + case NonFatal(e) => + stream.onError(e) + throw e + } finally { + in.close() + } + } +} + +class Artifact private (val path: Path, val storage: LocalData) { + require(!path.isAbsolute, s"Bad path: $path") + + lazy val size: Long = storage match { + case localData: LocalData => localData.size + } +} + +object Artifact { + val CLASS_PREFIX: Path = Paths.get("classes") + val JAR_PREFIX: Path = Paths.get("jars") + + def newJarArtifact(fileName: Path, storage: LocalData): Artifact = { + newArtifact(JAR_PREFIX, ".jar", fileName, storage) + } + + def newClassArtifact(fileName: Path, storage: LocalData): Artifact = { + newArtifact(CLASS_PREFIX, ".class", fileName, storage) + } + + private def newArtifact( + prefix: Path, + requiredSuffix: String, + fileName: Path, + storage: LocalData): Artifact = { + require(!fileName.isAbsolute) + require(fileName.toString.endsWith(requiredSuffix)) + new Artifact(prefix.resolve(fileName), storage) + } + + /** + * Payload stored on this machine. + */ + sealed trait LocalData { + def stream: InputStream + def size: Long + } + + /** + * Payload stored in a local file. + */ + class LocalFile(val path: Path) extends LocalData { + override def size: Long = Files.size(path) + override def stream: InputStream = Files.newInputStream(path) + } +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkConnectClient.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkConnectClient.scala new file mode 100644 index 0000000000000..fd9ced6eb62fc --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkConnectClient.scala @@ -0,0 +1,513 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.client + +import io.grpc.{CallCredentials, CallOptions, Channel, ClientCall, ClientInterceptor, CompositeChannelCredentials, ForwardingClientCall, Grpc, InsecureChannelCredentials, ManagedChannel, ManagedChannelBuilder, Metadata, MethodDescriptor, Status, TlsChannelCredentials} +import java.net.URI +import java.util.UUID +import java.util.concurrent.Executor +import scala.language.existentials + +import org.apache.spark.connect.proto +import org.apache.spark.connect.proto.UserContext +import org.apache.spark.sql.connect.common.config.ConnectCommon + +/** + * Conceptually the remote spark session that communicates with the server. + */ +private[sql] class SparkConnectClient( + private val userContext: proto.UserContext, + private val channelBuilder: ManagedChannelBuilder[_], + private[client] val userAgent: String) { + + private[this] lazy val channel: ManagedChannel = channelBuilder.build() + + private[this] val stub = proto.SparkConnectServiceGrpc.newBlockingStub(channel) + + private[client] val artifactManager: ArtifactManager = new ArtifactManager(userContext, channel) + + /** + * Placeholder method. + * @return + * User ID. + */ + private[client] def userId: String = userContext.getUserId() + + // Generate a unique session ID for this client. This UUID must be unique to allow + // concurrent Spark sessions of the same user. If the channel is closed, creating + // a new client will create a new session ID. + private[client] val sessionId: String = UUID.randomUUID.toString + + /** + * Dispatch the [[proto.AnalyzePlanRequest]] to the Spark Connect server. + * @return + * A [[proto.AnalyzePlanResponse]] from the Spark Connect server. + */ + def analyze(request: proto.AnalyzePlanRequest): proto.AnalyzePlanResponse = + stub.analyzePlan(request) + + def execute(plan: proto.Plan): java.util.Iterator[proto.ExecutePlanResponse] = { + val request = proto.ExecutePlanRequest + .newBuilder() + .setPlan(plan) + .setUserContext(userContext) + .setSessionId(sessionId) + .setClientType(userAgent) + .build() + stub.executePlan(request) + } + + /** + * Dispatch the [[proto.ConfigRequest]] to the Spark Connect server. + * @return + * A [[proto.ConfigResponse]] from the Spark Connect server. + */ + def config(operation: proto.ConfigRequest.Operation): proto.ConfigResponse = { + val request = proto.ConfigRequest + .newBuilder() + .setOperation(operation) + .setSessionId(sessionId) + .setClientType(userAgent) + .setUserContext(userContext) + .build() + stub.config(request) + } + + /** + * Builds a [[proto.AnalyzePlanRequest]] from `plan` and dispatched it to the Spark Connect + * server. + * @return + * A [[proto.AnalyzePlanResponse]] from the Spark Connect server. + */ + def analyze( + method: proto.AnalyzePlanRequest.AnalyzeCase, + plan: Option[proto.Plan] = None, + explainMode: Option[proto.AnalyzePlanRequest.Explain.ExplainMode] = None) + : proto.AnalyzePlanResponse = { + val builder = proto.AnalyzePlanRequest.newBuilder() + method match { + case proto.AnalyzePlanRequest.AnalyzeCase.SCHEMA => + assert(plan.isDefined) + builder.setSchema( + proto.AnalyzePlanRequest.Schema + .newBuilder() + .setPlan(plan.get) + .build()) + case proto.AnalyzePlanRequest.AnalyzeCase.EXPLAIN => + if (explainMode.isEmpty) { + throw new IllegalArgumentException(s"ExplainMode is required in Explain request") + } + assert(plan.isDefined) + builder.setExplain( + proto.AnalyzePlanRequest.Explain + .newBuilder() + .setPlan(plan.get) + .setExplainMode(explainMode.get) + .build()) + case proto.AnalyzePlanRequest.AnalyzeCase.IS_LOCAL => + assert(plan.isDefined) + builder.setIsLocal( + proto.AnalyzePlanRequest.IsLocal + .newBuilder() + .setPlan(plan.get) + .build()) + case proto.AnalyzePlanRequest.AnalyzeCase.IS_STREAMING => + assert(plan.isDefined) + builder.setIsStreaming( + proto.AnalyzePlanRequest.IsStreaming + .newBuilder() + .setPlan(plan.get) + .build()) + case proto.AnalyzePlanRequest.AnalyzeCase.INPUT_FILES => + assert(plan.isDefined) + builder.setInputFiles( + proto.AnalyzePlanRequest.InputFiles + .newBuilder() + .setPlan(plan.get) + .build()) + case proto.AnalyzePlanRequest.AnalyzeCase.SPARK_VERSION => + builder.setSparkVersion(proto.AnalyzePlanRequest.SparkVersion.newBuilder().build()) + case other => throw new IllegalArgumentException(s"Unknown Analyze request $other") + } + analyze(builder) + } + + def sameSemantics(plan: proto.Plan, otherPlan: proto.Plan): proto.AnalyzePlanResponse = { + val builder = proto.AnalyzePlanRequest.newBuilder() + builder.setSameSemantics( + proto.AnalyzePlanRequest.SameSemantics + .newBuilder() + .setTargetPlan(plan) + .setOtherPlan(otherPlan)) + analyze(builder) + } + + def semanticHash(plan: proto.Plan): proto.AnalyzePlanResponse = { + val builder = proto.AnalyzePlanRequest.newBuilder() + builder.setSemanticHash( + proto.AnalyzePlanRequest.SemanticHash + .newBuilder() + .setPlan(plan)) + analyze(builder) + } + + private[sql] def analyze( + builder: proto.AnalyzePlanRequest.Builder): proto.AnalyzePlanResponse = { + val request = builder + .setUserContext(userContext) + .setSessionId(sessionId) + .setClientType(userAgent) + .build() + analyze(request) + } + + def copy(): SparkConnectClient = { + new SparkConnectClient(userContext, channelBuilder, userAgent) + } + + /** + * Add a single artifact to the client session. + * + * Currently only local files with extensions .jar and .class are supported. + */ + def addArtifact(path: String): Unit = artifactManager.addArtifact(path) + + /** + * Add a single artifact to the client session. + * + * Currently only local files with extensions .jar and .class are supported. + */ + def addArtifact(uri: URI): Unit = artifactManager.addArtifact(uri) + + /** + * Add multiple artifacts to the session. + * + * Currently only local files with extensions .jar and .class are supported. + */ + def addArtifacts(uri: Seq[URI]): Unit = artifactManager.addArtifacts(uri) + + /** + * Shutdown the client's connection to the server. + */ + def shutdown(): Unit = { + channel.shutdownNow() + } +} + +object SparkConnectClient { + + private val SPARK_REMOTE: String = "SPARK_REMOTE" + + private val DEFAULT_USER_AGENT: String = "_SPARK_CONNECT_SCALA" + + private val AUTH_TOKEN_META_DATA_KEY: Metadata.Key[String] = + Metadata.Key.of("Authentication", Metadata.ASCII_STRING_MARSHALLER) + + private val AUTH_TOKEN_ON_INSECURE_CONN_ERROR_MSG: String = + "Authentication token cannot be passed over insecure connections. " + + "Either remove 'token' or set 'use_ssl=true'" + + // for internal tests + private[sql] def apply( + userContext: UserContext, + builder: ManagedChannelBuilder[_]): SparkConnectClient = + new SparkConnectClient(userContext, builder, DEFAULT_USER_AGENT) + + def builder(): Builder = new Builder() + + /** + * This is a helper class that is used to create a GRPC channel based on either a set host and + * port or a NameResolver-compliant URI connection string. + */ + class Builder() { + private val userContextBuilder = proto.UserContext.newBuilder() + private var _userAgent: String = DEFAULT_USER_AGENT + + private var _host: String = "localhost" + private var _port: Int = ConnectCommon.CONNECT_GRPC_BINDING_PORT + + private var _token: Option[String] = None + // If no value specified for isSslEnabled, default to false + private var isSslEnabled: Option[Boolean] = None + + private var metadata: Map[String, String] = Map.empty + + def userId(id: String): Builder = { + // TODO this is not an optional field! + require(id != null && id.nonEmpty) + userContextBuilder.setUserId(id) + this + } + + def userId: Option[String] = Option(userContextBuilder.getUserId).filter(_.nonEmpty) + + def userName(name: String): Builder = { + require(name != null && name.nonEmpty) + userContextBuilder.setUserName(name) + this + } + + def userName: Option[String] = Option(userContextBuilder.getUserName).filter(_.nonEmpty) + + def host(inputHost: String): Builder = { + require(inputHost != null) + _host = inputHost + this + } + + def host: String = _host + + def port(inputPort: Int): Builder = { + _port = inputPort + this + } + + def port: Int = _port + + /** + * Setting the token implicitly sets the use_ssl=true. All the following examples yield the + * same results: + * + * {{{ + * sc://localhost/;token=aaa + * sc://localhost/;use_ssl=true;token=aaa + * sc://localhost/;token=aaa;use_ssl=true + * }}} + * + * Throws exception if the token is set but use_ssl=false. + * + * @param inputToken + * the user token. + * @return + * this builder. + */ + def token(inputToken: String): Builder = { + require(inputToken != null && inputToken.nonEmpty) + _token = Some(inputToken) + // Only set the isSSlEnabled if it is not yet set + isSslEnabled match { + case None => isSslEnabled = Some(true) + case Some(false) => + throw new IllegalArgumentException(AUTH_TOKEN_ON_INSECURE_CONN_ERROR_MSG) + case Some(true) => // Good, the ssl is enabled + } + this + } + + def token: Option[String] = _token + + def enableSsl(): Builder = { + isSslEnabled = Some(true) + this + } + + /** + * Disables the SSL. Throws exception if the token has been set. + * + * @return + * this builder. + */ + def disableSsl(): Builder = { + require(_token.isEmpty, AUTH_TOKEN_ON_INSECURE_CONN_ERROR_MSG) + isSslEnabled = Some(false) + this + } + + def sslEnabled: Boolean = isSslEnabled.contains(true) + + private object URIParams { + val PARAM_USER_ID = "user_id" + val PARAM_USE_SSL = "use_ssl" + val PARAM_TOKEN = "token" + val PARAM_USER_AGENT = "user_agent" + } + + private def verifyURI(uri: URI): Unit = { + if (uri.getScheme != "sc") { + throw new IllegalArgumentException("Scheme for connection URI must be 'sc'.") + } + if (uri.getHost == null) { + throw new IllegalArgumentException(s"Host for connection URI must be defined.") + } + // Java URI considers everything after the authority segment as "path" until the + // ? (query)/# (fragment) components as shown in the regex + // [scheme:][//authority][path][?query][#fragment]. + // However, with the Spark Connect definition, configuration parameter are passed in the + // style of the HTTP URL Path Parameter Syntax (e.g + // sc://hostname:port/;param1=value;param2=value). + // Thus, we manually parse the "java path" to get the "correct path" and configuration + // parameters. + val pathAndParams = uri.getPath.split(';') + if (pathAndParams.nonEmpty && (pathAndParams(0) != "/" && pathAndParams(0) != "")) { + throw new IllegalArgumentException( + s"Path component for connection URI must be empty: " + + s"${pathAndParams(0)}") + } + } + + def userAgent(value: String): Builder = { + require(value != null) + _userAgent = value + this + } + + def userAgent: String = _userAgent + + def option(key: String, value: String): Builder = { + metadata += ((key, value)) + this + } + + def options: Map[String, String] = metadata + + private def parseURIParams(uri: URI): Unit = { + val params = uri.getPath.split(';').drop(1).filter(_ != "") + params.foreach { kv => + val (key, value) = { + val arr = kv.split('=') + if (arr.length != 2) { + throw new IllegalArgumentException( + s"Parameter $kv is not a valid parameter" + + s" key-value pair") + } + (arr(0), arr(1)) + } + key match { + case URIParams.PARAM_USER_ID => userId(value) + case URIParams.PARAM_USER_AGENT => userAgent(value) + case URIParams.PARAM_TOKEN => token(value) + case URIParams.PARAM_USE_SSL => + if (java.lang.Boolean.valueOf(value)) enableSsl() else disableSsl() + case _ => this.metadata = this.metadata + (key -> value) + } + } + } + + /** + * Configure the builder using the env SPARK_REMOTE environment variable. + */ + def loadFromEnvironment(): Builder = { + sys.env.get("SPARK_REMOTE").foreach(connectionString) + this + } + + /** + * Creates the channel with a target connection string, per the documentation of Spark + * Connect. + * + * Note: The connection string, if used, will override any previous host/port settings. + */ + def connectionString(connectionString: String): Builder = { + val uri = new URI(connectionString) + verifyURI(uri) + parseURIParams(uri) + _host = uri.getHost + val inputPort = uri.getPort + if (inputPort != -1) { + _port = inputPort + } + this + } + + /** + * Configure the builder with the given CLI arguments. + */ + def parse(args: Array[String]): Builder = { + SparkConnectClientParser.parse(args.toList, this) + this + } + + def build(): SparkConnectClient = { + val creds = isSslEnabled match { + case Some(false) | None => InsecureChannelCredentials.create() + case Some(true) => + _token match { + case Some(t) => + // With access token added in the http header. + CompositeChannelCredentials.create( + TlsChannelCredentials.create, + new AccessTokenCallCredentials(t)) + case None => + TlsChannelCredentials.create + } + } + + val channelBuilder = Grpc.newChannelBuilderForAddress(_host, _port, creds) + if (metadata.nonEmpty) { + channelBuilder.intercept(new MetadataHeaderClientInterceptor(metadata)) + } + channelBuilder.maxInboundMessageSize(ConnectCommon.CONNECT_GRPC_MAX_MESSAGE_SIZE) + new SparkConnectClient(userContextBuilder.build(), channelBuilder, _userAgent) + } + } + + /** + * A [[CallCredentials]] created from an access token. + * + * @param token + * A string to place directly in the http request authorization header, for example + * "authorization: Bearer ". + */ + private[client] class AccessTokenCallCredentials(token: String) extends CallCredentials { + override def applyRequestMetadata( + requestInfo: CallCredentials.RequestInfo, + appExecutor: Executor, + applier: CallCredentials.MetadataApplier): Unit = { + appExecutor.execute(() => { + try { + val headers = new Metadata() + headers.put(AUTH_TOKEN_META_DATA_KEY, s"Bearer $token"); + applier.apply(headers) + } catch { + case e: Throwable => + applier.fail(Status.UNAUTHENTICATED.withCause(e)); + } + }) + } + + override def thisUsesUnstableApi(): Unit = { + // Marks this API is not stable. Left empty on purpose. + } + } + + /** + * A client interceptor to pass extra parameters in http request header. + * + * @param metadata + * extra metadata placed in the http request header, for example "key: value". + */ + private[client] class MetadataHeaderClientInterceptor(metadata: Map[String, String]) + extends ClientInterceptor { + override def interceptCall[ReqT, RespT]( + method: MethodDescriptor[ReqT, RespT], + callOptions: CallOptions, + next: Channel): ClientCall[ReqT, RespT] = { + new ForwardingClientCall.SimpleForwardingClientCall[ReqT, RespT]( + next.newCall(method, callOptions)) { + override def start( + responseListener: ClientCall.Listener[RespT], + headers: Metadata): Unit = { + metadata.foreach { case (key, value) => + headers.put(Metadata.Key.of(key, Metadata.ASCII_STRING_MARSHALLER), value) + } + super.start(responseListener, headers) + } + } + } + } +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkConnectClientParser.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkConnectClientParser.scala new file mode 100644 index 0000000000000..dda769dc2adb1 --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkConnectClientParser.scala @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.client + +import scala.annotation.tailrec + +/** + * Parser that takes an array of (CLI) arguments and configures a [[SparkConnectClient]] with + * them. + */ +private[sql] object SparkConnectClientParser { + + /** + * @return + * usage string. + */ + def usage(): String = + s""" + |Options: + | --remote REMOTE URI of the Spark Connect Server to connect to. + | --host HOST Host where the Spark Connect Server is running. + | --port PORT Port where the Spark Connect Server is running. + | --enable-ssl Connect to the server using SSL. + | --token TOKEN Token to use for authentication. + | --user_id USER_ID Id of the user connecting. + | --user_name USER_NAME Name of the user connecting. + | --option KEY=VALUE Key-value pair that is used to further configure the session. + """.stripMargin + + /** + * Parse the command line and configure the builder. + */ + @tailrec + def parse(args: List[String], builder: SparkConnectClient.Builder): Unit = { + args match { + case Nil => () + case "--remote" :: tail => + val (value, remainder) = extract("--remote", tail) + parse(remainder, builder.connectionString(value)) + case "--host" :: tail => + val (value, remainder) = extract("--host", tail) + parse(remainder, builder.host(value)) + case "--port" :: tail => + val (value, remainder) = extract("--port", tail) + parse(remainder, builder.port(value.toInt)) + case "--token" :: tail => + val (value, remainder) = extract("--token", tail) + parse(remainder, builder.token(value)) + case "--use_ssl" :: tail => + parse(tail, builder.enableSsl()) + case "--user_id" :: tail => + val (value, remainder) = extract("--user_id", tail) + parse(remainder, builder.userId(value)) + case "--user_name" :: tail => + val (value, remainder) = extract("--user_name", tail) + parse(remainder, builder.userName(value)) + case "--user_agent" :: tail => + val (value, remainder) = extract("--user_agent", tail) + parse(remainder, builder.userAgent(value)) + case "--option" :: tail => + if (args.isEmpty) { + throw new IllegalArgumentException("--option requires a key-value pair") + } + val Array(key, value, rest @ _*) = tail.head.split('=') + if (rest.nonEmpty) { + throw new IllegalArgumentException( + s"--option should contain key=value, found ${tail.head} instead") + } + parse(tail.tail, builder.option(key, value)) + case unsupported :: _ => + throw new IllegalArgumentException(s"$unsupported is an unsupported argument.") + } + } + + private def extract(name: String, args: List[String]): (String, List[String]) = { + require(args.nonEmpty, s"$name option requires a value") + (args.head, args.tail) + } +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala new file mode 100644 index 0000000000000..80db558918bba --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/SparkResult.scala @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.client + +import java.util.Collections + +import scala.collection.JavaConverters._ +import scala.collection.mutable + +import org.apache.arrow.memory.BufferAllocator +import org.apache.arrow.vector.FieldVector +import org.apache.arrow.vector.ipc.ArrowStreamReader + +import org.apache.spark.connect.proto +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, ExpressionEncoder, RowEncoder} +import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.UnboundRowEncoder +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.Deserializer +import org.apache.spark.sql.connect.client.util.{AutoCloseables, Cleanable} +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.ArrowUtils +import org.apache.spark.sql.vectorized.{ArrowColumnVector, ColumnarBatch, ColumnVector} + +private[sql] class SparkResult[T]( + responses: java.util.Iterator[proto.ExecutePlanResponse], + allocator: BufferAllocator, + encoder: AgnosticEncoder[T]) + extends AutoCloseable + with Cleanable { + + private[this] var numRecords: Int = 0 + private[this] var structType: StructType = _ + private[this] var boundEncoder: ExpressionEncoder[T] = _ + private[this] val batches = mutable.Buffer.empty[ColumnarBatch] + + private def createEncoder(schema: StructType): ExpressionEncoder[T] = { + val agnosticEncoder = if (encoder == UnboundRowEncoder) { + // Create a row encoder based on the schema. + RowEncoder.encoderFor(schema).asInstanceOf[AgnosticEncoder[T]] + } else { + encoder + } + ExpressionEncoder(agnosticEncoder) + } + + private def processResponses(stopOnFirstNonEmptyResponse: Boolean): Boolean = { + while (responses.hasNext) { + val response = responses.next() + if (response.hasArrowBatch) { + val ipcStreamBytes = response.getArrowBatch.getData + val reader = new ArrowStreamReader(ipcStreamBytes.newInput(), allocator) + try { + val root = reader.getVectorSchemaRoot + if (batches.isEmpty) { + structType = ArrowUtils.fromArrowSchema(root.getSchema) + // TODO: create encoders that directly operate on arrow vectors. + boundEncoder = createEncoder(structType).resolveAndBind(structType.toAttributes) + } + while (reader.loadNextBatch()) { + val rowCount = root.getRowCount + assert(root.getRowCount == response.getArrowBatch.getRowCount) // HUH! + if (rowCount > 0) { + val vectors = root.getFieldVectors.asScala + .map(v => new ArrowColumnVector(transferToNewVector(v))) + .toArray[ColumnVector] + batches += new ColumnarBatch(vectors, rowCount) + numRecords += rowCount + if (stopOnFirstNonEmptyResponse) { + return true + } + } + } + } finally { + reader.close() + } + } + } + false + } + + private def transferToNewVector(in: FieldVector): FieldVector = { + val pair = in.getTransferPair(allocator) + pair.transfer() + pair.getTo.asInstanceOf[FieldVector] + } + + /** + * Returns the number of elements in the result. + */ + def length: Int = { + // We need to process all responses to make sure numRecords is correct. + processResponses(stopOnFirstNonEmptyResponse = false) + numRecords + } + + /** + * @return + * the schema of the result. + */ + def schema: StructType = { + processResponses(stopOnFirstNonEmptyResponse = true) + structType + } + + /** + * Create an Array with the contents of the result. + */ + def toArray: Array[T] = { + val result = encoder.clsTag.newArray(length) + val rows = iterator + var i = 0 + while (rows.hasNext) { + result(i) = rows.next() + assert(i < numRecords) + i += 1 + } + result + } + + /** + * Returns an iterator over the contents of the result. + */ + def iterator: java.util.Iterator[T] with AutoCloseable = { + new java.util.Iterator[T] with AutoCloseable { + private[this] var batchIndex: Int = -1 + private[this] var iterator: java.util.Iterator[InternalRow] = Collections.emptyIterator() + private[this] var deserializer: Deserializer[T] = _ + override def hasNext: Boolean = { + if (iterator.hasNext) { + return true + } + val nextBatchIndex = batchIndex + 1 + val hasNextBatch = if (nextBatchIndex == batches.size) { + processResponses(stopOnFirstNonEmptyResponse = true) + } else { + true + } + if (hasNextBatch) { + batchIndex = nextBatchIndex + iterator = batches(nextBatchIndex).rowIterator() + if (deserializer == null) { + deserializer = boundEncoder.createDeserializer() + } + } + hasNextBatch + } + + override def next(): T = { + if (!hasNext) { + throw new NoSuchElementException + } + deserializer(iterator.next()) + } + + override def close(): Unit = SparkResult.this.close() + } + } + + /** + * Close this result, freeing any underlying resources. + */ + override def close(): Unit = { + batches.foreach(_.close()) + } + + override def cleaner: AutoCloseable = AutoCloseables(batches.toSeq) +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/package.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/package.scala new file mode 100644 index 0000000000000..9c173076ab8bf --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/package.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect + +package object client { + + private[sql] def unsupported(): Nothing = { + throw new UnsupportedOperationException + } + + private[sql] def unsupported(message: String): Nothing = { + throw new UnsupportedOperationException(message) + } +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/util/Cleaner.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/util/Cleaner.scala new file mode 100644 index 0000000000000..4eecc88135665 --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/util/Cleaner.scala @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.client.util + +import java.lang.ref.{ReferenceQueue, WeakReference} +import java.util.Collections +import java.util.concurrent.ConcurrentHashMap + +import scala.collection.mutable +import scala.util.control.NonFatal + +/** + * Helper class for cleaning up an object's resources after the object itself has been garbage + * collected. + * + * When we move to Java 9+ we should replace this class by [[java.lang.ref.Cleaner]]. + */ +private[sql] class Cleaner { + class Ref(pin: AnyRef, val resource: AutoCloseable) + extends WeakReference[AnyRef](pin, referenceQueue) + with AutoCloseable { + override def close(): Unit = resource.close() + } + + def register(pin: Cleanable): Unit = { + register(pin, pin.cleaner) + } + + /** + * Register an objects' resources for clean-up. Note that it is absolutely pivotal that resource + * itself does not contain any reference to the object, if it does the object will never be + * garbage collected and the clean-up will never be performed. + * + * @param pin + * who's resources need to be cleaned up after GC. + * @param resource + * to clean-up. + */ + def register(pin: AnyRef, resource: AutoCloseable): Unit = { + referenceBuffer.add(new Ref(pin, resource)) + } + + @volatile private var stopped = false + private val referenceBuffer = Collections.newSetFromMap[Ref](new ConcurrentHashMap) + private val referenceQueue = new ReferenceQueue[AnyRef] + + private val cleanerThread = { + val thread = new Thread(() => cleanUp()) + thread.setName("cleaner") + thread.setDaemon(true) + thread + } + + def start(): Unit = { + require(!stopped) + cleanerThread.start() + } + + def stop(): Unit = { + stopped = true + cleanerThread.interrupt() + } + + private def cleanUp(): Unit = { + while (!stopped) { + try { + val ref = referenceQueue.remove().asInstanceOf[Ref] + referenceBuffer.remove(ref) + ref.close() + } catch { + case NonFatal(e) => + // Perhaps log this? + e.printStackTrace() + } + } + } +} + +trait Cleanable { + def cleaner: AutoCloseable +} + +object AutoCloseables { + def apply(resources: Seq[AutoCloseable]): AutoCloseable = { () => + val throwables = mutable.Buffer.empty[Throwable] + resources.foreach { resource => + try { + resource.close() + } catch { + case NonFatal(e) => throwables += e + } + } + if (throwables.nonEmpty) { + val t = throwables.head + throwables.tail.foreach(t.addSuppressed) + throw t + } + } +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/util/ConvertToArrow.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/util/ConvertToArrow.scala new file mode 100644 index 0000000000000..d124870e162d5 --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/connect/client/util/ConvertToArrow.scala @@ -0,0 +1,72 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.client.util + +import java.nio.channels.Channels + +import com.google.protobuf.ByteString +import org.apache.arrow.memory.BufferAllocator +import org.apache.arrow.vector.{VectorSchemaRoot, VectorUnloader} +import org.apache.arrow.vector.ipc.{ArrowStreamWriter, WriteChannel} +import org.apache.arrow.vector.ipc.message.{IpcOption, MessageSerializer} + +import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, ExpressionEncoder} +import org.apache.spark.sql.execution.arrow.ArrowWriter +import org.apache.spark.sql.util.ArrowUtils + +/** + * Utility for converting common Scala objects into Arrow IPC Stream. + */ +private[sql] object ConvertToArrow { + + /** + * Convert an iterator of common Scala objects into a sinlge Arrow IPC Stream. + */ + def apply[T]( + encoder: AgnosticEncoder[T], + data: Iterator[T], + timeZoneId: String, + bufferAllocator: BufferAllocator): ByteString = { + val arrowSchema = ArrowUtils.toArrowSchema(encoder.schema, timeZoneId) + val root = VectorSchemaRoot.create(arrowSchema, bufferAllocator) + val writer: ArrowWriter = ArrowWriter.create(root) + val unloader = new VectorUnloader(root) + val bytes = ByteString.newOutput() + val channel = new WriteChannel(Channels.newChannel(bytes)) + + try { + // Convert and write the data to the vector root. + val serializer = ExpressionEncoder(encoder).createSerializer() + data.foreach(o => writer.write(serializer(o))) + writer.finish() + + // Write the IPC Stream + MessageSerializer.serialize(channel, root.getSchema) + val batch = unloader.getRecordBatch + try MessageSerializer.serialize(channel, batch) + finally { + batch.close() + } + ArrowStreamWriter.writeEndOfStream(channel, IpcOption.DEFAULT) + + // Done + bytes.toByteString + } finally { + root.close() + } + } +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala new file mode 100644 index 0000000000000..0fe47092e4ec6 --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/expressions/UserDefinedFunction.scala @@ -0,0 +1,146 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.expressions + +import scala.collection.JavaConverters._ +import scala.reflect.runtime.universe.TypeTag + +import com.google.protobuf.ByteString + +import org.apache.spark.connect.proto +import org.apache.spark.sql.Column +import org.apache.spark.sql.catalyst.ScalaReflection +import org.apache.spark.sql.catalyst.encoders.AgnosticEncoder +import org.apache.spark.sql.connect.common.UdfPacket +import org.apache.spark.util.Utils + +/** + * A user-defined function. To create one, use the `udf` functions in `functions`. + * + * As an example: + * {{{ + * // Define a UDF that returns true or false based on some numeric score. + * val predict = udf((score: Double) => score > 0.5) + * + * // Projects a column that adds a prediction column based on the score column. + * df.select( predict(df("score")) ) + * }}} + * + * @since 3.4.0 + */ +sealed abstract class UserDefinedFunction { + + /** + * Returns true when the UDF can return a nullable value. + * + * @since 3.4.0 + */ + def nullable: Boolean + + /** + * Returns true iff the UDF is deterministic, i.e. the UDF produces the same output given the + * same input. + * + * @since 3.4.0 + */ + def deterministic: Boolean + + /** + * Returns an expression that invokes the UDF, using the given arguments. + * + * @since 3.4.0 + */ + @scala.annotation.varargs + def apply(exprs: Column*): Column + + /** + * Updates UserDefinedFunction with a given name. + * + * @since 3.4.0 + */ + def withName(name: String): UserDefinedFunction + + /** + * Updates UserDefinedFunction to non-nullable. + * + * @since 3.4.0 + */ + def asNonNullable(): UserDefinedFunction + + /** + * Updates UserDefinedFunction to nondeterministic. + * + * @since 3.4.0 + */ + def asNondeterministic(): UserDefinedFunction +} + +/** + * Holder class for a scalar user-defined function and it's input/output encoder(s). + */ +case class ScalarUserDefinedFunction( + function: AnyRef, + inputEncoders: Seq[AgnosticEncoder[_]], + outputEncoder: AgnosticEncoder[_], + name: Option[String], + override val nullable: Boolean, + override val deterministic: Boolean) + extends UserDefinedFunction { + + private[this] lazy val udf = { + val udfPacketBytes = Utils.serialize(UdfPacket(function, inputEncoders, outputEncoder)) + val scalaUdfBuilder = proto.ScalarScalaUDF + .newBuilder() + .setPayload(ByteString.copyFrom(udfPacketBytes)) + .setNullable(nullable) + + scalaUdfBuilder.build() + } + + @scala.annotation.varargs + override def apply(exprs: Column*): Column = Column { builder => + val udfBuilder = builder.getCommonInlineUserDefinedFunctionBuilder + udfBuilder + .setDeterministic(deterministic) + .setScalarScalaUdf(udf) + .addAllArguments(exprs.map(_.expr).asJava) + + name.foreach(udfBuilder.setFunctionName) + } + + override def withName(name: String): ScalarUserDefinedFunction = copy(name = Option(name)) + + override def asNonNullable(): ScalarUserDefinedFunction = copy(nullable = false) + + override def asNondeterministic(): ScalarUserDefinedFunction = copy(deterministic = false) +} + +object ScalarUserDefinedFunction { + private[sql] def apply( + function: AnyRef, + returnType: TypeTag[_], + parameterTypes: TypeTag[_]*): ScalarUserDefinedFunction = { + + ScalarUserDefinedFunction( + function = function, + inputEncoders = parameterTypes.map(tag => ScalaReflection.encoderFor(tag)), + outputEncoder = ScalaReflection.encoderFor(returnType), + name = None, + nullable = true, + deterministic = true) + } +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/expressions/Window.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/expressions/Window.scala new file mode 100644 index 0000000000000..c85e7bc9c5c0a --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/expressions/Window.scala @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.expressions + +import org.apache.spark.annotation.Stable +import org.apache.spark.sql.Column + +/** + * Utility functions for defining window in DataFrames. + * + * {{{ + * // PARTITION BY country ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW + * Window.partitionBy("country").orderBy("date") + * .rowsBetween(Window.unboundedPreceding, Window.currentRow) + * + * // PARTITION BY country ORDER BY date ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING + * Window.partitionBy("country").orderBy("date").rowsBetween(-3, 3) + * }}} + * + * @note + * When ordering is not defined, an unbounded window frame (rowFrame, unboundedPreceding, + * unboundedFollowing) is used by default. When ordering is defined, a growing window frame + * (rangeFrame, unboundedPreceding, currentRow) is used by default. + * + * @since 3.4.0 + */ +@Stable +object Window { + + /** + * Creates a [[WindowSpec]] with the partitioning defined. + * @since 3.4.0 + */ + @scala.annotation.varargs + def partitionBy(colName: String, colNames: String*): WindowSpec = { + spec.partitionBy(colName, colNames: _*) + } + + /** + * Creates a [[WindowSpec]] with the partitioning defined. + * @since 3.4.0 + */ + @scala.annotation.varargs + def partitionBy(cols: Column*): WindowSpec = { + spec.partitionBy(cols: _*) + } + + /** + * Creates a [[WindowSpec]] with the ordering defined. + * @since 3.4.0 + */ + @scala.annotation.varargs + def orderBy(colName: String, colNames: String*): WindowSpec = { + spec.orderBy(colName, colNames: _*) + } + + /** + * Creates a [[WindowSpec]] with the ordering defined. + * @since 1.4.0 + */ + @scala.annotation.varargs + def orderBy(cols: Column*): WindowSpec = { + spec.orderBy(cols: _*) + } + + /** + * Value representing the first row in the partition, equivalent to "UNBOUNDED PRECEDING" in + * SQL. This can be used to specify the frame boundaries: + * + * {{{ + * Window.rowsBetween(Window.unboundedPreceding, Window.currentRow) + * }}} + * + * @since 3.4.0 + */ + def unboundedPreceding: Long = Long.MinValue + + /** + * Value representing the last row in the partition, equivalent to "UNBOUNDED FOLLOWING" in SQL. + * This can be used to specify the frame boundaries: + * + * {{{ + * Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing) + * }}} + * + * @since 3.4.0 + */ + def unboundedFollowing: Long = Long.MaxValue + + /** + * Value representing the current row. This can be used to specify the frame boundaries: + * + * {{{ + * Window.rowsBetween(Window.unboundedPreceding, Window.currentRow) + * }}} + * + * @since 3.4.0 + */ + def currentRow: Long = 0 + + /** + * Creates a [[WindowSpec]] with the frame boundaries defined, from `start` (inclusive) to `end` + * (inclusive). + * + * Both `start` and `end` are positions relative to the current row. For example, "0" means + * "current row", while "-1" means the row before the current row, and "5" means the fifth row + * after the current row. + * + * We recommend users use `Window.unboundedPreceding`, `Window.unboundedFollowing`, and + * `Window.currentRow` to specify special boundary values, rather than using integral values + * directly. + * + * A row based boundary is based on the position of the row within the partition. An offset + * indicates the number of rows above or below the current row, the frame for the current row + * starts or ends. For instance, given a row based sliding frame with a lower bound offset of -1 + * and a upper bound offset of +2. The frame for row with index 5 would range from index 4 to + * index 7. + * + * {{{ + * import org.apache.spark.sql.expressions.Window + * val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b")) + * .toDF("id", "category") + * val byCategoryOrderedById = + * Window.partitionBy($"category").orderBy($"id").rowsBetween(Window.currentRow, 1) + * df.withColumn("sum", sum($"id") over byCategoryOrderedById).show() + * + * +---+--------+---+ + * | id|category|sum| + * +---+--------+---+ + * | 1| b| 3| + * | 2| b| 5| + * | 3| b| 3| + * | 1| a| 2| + * | 1| a| 3| + * | 2| a| 2| + * +---+--------+---+ + * }}} + * + * @param start + * boundary start, inclusive. The frame is unbounded if this is the minimum long value + * (`Window.unboundedPreceding`). + * @param end + * boundary end, inclusive. The frame is unbounded if this is the maximum long value + * (`Window.unboundedFollowing`). + * @since 3.4.0 + */ + // Note: when updating the doc for this method, also update WindowSpec.rowsBetween. + def rowsBetween(start: Long, end: Long): WindowSpec = { + spec.rowsBetween(start, end) + } + + /** + * Creates a [[WindowSpec]] with the frame boundaries defined, from `start` (inclusive) to `end` + * (inclusive). + * + * Both `start` and `end` are relative to the current row. For example, "0" means "current row", + * while "-1" means one off before the current row, and "5" means the five off after the current + * row. + * + * We recommend users use `Window.unboundedPreceding`, `Window.unboundedFollowing`, and + * `Window.currentRow` to specify special boundary values, rather than using long values + * directly. + * + * A range-based boundary is based on the actual value of the ORDER BY expression(s). An offset + * is used to alter the value of the ORDER BY expression, for instance if the current ORDER BY + * expression has a value of 10 and the lower bound offset is -3, the resulting lower bound for + * the current row will be 10 - 3 = 7. This however puts a number of constraints on the ORDER BY + * expressions: there can be only one expression and this expression must have a numerical data + * type. An exception can be made when the offset is unbounded, because no value modification is + * needed, in this case multiple and non-numeric ORDER BY expression are allowed. + * + * {{{ + * import org.apache.spark.sql.expressions.Window + * val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b")) + * .toDF("id", "category") + * val byCategoryOrderedById = + * Window.partitionBy($"category").orderBy($"id").rangeBetween(Window.currentRow, 1) + * df.withColumn("sum", sum($"id") over byCategoryOrderedById).show() + * + * +---+--------+---+ + * | id|category|sum| + * +---+--------+---+ + * | 1| b| 3| + * | 2| b| 5| + * | 3| b| 3| + * | 1| a| 4| + * | 1| a| 4| + * | 2| a| 2| + * +---+--------+---+ + * }}} + * + * @param start + * boundary start, inclusive. The frame is unbounded if this is the minimum long value + * (`Window.unboundedPreceding`). + * @param end + * boundary end, inclusive. The frame is unbounded if this is the maximum long value + * (`Window.unboundedFollowing`). + * @since 3.4.0 + */ + // Note: when updating the doc for this method, also update WindowSpec.rangeBetween. + def rangeBetween(start: Long, end: Long): WindowSpec = { + spec.rangeBetween(start, end) + } + + private[sql] def spec: WindowSpec = { + new WindowSpec(Seq.empty, Seq.empty, None) + } + +} + +/** + * Utility functions for defining window in DataFrames. + * + * {{{ + * // PARTITION BY country ORDER BY date ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW + * Window.partitionBy("country").orderBy("date") + * .rowsBetween(Window.unboundedPreceding, Window.currentRow) + * + * // PARTITION BY country ORDER BY date ROWS BETWEEN 3 PRECEDING AND 3 FOLLOWING + * Window.partitionBy("country").orderBy("date").rowsBetween(-3, 3) + * }}} + * + * @since 3.4.0 + */ +@Stable +class Window private () // So we can see Window in JavaDoc. diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala new file mode 100644 index 0000000000000..cecfb6a0d919f --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/expressions/WindowSpec.scala @@ -0,0 +1,240 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.expressions + +import scala.collection.JavaConverters._ + +import org.apache.spark.annotation.Stable +import org.apache.spark.connect.proto +import org.apache.spark.sql.Column + +/** + * A window specification that defines the partitioning, ordering, and frame boundaries. + * + * Use the static methods in [[Window]] to create a [[WindowSpec]]. + * + * @since 3.4.0 + */ +@Stable +class WindowSpec private[sql] ( + partitionSpec: Seq[proto.Expression], + orderSpec: Seq[proto.Expression.SortOrder], + frame: Option[proto.Expression.Window.WindowFrame]) { + + /** + * Defines the partitioning columns in a [[WindowSpec]]. + * @since 3.4.0 + */ + @scala.annotation.varargs + def partitionBy(colName: String, colNames: String*): WindowSpec = { + partitionBy((colName +: colNames).map(Column(_)): _*) + } + + /** + * Defines the partitioning columns in a [[WindowSpec]]. + * @since 3.4.0 + */ + @scala.annotation.varargs + def partitionBy(cols: Column*): WindowSpec = { + new WindowSpec(cols.map(_.expr), orderSpec, frame) + } + + /** + * Defines the ordering columns in a [[WindowSpec]]. + * @since 3.4.0 + */ + @scala.annotation.varargs + def orderBy(colName: String, colNames: String*): WindowSpec = { + orderBy((colName +: colNames).map(Column(_)): _*) + } + + /** + * Defines the ordering columns in a [[WindowSpec]]. + * @since 3.4.0 + */ + @scala.annotation.varargs + def orderBy(cols: Column*): WindowSpec = { + val sortOrder: Seq[proto.Expression.SortOrder] = cols.map(_.sortOrder) + new WindowSpec(partitionSpec, sortOrder, frame) + } + + /** + * Defines the frame boundaries, from `start` (inclusive) to `end` (inclusive). + * + * Both `start` and `end` are relative positions from the current row. For example, "0" means + * "current row", while "-1" means the row before the current row, and "5" means the fifth row + * after the current row. + * + * We recommend users use `Window.unboundedPreceding`, `Window.unboundedFollowing`, and + * `Window.currentRow` to specify special boundary values, rather than using integral values + * directly. + * + * A row based boundary is based on the position of the row within the partition. An offset + * indicates the number of rows above or below the current row, the frame for the current row + * starts or ends. For instance, given a row based sliding frame with a lower bound offset of -1 + * and a upper bound offset of +2. The frame for row with index 5 would range from index 4 to + * index 7. + * + * {{{ + * import org.apache.spark.sql.expressions.Window + * val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b")) + * .toDF("id", "category") + * val byCategoryOrderedById = + * Window.partitionBy($"category").orderBy($"id").rowsBetween(Window.currentRow, 1) + * df.withColumn("sum", sum($"id") over byCategoryOrderedById).show() + * + * +---+--------+---+ + * | id|category|sum| + * +---+--------+---+ + * | 1| b| 3| + * | 2| b| 5| + * | 3| b| 3| + * | 1| a| 2| + * | 1| a| 3| + * | 2| a| 2| + * +---+--------+---+ + * }}} + * + * @param start + * boundary start, inclusive. The frame is unbounded if this is the minimum long value + * (`Window.unboundedPreceding`). + * @param end + * boundary end, inclusive. The frame is unbounded if this is the maximum long value + * (`Window.unboundedFollowing`). + * @since 3.4.0 + */ + // Note: when updating the doc for this method, also update Window.rowsBetween. + def rowsBetween(start: Long, end: Long): WindowSpec = { + new WindowSpec( + partitionSpec, + orderSpec, + Some( + toWindowFrame( + proto.Expression.Window.WindowFrame.FrameType.FRAME_TYPE_ROW, + start, + end, + true))) + } + + /** + * Defines the frame boundaries, from `start` (inclusive) to `end` (inclusive). + * + * Both `start` and `end` are relative from the current row. For example, "0" means "current + * row", while "-1" means one off before the current row, and "5" means the five off after the + * current row. + * + * We recommend users use `Window.unboundedPreceding`, `Window.unboundedFollowing`, and + * `Window.currentRow` to specify special boundary values, rather than using long values + * directly. + * + * A range-based boundary is based on the actual value of the ORDER BY expression(s). An offset + * is used to alter the value of the ORDER BY expression, for instance if the current order by + * expression has a value of 10 and the lower bound offset is -3, the resulting lower bound for + * the current row will be 10 - 3 = 7. This however puts a number of constraints on the ORDER BY + * expressions: there can be only one expression and this expression must have a numerical data + * type. An exception can be made when the offset is unbounded, because no value modification is + * needed, in this case multiple and non-numeric ORDER BY expression are allowed. + * + * {{{ + * import org.apache.spark.sql.expressions.Window + * val df = Seq((1, "a"), (1, "a"), (2, "a"), (1, "b"), (2, "b"), (3, "b")) + * .toDF("id", "category") + * val byCategoryOrderedById = + * Window.partitionBy($"category").orderBy($"id").rangeBetween(Window.currentRow, 1) + * df.withColumn("sum", sum($"id") over byCategoryOrderedById).show() + * + * +---+--------+---+ + * | id|category|sum| + * +---+--------+---+ + * | 1| b| 3| + * | 2| b| 5| + * | 3| b| 3| + * | 1| a| 4| + * | 1| a| 4| + * | 2| a| 2| + * +---+--------+---+ + * }}} + * + * @param start + * boundary start, inclusive. The frame is unbounded if this is the minimum long value + * (`Window.unboundedPreceding`). + * @param end + * boundary end, inclusive. The frame is unbounded if this is the maximum long value + * (`Window.unboundedFollowing`). + * @since 3.4.0 + */ + // Note: when updating the doc for this method, also update Window.rangeBetween. + def rangeBetween(start: Long, end: Long): WindowSpec = { + new WindowSpec( + partitionSpec, + orderSpec, + Some( + toWindowFrame( + proto.Expression.Window.WindowFrame.FrameType.FRAME_TYPE_RANGE, + start, + end, + false))) + } + + /** + * Converts this [[WindowSpec]] into a [[Column]] with an aggregate expression. + */ + private[sql] def withAggregate(aggregate: Column): Column = { + Column { builder => + val windowBuilder = builder.getWindowBuilder + windowBuilder.setWindowFunction(aggregate.expr) + if (frame.isDefined) { + windowBuilder.setFrameSpec(frame.get) + } + windowBuilder.addAllPartitionSpec(partitionSpec.asJava) + windowBuilder.addAllOrderSpec(orderSpec.asJava) + } + } + + private[sql] def toWindowFrame( + frameType: proto.Expression.Window.WindowFrame.FrameType, + start: Long, + end: Long, + isRowBetween: Boolean): proto.Expression.Window.WindowFrame = { + val windowFrameBuilder = proto.Expression.Window.WindowFrame.newBuilder() + windowFrameBuilder.setFrameType(frameType) + start match { + case 0 => windowFrameBuilder.getLowerBuilder.setCurrentRow(true) + case Long.MinValue => windowFrameBuilder.getLowerBuilder.setUnbounded(true) + case x if isRowBetween && Int.MinValue <= x && x <= Int.MaxValue => + windowFrameBuilder.getLowerBuilder.getValueBuilder.getLiteralBuilder + .setInteger(start.toInt) + case _ if !isRowBetween => + windowFrameBuilder.getLowerBuilder.getValueBuilder.getLiteralBuilder.setLong(start) + case _ => throw new UnsupportedOperationException() + } + + end match { + case 0 => windowFrameBuilder.getUpperBuilder.setCurrentRow(true) + case Long.MaxValue => windowFrameBuilder.getUpperBuilder.setUnbounded(true) + case x if isRowBetween && Int.MinValue <= x && x <= Int.MaxValue => + windowFrameBuilder.getUpperBuilder.getValueBuilder.getLiteralBuilder + .setInteger(end.toInt) + case _ if !isRowBetween => + windowFrameBuilder.getUpperBuilder.getValueBuilder.getLiteralBuilder.setLong(end) + case _ => throw new UnsupportedOperationException() + } + + windowFrameBuilder.build() + } +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala new file mode 100644 index 0000000000000..29c2e89c53779 --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/functions.scala @@ -0,0 +1,5363 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +import java.util.Collections + +import scala.collection.JavaConverters._ +import scala.reflect.runtime.universe.{typeTag, TypeTag} + +import org.apache.spark.connect.proto +import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.PrimitiveLongEncoder +import org.apache.spark.sql.connect.common.LiteralValueProtoConverter._ +import org.apache.spark.sql.expressions.{ScalarUserDefinedFunction, UserDefinedFunction} +import org.apache.spark.sql.types.{DataType, StructType} +import org.apache.spark.sql.types.DataType.parseTypeWithFallback + +/** + * Commonly used functions available for DataFrame operations. Using functions defined here + * provides a little bit more compile-time safety to make sure the function exists. + * + * Spark also includes more built-in functions that are less common and are not defined here. You + * can still access them (and all the functions defined here) using the `functions.expr()` API and + * calling them through a SQL expression string. You can find the entire list of functions at SQL + * API documentation of your Spark version, see also the latest list + * + * As an example, `isnan` is a function that is defined here. You can use `isnan(col("myCol"))` to + * invoke the `isnan` function. This way the programming language's compiler ensures `isnan` + * exists and is of the proper form. You can also use `expr("isnan(myCol)")` function to invoke + * the same function. In this case, Spark itself will ensure `isnan` exists when it analyzes the + * query. + * + * `regr_count` is an example of a function that is built-in but not defined here, because it is + * less commonly used. To invoke it, use `expr("regr_count(yCol, xCol)")`. + * + * This function APIs usually have methods with `Column` signature only because it can support not + * only `Column` but also other types such as a native string. The other variants currently exist + * for historical reasons. + * + * @groupname udf_funcs UDF functions + * @groupname agg_funcs Aggregate functions + * @groupname datetime_funcs Date time functions + * @groupname sort_funcs Sorting functions + * @groupname normal_funcs Non-aggregate functions + * @groupname math_funcs Math functions + * @groupname misc_funcs Misc functions + * @groupname window_funcs Window functions + * @groupname string_funcs String functions + * @groupname collection_funcs Collection functions + * @groupname partition_transforms Partition transform functions + * @groupname Ungrouped Support functions for DataFrames + * + * @since 3.4.0 + */ +// scalastyle:off +object functions { +// scalastyle:on + + /** + * Returns a [[Column]] based on the given column name. + * + * @group normal_funcs + * @since 3.4.0 + */ + def col(colName: String): Column = Column(colName) + + /** + * Returns a [[Column]] based on the given column name. Alias of [[col]]. + * + * @group normal_funcs + * @since 3.4.0 + */ + def column(colName: String): Column = col(colName) + + private def createLiteral(literalBuilder: proto.Expression.Literal.Builder): Column = Column { + builder => builder.setLiteral(literalBuilder) + } + + /** + * Creates a [[Column]] of literal value. + * + * The passed in object is returned directly if it is already a [[Column]]. If the object is a + * Scala Symbol, it is converted into a [[Column]] also. Otherwise, a new [[Column]] is created + * to represent the literal value. + * + * @since 3.4.0 + */ + def lit(literal: Any): Column = { + literal match { + case c: Column => c + case s: Symbol => Column(s.name) + case _ => createLiteral(toLiteralProtoBuilder(literal)) + } + } + ////////////////////////////////////////////////////////////////////////////////////////////// + // Sort functions + ////////////////////////////////////////////////////////////////////////////////////////////// + + /** + * Returns a sort expression based on ascending order of the column. + * {{{ + * df.sort(asc("dept"), desc("age")) + * }}} + * + * @group sort_funcs + * @since 3.4.0 + */ + def asc(columnName: String): Column = Column(columnName).asc + + /** + * Returns a sort expression based on ascending order of the column, and null values return + * before non-null values. + * {{{ + * df.sort(asc_nulls_first("dept"), desc("age")) + * }}} + * + * @group sort_funcs + * @since 3.4.0 + */ + def asc_nulls_first(columnName: String): Column = Column(columnName).asc_nulls_first + + /** + * Returns a sort expression based on ascending order of the column, and null values appear + * after non-null values. + * {{{ + * df.sort(asc_nulls_last("dept"), desc("age")) + * }}} + * + * @group sort_funcs + * @since 3.4.0 + */ + def asc_nulls_last(columnName: String): Column = Column(columnName).asc_nulls_last + + /** + * Returns a sort expression based on the descending order of the column. + * {{{ + * df.sort(asc("dept"), desc("age")) + * }}} + * + * @group sort_funcs + * @since 3.4.0 + */ + def desc(columnName: String): Column = Column(columnName).desc + + /** + * Returns a sort expression based on the descending order of the column, and null values appear + * before non-null values. + * {{{ + * df.sort(asc("dept"), desc_nulls_first("age")) + * }}} + * + * @group sort_funcs + * @since 3.4.0 + */ + def desc_nulls_first(columnName: String): Column = Column(columnName).desc_nulls_first + + /** + * Returns a sort expression based on the descending order of the column, and null values appear + * after non-null values. + * {{{ + * df.sort(asc("dept"), desc_nulls_last("age")) + * }}} + * + * @group sort_funcs + * @since 3.4.0 + */ + def desc_nulls_last(columnName: String): Column = Column(columnName).desc_nulls_last + + ////////////////////////////////////////////////////////////////////////////////////////////// + // Aggregate functions + ////////////////////////////////////////////////////////////////////////////////////////////// + + /** + * @group agg_funcs + * @since 3.4.0 + */ + @deprecated("Use approx_count_distinct", "2.1.0") + def approxCountDistinct(e: Column): Column = approx_count_distinct(e) + + /** + * @group agg_funcs + * @since 3.4.0 + */ + @deprecated("Use approx_count_distinct", "2.1.0") + def approxCountDistinct(columnName: String): Column = approx_count_distinct(columnName) + + /** + * @group agg_funcs + * @since 3.4.0 + */ + @deprecated("Use approx_count_distinct", "2.1.0") + def approxCountDistinct(e: Column, rsd: Double): Column = approx_count_distinct(e, rsd) + + /** + * @group agg_funcs + * @since 3.4.0 + */ + @deprecated("Use approx_count_distinct", "2.1.0") + def approxCountDistinct(columnName: String, rsd: Double): Column = { + approx_count_distinct(Column(columnName), rsd) + } + + /** + * Aggregate function: returns the approximate number of distinct items in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def approx_count_distinct(e: Column): Column = Column.fn("approx_count_distinct", e) + + /** + * Aggregate function: returns the approximate number of distinct items in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def approx_count_distinct(columnName: String): Column = approx_count_distinct( + column(columnName)) + + /** + * Aggregate function: returns the approximate number of distinct items in a group. + * + * @param rsd + * maximum relative standard deviation allowed (default = 0.05) + * + * @group agg_funcs + * @since 3.4.0 + */ + def approx_count_distinct(e: Column, rsd: Double): Column = { + Column.fn("approx_count_distinct", e, lit(rsd)) + } + + /** + * Aggregate function: returns the approximate number of distinct items in a group. + * + * @param rsd + * maximum relative standard deviation allowed (default = 0.05) + * + * @group agg_funcs + * @since 3.4.0 + */ + def approx_count_distinct(columnName: String, rsd: Double): Column = { + approx_count_distinct(Column(columnName), rsd) + } + + /** + * Aggregate function: returns the average of the values in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def avg(e: Column): Column = Column.fn("avg", e) + + /** + * Aggregate function: returns the average of the values in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def avg(columnName: String): Column = avg(Column(columnName)) + + /** + * Aggregate function: returns a list of objects with duplicates. + * + * @note + * The function is non-deterministic because the order of collected results depends on the + * order of the rows which may be non-deterministic after a shuffle. + * + * @group agg_funcs + * @since 3.4.0 + */ + def collect_list(e: Column): Column = Column.fn("collect_list", e) + + /** + * Aggregate function: returns a list of objects with duplicates. + * + * @note + * The function is non-deterministic because the order of collected results depends on the + * order of the rows which may be non-deterministic after a shuffle. + * + * @group agg_funcs + * @since 3.4.0 + */ + def collect_list(columnName: String): Column = collect_list(Column(columnName)) + + /** + * Aggregate function: returns a set of objects with duplicate elements eliminated. + * + * @note + * The function is non-deterministic because the order of collected results depends on the + * order of the rows which may be non-deterministic after a shuffle. + * + * @group agg_funcs + * @since 3.4.0 + */ + def collect_set(e: Column): Column = Column.fn("collect_set", e) + + /** + * Aggregate function: returns a set of objects with duplicate elements eliminated. + * + * @note + * The function is non-deterministic because the order of collected results depends on the + * order of the rows which may be non-deterministic after a shuffle. + * + * @group agg_funcs + * @since 3.4.0 + */ + def collect_set(columnName: String): Column = collect_set(Column(columnName)) + + /** + * Aggregate function: returns the Pearson Correlation Coefficient for two columns. + * + * @group agg_funcs + * @since 3.4.0 + */ + def corr(column1: Column, column2: Column): Column = Column.fn("corr", column1, column2) + + /** + * Aggregate function: returns the Pearson Correlation Coefficient for two columns. + * + * @group agg_funcs + * @since 3.4.0 + */ + def corr(columnName1: String, columnName2: String): Column = { + corr(Column(columnName1), Column(columnName2)) + } + + /** + * Aggregate function: returns the number of items in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def count(e: Column): Column = Column.fn("count", e) + + /** + * Aggregate function: returns the number of items in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def count(columnName: String): TypedColumn[Any, Long] = + count(Column(columnName)).as(PrimitiveLongEncoder) + + /** + * Aggregate function: returns the number of distinct items in a group. + * + * An alias of `count_distinct`, and it is encouraged to use `count_distinct` directly. + * + * @group agg_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def countDistinct(expr: Column, exprs: Column*): Column = count_distinct(expr, exprs: _*) + + /** + * Aggregate function: returns the number of distinct items in a group. + * + * An alias of `count_distinct`, and it is encouraged to use `count_distinct` directly. + * + * @group agg_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def countDistinct(columnName: String, columnNames: String*): Column = + count_distinct(Column(columnName), columnNames.map(Column.apply): _*) + + /** + * Aggregate function: returns the number of distinct items in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def count_distinct(expr: Column, exprs: Column*): Column = + Column.fn("count", isDistinct = true, expr +: exprs: _*) + + /** + * Aggregate function: returns the population covariance for two columns. + * + * @group agg_funcs + * @since 3.4.0 + */ + def covar_pop(column1: Column, column2: Column): Column = + Column.fn("covar_pop", column1, column2) + + /** + * Aggregate function: returns the population covariance for two columns. + * + * @group agg_funcs + * @since 3.4.0 + */ + def covar_pop(columnName1: String, columnName2: String): Column = { + covar_pop(Column(columnName1), Column(columnName2)) + } + + /** + * Aggregate function: returns the sample covariance for two columns. + * + * @group agg_funcs + * @since 3.4.0 + */ + def covar_samp(column1: Column, column2: Column): Column = + Column.fn("covar_samp", column1, column2) + + /** + * Aggregate function: returns the sample covariance for two columns. + * + * @group agg_funcs + * @since 3.4.0 + */ + def covar_samp(columnName1: String, columnName2: String): Column = + covar_samp(Column(columnName1), Column(columnName2)) + + /** + * Aggregate function: returns the first value in a group. + * + * The function by default returns the first values it sees. It will return the first non-null + * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. + * + * @note + * The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. + * + * @group agg_funcs + * @since 3.4.0 + */ + def first(e: Column, ignoreNulls: Boolean): Column = + Column.fn("first", e, lit(ignoreNulls)) + + /** + * Aggregate function: returns the first value of a column in a group. + * + * The function by default returns the first values it sees. It will return the first non-null + * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. + * + * @note + * The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. + * + * @group agg_funcs + * @since 3.4.0 + */ + def first(columnName: String, ignoreNulls: Boolean): Column = { + first(Column(columnName), ignoreNulls) + } + + /** + * Aggregate function: returns the first value in a group. + * + * The function by default returns the first values it sees. It will return the first non-null + * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. + * + * @note + * The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. + * + * @group agg_funcs + * @since 3.4.0 + */ + def first(e: Column): Column = first(e, ignoreNulls = false) + + /** + * Aggregate function: returns the first value of a column in a group. + * + * The function by default returns the first values it sees. It will return the first non-null + * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. + * + * @note + * The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. + * + * @group agg_funcs + * @since 3.4.0 + */ + def first(columnName: String): Column = first(Column(columnName)) + + /** + * Aggregate function: indicates whether a specified column in a GROUP BY list is aggregated or + * not, returns 1 for aggregated or 0 for not aggregated in the result set. + * + * @group agg_funcs + * @since 3.4.0 + */ + def grouping(e: Column): Column = Column.fn("grouping", e) + + /** + * Aggregate function: indicates whether a specified column in a GROUP BY list is aggregated or + * not, returns 1 for aggregated or 0 for not aggregated in the result set. + * + * @group agg_funcs + * @since 3.4.0 + */ + def grouping(columnName: String): Column = grouping(Column(columnName)) + + /** + * Aggregate function: returns the level of grouping, equals to + * + * {{{ + * (grouping(c1) <<; (n-1)) + (grouping(c2) <<; (n-2)) + ... + grouping(cn) + * }}} + * + * @note + * The list of columns should match with grouping columns exactly, or empty (means all the + * grouping columns). + * + * @group agg_funcs + * @since 3.4.0 + */ + def grouping_id(cols: Column*): Column = Column.fn("grouping_id", cols: _*) + + /** + * Aggregate function: returns the level of grouping, equals to + * + * {{{ + * (grouping(c1) <<; (n-1)) + (grouping(c2) <<; (n-2)) + ... + grouping(cn) + * }}} + * + * @note + * The list of columns should match with grouping columns exactly. + * + * @group agg_funcs + * @since 3.4.0 + */ + def grouping_id(colName: String, colNames: String*): Column = + grouping_id((Seq(colName) ++ colNames).map(n => Column(n)): _*) + + /** + * Aggregate function: returns the kurtosis of the values in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def kurtosis(e: Column): Column = Column.fn("kurtosis", e) + + /** + * Aggregate function: returns the kurtosis of the values in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def kurtosis(columnName: String): Column = kurtosis(Column(columnName)) + + /** + * Aggregate function: returns the last value in a group. + * + * The function by default returns the last values it sees. It will return the last non-null + * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. + * + * @note + * The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. + * + * @group agg_funcs + * @since 3.4.0 + */ + def last(e: Column, ignoreNulls: Boolean): Column = + Column.fn("last", e, lit(ignoreNulls)) + + /** + * Aggregate function: returns the last value of the column in a group. + * + * The function by default returns the last values it sees. It will return the last non-null + * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. + * + * @note + * The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. + * + * @group agg_funcs + * @since 3.4.0 + */ + def last(columnName: String, ignoreNulls: Boolean): Column = + last(Column(columnName), ignoreNulls) + + /** + * Aggregate function: returns the last value in a group. + * + * The function by default returns the last values it sees. It will return the last non-null + * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. + * + * @note + * The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. + * + * @group agg_funcs + * @since 3.4.0 + */ + def last(e: Column): Column = last(e, ignoreNulls = false) + + /** + * Aggregate function: returns the last value of the column in a group. + * + * The function by default returns the last values it sees. It will return the last non-null + * value it sees when ignoreNulls is set to true. If all values are null, then null is returned. + * + * @note + * The function is non-deterministic because its results depends on the order of the rows + * which may be non-deterministic after a shuffle. + * + * @group agg_funcs + * @since 3.4.0 + */ + def last(columnName: String): Column = last(Column(columnName), ignoreNulls = false) + + /** + * Aggregate function: returns the most frequent value in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def mode(e: Column): Column = Column.fn("mode", e) + + /** + * Aggregate function: returns the maximum value of the expression in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def max(e: Column): Column = Column.fn("max", e) + + /** + * Aggregate function: returns the maximum value of the column in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def max(columnName: String): Column = max(Column(columnName)) + + /** + * Aggregate function: returns the value associated with the maximum value of ord. + * + * @group agg_funcs + * @since 3.4.0 + */ + def max_by(e: Column, ord: Column): Column = Column.fn("max_by", e, ord) + + /** + * Aggregate function: returns the average of the values in a group. Alias for avg. + * + * @group agg_funcs + * @since 3.4.0 + */ + def mean(e: Column): Column = avg(e) + + /** + * Aggregate function: returns the average of the values in a group. Alias for avg. + * + * @group agg_funcs + * @since 3.4.0 + */ + def mean(columnName: String): Column = avg(columnName) + + /** + * Aggregate function: returns the median of the values in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def median(e: Column): Column = Column.fn("median", e) + + /** + * Aggregate function: returns the minimum value of the expression in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def min(e: Column): Column = Column.fn("min", e) + + /** + * Aggregate function: returns the minimum value of the column in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def min(columnName: String): Column = min(Column(columnName)) + + /** + * Aggregate function: returns the value associated with the minimum value of ord. + * + * @group agg_funcs + * @since 3.4.0 + */ + def min_by(e: Column, ord: Column): Column = Column.fn("min_by", e, ord) + + /** + * Aggregate function: returns the approximate `percentile` of the numeric column `col` which is + * the smallest value in the ordered `col` values (sorted from least to greatest) such that no + * more than `percentage` of `col` values is less than the value or equal to that value. + * + * If percentage is an array, each value must be between 0.0 and 1.0. If it is a single floating + * point value, it must be between 0.0 and 1.0. + * + * The accuracy parameter is a positive numeric literal which controls approximation accuracy at + * the cost of memory. Higher value of accuracy yields better accuracy, 1.0/accuracy is the + * relative error of the approximation. + * + * @group agg_funcs + * @since 3.4.0 + */ + def percentile_approx(e: Column, percentage: Column, accuracy: Column): Column = + Column.fn("percentile_approx", e, percentage, accuracy) + + /** + * Aggregate function: returns the product of all numerical elements in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def product(e: Column): Column = Column.fn("product", e) + + /** + * Aggregate function: returns the skewness of the values in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def skewness(e: Column): Column = Column.fn("skewness", e) + + /** + * Aggregate function: returns the skewness of the values in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def skewness(columnName: String): Column = skewness(Column(columnName)) + + /** + * Aggregate function: alias for `stddev_samp`. + * + * @group agg_funcs + * @since 3.4.0 + */ + def stddev(e: Column): Column = Column.fn("stddev", e) + + /** + * Aggregate function: alias for `stddev_samp`. + * + * @group agg_funcs + * @since 3.4.0 + */ + def stddev(columnName: String): Column = stddev(Column(columnName)) + + /** + * Aggregate function: returns the sample standard deviation of the expression in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def stddev_samp(e: Column): Column = Column.fn("stddev_samp", e) + + /** + * Aggregate function: returns the sample standard deviation of the expression in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def stddev_samp(columnName: String): Column = stddev_samp(Column(columnName)) + + /** + * Aggregate function: returns the population standard deviation of the expression in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def stddev_pop(e: Column): Column = Column.fn("stddev_pop", e) + + /** + * Aggregate function: returns the population standard deviation of the expression in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def stddev_pop(columnName: String): Column = stddev_pop(Column(columnName)) + + /** + * Aggregate function: returns the sum of all values in the expression. + * + * @group agg_funcs + * @since 3.4.0 + */ + def sum(e: Column): Column = Column.fn("sum", e) + + /** + * Aggregate function: returns the sum of all values in the given column. + * + * @group agg_funcs + * @since 3.4.0 + */ + def sum(columnName: String): Column = sum(Column(columnName)) + + /** + * Aggregate function: returns the sum of distinct values in the expression. + * + * @group agg_funcs + * @since 3.4.0 + */ + @deprecated("Use sum_distinct", "3.2.0") + def sumDistinct(e: Column): Column = sum_distinct(e) + + /** + * Aggregate function: returns the sum of distinct values in the expression. + * + * @group agg_funcs + * @since 3.4.0 + */ + @deprecated("Use sum_distinct", "3.2.0") + def sumDistinct(columnName: String): Column = sum_distinct(Column(columnName)) + + /** + * Aggregate function: returns the sum of distinct values in the expression. + * + * @group agg_funcs + * @since 3.4.0 + */ + def sum_distinct(e: Column): Column = Column.fn("sum", isDistinct = true, e) + + /** + * Aggregate function: alias for `var_samp`. + * + * @group agg_funcs + * @since 3.4.0 + */ + def variance(e: Column): Column = Column.fn("variance", e) + + /** + * Aggregate function: alias for `var_samp`. + * + * @group agg_funcs + * @since 3.4.0 + */ + def variance(columnName: String): Column = variance(Column(columnName)) + + /** + * Aggregate function: returns the unbiased variance of the values in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def var_samp(e: Column): Column = Column.fn("var_samp", e) + + /** + * Aggregate function: returns the unbiased variance of the values in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def var_samp(columnName: String): Column = var_samp(Column(columnName)) + + /** + * Aggregate function: returns the population variance of the values in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def var_pop(e: Column): Column = Column.fn("var_pop", e) + + /** + * Aggregate function: returns the population variance of the values in a group. + * + * @group agg_funcs + * @since 3.4.0 + */ + def var_pop(columnName: String): Column = var_pop(Column(columnName)) + + ////////////////////////////////////////////////////////////////////////////////////////////// + // Window functions + ////////////////////////////////////////////////////////////////////////////////////////////// + + /** + * Window function: returns the cumulative distribution of values within a window partition, + * i.e. the fraction of rows that are below the current row. + * + * {{{ + * N = total number of rows in the partition + * cumeDist(x) = number of values before (and including) x / N + * }}} + * + * @group window_funcs + * @since 3.4.0 + */ + def cume_dist(): Column = Column.fn("cume_dist") + + /** + * Window function: returns the rank of rows within a window partition, without any gaps. + * + * The difference between rank and dense_rank is that denseRank leaves no gaps in ranking + * sequence when there are ties. That is, if you were ranking a competition using dense_rank and + * had three people tie for second place, you would say that all three were in second place and + * that the next person came in third. Rank would give me sequential numbers, making the person + * that came in third place (after the ties) would register as coming in fifth. + * + * This is equivalent to the DENSE_RANK function in SQL. + * + * @group window_funcs + * @since 3.4.0 + */ + def dense_rank(): Column = Column.fn("dense_rank") + + /** + * Window function: returns the value that is `offset` rows before the current row, and `null` + * if there is less than `offset` rows before the current row. For example, an `offset` of one + * will return the previous row at any given point in the window partition. + * + * This is equivalent to the LAG function in SQL. + * + * @group window_funcs + * @since 3.4.0 + */ + def lag(e: Column, offset: Int): Column = lag(e, offset, null) + + /** + * Window function: returns the value that is `offset` rows before the current row, and `null` + * if there is less than `offset` rows before the current row. For example, an `offset` of one + * will return the previous row at any given point in the window partition. + * + * This is equivalent to the LAG function in SQL. + * + * @group window_funcs + * @since 3.4.0 + */ + def lag(columnName: String, offset: Int): Column = lag(columnName, offset, null) + + /** + * Window function: returns the value that is `offset` rows before the current row, and + * `defaultValue` if there is less than `offset` rows before the current row. For example, an + * `offset` of one will return the previous row at any given point in the window partition. + * + * This is equivalent to the LAG function in SQL. + * + * @group window_funcs + * @since 3.4.0 + */ + def lag(columnName: String, offset: Int, defaultValue: Any): Column = { + lag(Column(columnName), offset, defaultValue) + } + + /** + * Window function: returns the value that is `offset` rows before the current row, and + * `defaultValue` if there is less than `offset` rows before the current row. For example, an + * `offset` of one will return the previous row at any given point in the window partition. + * + * This is equivalent to the LAG function in SQL. + * + * @group window_funcs + * @since 3.4.0 + */ + def lag(e: Column, offset: Int, defaultValue: Any): Column = { + lag(e, offset, defaultValue, ignoreNulls = false) + } + + /** + * Window function: returns the value that is `offset` rows before the current row, and + * `defaultValue` if there is less than `offset` rows before the current row. `ignoreNulls` + * determines whether null values of row are included in or eliminated from the calculation. For + * example, an `offset` of one will return the previous row at any given point in the window + * partition. + * + * This is equivalent to the LAG function in SQL. + * + * @group window_funcs + * @since 3.4.0 + */ + def lag(e: Column, offset: Int, defaultValue: Any, ignoreNulls: Boolean): Column = + Column.fn("lag", e, lit(offset), lit(defaultValue), lit(ignoreNulls)) + + /** + * Window function: returns the value that is `offset` rows after the current row, and `null` if + * there is less than `offset` rows after the current row. For example, an `offset` of one will + * return the next row at any given point in the window partition. + * + * This is equivalent to the LEAD function in SQL. + * + * @group window_funcs + * @since 3.4.0 + */ + def lead(columnName: String, offset: Int): Column = { + lead(columnName, offset, null) + } + + /** + * Window function: returns the value that is `offset` rows after the current row, and `null` if + * there is less than `offset` rows after the current row. For example, an `offset` of one will + * return the next row at any given point in the window partition. + * + * This is equivalent to the LEAD function in SQL. + * + * @group window_funcs + * @since 3.4.0 + */ + def lead(e: Column, offset: Int): Column = { + lead(e, offset, null) + } + + /** + * Window function: returns the value that is `offset` rows after the current row, and + * `defaultValue` if there is less than `offset` rows after the current row. For example, an + * `offset` of one will return the next row at any given point in the window partition. + * + * This is equivalent to the LEAD function in SQL. + * + * @group window_funcs + * @since 3.4.0 + */ + def lead(columnName: String, offset: Int, defaultValue: Any): Column = { + lead(Column(columnName), offset, defaultValue) + } + + /** + * Window function: returns the value that is `offset` rows after the current row, and + * `defaultValue` if there is less than `offset` rows after the current row. For example, an + * `offset` of one will return the next row at any given point in the window partition. + * + * This is equivalent to the LEAD function in SQL. + * + * @group window_funcs + * @since 3.4.0 + */ + def lead(e: Column, offset: Int, defaultValue: Any): Column = { + lead(e, offset, defaultValue, ignoreNulls = false) + } + + /** + * Window function: returns the value that is `offset` rows after the current row, and + * `defaultValue` if there is less than `offset` rows after the current row. `ignoreNulls` + * determines whether null values of row are included in or eliminated from the calculation. The + * default value of `ignoreNulls` is false. For example, an `offset` of one will return the next + * row at any given point in the window partition. + * + * This is equivalent to the LEAD function in SQL. + * + * @group window_funcs + * @since 3.4.0 + */ + def lead(e: Column, offset: Int, defaultValue: Any, ignoreNulls: Boolean): Column = + Column.fn("lead", e, lit(offset), lit(defaultValue), lit(ignoreNulls)) + + /** + * Window function: returns the value that is the `offset`th row of the window frame (counting + * from 1), and `null` if the size of window frame is less than `offset` rows. + * + * It will return the `offset`th non-null value it sees when ignoreNulls is set to true. If all + * values are null, then null is returned. + * + * This is equivalent to the nth_value function in SQL. + * + * @group window_funcs + * @since 3.4.0 + */ + def nth_value(e: Column, offset: Int, ignoreNulls: Boolean): Column = + Column.fn("nth_value", e, lit(offset), lit(ignoreNulls)) + + /** + * Window function: returns the value that is the `offset`th row of the window frame (counting + * from 1), and `null` if the size of window frame is less than `offset` rows. + * + * This is equivalent to the nth_value function in SQL. + * + * @group window_funcs + * @since 3.4.0 + */ + def nth_value(e: Column, offset: Int): Column = + Column.fn("nth_value", e, lit(offset)) + + /** + * Window function: returns the ntile group id (from 1 to `n` inclusive) in an ordered window + * partition. For example, if `n` is 4, the first quarter of the rows will get value 1, the + * second quarter will get 2, the third quarter will get 3, and the last quarter will get 4. + * + * This is equivalent to the NTILE function in SQL. + * + * @group window_funcs + * @since 3.4.0 + */ + def ntile(n: Int): Column = Column.fn("ntile", lit(n)) + + /** + * Window function: returns the relative rank (i.e. percentile) of rows within a window + * partition. + * + * This is computed by: + * {{{ + * (rank of row in its partition - 1) / (number of rows in the partition - 1) + * }}} + * + * This is equivalent to the PERCENT_RANK function in SQL. + * + * @group window_funcs + * @since 3.4.0 + */ + def percent_rank(): Column = Column.fn("percent_rank") + + /** + * Window function: returns the rank of rows within a window partition. + * + * The difference between rank and dense_rank is that dense_rank leaves no gaps in ranking + * sequence when there are ties. That is, if you were ranking a competition using dense_rank and + * had three people tie for second place, you would say that all three were in second place and + * that the next person came in third. Rank would give me sequential numbers, making the person + * that came in third place (after the ties) would register as coming in fifth. + * + * This is equivalent to the RANK function in SQL. + * + * @group window_funcs + * @since 3.4.0 + */ + def rank(): Column = Column.fn("rank") + + /** + * Window function: returns a sequential number starting at 1 within a window partition. + * + * @group window_funcs + * @since 3.4.0 + */ + def row_number(): Column = Column.fn("row_number") + + ////////////////////////////////////////////////////////////////////////////////////////////// + // Non-aggregate functions + ////////////////////////////////////////////////////////////////////////////////////////////// + + /** + * Creates a new array column. The input columns must all have the same data type. + * + * @group normal_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def array(cols: Column*): Column = Column.fn("array", cols: _*) + + /** + * Creates a new array column. The input columns must all have the same data type. + * + * @group normal_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def array(colName: String, colNames: String*): Column = { + array((colName +: colNames).map(col): _*) + } + + /** + * Creates a new map column. The input columns must be grouped as key-value pairs, e.g. (key1, + * value1, key2, value2, ...). The key columns must all have the same data type, and can't be + * null. The value columns must all have the same data type. + * + * @group normal_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def map(cols: Column*): Column = Column.fn("map", cols: _*) + + /** + * Creates a new map column. The array in the first column is used for keys. The array in the + * second column is used for values. All elements in the array for key should not be null. + * + * @group normal_funcs + * @since 3.4.0 + */ + def map_from_arrays(keys: Column, values: Column): Column = + Column.fn("map_from_arrays", keys, values) + + /** + * Marks a DataFrame as small enough for use in broadcast joins. + * + * The following example marks the right DataFrame for broadcast hash join using `joinKey`. + * {{{ + * // left and right are DataFrames + * left.join(broadcast(right), "joinKey") + * }}} + * + * @group normal_funcs + * @since 3.4.0 + */ + def broadcast[T](df: Dataset[T]): Dataset[T] = { + df.hint("broadcast") + } + + /** + * Returns the first column that is not null, or null if all inputs are null. + * + * For example, `coalesce(a, b, c)` will return a if a is not null, or b if a is null and b is + * not null, or c if both a and b are null but c is not null. + * + * @group normal_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def coalesce(e: Column*): Column = Column.fn("coalesce", e: _*) + + /** + * Creates a string column for the file name of the current Spark task. + * + * @group normal_funcs + * @since 3.4.0 + */ + def input_file_name(): Column = Column.fn("input_file_name") + + /** + * Return true iff the column is NaN. + * + * @group normal_funcs + * @since 3.4.0 + */ + def isnan(e: Column): Column = e.isNaN + + /** + * Return true iff the column is null. + * + * @group normal_funcs + * @since 3.4.0 + */ + def isnull(e: Column): Column = e.isNull + + /** + * A column expression that generates monotonically increasing 64-bit integers. + * + * The generated ID is guaranteed to be monotonically increasing and unique, but not + * consecutive. The current implementation puts the partition ID in the upper 31 bits, and the + * record number within each partition in the lower 33 bits. The assumption is that the data + * frame has less than 1 billion partitions, and each partition has less than 8 billion records. + * + * As an example, consider a `DataFrame` with two partitions, each with 3 records. This + * expression would return the following IDs: + * + * {{{ + * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594. + * }}} + * + * @group normal_funcs + * @since 3.4.0 + */ + @deprecated("Use monotonically_increasing_id()", "2.0.0") + def monotonicallyIncreasingId(): Column = monotonically_increasing_id() + + /** + * A column expression that generates monotonically increasing 64-bit integers. + * + * The generated ID is guaranteed to be monotonically increasing and unique, but not + * consecutive. The current implementation puts the partition ID in the upper 31 bits, and the + * record number within each partition in the lower 33 bits. The assumption is that the data + * frame has less than 1 billion partitions, and each partition has less than 8 billion records. + * + * As an example, consider a `DataFrame` with two partitions, each with 3 records. This + * expression would return the following IDs: + * + * {{{ + * 0, 1, 2, 8589934592 (1L << 33), 8589934593, 8589934594. + * }}} + * + * @group normal_funcs + * @since 3.4.0 + */ + def monotonically_increasing_id(): Column = Column.fn("monotonically_increasing_id") + + /** + * Returns col1 if it is not NaN, or col2 if col1 is NaN. + * + * Both inputs should be floating point columns (DoubleType or FloatType). + * + * @group normal_funcs + * @since 3.4.0 + */ + def nanvl(col1: Column, col2: Column): Column = Column.fn("nanvl", col1, col2) + + /** + * Unary minus, i.e. negate the expression. + * {{{ + * // Select the amount column and negates all values. + * // Scala: + * df.select( -df("amount") ) + * + * // Java: + * df.select( negate(df.col("amount")) ); + * }}} + * + * @group normal_funcs + * @since 3.4.0 + */ + def negate(e: Column): Column = -e + + /** + * Inversion of boolean expression, i.e. NOT. + * {{{ + * // Scala: select rows that are not active (isActive === false) + * df.filter( !df("isActive") ) + * + * // Java: + * df.filter( not(df.col("isActive")) ); + * }}} + * + * @group normal_funcs + * @since 3.4.0 + */ + def not(e: Column): Column = !e + + /** + * Generate a random column with independent and identically distributed (i.i.d.) samples + * uniformly distributed in [0.0, 1.0). + * + * @note + * The function is non-deterministic in general case. + * + * @group normal_funcs + * @since 3.4.0 + */ + def rand(seed: Long): Column = Column.fn("rand", lit(seed)) + + /** + * Generate a random column with independent and identically distributed (i.i.d.) samples + * uniformly distributed in [0.0, 1.0). + * + * @note + * The function is non-deterministic in general case. + * + * @group normal_funcs + * @since 3.4.0 + */ + def rand(): Column = Column.fn("rand") + + /** + * Generate a column with independent and identically distributed (i.i.d.) samples from the + * standard normal distribution. + * + * @note + * The function is non-deterministic in general case. + * + * @group normal_funcs + * @since 3.4.0 + */ + def randn(seed: Long): Column = Column.fn("randn", lit(seed)) + + /** + * Generate a column with independent and identically distributed (i.i.d.) samples from the + * standard normal distribution. + * + * @note + * The function is non-deterministic in general case. + * + * @group normal_funcs + * @since 3.4.0 + */ + def randn(): Column = Column.fn("randn") + + /** + * Partition ID. + * + * @note + * This is non-deterministic because it depends on data partitioning and task scheduling. + * + * @group normal_funcs + * @since 3.4.0 + */ + def spark_partition_id(): Column = Column.fn("spark_partition_id") + + /** + * Computes the square root of the specified float value. + * + * @group math_funcs + * @since 3.4.0 + */ + def sqrt(e: Column): Column = Column.fn("sqrt", e) + + /** + * Computes the square root of the specified float value. + * + * @group math_funcs + * @since 3.4.0 + */ + def sqrt(colName: String): Column = sqrt(Column(colName)) + + /** + * Creates a new struct column. If the input column is a column in a `DataFrame`, or a derived + * column expression that is named (i.e. aliased), its name would be retained as the + * StructField's name, otherwise, the newly generated StructField's name would be auto generated + * as `col` with a suffix `index + 1`, i.e. col1, col2, col3, ... + * + * @group normal_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def struct(cols: Column*): Column = Column.fn("struct", cols: _*) + + /** + * Creates a new struct column that composes multiple input columns. + * + * @group normal_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def struct(colName: String, colNames: String*): Column = { + struct((colName +: colNames).map(col): _*) + } + + /** + * Evaluates a list of conditions and returns one of multiple possible result expressions. If + * otherwise is not defined at the end, null is returned for unmatched conditions. + * + * {{{ + * // Example: encoding gender string column into integer. + * + * // Scala: + * people.select(when(people("gender") === "male", 0) + * .when(people("gender") === "female", 1) + * .otherwise(2)) + * + * // Java: + * people.select(when(col("gender").equalTo("male"), 0) + * .when(col("gender").equalTo("female"), 1) + * .otherwise(2)) + * }}} + * + * @group normal_funcs + * @since 3.4.0 + */ + def when(condition: Column, value: Any): Column = Column { builder => + builder.getUnresolvedFunctionBuilder + .setFunctionName("when") + .addArguments(condition.expr) + .addArguments(lit(value).expr) + } + + /** + * Computes bitwise NOT (~) of a number. + * + * @group normal_funcs + * @since 3.4.0 + */ + @deprecated("Use bitwise_not", "3.2.0") + def bitwiseNOT(e: Column): Column = bitwise_not(e) + + /** + * Computes bitwise NOT (~) of a number. + * + * @group normal_funcs + * @since 3.4.0 + */ + def bitwise_not(e: Column): Column = Column.fn("~", e) + + /** + * Parses the expression string into the column that it represents, similar to + * [[Dataset#selectExpr]]. + * {{{ + * // get the number of words of each length + * df.groupBy(expr("length(word)")).count() + * }}} + * + * @group normal_funcs + */ + def expr(expr: String): Column = Column { builder => + builder.getExpressionStringBuilder.setExpression(expr) + } + + ////////////////////////////////////////////////////////////////////////////////////////////// + // Math Functions + ////////////////////////////////////////////////////////////////////////////////////////////// + + /** + * Computes the absolute value of a numeric value. + * + * @group math_funcs + * @since 3.4.0 + */ + def abs(e: Column): Column = Column.fn("abs", e) + + /** + * @return + * inverse cosine of `e` in radians, as if computed by `java.lang.Math.acos` + * + * @group math_funcs + * @since 3.4.0 + */ + def acos(e: Column): Column = Column.fn("acos", e) + + /** + * @return + * inverse cosine of `columnName`, as if computed by `java.lang.Math.acos` + * + * @group math_funcs + * @since 3.4.0 + */ + def acos(columnName: String): Column = acos(Column(columnName)) + + /** + * @return + * inverse hyperbolic cosine of `e` + * + * @group math_funcs + * @since 3.4.0 + */ + def acosh(e: Column): Column = Column.fn("acosh", e) + + /** + * @return + * inverse hyperbolic cosine of `columnName` + * + * @group math_funcs + * @since 3.4.0 + */ + def acosh(columnName: String): Column = acosh(Column(columnName)) + + /** + * @return + * inverse sine of `e` in radians, as if computed by `java.lang.Math.asin` + * + * @group math_funcs + * @since 3.4.0 + */ + def asin(e: Column): Column = Column.fn("asin", e) + + /** + * @return + * inverse sine of `columnName`, as if computed by `java.lang.Math.asin` + * + * @group math_funcs + * @since 3.4.0 + */ + def asin(columnName: String): Column = asin(Column(columnName)) + + /** + * @return + * inverse hyperbolic sine of `e` + * + * @group math_funcs + * @since 3.4.0 + */ + def asinh(e: Column): Column = Column.fn("asinh", e) + + /** + * @return + * inverse hyperbolic sine of `columnName` + * + * @group math_funcs + * @since 3.4.0 + */ + def asinh(columnName: String): Column = asinh(Column(columnName)) + + /** + * @return + * inverse tangent of `e` as if computed by `java.lang.Math.atan` + * + * @group math_funcs + * @since 3.4.0 + */ + def atan(e: Column): Column = Column.fn("atan", e) + + /** + * @return + * inverse tangent of `columnName`, as if computed by `java.lang.Math.atan` + * + * @group math_funcs + * @since 3.4.0 + */ + def atan(columnName: String): Column = atan(Column(columnName)) + + /** + * @param y + * coordinate on y-axis + * @param x + * coordinate on x-axis + * @return + * the theta component of the point (r, theta) in polar coordinates that + * corresponds to the point (x, y) in Cartesian coordinates, as if computed by + * `java.lang.Math.atan2` + * + * @group math_funcs + * @since 3.4.0 + */ + def atan2(y: Column, x: Column): Column = Column.fn("atan2", y, x) + + /** + * @param y + * coordinate on y-axis + * @param xName + * coordinate on x-axis + * @return + * the theta component of the point (r, theta) in polar coordinates that + * corresponds to the point (x, y) in Cartesian coordinates, as if computed by + * `java.lang.Math.atan2` + * + * @group math_funcs + * @since 3.4.0 + */ + def atan2(y: Column, xName: String): Column = atan2(y, Column(xName)) + + /** + * @param yName + * coordinate on y-axis + * @param x + * coordinate on x-axis + * @return + * the theta component of the point (r, theta) in polar coordinates that + * corresponds to the point (x, y) in Cartesian coordinates, as if computed by + * `java.lang.Math.atan2` + * + * @group math_funcs + * @since 3.4.0 + */ + def atan2(yName: String, x: Column): Column = atan2(Column(yName), x) + + /** + * @param yName + * coordinate on y-axis + * @param xName + * coordinate on x-axis + * @return + * the theta component of the point (r, theta) in polar coordinates that + * corresponds to the point (x, y) in Cartesian coordinates, as if computed by + * `java.lang.Math.atan2` + * + * @group math_funcs + * @since 3.4.0 + */ + def atan2(yName: String, xName: String): Column = + atan2(Column(yName), Column(xName)) + + /** + * @param y + * coordinate on y-axis + * @param xValue + * coordinate on x-axis + * @return + * the theta component of the point (r, theta) in polar coordinates that + * corresponds to the point (x, y) in Cartesian coordinates, as if computed by + * `java.lang.Math.atan2` + * + * @group math_funcs + * @since 3.4.0 + */ + def atan2(y: Column, xValue: Double): Column = atan2(y, lit(xValue)) + + /** + * @param yName + * coordinate on y-axis + * @param xValue + * coordinate on x-axis + * @return + * the theta component of the point (r, theta) in polar coordinates that + * corresponds to the point (x, y) in Cartesian coordinates, as if computed by + * `java.lang.Math.atan2` + * + * @group math_funcs + * @since 3.4.0 + */ + def atan2(yName: String, xValue: Double): Column = atan2(Column(yName), xValue) + + /** + * @param yValue + * coordinate on y-axis + * @param x + * coordinate on x-axis + * @return + * the theta component of the point (r, theta) in polar coordinates that + * corresponds to the point (x, y) in Cartesian coordinates, as if computed by + * `java.lang.Math.atan2` + * + * @group math_funcs + * @since 3.4.0 + */ + def atan2(yValue: Double, x: Column): Column = atan2(lit(yValue), x) + + /** + * @param yValue + * coordinate on y-axis + * @param xName + * coordinate on x-axis + * @return + * the theta component of the point (r, theta) in polar coordinates that + * corresponds to the point (x, y) in Cartesian coordinates, as if computed by + * `java.lang.Math.atan2` + * + * @group math_funcs + * @since 3.4.0 + */ + def atan2(yValue: Double, xName: String): Column = atan2(yValue, Column(xName)) + + /** + * @return + * inverse hyperbolic tangent of `e` + * + * @group math_funcs + * @since 3.4.0 + */ + def atanh(e: Column): Column = Column.fn("atanh", e) + + /** + * @return + * inverse hyperbolic tangent of `columnName` + * + * @group math_funcs + * @since 3.4.0 + */ + def atanh(columnName: String): Column = atanh(Column(columnName)) + + /** + * An expression that returns the string representation of the binary value of the given long + * column. For example, bin("12") returns "1100". + * + * @group math_funcs + * @since 3.4.0 + */ + def bin(e: Column): Column = Column.fn("bin", e) + + /** + * An expression that returns the string representation of the binary value of the given long + * column. For example, bin("12") returns "1100". + * + * @group math_funcs + * @since 3.4.0 + */ + def bin(columnName: String): Column = bin(Column(columnName)) + + /** + * Computes the cube-root of the given value. + * + * @group math_funcs + * @since 3.4.0 + */ + def cbrt(e: Column): Column = Column.fn("cbrt", e) + + /** + * Computes the cube-root of the given column. + * + * @group math_funcs + * @since 3.4.0 + */ + def cbrt(columnName: String): Column = cbrt(Column(columnName)) + + /** + * Computes the ceiling of the given value of `e` to `scale` decimal places. + * + * @group math_funcs + * @since 3.4.0 + */ + def ceil(e: Column, scale: Column): Column = Column.fn("ceil", e, scale) + + /** + * Computes the ceiling of the given value of `e` to 0 decimal places. + * + * @group math_funcs + * @since 3.4.0 + */ + def ceil(e: Column): Column = Column.fn("ceil", e) + + /** + * Computes the ceiling of the given value of `e` to 0 decimal places. + * + * @group math_funcs + * @since 3.4.0 + */ + def ceil(columnName: String): Column = ceil(Column(columnName)) + + /** + * Convert a number in a string column from one base to another. + * + * @group math_funcs + * @since 3.4.0 + */ + def conv(num: Column, fromBase: Int, toBase: Int): Column = + Column.fn("conv", num, lit(fromBase), lit(toBase)) + + /** + * @param e + * angle in radians + * @return + * cosine of the angle, as if computed by `java.lang.Math.cos` + * + * @group math_funcs + * @since 3.4.0 + */ + def cos(e: Column): Column = Column.fn("cos", e) + + /** + * @param columnName + * angle in radians + * @return + * cosine of the angle, as if computed by `java.lang.Math.cos` + * + * @group math_funcs + * @since 3.4.0 + */ + def cos(columnName: String): Column = cos(Column(columnName)) + + /** + * @param e + * hyperbolic angle + * @return + * hyperbolic cosine of the angle, as if computed by `java.lang.Math.cosh` + * + * @group math_funcs + * @since 3.4.0 + */ + def cosh(e: Column): Column = Column.fn("cosh", e) + + /** + * @param columnName + * hyperbolic angle + * @return + * hyperbolic cosine of the angle, as if computed by `java.lang.Math.cosh` + * + * @group math_funcs + * @since 3.4.0 + */ + def cosh(columnName: String): Column = cosh(Column(columnName)) + + /** + * @param e + * angle in radians + * @return + * cotangent of the angle + * + * @group math_funcs + * @since 3.4.0 + */ + def cot(e: Column): Column = Column.fn("cot", e) + + /** + * @param e + * angle in radians + * @return + * cosecant of the angle + * + * @group math_funcs + * @since 3.4.0 + */ + def csc(e: Column): Column = Column.fn("csc", e) + + /** + * Computes the exponential of the given value. + * + * @group math_funcs + * @since 3.4.0 + */ + def exp(e: Column): Column = Column.fn("exp", e) + + /** + * Computes the exponential of the given column. + * + * @group math_funcs + * @since 3.4.0 + */ + def exp(columnName: String): Column = exp(Column(columnName)) + + /** + * Computes the exponential of the given value minus one. + * + * @group math_funcs + * @since 3.4.0 + */ + def expm1(e: Column): Column = Column.fn("expm1", e) + + /** + * Computes the exponential of the given column minus one. + * + * @group math_funcs + * @since 3.4.0 + */ + def expm1(columnName: String): Column = expm1(Column(columnName)) + + /** + * Computes the factorial of the given value. + * + * @group math_funcs + * @since 3.4.0 + */ + def factorial(e: Column): Column = Column.fn("factorial", e) + + /** + * Computes the floor of the given value of `e` to `scale` decimal places. + * + * @group math_funcs + * @since 3.4.0 + */ + def floor(e: Column, scale: Column): Column = Column.fn("floor", e, scale) + + /** + * Computes the floor of the given value of `e` to 0 decimal places. + * + * @group math_funcs + * @since 3.4.0 + */ + def floor(e: Column): Column = Column.fn("floor", e) + + /** + * Computes the floor of the given column value to 0 decimal places. + * + * @group math_funcs + * @since 3.4.0 + */ + def floor(columnName: String): Column = floor(Column(columnName)) + + /** + * Returns the greatest value of the list of values, skipping null values. This function takes + * at least 2 parameters. It will return null iff all parameters are null. + * + * @group normal_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def greatest(exprs: Column*): Column = Column.fn("greatest", exprs: _*) + + /** + * Returns the greatest value of the list of column names, skipping null values. This function + * takes at least 2 parameters. It will return null iff all parameters are null. + * + * @group normal_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def greatest(columnName: String, columnNames: String*): Column = + greatest((columnName +: columnNames).map(Column.apply): _*) + + /** + * Computes hex value of the given column. + * + * @group math_funcs + * @since 3.4.0 + */ + def hex(column: Column): Column = Column.fn("hex", column) + + /** + * Inverse of hex. Interprets each pair of characters as a hexadecimal number and converts to + * the byte representation of number. + * + * @group math_funcs + * @since 3.4.0 + */ + def unhex(column: Column): Column = Column.fn("unhex", column) + + /** + * Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow. + * + * @group math_funcs + * @since 3.4.0 + */ + def hypot(l: Column, r: Column): Column = Column.fn("hypot", l, r) + + /** + * Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow. + * + * @group math_funcs + * @since 3.4.0 + */ + def hypot(l: Column, rightName: String): Column = hypot(l, Column(rightName)) + + /** + * Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow. + * + * @group math_funcs + * @since 3.4.0 + */ + def hypot(leftName: String, r: Column): Column = hypot(Column(leftName), r) + + /** + * Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow. + * + * @group math_funcs + * @since 3.4.0 + */ + def hypot(leftName: String, rightName: String): Column = + hypot(Column(leftName), Column(rightName)) + + /** + * Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow. + * + * @group math_funcs + * @since 3.4.0 + */ + def hypot(l: Column, r: Double): Column = hypot(l, lit(r)) + + /** + * Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow. + * + * @group math_funcs + * @since 3.4.0 + */ + def hypot(leftName: String, r: Double): Column = hypot(Column(leftName), r) + + /** + * Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow. + * + * @group math_funcs + * @since 3.4.0 + */ + def hypot(l: Double, r: Column): Column = hypot(lit(l), r) + + /** + * Computes `sqrt(a^2^ + b^2^)` without intermediate overflow or underflow. + * + * @group math_funcs + * @since 3.4.0 + */ + def hypot(l: Double, rightName: String): Column = hypot(l, Column(rightName)) + + /** + * Returns the least value of the list of values, skipping null values. This function takes at + * least 2 parameters. It will return null iff all parameters are null. + * + * @group normal_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def least(exprs: Column*): Column = Column.fn("least", exprs: _*) + + /** + * Returns the least value of the list of column names, skipping null values. This function + * takes at least 2 parameters. It will return null iff all parameters are null. + * + * @group normal_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def least(columnName: String, columnNames: String*): Column = + least((columnName +: columnNames).map(Column.apply): _*) + + /** + * Computes the natural logarithm of the given value. + * + * @group math_funcs + * @since 3.4.0 + */ + def log(e: Column): Column = Column.fn("log", e) + + /** + * Computes the natural logarithm of the given column. + * + * @group math_funcs + * @since 3.4.0 + */ + def log(columnName: String): Column = log(Column(columnName)) + + /** + * Returns the first argument-base logarithm of the second argument. + * + * @group math_funcs + * @since 3.4.0 + */ + def log(base: Double, a: Column): Column = Column.fn("log", lit(base), a) + + /** + * Returns the first argument-base logarithm of the second argument. + * + * @group math_funcs + * @since 3.4.0 + */ + def log(base: Double, columnName: String): Column = log(base, Column(columnName)) + + /** + * Computes the logarithm of the given value in base 10. + * + * @group math_funcs + * @since 3.4.0 + */ + def log10(e: Column): Column = Column.fn("log10", e) + + /** + * Computes the logarithm of the given value in base 10. + * + * @group math_funcs + * @since 3.4.0 + */ + def log10(columnName: String): Column = log10(Column(columnName)) + + /** + * Computes the natural logarithm of the given value plus one. + * + * @group math_funcs + * @since 3.4.0 + */ + def log1p(e: Column): Column = Column.fn("log1p", e) + + /** + * Computes the natural logarithm of the given column plus one. + * + * @group math_funcs + * @since 3.4.0 + */ + def log1p(columnName: String): Column = log1p(Column(columnName)) + + /** + * Computes the logarithm of the given column in base 2. + * + * @group math_funcs + * @since 3.4.0 + */ + def log2(expr: Column): Column = Column.fn("log2", expr) + + /** + * Computes the logarithm of the given value in base 2. + * + * @group math_funcs + * @since 3.4.0 + */ + def log2(columnName: String): Column = log2(Column(columnName)) + + /** + * Returns the value of the first argument raised to the power of the second argument. + * + * @group math_funcs + * @since 3.4.0 + */ + def pow(l: Column, r: Column): Column = Column.fn("power", l, r) + + /** + * Returns the value of the first argument raised to the power of the second argument. + * + * @group math_funcs + * @since 3.4.0 + */ + def pow(l: Column, rightName: String): Column = pow(l, Column(rightName)) + + /** + * Returns the value of the first argument raised to the power of the second argument. + * + * @group math_funcs + * @since 3.4.0 + */ + def pow(leftName: String, r: Column): Column = pow(Column(leftName), r) + + /** + * Returns the value of the first argument raised to the power of the second argument. + * + * @group math_funcs + * @since 3.4.0 + */ + def pow(leftName: String, rightName: String): Column = pow(Column(leftName), Column(rightName)) + + /** + * Returns the value of the first argument raised to the power of the second argument. + * + * @group math_funcs + * @since 3.4.0 + */ + def pow(l: Column, r: Double): Column = pow(l, lit(r)) + + /** + * Returns the value of the first argument raised to the power of the second argument. + * + * @group math_funcs + * @since 3.4.0 + */ + def pow(leftName: String, r: Double): Column = pow(Column(leftName), r) + + /** + * Returns the value of the first argument raised to the power of the second argument. + * + * @group math_funcs + * @since 3.4.0 + */ + def pow(l: Double, r: Column): Column = pow(lit(l), r) + + /** + * Returns the value of the first argument raised to the power of the second argument. + * + * @group math_funcs + * @since 3.4.0 + */ + def pow(l: Double, rightName: String): Column = pow(l, Column(rightName)) + + /** + * Returns the positive value of dividend mod divisor. + * + * @group math_funcs + * @since 3.4.0 + */ + def pmod(dividend: Column, divisor: Column): Column = Column.fn("pmod", dividend, divisor) + + /** + * Returns the double value that is closest in value to the argument and is equal to a + * mathematical integer. + * + * @group math_funcs + * @since 3.4.0 + */ + def rint(e: Column): Column = Column.fn("rint", e) + + /** + * Returns the double value that is closest in value to the argument and is equal to a + * mathematical integer. + * + * @group math_funcs + * @since 3.4.0 + */ + def rint(columnName: String): Column = rint(Column(columnName)) + + /** + * Returns the value of the column `e` rounded to 0 decimal places with HALF_UP round mode. + * + * @group math_funcs + * @since 3.4.0 + */ + def round(e: Column): Column = round(e, 0) + + /** + * Round the value of `e` to `scale` decimal places with HALF_UP round mode if `scale` is + * greater than or equal to 0 or at integral part when `scale` is less than 0. + * + * @group math_funcs + * @since 3.4.0 + */ + def round(e: Column, scale: Int): Column = Column.fn("round", e, lit(scale)) + + /** + * Returns the value of the column `e` rounded to 0 decimal places with HALF_EVEN round mode. + * + * @group math_funcs + * @since 3.4.0 + */ + def bround(e: Column): Column = bround(e, 0) + + /** + * Round the value of `e` to `scale` decimal places with HALF_EVEN round mode if `scale` is + * greater than or equal to 0 or at integral part when `scale` is less than 0. + * + * @group math_funcs + * @since 3.4.0 + */ + def bround(e: Column, scale: Int): Column = Column.fn("bround", e, lit(scale)) + + /** + * @param e + * angle in radians + * @return + * secant of the angle + * + * @group math_funcs + * @since 3.4.0 + */ + def sec(e: Column): Column = Column.fn("sec", e) + + /** + * Shift the given value numBits left. If the given value is a long value, this function will + * return a long value else it will return an integer value. + * + * @group math_funcs + * @since 3.4.0 + */ + @deprecated("Use shiftleft", "3.2.0") + def shiftLeft(e: Column, numBits: Int): Column = shiftleft(e, numBits) + + /** + * Shift the given value numBits left. If the given value is a long value, this function will + * return a long value else it will return an integer value. + * + * @group math_funcs + * @since 3.4.0 + */ + def shiftleft(e: Column, numBits: Int): Column = Column.fn("shiftleft", e, lit(numBits)) + + /** + * (Signed) shift the given value numBits right. If the given value is a long value, it will + * return a long value else it will return an integer value. + * + * @group math_funcs + * @since 3.4.0 + */ + @deprecated("Use shiftright", "3.2.0") + def shiftRight(e: Column, numBits: Int): Column = shiftright(e, numBits) + + /** + * (Signed) shift the given value numBits right. If the given value is a long value, it will + * return a long value else it will return an integer value. + * + * @group math_funcs + * @since 3.4.0 + */ + def shiftright(e: Column, numBits: Int): Column = Column.fn("shiftright", e, lit(numBits)) + + /** + * Unsigned shift the given value numBits right. If the given value is a long value, it will + * return a long value else it will return an integer value. + * + * @group math_funcs + * @since 3.4.0 + */ + @deprecated("Use shiftrightunsigned", "3.2.0") + def shiftRightUnsigned(e: Column, numBits: Int): Column = shiftrightunsigned(e, numBits) + + /** + * Unsigned shift the given value numBits right. If the given value is a long value, it will + * return a long value else it will return an integer value. + * + * @group math_funcs + * @since 3.4.0 + */ + def shiftrightunsigned(e: Column, numBits: Int): Column = + Column.fn("shiftrightunsigned", e, lit(numBits)) + + /** + * Computes the signum of the given value. + * + * @group math_funcs + * @since 3.4.0 + */ + def signum(e: Column): Column = Column.fn("signum", e) + + /** + * Computes the signum of the given column. + * + * @group math_funcs + * @since 3.4.0 + */ + def signum(columnName: String): Column = signum(Column(columnName)) + + /** + * @param e + * angle in radians + * @return + * sine of the angle, as if computed by `java.lang.Math.sin` + * + * @group math_funcs + * @since 3.4.0 + */ + def sin(e: Column): Column = Column.fn("sin", e) + + /** + * @param columnName + * angle in radians + * @return + * sine of the angle, as if computed by `java.lang.Math.sin` + * + * @group math_funcs + * @since 3.4.0 + */ + def sin(columnName: String): Column = sin(Column(columnName)) + + /** + * @param e + * hyperbolic angle + * @return + * hyperbolic sine of the given value, as if computed by `java.lang.Math.sinh` + * + * @group math_funcs + * @since 3.4.0 + */ + def sinh(e: Column): Column = Column.fn("sinh", e) + + /** + * @param columnName + * hyperbolic angle + * @return + * hyperbolic sine of the given value, as if computed by `java.lang.Math.sinh` + * + * @group math_funcs + * @since 3.4.0 + */ + def sinh(columnName: String): Column = sinh(Column(columnName)) + + /** + * @param e + * angle in radians + * @return + * tangent of the given value, as if computed by `java.lang.Math.tan` + * + * @group math_funcs + * @since 3.4.0 + */ + def tan(e: Column): Column = Column.fn("tan", e) + + /** + * @param columnName + * angle in radians + * @return + * tangent of the given value, as if computed by `java.lang.Math.tan` + * + * @group math_funcs + * @since 3.4.0 + */ + def tan(columnName: String): Column = tan(Column(columnName)) + + /** + * @param e + * hyperbolic angle + * @return + * hyperbolic tangent of the given value, as if computed by `java.lang.Math.tanh` + * + * @group math_funcs + * @since 3.4.0 + */ + def tanh(e: Column): Column = Column.fn("tanh", e) + + /** + * @param columnName + * hyperbolic angle + * @return + * hyperbolic tangent of the given value, as if computed by `java.lang.Math.tanh` + * + * @group math_funcs + * @since 3.4.0 + */ + def tanh(columnName: String): Column = tanh(Column(columnName)) + + /** + * @group math_funcs + * @since 3.4.0 + */ + @deprecated("Use degrees", "2.1.0") + def toDegrees(e: Column): Column = degrees(e) + + /** + * @group math_funcs + * @since 3.4.0 + */ + @deprecated("Use degrees", "2.1.0") + def toDegrees(columnName: String): Column = degrees(Column(columnName)) + + /** + * Converts an angle measured in radians to an approximately equivalent angle measured in + * degrees. + * + * @param e + * angle in radians + * @return + * angle in degrees, as if computed by `java.lang.Math.toDegrees` + * + * @group math_funcs + * @since 3.4.0 + */ + def degrees(e: Column): Column = Column.fn("degrees", e) + + /** + * Converts an angle measured in radians to an approximately equivalent angle measured in + * degrees. + * + * @param columnName + * angle in radians + * @return + * angle in degrees, as if computed by `java.lang.Math.toDegrees` + * + * @group math_funcs + * @since 3.4.0 + */ + def degrees(columnName: String): Column = degrees(Column(columnName)) + + /** + * @group math_funcs + * @since 3.4.0 + */ + @deprecated("Use radians", "2.1.0") + def toRadians(e: Column): Column = radians(e) + + /** + * @group math_funcs + * @since 3.4.0 + */ + @deprecated("Use radians", "2.1.0") + def toRadians(columnName: String): Column = radians(Column(columnName)) + + /** + * Converts an angle measured in degrees to an approximately equivalent angle measured in + * radians. + * + * @param e + * angle in degrees + * @return + * angle in radians, as if computed by `java.lang.Math.toRadians` + * + * @group math_funcs + * @since 3.4.0 + */ + def radians(e: Column): Column = Column.fn("radians", e) + + /** + * Converts an angle measured in degrees to an approximately equivalent angle measured in + * radians. + * + * @param columnName + * angle in degrees + * @return + * angle in radians, as if computed by `java.lang.Math.toRadians` + * + * @group math_funcs + * @since 3.4.0 + */ + def radians(columnName: String): Column = radians(Column(columnName)) + + ////////////////////////////////////////////////////////////////////////////////////////////// + // Misc functions + ////////////////////////////////////////////////////////////////////////////////////////////// + + /** + * Calculates the MD5 digest of a binary column and returns the value as a 32 character hex + * string. + * + * @group misc_funcs + * @since 3.4.0 + */ + def md5(e: Column): Column = Column.fn("md5", e) + + /** + * Calculates the SHA-1 digest of a binary column and returns the value as a 40 character hex + * string. + * + * @group misc_funcs + * @since 3.4.0 + */ + def sha1(e: Column): Column = Column.fn("sha1", e) + + /** + * Calculates the SHA-2 family of hash functions of a binary column and returns the value as a + * hex string. + * + * @param e + * column to compute SHA-2 on. + * @param numBits + * one of 224, 256, 384, or 512. + * + * @group misc_funcs + * @since 3.4.0 + */ + def sha2(e: Column, numBits: Int): Column = { + require( + Seq(0, 224, 256, 384, 512).contains(numBits), + s"numBits $numBits is not in the permitted values (0, 224, 256, 384, 512)") + Column.fn("sha2", e, lit(numBits)) + } + + /** + * Calculates the cyclic redundancy check value (CRC32) of a binary column and returns the value + * as a bigint. + * + * @group misc_funcs + * @since 3.4.0 + */ + def crc32(e: Column): Column = Column.fn("crc32", e) + + /** + * Calculates the hash code of given columns, and returns the result as an int column. + * + * @group misc_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def hash(cols: Column*): Column = Column.fn("hash", cols: _*) + + /** + * Calculates the hash code of given columns using the 64-bit variant of the xxHash algorithm, + * and returns the result as a long column. The hash computation uses an initial seed of 42. + * + * @group misc_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def xxhash64(cols: Column*): Column = Column.fn("xxhash64", cols: _*) + + /** + * Returns null if the condition is true, and throws an exception otherwise. + * + * @group misc_funcs + * @since 3.4.0 + */ + def assert_true(c: Column): Column = Column.fn("assert_true", c) + + /** + * Returns null if the condition is true; throws an exception with the error message otherwise. + * + * @group misc_funcs + * @since 3.4.0 + */ + def assert_true(c: Column, e: Column): Column = Column.fn("assert_true", c, e) + + /** + * Throws an exception with the provided error message. + * + * @group misc_funcs + * @since 3.4.0 + */ + def raise_error(c: Column): Column = Column.fn("raise_error", c) + + ////////////////////////////////////////////////////////////////////////////////////////////// + // String functions + ////////////////////////////////////////////////////////////////////////////////////////////// + + /** + * Computes the numeric value of the first character of the string column, and returns the + * result as an int column. + * + * @group string_funcs + * @since 3.4.0 + */ + def ascii(e: Column): Column = Column.fn("ascii", e) + + /** + * Computes the BASE64 encoding of a binary column and returns it as a string column. This is + * the reverse of unbase64. + * + * @group string_funcs + * @since 3.4.0 + */ + def base64(e: Column): Column = Column.fn("base64", e) + + /** + * Calculates the bit length for the specified string column. + * + * @group string_funcs + * @since 3.4.0 + */ + def bit_length(e: Column): Column = Column.fn("bit_length", e) + + /** + * Concatenates multiple input string columns together into a single string column, using the + * given separator. + * + * @group string_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def concat_ws(sep: String, exprs: Column*): Column = + Column.fn("concat_ws", lit(sep) +: exprs: _*) + + /** + * Computes the first argument into a string from a binary using the provided character set (one + * of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). If either argument + * is null, the result will also be null. + * + * @group string_funcs + * @since 3.4.0 + */ + def decode(value: Column, charset: String): Column = + Column.fn("decode", value, lit(charset)) + + /** + * Computes the first argument into a binary from a string using the provided character set (one + * of 'US-ASCII', 'ISO-8859-1', 'UTF-8', 'UTF-16BE', 'UTF-16LE', 'UTF-16'). If either argument + * is null, the result will also be null. + * + * @group string_funcs + * @since 3.4.0 + */ + def encode(value: Column, charset: String): Column = + Column.fn("encode", value, lit(charset)) + + /** + * Formats numeric column x to a format like '#,###,###.##', rounded to d decimal places with + * HALF_EVEN round mode, and returns the result as a string column. + * + * If d is 0, the result has no decimal point or fractional part. If d is less than 0, the + * result will be null. + * + * @group string_funcs + * @since 3.4.0 + */ + def format_number(x: Column, d: Int): Column = Column.fn("format_number", x, lit(d)) + + /** + * Formats the arguments in printf-style and returns the result as a string column. + * + * @group string_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def format_string(format: String, arguments: Column*): Column = + Column.fn("format_string", lit(format) +: arguments: _*) + + /** + * Returns a new string column by converting the first letter of each word to uppercase. Words + * are delimited by whitespace. + * + * For example, "hello world" will become "Hello World". + * + * @group string_funcs + * @since 3.4.0 + */ + def initcap(e: Column): Column = Column.fn("initcap", e) + + /** + * Locate the position of the first occurrence of substr column in the given string. Returns + * null if either of the arguments are null. + * + * @note + * The position is not zero based, but 1 based index. Returns 0 if substr could not be found + * in str. + * + * @group string_funcs + * @since 3.4.0 + */ + def instr(str: Column, substring: String): Column = Column.fn("instr", str, lit(substring)) + + /** + * Computes the character length of a given string or number of bytes of a binary string. The + * length of character strings include the trailing spaces. The length of binary strings + * includes binary zeros. + * + * @group string_funcs + * @since 3.4.0 + */ + def length(e: Column): Column = Column.fn("length", e) + + /** + * Converts a string column to lower case. + * + * @group string_funcs + * @since 3.4.0 + */ + def lower(e: Column): Column = Column.fn("lower", e) + + /** + * Computes the Levenshtein distance of the two given string columns. + * @group string_funcs + * @since 3.4.0 + */ + def levenshtein(l: Column, r: Column): Column = Column.fn("levenshtein", l, r) + + /** + * Locate the position of the first occurrence of substr. + * + * @note + * The position is not zero based, but 1 based index. Returns 0 if substr could not be found + * in str. + * + * @group string_funcs + * @since 3.4.0 + */ + def locate(substr: String, str: Column): Column = Column.fn("locate", lit(substr), str) + + /** + * Locate the position of the first occurrence of substr in a string column, after position pos. + * + * @note + * The position is not zero based, but 1 based index. returns 0 if substr could not be found + * in str. + * + * @group string_funcs + * @since 3.4.0 + */ + def locate(substr: String, str: Column, pos: Int): Column = + Column.fn("locate", lit(substr), str, lit(pos)) + + /** + * Left-pad the string column with pad to a length of len. If the string column is longer than + * len, the return value is shortened to len characters. + * + * @group string_funcs + * @since 3.4.0 + */ + def lpad(str: Column, len: Int, pad: String): Column = + Column.fn("lpad", str, lit(len), lit(pad)) + + /** + * Left-pad the binary column with pad to a byte length of len. If the binary column is longer + * than len, the return value is shortened to len bytes. + * + * @group string_funcs + * @since 3.4.0 + */ + def lpad(str: Column, len: Int, pad: Array[Byte]): Column = + Column.fn("lpad", str, lit(len), lit(pad)) + + /** + * Trim the spaces from left end for the specified string value. + * + * @group string_funcs + * @since 3.4.0 + */ + def ltrim(e: Column): Column = Column.fn("ltrim", e) + + /** + * Trim the specified character string from left end for the specified string column. + * @group string_funcs + * @since 3.4.0 + */ + def ltrim(e: Column, trimString: String): Column = Column.fn("ltrim", e, lit(trimString)) + + /** + * Calculates the byte length for the specified string column. + * + * @group string_funcs + * @since 3.4.0 + */ + def octet_length(e: Column): Column = Column.fn("octet_length", e) + + /** + * Extract a specific group matched by a Java regex, from the specified string column. If the + * regex did not match, or the specified group did not match, an empty string is returned. if + * the specified group index exceeds the group count of regex, an IllegalArgumentException will + * be thrown. + * + * @group string_funcs + * @since 3.4.0 + */ + def regexp_extract(e: Column, exp: String, groupIdx: Int): Column = + Column.fn("regexp_extract", e, lit(exp), lit(groupIdx)) + + /** + * Replace all substrings of the specified string value that match regexp with rep. + * + * @group string_funcs + * @since 3.4.0 + */ + def regexp_replace(e: Column, pattern: String, replacement: String): Column = + regexp_replace(e, lit(pattern), lit(replacement)) + + /** + * Replace all substrings of the specified string value that match regexp with rep. + * + * @group string_funcs + * @since 3.4.0 + */ + def regexp_replace(e: Column, pattern: Column, replacement: Column): Column = + Column.fn("regexp_replace", e, pattern, replacement) + + /** + * Decodes a BASE64 encoded string column and returns it as a binary column. This is the reverse + * of base64. + * + * @group string_funcs + * @since 3.4.0 + */ + def unbase64(e: Column): Column = Column.fn("unbase64", e) + + /** + * Right-pad the string column with pad to a length of len. If the string column is longer than + * len, the return value is shortened to len characters. + * + * @group string_funcs + * @since 3.4.0 + */ + def rpad(str: Column, len: Int, pad: String): Column = + Column.fn("rpad", str, lit(len), lit(pad)) + + /** + * Right-pad the binary column with pad to a byte length of len. If the binary column is longer + * than len, the return value is shortened to len bytes. + * + * @group string_funcs + * @since 3.4.0 + */ + def rpad(str: Column, len: Int, pad: Array[Byte]): Column = + Column.fn("rpad", str, lit(len), lit(pad)) + + /** + * Repeats a string column n times, and returns it as a new string column. + * + * @group string_funcs + * @since 3.4.0 + */ + def repeat(str: Column, n: Int): Column = Column.fn("repeat", str, lit(n)) + + /** + * Trim the spaces from right end for the specified string value. + * + * @group string_funcs + * @since 3.4.0 + */ + def rtrim(e: Column): Column = Column.fn("rtrim", e) + + /** + * Trim the specified character string from right end for the specified string column. + * @group string_funcs + * @since 3.4.0 + */ + def rtrim(e: Column, trimString: String): Column = Column.fn("rtrim", e, lit(trimString)) + + /** + * Returns the soundex code for the specified expression. + * + * @group string_funcs + * @since 3.4.0 + */ + def soundex(e: Column): Column = Column.fn("soundex", e) + + /** + * Splits str around matches of the given pattern. + * + * @param str + * a string expression to split + * @param pattern + * a string representing a regular expression. The regex string should be a Java regular + * expression. + * + * @group string_funcs + * @since 3.4.0 + */ + def split(str: Column, pattern: String): Column = Column.fn("split", str, lit(pattern)) + + /** + * Splits str around matches of the given pattern. + * + * @param str + * a string expression to split + * @param pattern + * a string representing a regular expression. The regex string should be a Java regular + * expression. + * @param limit + * an integer expression which controls the number of times the regex is applied.
    + *
  • limit greater than 0: The resulting array's length will not be more than limit, and the + * resulting array's last entry will contain all input beyond the last matched regex.
  • + *
  • limit less than or equal to 0: `regex` will be applied as many times as possible, and + * the resulting array can be of any size.
+ * + * @group string_funcs + * @since 3.4.0 + */ + def split(str: Column, pattern: String, limit: Int): Column = + Column.fn("split", str, lit(pattern), lit(limit)) + + /** + * Substring starts at `pos` and is of length `len` when str is String type or returns the slice + * of byte array that starts at `pos` in byte and is of length `len` when str is Binary type + * + * @note + * The position is not zero based, but 1 based index. + * + * @group string_funcs + * @since 3.4.0 + */ + def substring(str: Column, pos: Int, len: Int): Column = + Column.fn("substring", str, lit(pos), lit(len)) + + /** + * Returns the substring from string str before count occurrences of the delimiter delim. If + * count is positive, everything the left of the final delimiter (counting from left) is + * returned. If count is negative, every to the right of the final delimiter (counting from the + * right) is returned. substring_index performs a case-sensitive match when searching for delim. + * + * @group string_funcs + */ + def substring_index(str: Column, delim: String, count: Int): Column = + Column.fn("substring_index", str, lit(delim), lit(count)) + + /** + * Overlay the specified portion of `src` with `replace`, starting from byte position `pos` of + * `src` and proceeding for `len` bytes. + * + * @group string_funcs + * @since 3.4.0 + */ + def overlay(src: Column, replace: Column, pos: Column, len: Column): Column = + Column.fn("overlay", src, replace, pos, len) + + /** + * Overlay the specified portion of `src` with `replace`, starting from byte position `pos` of + * `src`. + * + * @group string_funcs + * @since 3.4.0 + */ + def overlay(src: Column, replace: Column, pos: Column): Column = + Column.fn("overlay", src, replace, pos) + + /** + * Splits a string into arrays of sentences, where each sentence is an array of words. + * @group string_funcs + * @since 3.4.0 + */ + def sentences(string: Column, language: Column, country: Column): Column = + Column.fn("sentences", string, language, country) + + /** + * Splits a string into arrays of sentences, where each sentence is an array of words. The + * default locale is used. + * @group string_funcs + * @since 3.4.0 + */ + def sentences(string: Column): Column = Column.fn("sentences", string) + + /** + * Translate any character in the src by a character in replaceString. The characters in + * replaceString correspond to the characters in matchingString. The translate will happen when + * any character in the string matches the character in the `matchingString`. + * + * @group string_funcs + * @since 3.4.0 + */ + def translate(src: Column, matchingString: String, replaceString: String): Column = + Column.fn("translate", src, lit(matchingString), lit(replaceString)) + + /** + * Trim the spaces from both ends for the specified string column. + * + * @group string_funcs + * @since 3.4.0 + */ + def trim(e: Column): Column = Column.fn("trim", e) + + /** + * Trim the specified character from both ends for the specified string column. + * @group string_funcs + * @since 3.4.0 + */ + def trim(e: Column, trimString: String): Column = Column.fn("trim", e, lit(trimString)) + + /** + * Converts a string column to upper case. + * + * @group string_funcs + * @since 3.4.0 + */ + def upper(e: Column): Column = Column.fn("upper", e) + + ////////////////////////////////////////////////////////////////////////////////////////////// + // DateTime functions + ////////////////////////////////////////////////////////////////////////////////////////////// + + /** + * Returns the date that is `numMonths` after `startDate`. + * + * @param startDate + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a date, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @param numMonths + * The number of months to add to `startDate`, can be negative to subtract months + * @return + * A date, or null if `startDate` was a string that could not be cast to a date + * @group datetime_funcs + * @since 3.4.0 + */ + def add_months(startDate: Column, numMonths: Int): Column = + add_months(startDate, lit(numMonths)) + + /** + * Returns the date that is `numMonths` after `startDate`. + * + * @param startDate + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a date, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @param numMonths + * A column of the number of months to add to `startDate`, can be negative to subtract months + * @return + * A date, or null if `startDate` was a string that could not be cast to a date + * @group datetime_funcs + * @since 3.4.0 + */ + def add_months(startDate: Column, numMonths: Column): Column = + Column.fn("add_months", startDate, numMonths) + + /** + * Returns the current date at the start of query evaluation as a date column. All calls of + * current_date within the same query return the same value. + * + * @group datetime_funcs + * @since 3.4.0 + */ + def current_date(): Column = Column.fn("current_date") + + /** + * Returns the current timestamp at the start of query evaluation as a timestamp column. All + * calls of current_timestamp within the same query return the same value. + * + * @group datetime_funcs + * @since 3.4.0 + */ + def current_timestamp(): Column = Column.fn("current_timestamp") + + /** + * Returns the current timestamp without time zone at the start of query evaluation as a + * timestamp without time zone column. All calls of localtimestamp within the same query return + * the same value. + * + * @group datetime_funcs + * @since 3.4.0 + */ + def localtimestamp(): Column = Column.fn("localtimestamp") + + /** + * Converts a date/timestamp/string to a value of string in the format specified by the date + * format given by the second argument. + * + * See Datetime + * Patterns for valid date and time format patterns + * + * @param dateExpr + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @param format + * A pattern `dd.MM.yyyy` would return a string like `18.03.1993` + * @return + * A string, or null if `dateExpr` was a string that could not be cast to a timestamp + * @note + * Use specialized functions like [[year]] whenever possible as they benefit from a + * specialized implementation. + * @throws IllegalArgumentException + * if the `format` pattern is invalid + * @group datetime_funcs + * @since 3.4.0 + */ + def date_format(dateExpr: Column, format: String): Column = + Column.fn("date_format", dateExpr, lit(format)) + + /** + * Returns the date that is `days` days after `start` + * + * @param start + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a date, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @param days + * The number of days to add to `start`, can be negative to subtract days + * @return + * A date, or null if `start` was a string that could not be cast to a date + * @group datetime_funcs + * @since 3.4.0 + */ + def date_add(start: Column, days: Int): Column = date_add(start, lit(days)) + + /** + * Returns the date that is `days` days after `start` + * + * @param start + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a date, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @param days + * A column of the number of days to add to `start`, can be negative to subtract days + * @return + * A date, or null if `start` was a string that could not be cast to a date + * @group datetime_funcs + * @since 3.4.0 + */ + def date_add(start: Column, days: Column): Column = Column.fn("date_add", start, days) + + /** + * Returns the date that is `days` days before `start` + * + * @param start + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a date, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @param days + * The number of days to subtract from `start`, can be negative to add days + * @return + * A date, or null if `start` was a string that could not be cast to a date + * @group datetime_funcs + * @since 3.4.0 + */ + def date_sub(start: Column, days: Int): Column = date_sub(start, lit(days)) + + /** + * Returns the date that is `days` days before `start` + * + * @param start + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a date, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @param days + * A column of the number of days to subtract from `start`, can be negative to add days + * @return + * A date, or null if `start` was a string that could not be cast to a date + * @group datetime_funcs + * @since 3.4.0 + */ + def date_sub(start: Column, days: Column): Column = + Column.fn("date_sub", start, days) + + /** + * Returns the number of days from `start` to `end`. + * + * Only considers the date part of the input. For example: + * {{{ + * dateddiff("2018-01-10 00:00:00", "2018-01-09 23:59:59") + * // returns 1 + * }}} + * + * @param end + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a date, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @param start + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a date, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @return + * An integer, or null if either `end` or `start` were strings that could not be cast to a + * date. Negative if `end` is before `start` + * @group datetime_funcs + * @since 3.4.0 + */ + def datediff(end: Column, start: Column): Column = Column.fn("datediff", end, start) + + /** + * Extracts the year as an integer from a given date/timestamp/string. + * @return + * An integer, or null if the input was a string that could not be cast to a date + * @group datetime_funcs + * @since 3.4.0 + */ + def year(e: Column): Column = Column.fn("year", e) + + /** + * Extracts the quarter as an integer from a given date/timestamp/string. + * @return + * An integer, or null if the input was a string that could not be cast to a date + * @group datetime_funcs + * @since 3.4.0 + */ + def quarter(e: Column): Column = Column.fn("quarter", e) + + /** + * Extracts the month as an integer from a given date/timestamp/string. + * @return + * An integer, or null if the input was a string that could not be cast to a date + * @group datetime_funcs + * @since 3.4.0 + */ + def month(e: Column): Column = Column.fn("month", e) + + /** + * Extracts the day of the week as an integer from a given date/timestamp/string. Ranges from 1 + * for a Sunday through to 7 for a Saturday + * @return + * An integer, or null if the input was a string that could not be cast to a date + * @group datetime_funcs + * @since 3.4.0 + */ + def dayofweek(e: Column): Column = Column.fn("dayofweek", e) + + /** + * Extracts the day of the month as an integer from a given date/timestamp/string. + * @return + * An integer, or null if the input was a string that could not be cast to a date + * @group datetime_funcs + * @since 3.4.0 + */ + def dayofmonth(e: Column): Column = Column.fn("dayofmonth", e) + + /** + * Extracts the day of the year as an integer from a given date/timestamp/string. + * @return + * An integer, or null if the input was a string that could not be cast to a date + * @group datetime_funcs + * @since 3.4.0 + */ + def dayofyear(e: Column): Column = Column.fn("dayofyear", e) + + /** + * Extracts the hours as an integer from a given date/timestamp/string. + * @return + * An integer, or null if the input was a string that could not be cast to a date + * @group datetime_funcs + * @since 3.4.0 + */ + def hour(e: Column): Column = Column.fn("hour", e) + + /** + * Returns the last day of the month which the given date belongs to. For example, input + * "2015-07-27" returns "2015-07-31" since July 31 is the last day of the month in July 2015. + * + * @param e + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a date, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @return + * A date, or null if the input was a string that could not be cast to a date + * @group datetime_funcs + * @since 3.4.0 + */ + def last_day(e: Column): Column = Column.fn("last_day", e) + + /** + * Extracts the minutes as an integer from a given date/timestamp/string. + * @return + * An integer, or null if the input was a string that could not be cast to a date + * @group datetime_funcs + * @since 3.4.0 + */ + def minute(e: Column): Column = Column.fn("minute", e) + + /** + * @return + * A date created from year, month and day fields. + * @group datetime_funcs + * @since 3.4.0 + */ + def make_date(year: Column, month: Column, day: Column): Column = + Column.fn("make_date", year, month, day) + + /** + * Returns number of months between dates `start` and `end`. + * + * A whole number is returned if both inputs have the same day of month or both are the last day + * of their respective months. Otherwise, the difference is calculated assuming 31 days per + * month. + * + * For example: + * {{{ + * months_between("2017-11-14", "2017-07-14") // returns 4.0 + * months_between("2017-01-01", "2017-01-10") // returns 0.29032258 + * months_between("2017-06-01", "2017-06-16 12:00:00") // returns -0.5 + * }}} + * + * @param end + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @param start + * A date, timestamp or string. If a string, the data must be in a format that can cast to a + * timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @return + * A double, or null if either `end` or `start` were strings that could not be cast to a + * timestamp. Negative if `end` is before `start` + * @group datetime_funcs + * @since 3.4.0 + */ + def months_between(end: Column, start: Column): Column = + Column.fn("months_between", end, start) + + /** + * Returns number of months between dates `end` and `start`. If `roundOff` is set to true, the + * result is rounded off to 8 digits; it is not rounded otherwise. + * @group datetime_funcs + * @since 3.4.0 + */ + def months_between(end: Column, start: Column, roundOff: Boolean): Column = + Column.fn("months_between", end, start, lit(roundOff)) + + /** + * Returns the first date which is later than the value of the `date` column that is on the + * specified day of the week. + * + * For example, `next_day('2015-07-27', "Sunday")` returns 2015-08-02 because that is the first + * Sunday after 2015-07-27. + * + * @param date + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a date, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @param dayOfWeek + * Case insensitive, and accepts: "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun" + * @return + * A date, or null if `date` was a string that could not be cast to a date or if `dayOfWeek` + * was an invalid value + * @group datetime_funcs + * @since 3.4.0 + */ + def next_day(date: Column, dayOfWeek: String): Column = next_day(date, lit(dayOfWeek)) + + /** + * Returns the first date which is later than the value of the `date` column that is on the + * specified day of the week. + * + * For example, `next_day('2015-07-27', "Sunday")` returns 2015-08-02 because that is the first + * Sunday after 2015-07-27. + * + * @param date + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a date, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @param dayOfWeek + * A column of the day of week. Case insensitive, and accepts: "Mon", "Tue", "Wed", "Thu", + * "Fri", "Sat", "Sun" + * @return + * A date, or null if `date` was a string that could not be cast to a date or if `dayOfWeek` + * was an invalid value + * @group datetime_funcs + * @since 3.4.0 + */ + def next_day(date: Column, dayOfWeek: Column): Column = + Column.fn("next_day", date, dayOfWeek) + + /** + * Extracts the seconds as an integer from a given date/timestamp/string. + * @return + * An integer, or null if the input was a string that could not be cast to a timestamp + * @group datetime_funcs + * @since 3.4.0 + */ + def second(e: Column): Column = Column.fn("second", e) + + /** + * Extracts the week number as an integer from a given date/timestamp/string. + * + * A week is considered to start on a Monday and week 1 is the first week with more than 3 days, + * as defined by ISO 8601 + * + * @return + * An integer, or null if the input was a string that could not be cast to a date + * @group datetime_funcs + * @since 3.4.0 + */ + def weekofyear(e: Column): Column = Column.fn("weekofyear", e) + + /** + * Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string + * representing the timestamp of that moment in the current system time zone in the yyyy-MM-dd + * HH:mm:ss format. + * + * @param ut + * A number of a type that is castable to a long, such as string or integer. Can be negative + * for timestamps before the unix epoch + * @return + * A string, or null if the input was a string that could not be cast to a long + * @group datetime_funcs + * @since 3.4.0 + */ + def from_unixtime(ut: Column): Column = Column.fn("from_unixtime", ut) + + /** + * Converts the number of seconds from unix epoch (1970-01-01 00:00:00 UTC) to a string + * representing the timestamp of that moment in the current system time zone in the given + * format. + * + * See Datetime + * Patterns for valid date and time format patterns + * + * @param ut + * A number of a type that is castable to a long, such as string or integer. Can be negative + * for timestamps before the unix epoch + * @param f + * A date time pattern that the input will be formatted to + * @return + * A string, or null if `ut` was a string that could not be cast to a long or `f` was an + * invalid date time pattern + * @group datetime_funcs + * @since 3.4.0 + */ + def from_unixtime(ut: Column, f: String): Column = + Column.fn("from_unixtime", ut, lit(f)) + + /** + * Returns the current Unix timestamp (in seconds) as a long. + * + * @note + * All calls of `unix_timestamp` within the same query return the same value (i.e. the current + * timestamp is calculated at the start of query evaluation). + * + * @group datetime_funcs + * @since 3.4.0 + */ + def unix_timestamp(): Column = unix_timestamp(current_timestamp()) + + /** + * Converts time string in format yyyy-MM-dd HH:mm:ss to Unix timestamp (in seconds), using the + * default timezone and the default locale. + * + * @param s + * A date, timestamp or string. If a string, the data must be in the `yyyy-MM-dd HH:mm:ss` + * format + * @return + * A long, or null if the input was a string not of the correct format + * @group datetime_funcs + * @since 3.4.0 + */ + def unix_timestamp(s: Column): Column = Column.fn("unix_timestamp", s) + + /** + * Converts time string with given pattern to Unix timestamp (in seconds). + * + * See Datetime + * Patterns for valid date and time format patterns + * + * @param s + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a date, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @param p + * A date time pattern detailing the format of `s` when `s` is a string + * @return + * A long, or null if `s` was a string that could not be cast to a date or `p` was an invalid + * format + * @group datetime_funcs + * @since 3.4.0 + */ + def unix_timestamp(s: Column, p: String): Column = + Column.fn("unix_timestamp", s, lit(p)) + + /** + * Converts to a timestamp by casting rules to `TimestampType`. + * + * @param s + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @return + * A timestamp, or null if the input was a string that could not be cast to a timestamp + * @group datetime_funcs + * @since 3.4.0 + */ + def to_timestamp(s: Column): Column = Column.fn("to_timestamp", s) + + /** + * Converts time string with the given pattern to timestamp. + * + * See Datetime + * Patterns for valid date and time format patterns + * + * @param s + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @param fmt + * A date time pattern detailing the format of `s` when `s` is a string + * @return + * A timestamp, or null if `s` was a string that could not be cast to a timestamp or `fmt` was + * an invalid format + * @group datetime_funcs + * @since 3.4.0 + */ + def to_timestamp(s: Column, fmt: String): Column = Column.fn("to_timestamp", s, lit(fmt)) + + /** + * Converts the column into `DateType` by casting rules to `DateType`. + * + * @group datetime_funcs + * @since 3.4.0 + */ + def to_date(e: Column): Column = Column.fn("to_date", e) + + /** + * Converts the column into a `DateType` with a specified format + * + * See Datetime + * Patterns for valid date and time format patterns + * + * @param e + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a date, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @param fmt + * A date time pattern detailing the format of `e` when `e`is a string + * @return + * A date, or null if `e` was a string that could not be cast to a date or `fmt` was an + * invalid format + * @group datetime_funcs + * @since 3.4.0 + */ + def to_date(e: Column, fmt: String): Column = Column.fn("to_date", e, lit(fmt)) + + /** + * Returns date truncated to the unit specified by the format. + * + * For example, `trunc("2018-11-19 12:01:19", "year")` returns 2018-01-01 + * + * @param date + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a date, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @param format: + * 'year', 'yyyy', 'yy' to truncate by year, or 'month', 'mon', 'mm' to truncate by month + * Other options are: 'week', 'quarter' + * + * @return + * A date, or null if `date` was a string that could not be cast to a date or `format` was an + * invalid value + * @group datetime_funcs + * @since 3.4.0 + */ + def trunc(date: Column, format: String): Column = Column.fn("trunc", date, lit(format)) + + /** + * Returns timestamp truncated to the unit specified by the format. + * + * For example, `date_trunc("year", "2018-11-19 12:01:19")` returns 2018-01-01 00:00:00 + * + * @param format: + * 'year', 'yyyy', 'yy' to truncate by year, 'month', 'mon', 'mm' to truncate by month, 'day', + * 'dd' to truncate by day, Other options are: 'microsecond', 'millisecond', 'second', + * 'minute', 'hour', 'week', 'quarter' + * @param timestamp + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @return + * A timestamp, or null if `timestamp` was a string that could not be cast to a timestamp or + * `format` was an invalid value + * @group datetime_funcs + * @since 3.4.0 + */ + def date_trunc(format: String, timestamp: Column): Column = + Column.fn("date_trunc", lit(format), timestamp) + + /** + * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in UTC, and renders + * that time as a timestamp in the given time zone. For example, 'GMT+1' would yield '2017-07-14 + * 03:40:00.0'. + * + * @param ts + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @param tz + * A string detailing the time zone ID that the input should be adjusted to. It should be in + * the format of either region-based zone IDs or zone offsets. Region IDs must have the form + * 'area/city', such as 'America/Los_Angeles'. Zone offsets must be in the format + * '(+|-)HH:mm', for example '-08:00' or '+01:00'. Also 'UTC' and 'Z' are supported as aliases + * of '+00:00'. Other short names are not recommended to use because they can be ambiguous. + * @return + * A timestamp, or null if `ts` was a string that could not be cast to a timestamp or `tz` was + * an invalid value + * @group datetime_funcs + * @since 3.4.0 + */ + def from_utc_timestamp(ts: Column, tz: String): Column = from_utc_timestamp(ts, lit(tz)) + + /** + * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in UTC, and renders + * that time as a timestamp in the given time zone. For example, 'GMT+1' would yield '2017-07-14 + * 03:40:00.0'. + * @group datetime_funcs + * @since 3.4.0 + */ + def from_utc_timestamp(ts: Column, tz: Column): Column = + Column.fn("from_utc_timestamp", ts, tz) + + /** + * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in the given time + * zone, and renders that time as a timestamp in UTC. For example, 'GMT+1' would yield + * '2017-07-14 01:40:00.0'. + * + * @param ts + * A date, timestamp or string. If a string, the data must be in a format that can be cast to + * a timestamp, such as `yyyy-MM-dd` or `yyyy-MM-dd HH:mm:ss.SSSS` + * @param tz + * A string detailing the time zone ID that the input should be adjusted to. It should be in + * the format of either region-based zone IDs or zone offsets. Region IDs must have the form + * 'area/city', such as 'America/Los_Angeles'. Zone offsets must be in the format + * '(+|-)HH:mm', for example '-08:00' or '+01:00'. Also 'UTC' and 'Z' are supported as aliases + * of '+00:00'. Other short names are not recommended to use because they can be ambiguous. + * @return + * A timestamp, or null if `ts` was a string that could not be cast to a timestamp or `tz` was + * an invalid value + * @group datetime_funcs + * @since 3.4.0 + */ + def to_utc_timestamp(ts: Column, tz: String): Column = to_utc_timestamp(ts, lit(tz)) + + /** + * Given a timestamp like '2017-07-14 02:40:00.0', interprets it as a time in the given time + * zone, and renders that time as a timestamp in UTC. For example, 'GMT+1' would yield + * '2017-07-14 01:40:00.0'. + * @group datetime_funcs + * @since 3.4.0 + */ + def to_utc_timestamp(ts: Column, tz: Column): Column = Column.fn("to_utc_timestamp", ts, tz) + + /** + * Bucketize rows into one or more time windows given a timestamp specifying column. Window + * starts are inclusive but the window ends are exclusive, e.g. 12:05 will be in the window + * [12:05,12:10) but not in [12:00,12:05). Windows can support microsecond precision. Windows in + * the order of months are not supported. The following example takes the average stock price + * for a one minute window every 10 seconds starting 5 seconds after the hour: + * + * {{{ + * val df = ... // schema => timestamp: TimestampType, stockId: StringType, price: DoubleType + * df.groupBy(window($"timestamp", "1 minute", "10 seconds", "5 seconds"), $"stockId") + * .agg(mean("price")) + * }}} + * + * The windows will look like: + * + * {{{ + * 09:00:05-09:01:05 + * 09:00:15-09:01:15 + * 09:00:25-09:01:25 ... + * }}} + * + * For a streaming query, you may use the function `current_timestamp` to generate windows on + * processing time. + * + * @param timeColumn + * The column or the expression to use as the timestamp for windowing by time. The time column + * must be of TimestampType or TimestampNTZType. + * @param windowDuration + * A string specifying the width of the window, e.g. `10 minutes`, `1 second`. Check + * `org.apache.spark.unsafe.types.CalendarInterval` for valid duration identifiers. Note that + * the duration is a fixed length of time, and does not vary over time according to a + * calendar. For example, `1 day` always means 86,400,000 milliseconds, not a calendar day. + * @param slideDuration + * A string specifying the sliding interval of the window, e.g. `1 minute`. A new window will + * be generated every `slideDuration`. Must be less than or equal to the `windowDuration`. + * Check `org.apache.spark.unsafe.types.CalendarInterval` for valid duration identifiers. This + * duration is likewise absolute, and does not vary according to a calendar. + * @param startTime + * The offset with respect to 1970-01-01 00:00:00 UTC with which to start window intervals. + * For example, in order to have hourly tumbling windows that start 15 minutes past the hour, + * e.g. 12:15-13:15, 13:15-14:15... provide `startTime` as `15 minutes`. + * + * @group datetime_funcs + * @since 3.4.0 + */ + def window( + timeColumn: Column, + windowDuration: String, + slideDuration: String, + startTime: String): Column = + Column.fn("window", timeColumn, lit(windowDuration), lit(slideDuration), lit(startTime)) + + /** + * Bucketize rows into one or more time windows given a timestamp specifying column. Window + * starts are inclusive but the window ends are exclusive, e.g. 12:05 will be in the window + * [12:05,12:10) but not in [12:00,12:05). Windows can support microsecond precision. Windows in + * the order of months are not supported. The windows start beginning at 1970-01-01 00:00:00 + * UTC. The following example takes the average stock price for a one minute window every 10 + * seconds: + * + * {{{ + * val df = ... // schema => timestamp: TimestampType, stockId: StringType, price: DoubleType + * df.groupBy(window($"timestamp", "1 minute", "10 seconds"), $"stockId") + * .agg(mean("price")) + * }}} + * + * The windows will look like: + * + * {{{ + * 09:00:00-09:01:00 + * 09:00:10-09:01:10 + * 09:00:20-09:01:20 ... + * }}} + * + * For a streaming query, you may use the function `current_timestamp` to generate windows on + * processing time. + * + * @param timeColumn + * The column or the expression to use as the timestamp for windowing by time. The time column + * must be of TimestampType or TimestampNTZType. + * @param windowDuration + * A string specifying the width of the window, e.g. `10 minutes`, `1 second`. Check + * `org.apache.spark.unsafe.types.CalendarInterval` for valid duration identifiers. Note that + * the duration is a fixed length of time, and does not vary over time according to a + * calendar. For example, `1 day` always means 86,400,000 milliseconds, not a calendar day. + * @param slideDuration + * A string specifying the sliding interval of the window, e.g. `1 minute`. A new window will + * be generated every `slideDuration`. Must be less than or equal to the `windowDuration`. + * Check `org.apache.spark.unsafe.types.CalendarInterval` for valid duration identifiers. This + * duration is likewise absolute, and does not vary according to a calendar. + * + * @group datetime_funcs + * @since 3.4.0 + */ + def window(timeColumn: Column, windowDuration: String, slideDuration: String): Column = { + window(timeColumn, windowDuration, slideDuration, "0 second") + } + + /** + * Generates tumbling time windows given a timestamp specifying column. Window starts are + * inclusive but the window ends are exclusive, e.g. 12:05 will be in the window [12:05,12:10) + * but not in [12:00,12:05). Windows can support microsecond precision. Windows in the order of + * months are not supported. The windows start beginning at 1970-01-01 00:00:00 UTC. The + * following example takes the average stock price for a one minute tumbling window: + * + * {{{ + * val df = ... // schema => timestamp: TimestampType, stockId: StringType, price: DoubleType + * df.groupBy(window($"timestamp", "1 minute"), $"stockId") + * .agg(mean("price")) + * }}} + * + * The windows will look like: + * + * {{{ + * 09:00:00-09:01:00 + * 09:01:00-09:02:00 + * 09:02:00-09:03:00 ... + * }}} + * + * For a streaming query, you may use the function `current_timestamp` to generate windows on + * processing time. + * + * @param timeColumn + * The column or the expression to use as the timestamp for windowing by time. The time column + * must be of TimestampType or TimestampNTZType. + * @param windowDuration + * A string specifying the width of the window, e.g. `10 minutes`, `1 second`. Check + * `org.apache.spark.unsafe.types.CalendarInterval` for valid duration identifiers. + * + * @group datetime_funcs + * @since 3.4.0 + */ + def window(timeColumn: Column, windowDuration: String): Column = { + window(timeColumn, windowDuration, windowDuration, "0 second") + } + + /** + * Extracts the event time from the window column. + * + * The window column is of StructType { start: Timestamp, end: Timestamp } where start is + * inclusive and end is exclusive. Since event time can support microsecond precision, + * window_time(window) = window.end - 1 microsecond. + * + * @param windowColumn + * The window column (typically produced by window aggregation) of type StructType { start: + * Timestamp, end: Timestamp } + * + * @group datetime_funcs + * @since 3.4.0 + */ + def window_time(windowColumn: Column): Column = Column.fn("window_time", windowColumn) + + /** + * Generates session window given a timestamp specifying column. + * + * Session window is one of dynamic windows, which means the length of window is varying + * according to the given inputs. The length of session window is defined as "the timestamp of + * latest input of the session + gap duration", so when the new inputs are bound to the current + * session window, the end time of session window can be expanded according to the new inputs. + * + * Windows can support microsecond precision. gapDuration in the order of months are not + * supported. + * + * For a streaming query, you may use the function `current_timestamp` to generate windows on + * processing time. + * + * @param timeColumn + * The column or the expression to use as the timestamp for windowing by time. The time column + * must be of TimestampType or TimestampNTZType. + * @param gapDuration + * A string specifying the timeout of the session, e.g. `10 minutes`, `1 second`. Check + * `org.apache.spark.unsafe.types.CalendarInterval` for valid duration identifiers. + * + * @group datetime_funcs + * @since 3.4.0 + */ + def session_window(timeColumn: Column, gapDuration: String): Column = + session_window(timeColumn, lit(gapDuration)) + + /** + * Generates session window given a timestamp specifying column. + * + * Session window is one of dynamic windows, which means the length of window is varying + * according to the given inputs. For static gap duration, the length of session window is + * defined as "the timestamp of latest input of the session + gap duration", so when the new + * inputs are bound to the current session window, the end time of session window can be + * expanded according to the new inputs. + * + * Besides a static gap duration value, users can also provide an expression to specify gap + * duration dynamically based on the input row. With dynamic gap duration, the closing of a + * session window does not depend on the latest input anymore. A session window's range is the + * union of all events' ranges which are determined by event start time and evaluated gap + * duration during the query execution. Note that the rows with negative or zero gap duration + * will be filtered out from the aggregation. + * + * Windows can support microsecond precision. gapDuration in the order of months are not + * supported. + * + * For a streaming query, you may use the function `current_timestamp` to generate windows on + * processing time. + * + * @param timeColumn + * The column or the expression to use as the timestamp for windowing by time. The time column + * must be of TimestampType or TimestampNTZType. + * @param gapDuration + * A column specifying the timeout of the session. It could be static value, e.g. `10 + * minutes`, `1 second`, or an expression/UDF that specifies gap duration dynamically based on + * the input row. + * + * @group datetime_funcs + * @since 3.4.0 + */ + def session_window(timeColumn: Column, gapDuration: Column): Column = + Column.fn("session_window", timeColumn, gapDuration).as("session_window") + + /** + * Converts the number of seconds from the Unix epoch (1970-01-01T00:00:00Z) to a timestamp. + * @group datetime_funcs + * @since 3.4.0 + */ + def timestamp_seconds(e: Column): Column = Column.fn("timestamp_seconds", e) + + ////////////////////////////////////////////////////////////////////////////////////////////// + // Collection functions + ////////////////////////////////////////////////////////////////////////////////////////////// + + /** + * Returns null if the array is null, true if the array contains `value`, and false otherwise. + * @group collection_funcs + * @since 3.4.0 + */ + def array_contains(column: Column, value: Any): Column = + Column.fn("array_contains", column, lit(value)) + + /** + * Returns an ARRAY containing all elements from the source ARRAY as well as the new element. + * The new element/column is located at end of the ARRAY. + * + * @group collection_funcs + * @since 3.4.0 + */ + def array_append(column: Column, element: Any): Column = + Column.fn("array_append", column, lit(element)) + + /** + * Returns `true` if `a1` and `a2` have at least one non-null element in common. If not and both + * the arrays are non-empty and any of them contains a `null`, it returns `null`. It returns + * `false` otherwise. + * @group collection_funcs + * @since 3.4.0 + */ + def arrays_overlap(a1: Column, a2: Column): Column = Column.fn("arrays_overlap", a1, a2) + + /** + * Returns an array containing all the elements in `x` from index `start` (or starting from the + * end if `start` is negative) with the specified `length`. + * + * @param x + * the array column to be sliced + * @param start + * the starting index + * @param length + * the length of the slice + * + * @group collection_funcs + * @since 3.4.0 + */ + def slice(x: Column, start: Int, length: Int): Column = + slice(x, lit(start), lit(length)) + + /** + * Returns an array containing all the elements in `x` from index `start` (or starting from the + * end if `start` is negative) with the specified `length`. + * + * @param x + * the array column to be sliced + * @param start + * the starting index + * @param length + * the length of the slice + * + * @group collection_funcs + * @since 3.4.0 + */ + def slice(x: Column, start: Column, length: Column): Column = + Column.fn("slice", x, start, length) + + /** + * Concatenates the elements of `column` using the `delimiter`. Null values are replaced with + * `nullReplacement`. + * @group collection_funcs + * @since 3.4.0 + */ + def array_join(column: Column, delimiter: String, nullReplacement: String): Column = + Column.fn("array_join", column, lit(delimiter), lit(nullReplacement)) + + /** + * Concatenates the elements of `column` using the `delimiter`. + * @group collection_funcs + * @since 3.4.0 + */ + def array_join(column: Column, delimiter: String): Column = + Column.fn("array_join", column, lit(delimiter)) + + /** + * Concatenates multiple input columns together into a single column. The function works with + * strings, binary and compatible array columns. + * + * @group collection_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def concat(exprs: Column*): Column = Column.fn("concat", exprs: _*) + + /** + * Locates the position of the first occurrence of the value in the given array as long. Returns + * null if either of the arguments are null. + * + * @note + * The position is not zero based, but 1 based index. Returns 0 if value could not be found in + * array. + * + * @group collection_funcs + * @since 3.4.0 + */ + def array_position(column: Column, value: Any): Column = + Column.fn("array_position", column, lit(value)) + + /** + * Returns element of array at given index in value if column is array. Returns value for the + * given key in value if column is map. + * + * @group collection_funcs + * @since 3.4.0 + */ + def element_at(column: Column, value: Any): Column = Column.fn("element_at", column, lit(value)) + + /** + * Returns element of array at given (0-based) index. If the index points outside of the array + * boundaries, then this function returns NULL. + * + * @group collection_funcs + * @since 3.4.0 + */ + def get(column: Column, index: Column): Column = Column.fn("get", column, index) + + /** + * Sorts the input array in ascending order. The elements of the input array must be orderable. + * NaN is greater than any non-NaN elements for double/float type. Null elements will be placed + * at the end of the returned array. + * + * @group collection_funcs + * @since 3.4.0 + */ + def array_sort(e: Column): Column = Column.fn("array_sort", e) + + /** + * Sorts the input array based on the given comparator function. The comparator will take two + * arguments representing two elements of the array. It returns a negative integer, 0, or a + * positive integer as the first element is less than, equal to, or greater than the second + * element. If the comparator function returns null, the function will fail and raise an error. + * + * @group collection_funcs + * @since 3.4.0 + */ + def array_sort(e: Column, comparator: (Column, Column) => Column): Column = + Column.fn("array_sort", e, createLambda(comparator)) + + /** + * Remove all elements that equal to element from the given array. + * + * @group collection_funcs + * @since 3.4.0 + */ + def array_remove(column: Column, element: Any): Column = + Column.fn("array_remove", column, lit(element)) + + /** + * Remove all null elements from the given array. + * + * @group collection_funcs + * @since 3.4.0 + */ + def array_compact(column: Column): Column = Column.fn("array_compact", column) + + /** + * Removes duplicate values from the array. + * @group collection_funcs + * @since 3.4.0 + */ + def array_distinct(e: Column): Column = Column.fn("array_distinct", e) + + /** + * Returns an array of the elements in the intersection of the given two arrays, without + * duplicates. + * + * @group collection_funcs + * @since 3.4.0 + */ + def array_intersect(col1: Column, col2: Column): Column = + Column.fn("array_intersect", col1, col2) + + /** + * Adds an item into a given array at a specified position + * + * @group collection_funcs + * @since 3.4.0 + */ + def array_insert(arr: Column, pos: Column, value: Column): Column = + Column.fn("array_insert", arr, pos, value) + + /** + * Returns an array of the elements in the union of the given two arrays, without duplicates. + * + * @group collection_funcs + * @since 3.4.0 + */ + def array_union(col1: Column, col2: Column): Column = + Column.fn("array_union", col1, col2) + + /** + * Returns an array of the elements in the first array but not in the second array, without + * duplicates. The order of elements in the result is not determined + * + * @group collection_funcs + * @since 3.4.0 + */ + def array_except(col1: Column, col2: Column): Column = + Column.fn("array_except", col1, col2) + + private def newLambdaVariable(name: String): proto.Expression.UnresolvedNamedLambdaVariable = { + proto.Expression.UnresolvedNamedLambdaVariable + .newBuilder() + .addNameParts(name) + .build() + } + + private def toLambdaVariableColumn( + v: proto.Expression.UnresolvedNamedLambdaVariable): Column = { + Column(_.setUnresolvedNamedLambdaVariable(v)) + } + + private def createLambda(f: Column => Column): Column = Column { builder => + val x = newLambdaVariable("x") + val function = f(toLambdaVariableColumn(x)) + builder.getLambdaFunctionBuilder + .setFunction(function.expr) + .addArguments(x) + } + + private def createLambda(f: (Column, Column) => Column) = Column { builder => + val x = newLambdaVariable("x") + val y = newLambdaVariable("y") + val function = f(toLambdaVariableColumn(x), toLambdaVariableColumn(y)) + builder.getLambdaFunctionBuilder + .setFunction(function.expr) + .addArguments(x) + .addArguments(y) + } + + private def createLambda(f: (Column, Column, Column) => Column) = Column { builder => + val x = newLambdaVariable("x") + val y = newLambdaVariable("y") + val z = newLambdaVariable("z") + val function = + f(toLambdaVariableColumn(x), toLambdaVariableColumn(y), toLambdaVariableColumn(z)) + builder.getLambdaFunctionBuilder + .setFunction(function.expr) + .addArguments(x) + .addArguments(y) + .addArguments(z) + } + + /** + * Returns an array of elements after applying a transformation to each element in the input + * array. + * {{{ + * df.select(transform(col("i"), x => x + 1)) + * }}} + * + * @param column + * the input array column + * @param f + * col => transformed_col, the lambda function to transform the input column + * + * @group collection_funcs + * @since 3.4.0 + */ + def transform(column: Column, f: Column => Column): Column = + Column.fn("transform", column, createLambda(f)) + + /** + * Returns an array of elements after applying a transformation to each element in the input + * array. + * {{{ + * df.select(transform(col("i"), (x, i) => x + i)) + * }}} + * + * @param column + * the input array column + * @param f + * (col, index) => transformed_col, the lambda function to filter the input column given the + * index. Indices start at 0. + * + * @group collection_funcs + * @since 3.4.0 + */ + def transform(column: Column, f: (Column, Column) => Column): Column = + Column.fn("transform", column, createLambda(f)) + + /** + * Returns whether a predicate holds for one or more elements in the array. + * {{{ + * df.select(exists(col("i"), _ % 2 === 0)) + * }}} + * + * @param column + * the input array column + * @param f + * col => predicate, the Boolean predicate to check the input column + * + * @group collection_funcs + * @since 3.4.0 + */ + def exists(column: Column, f: Column => Column): Column = + Column.fn("exists", column, createLambda(f)) + + /** + * Returns whether a predicate holds for every element in the array. + * {{{ + * df.select(forall(col("i"), x => x % 2 === 0)) + * }}} + * + * @param column + * the input array column + * @param f + * col => predicate, the Boolean predicate to check the input column + * + * @group collection_funcs + * @since 3.4.0 + */ + def forall(column: Column, f: Column => Column): Column = + Column.fn("forall", column, createLambda(f)) + + /** + * Returns an array of elements for which a predicate holds in a given array. + * {{{ + * df.select(filter(col("s"), x => x % 2 === 0)) + * }}} + * + * @param column + * the input array column + * @param f + * col => predicate, the Boolean predicate to filter the input column + * + * @group collection_funcs + * @since 3.4.0 + */ + def filter(column: Column, f: Column => Column): Column = + Column.fn("filter", column, createLambda(f)) + + /** + * Returns an array of elements for which a predicate holds in a given array. + * {{{ + * df.select(filter(col("s"), (x, i) => i % 2 === 0)) + * }}} + * + * @param column + * the input array column + * @param f + * (col, index) => predicate, the Boolean predicate to filter the input column given the + * index. Indices start at 0. + * + * @group collection_funcs + * @since 3.4.0 + */ + def filter(column: Column, f: (Column, Column) => Column): Column = + Column.fn("filter", column, createLambda(f)) + + /** + * Applies a binary operator to an initial state and all elements in the array, and reduces this + * to a single state. The final state is converted into the final result by applying a finish + * function. + * {{{ + * df.select(aggregate(col("i"), lit(0), (acc, x) => acc + x, _ * 10)) + * }}} + * + * @param expr + * the input array column + * @param initialValue + * the initial value + * @param merge + * (combined_value, input_value) => combined_value, the merge function to merge an input value + * to the combined_value + * @param finish + * combined_value => final_value, the lambda function to convert the combined value of all + * inputs to final result + * + * @group collection_funcs + * @since 3.4.0 + */ + def aggregate( + expr: Column, + initialValue: Column, + merge: (Column, Column) => Column, + finish: Column => Column): Column = + Column.fn("aggregate", expr, initialValue, createLambda(merge), createLambda(finish)) + + /** + * Applies a binary operator to an initial state and all elements in the array, and reduces this + * to a single state. + * {{{ + * df.select(aggregate(col("i"), lit(0), (acc, x) => acc + x)) + * }}} + * + * @param expr + * the input array column + * @param initialValue + * the initial value + * @param merge + * (combined_value, input_value) => combined_value, the merge function to merge an input value + * to the combined_value + * @group collection_funcs + * @since 3.4.0 + */ + def aggregate(expr: Column, initialValue: Column, merge: (Column, Column) => Column): Column = + aggregate(expr, initialValue, merge, c => c) + + /** + * Merge two given arrays, element-wise, into a single array using a function. If one array is + * shorter, nulls are appended at the end to match the length of the longer array, before + * applying the function. + * {{{ + * df.select(zip_with(df1("val1"), df1("val2"), (x, y) => x + y)) + * }}} + * + * @param left + * the left input array column + * @param right + * the right input array column + * @param f + * (lCol, rCol) => col, the lambda function to merge two input columns into one column + * + * @group collection_funcs + * @since 3.4.0 + */ + def zip_with(left: Column, right: Column, f: (Column, Column) => Column): Column = + Column.fn("zip_with", left, right, createLambda(f)) + + /** + * Applies a function to every key-value pair in a map and returns a map with the results of + * those applications as the new keys for the pairs. + * {{{ + * df.select(transform_keys(col("i"), (k, v) => k + v)) + * }}} + * + * @param expr + * the input map column + * @param f + * (key, value) => new_key, the lambda function to transform the key of input map column + * + * @group collection_funcs + * @since 3.4.0 + */ + def transform_keys(expr: Column, f: (Column, Column) => Column): Column = + Column.fn("transform_keys", expr, createLambda(f)) + + /** + * Applies a function to every key-value pair in a map and returns a map with the results of + * those applications as the new values for the pairs. + * {{{ + * df.select(transform_values(col("i"), (k, v) => k + v)) + * }}} + * + * @param expr + * the input map column + * @param f + * (key, value) => new_value, the lambda function to transform the value of input map column + * + * @group collection_funcs + * @since 3.4.0 + */ + def transform_values(expr: Column, f: (Column, Column) => Column): Column = + Column.fn("transform_values", expr, createLambda(f)) + + /** + * Returns a map whose key-value pairs satisfy a predicate. + * {{{ + * df.select(map_filter(col("m"), (k, v) => k * 10 === v)) + * }}} + * + * @param expr + * the input map column + * @param f + * (key, value) => predicate, the Boolean predicate to filter the input map column + * + * @group collection_funcs + * @since 3.4.0 + */ + def map_filter(expr: Column, f: (Column, Column) => Column): Column = + Column.fn("map_filter", expr, createLambda(f)) + + /** + * Merge two given maps, key-wise into a single map using a function. + * {{{ + * df.select(map_zip_with(df("m1"), df("m2"), (k, v1, v2) => k === v1 + v2)) + * }}} + * + * @param left + * the left input map column + * @param right + * the right input map column + * @param f + * (key, value1, value2) => new_value, the lambda function to merge the map values + * + * @group collection_funcs + * @since 3.4.0 + */ + def map_zip_with(left: Column, right: Column, f: (Column, Column, Column) => Column): Column = + Column.fn("map_zip_with", left, right, createLambda(f)) + + /** + * Creates a new row for each element in the given array or map column. Uses the default column + * name `col` for elements in the array and `key` and `value` for elements in the map unless + * specified otherwise. + * + * @group collection_funcs + * @since 3.4.0 + */ + def explode(e: Column): Column = Column.fn("explode", e) + + /** + * Creates a new row for each element in the given array or map column. Uses the default column + * name `col` for elements in the array and `key` and `value` for elements in the map unless + * specified otherwise. Unlike explode, if the array/map is null or empty then null is produced. + * + * @group collection_funcs + * @since 3.4.0 + */ + def explode_outer(e: Column): Column = Column.fn("explode_outer", e) + + /** + * Creates a new row for each element with position in the given array or map column. Uses the + * default column name `pos` for position, and `col` for elements in the array and `key` and + * `value` for elements in the map unless specified otherwise. + * + * @group collection_funcs + * @since 3.4.0 + */ + def posexplode(e: Column): Column = Column.fn("posexplode", e) + + /** + * Creates a new row for each element with position in the given array or map column. Uses the + * default column name `pos` for position, and `col` for elements in the array and `key` and + * `value` for elements in the map unless specified otherwise. Unlike posexplode, if the + * array/map is null or empty then the row (null, null) is produced. + * + * @group collection_funcs + * @since 3.4.0 + */ + def posexplode_outer(e: Column): Column = Column.fn("posexplode_outer", e) + + /** + * Creates a new row for each element in the given array of structs. + * + * @group collection_funcs + * @since 3.4.0 + */ + def inline(e: Column): Column = Column.fn("inline", e) + + /** + * Creates a new row for each element in the given array of structs. Unlike inline, if the array + * is null or empty then null is produced for each nested column. + * + * @group collection_funcs + * @since 3.4.0 + */ + def inline_outer(e: Column): Column = Column.fn("inline_outer", e) + + /** + * Extracts json object from a json string based on json path specified, and returns json string + * of the extracted json object. It will return null if the input json string is invalid. + * + * @group collection_funcs + * @since 3.4.0 + */ + def get_json_object(e: Column, path: String): Column = + Column.fn("get_json_object", e, lit(path)) + + /** + * Creates a new row for a json column according to the given field names. + * + * @group collection_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def json_tuple(json: Column, fields: String*): Column = { + require(fields.nonEmpty, "at least 1 field name should be given.") + Column.fn("json_tuple", json +: fields.map(lit): _*) + } + + // scalastyle:off line.size.limit + /** + * (Scala-specific) Parses a column containing a JSON string into a `StructType` with the + * specified schema. Returns `null`, in the case of an unparseable string. + * + * @param e + * a string column containing JSON data. + * @param schema + * the schema to use when parsing the json string + * @param options + * options to control how the json is parsed. Accepts the same options as the json data + * source. See Data + * Source Option in the version you use. + * + * @group collection_funcs + * @since 3.4.0 + */ + // scalastyle:on line.size.limit + def from_json(e: Column, schema: StructType, options: Map[String, String]): Column = + from_json(e, schema.asInstanceOf[DataType], options) + + // scalastyle:off line.size.limit + /** + * (Scala-specific) Parses a column containing a JSON string into a `MapType` with `StringType` + * as keys type, `StructType` or `ArrayType` with the specified schema. Returns `null`, in the + * case of an unparseable string. + * + * @param e + * a string column containing JSON data. + * @param schema + * the schema to use when parsing the json string + * @param options + * options to control how the json is parsed. accepts the same options and the json data + * source. See Data + * Source Option in the version you use. + * + * @group collection_funcs + * @since 3.4.0 + */ + // scalastyle:on line.size.limit + def from_json(e: Column, schema: DataType, options: Map[String, String]): Column = { + from_json(e, lit(schema.json), options.iterator) + } + + // scalastyle:off line.size.limit + /** + * (Java-specific) Parses a column containing a JSON string into a `StructType` with the + * specified schema. Returns `null`, in the case of an unparseable string. + * + * @param e + * a string column containing JSON data. + * @param schema + * the schema to use when parsing the json string + * @param options + * options to control how the json is parsed. accepts the same options and the json data + * source. See Data + * Source Option in the version you use. + * + * @group collection_funcs + * @since 3.4.0 + */ + // scalastyle:on line.size.limit + def from_json(e: Column, schema: StructType, options: java.util.Map[String, String]): Column = + from_json(e, schema, options.asScala.toMap) + + // scalastyle:off line.size.limit + /** + * (Java-specific) Parses a column containing a JSON string into a `MapType` with `StringType` + * as keys type, `StructType` or `ArrayType` with the specified schema. Returns `null`, in the + * case of an unparseable string. + * + * @param e + * a string column containing JSON data. + * @param schema + * the schema to use when parsing the json string + * @param options + * options to control how the json is parsed. accepts the same options and the json data + * source. See Data + * Source Option in the version you use. + * + * @group collection_funcs + * @since 3.4.0 + */ + // scalastyle:on line.size.limit + def from_json(e: Column, schema: DataType, options: java.util.Map[String, String]): Column = { + from_json(e, schema, options.asScala.toMap) + } + + /** + * Parses a column containing a JSON string into a `StructType` with the specified schema. + * Returns `null`, in the case of an unparseable string. + * + * @param e + * a string column containing JSON data. + * @param schema + * the schema to use when parsing the json string + * + * @group collection_funcs + * @since 3.4.0 + */ + def from_json(e: Column, schema: StructType): Column = + from_json(e, schema, Map.empty[String, String]) + + /** + * Parses a column containing a JSON string into a `MapType` with `StringType` as keys type, + * `StructType` or `ArrayType` with the specified schema. Returns `null`, in the case of an + * unparseable string. + * + * @param e + * a string column containing JSON data. + * @param schema + * the schema to use when parsing the json string + * + * @group collection_funcs + * @since 3.4.0 + */ + def from_json(e: Column, schema: DataType): Column = + from_json(e, schema, Map.empty[String, String]) + + // scalastyle:off line.size.limit + /** + * (Java-specific) Parses a column containing a JSON string into a `MapType` with `StringType` + * as keys type, `StructType` or `ArrayType` with the specified schema. Returns `null`, in the + * case of an unparseable string. + * + * @param e + * a string column containing JSON data. + * @param schema + * the schema as a DDL-formatted string. + * @param options + * options to control how the json is parsed. accepts the same options and the json data + * source. See Data + * Source Option in the version you use. + * + * @group collection_funcs + * @since 3.4.0 + */ + // scalastyle:on line.size.limit + def from_json(e: Column, schema: String, options: java.util.Map[String, String]): Column = { + from_json(e, schema, options.asScala.toMap) + } + + // scalastyle:off line.size.limit + /** + * (Scala-specific) Parses a column containing a JSON string into a `MapType` with `StringType` + * as keys type, `StructType` or `ArrayType` with the specified schema. Returns `null`, in the + * case of an unparseable string. + * + * @param e + * a string column containing JSON data. + * @param schema + * the schema as a DDL-formatted string. + * @param options + * options to control how the json is parsed. accepts the same options and the json data + * source. See Data + * Source Option in the version you use. + * + * @group collection_funcs + * @since 3.4.0 + */ + // scalastyle:on line.size.limit + def from_json(e: Column, schema: String, options: Map[String, String]): Column = { + val dataType = + parseTypeWithFallback(schema, DataType.fromJson, fallbackParser = DataType.fromDDL) + from_json(e, dataType, options) + } + + /** + * (Scala-specific) Parses a column containing a JSON string into a `MapType` with `StringType` + * as keys type, `StructType` or `ArrayType` of `StructType`s with the specified schema. Returns + * `null`, in the case of an unparseable string. + * + * @param e + * a string column containing JSON data. + * @param schema + * the schema to use when parsing the json string + * + * @group collection_funcs + * @since 3.4.0 + */ + def from_json(e: Column, schema: Column): Column = { + from_json(e, schema, Iterator.empty) + } + + // scalastyle:off line.size.limit + /** + * (Java-specific) Parses a column containing a JSON string into a `MapType` with `StringType` + * as keys type, `StructType` or `ArrayType` of `StructType`s with the specified schema. Returns + * `null`, in the case of an unparseable string. + * + * @param e + * a string column containing JSON data. + * @param schema + * the schema to use when parsing the json string + * @param options + * options to control how the json is parsed. accepts the same options and the json data + * source. See Data + * Source Option in the version you use. + * + * @group collection_funcs + * @since 3.4.0 + */ + // scalastyle:on line.size.limit + def from_json(e: Column, schema: Column, options: java.util.Map[String, String]): Column = { + from_json(e, schema, options.asScala.iterator) + } + + /** + * Invoke a function with an options map as its last argument. If there are no options, its + * column is dropped. + */ + private def fnWithOptions( + name: String, + options: Iterator[(String, String)], + arguments: Column*): Column = { + val augmentedArguments = if (options.hasNext) { + val flattenedKeyValueIterator = options.flatMap { case (k, v) => + Iterator(lit(k), lit(v)) + } + arguments :+ map(flattenedKeyValueIterator.toSeq: _*) + } else { + arguments + } + Column.fn(name, augmentedArguments: _*) + } + + private def from_json( + e: Column, + schema: Column, + options: Iterator[(String, String)]): Column = { + fnWithOptions("from_json", options, e, schema) + } + + /** + * Parses a JSON string and infers its schema in DDL format. + * + * @param json + * a JSON string. + * + * @group collection_funcs + * @since 3.4.0 + */ + def schema_of_json(json: String): Column = schema_of_json(lit(json)) + + /** + * Parses a JSON string and infers its schema in DDL format. + * + * @param json + * a foldable string column containing a JSON string. + * + * @group collection_funcs + * @since 3.4.0 + */ + def schema_of_json(json: Column): Column = Column.fn("schema_of_json", json) + + // scalastyle:off line.size.limit + /** + * Parses a JSON string and infers its schema in DDL format using options. + * + * @param json + * a foldable string column containing JSON data. + * @param options + * options to control how the json is parsed. accepts the same options and the json data + * source. See Data + * Source Option in the version you use. + * @return + * a column with string literal containing schema in DDL format. + * + * @group collection_funcs + * @since 3.4.0 + */ + // scalastyle:on line.size.limit + def schema_of_json(json: Column, options: java.util.Map[String, String]): Column = + fnWithOptions("schema_of_json", options.asScala.iterator, json) + + // scalastyle:off line.size.limit + /** + * (Scala-specific) Converts a column containing a `StructType`, `ArrayType` or a `MapType` into + * a JSON string with the specified schema. Throws an exception, in the case of an unsupported + * type. + * + * @param e + * a column containing a struct, an array or a map. + * @param options + * options to control how the struct column is converted into a json string. accepts the same + * options and the json data source. See Data + * Source Option in the version you use. Additionally the function supports the `pretty` + * option which enables pretty JSON generation. + * + * @group collection_funcs + * @since 3.4.0 + */ + // scalastyle:on line.size.limit + def to_json(e: Column, options: Map[String, String]): Column = + fnWithOptions("to_json", options.iterator, e) + + // scalastyle:off line.size.limit + /** + * (Java-specific) Converts a column containing a `StructType`, `ArrayType` or a `MapType` into + * a JSON string with the specified schema. Throws an exception, in the case of an unsupported + * type. + * + * @param e + * a column containing a struct, an array or a map. + * @param options + * options to control how the struct column is converted into a json string. accepts the same + * options and the json data source. See Data + * Source Option in the version you use. Additionally the function supports the `pretty` + * option which enables pretty JSON generation. + * + * @group collection_funcs + * @since 3.4.0 + */ + // scalastyle:on line.size.limit + def to_json(e: Column, options: java.util.Map[String, String]): Column = + to_json(e, options.asScala.toMap) + + /** + * Converts a column containing a `StructType`, `ArrayType` or a `MapType` into a JSON string + * with the specified schema. Throws an exception, in the case of an unsupported type. + * + * @param e + * a column containing a struct, an array or a map. + * + * @group collection_funcs + * @since 3.4.0 + */ + def to_json(e: Column): Column = + to_json(e, Map.empty[String, String]) + + /** + * Returns length of array or map. + * + * The function returns null for null input if spark.sql.legacy.sizeOfNull is set to false or + * spark.sql.ansi.enabled is set to true. Otherwise, the function returns -1 for null input. + * With the default settings, the function returns -1 for null input. + * + * @group collection_funcs + * @since 3.4.0 + */ + def size(e: Column): Column = Column.fn("size", e) + + /** + * Sorts the input array for the given column in ascending order, according to the natural + * ordering of the array elements. Null elements will be placed at the beginning of the returned + * array. + * + * @group collection_funcs + * @since 3.4.0 + */ + def sort_array(e: Column): Column = sort_array(e, asc = true) + + /** + * Sorts the input array for the given column in ascending or descending order, according to the + * natural ordering of the array elements. NaN is greater than any non-NaN elements for + * double/float type. Null elements will be placed at the beginning of the returned array in + * ascending order or at the end of the returned array in descending order. + * + * @group collection_funcs + * @since 3.4.0 + */ + def sort_array(e: Column, asc: Boolean): Column = Column.fn("sort_array", e, lit(asc)) + + /** + * Returns the minimum value in the array. NaN is greater than any non-NaN elements for + * double/float type. NULL elements are skipped. + * + * @group collection_funcs + * @since 3.4.0 + */ + def array_min(e: Column): Column = Column.fn("array_min", e) + + /** + * Returns the maximum value in the array. NaN is greater than any non-NaN elements for + * double/float type. NULL elements are skipped. + * + * @group collection_funcs + * @since 3.4.0 + */ + def array_max(e: Column): Column = Column.fn("array_max", e) + + /** + * Returns a random permutation of the given array. + * + * @note + * The function is non-deterministic. + * + * @group collection_funcs + * @since 3.4.0 + */ + def shuffle(e: Column): Column = Column.fn("shuffle", e) + + /** + * Returns a reversed string or an array with reverse order of elements. + * @group collection_funcs + * @since 3.4.0 + */ + def reverse(e: Column): Column = Column.fn("reverse", e) + + /** + * Creates a single array from an array of arrays. If a structure of nested arrays is deeper + * than two levels, only one level of nesting is removed. + * @group collection_funcs + * @since 3.4.0 + */ + def flatten(e: Column): Column = Column.fn("flatten", e) + + /** + * Generate a sequence of integers from start to stop, incrementing by step. + * + * @group collection_funcs + * @since 3.4.0 + */ + def sequence(start: Column, stop: Column, step: Column): Column = + Column.fn("sequence", start, stop, step) + + /** + * Generate a sequence of integers from start to stop, incrementing by 1 if start is less than + * or equal to stop, otherwise -1. + * + * @group collection_funcs + * @since 3.4.0 + */ + def sequence(start: Column, stop: Column): Column = sequence(start, stop, lit(1L)) + + /** + * Creates an array containing the left argument repeated the number of times given by the right + * argument. + * + * @group collection_funcs + * @since 3.4.0 + */ + def array_repeat(left: Column, right: Column): Column = Column.fn("array_repeat", left, right) + + /** + * Creates an array containing the left argument repeated the number of times given by the right + * argument. + * + * @group collection_funcs + * @since 3.4.0 + */ + def array_repeat(e: Column, count: Int): Column = array_repeat(e, lit(count)) + + /** + * Returns true if the map contains the key. + * @group collection_funcs + * @since 3.4.0 + */ + def map_contains_key(column: Column, key: Any): Column = + Column.fn("map_contains_key", column, lit(key)) + + /** + * Returns an unordered array containing the keys of the map. + * @group collection_funcs + * @since 3.4.0 + */ + def map_keys(e: Column): Column = Column.fn("map_keys", e) + + /** + * Returns an unordered array containing the values of the map. + * @group collection_funcs + * @since 3.4.0 + */ + def map_values(e: Column): Column = Column.fn("map_values", e) + + /** + * Returns an unordered array of all entries in the given map. + * @group collection_funcs + * @since 3.4.0 + */ + def map_entries(e: Column): Column = Column.fn("map_entries", e) + + /** + * Returns a map created from the given array of entries. + * @group collection_funcs + * @since 3.4.0 + */ + def map_from_entries(e: Column): Column = Column.fn("map_from_entries", e) + + /** + * Returns a merged array of structs in which the N-th struct contains all N-th values of input + * arrays. + * @group collection_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def arrays_zip(e: Column*): Column = Column.fn("arrays_zip", e: _*) + + /** + * Returns the union of all the given maps. + * @group collection_funcs + * @since 3.4.0 + */ + @scala.annotation.varargs + def map_concat(cols: Column*): Column = Column.fn("map_concat", cols: _*) + + // scalastyle:off line.size.limit + /** + * Parses a column containing a CSV string into a `StructType` with the specified schema. + * Returns `null`, in the case of an unparseable string. + * + * @param e + * a string column containing CSV data. + * @param schema + * the schema to use when parsing the CSV string + * @param options + * options to control how the CSV is parsed. accepts the same options and the CSV data source. + * See Data + * Source Option in the version you use. + * + * @group collection_funcs + * @since 3.4.0 + */ + // scalastyle:on line.size.limit + def from_csv(e: Column, schema: StructType, options: Map[String, String]): Column = + from_csv(e, lit(schema.toDDL), options.iterator) + + // scalastyle:off line.size.limit + /** + * (Java-specific) Parses a column containing a CSV string into a `StructType` with the + * specified schema. Returns `null`, in the case of an unparseable string. + * + * @param e + * a string column containing CSV data. + * @param schema + * the schema to use when parsing the CSV string + * @param options + * options to control how the CSV is parsed. accepts the same options and the CSV data source. + * See Data + * Source Option in the version you use. + * + * @group collection_funcs + * @since 3.4.0 + */ + // scalastyle:on line.size.limit + def from_csv(e: Column, schema: Column, options: java.util.Map[String, String]): Column = + from_csv(e, schema, options.asScala.iterator) + + private def from_csv(e: Column, schema: Column, options: Iterator[(String, String)]): Column = + fnWithOptions("from_csv", options, e, schema) + + /** + * Parses a CSV string and infers its schema in DDL format. + * + * @param csv + * a CSV string. + * + * @group collection_funcs + * @since 3.4.0 + */ + def schema_of_csv(csv: String): Column = schema_of_csv(lit(csv)) + + /** + * Parses a CSV string and infers its schema in DDL format. + * + * @param csv + * a foldable string column containing a CSV string. + * + * @group collection_funcs + * @since 3.4.0 + */ + def schema_of_csv(csv: Column): Column = schema_of_csv(csv, Collections.emptyMap()) + + // scalastyle:off line.size.limit + /** + * Parses a CSV string and infers its schema in DDL format using options. + * + * @param csv + * a foldable string column containing a CSV string. + * @param options + * options to control how the CSV is parsed. accepts the same options and the CSV data source. + * See Data + * Source Option in the version you use. + * @return + * a column with string literal containing schema in DDL format. + * + * @group collection_funcs + * @since 3.4.0 + */ + // scalastyle:on line.size.limit + def schema_of_csv(csv: Column, options: java.util.Map[String, String]): Column = + fnWithOptions("schema_of_csv", options.asScala.iterator, csv) + + // scalastyle:off line.size.limit + /** + * (Java-specific) Converts a column containing a `StructType` into a CSV string with the + * specified schema. Throws an exception, in the case of an unsupported type. + * + * @param e + * a column containing a struct. + * @param options + * options to control how the struct column is converted into a CSV string. It accepts the + * same options and the CSV data source. See Data + * Source Option in the version you use. + * + * @group collection_funcs + * @since 3.4.0 + */ + // scalastyle:on line.size.limit + def to_csv(e: Column, options: java.util.Map[String, String]): Column = + fnWithOptions("to_csv", options.asScala.iterator, e) + + /** + * Converts a column containing a `StructType` into a CSV string with the specified schema. + * Throws an exception, in the case of an unsupported type. + * + * @param e + * a column containing a struct. + * + * @group collection_funcs + * @since 3.4.0 + */ + def to_csv(e: Column): Column = to_csv(e, Collections.emptyMap()) + + ////////////////////////////////////////////////////////////////////////////////////////////// + // Partition Transforms functions + ////////////////////////////////////////////////////////////////////////////////////////////// + + /** + * A transform for timestamps and dates to partition data into years. + * + * @group partition_transforms + * @since 3.4.0 + */ + def years(e: Column): Column = + Column.fn("years", e) + + /** + * A transform for timestamps and dates to partition data into months. + * + * @group partition_transforms + * @since 3.4.0 + */ + def months(e: Column): Column = + Column.fn("months", e) + + /** + * A transform for timestamps and dates to partition data into days. + * + * @group partition_transforms + * @since 3.4.0 + */ + def days(e: Column): Column = + Column.fn("days", e) + + /** + * A transform for timestamps to partition data into hours. + * + * @group partition_transforms + * @since 3.4.0 + */ + def hours(e: Column): Column = + Column.fn("hours", e) + + /** + * A transform for any type that partitions by a hash of the input column. + * + * @group partition_transforms + * @since 3.4.0 + */ + def bucket(numBuckets: Column, e: Column): Column = + Column.fn("bucket", numBuckets, e) + + /** + * A transform for any type that partitions by a hash of the input column. + * + * @group partition_transforms + * @since 3.4.0 + */ + def bucket(numBuckets: Int, e: Column): Column = + Column.fn("bucket", lit(numBuckets), e) + + ////////////////////////////////////////////////////////////////////////////////////////////// + // Scala UDF functions + ////////////////////////////////////////////////////////////////////////////////////////////// + + // scalastyle:off line.size.limit + + /** + * Defines a Scala closure of 0 arguments as user-defined function (UDF). The data types are + * automatically inferred based on the Scala closure's signature. By default the returned UDF is + * deterministic. To change it to nondeterministic, call the API + * `UserDefinedFunction.asNondeterministic()`. + * + * @group udf_funcs + * @since 3.4.0 + */ + def udf[RT: TypeTag](f: () => RT): UserDefinedFunction = { + ScalarUserDefinedFunction(f, typeTag[RT]) + } + + /** + * Defines a Scala closure of 1 arguments as user-defined function (UDF). The data types are + * automatically inferred based on the Scala closure's signature. By default the returned UDF is + * deterministic. To change it to nondeterministic, call the API + * `UserDefinedFunction.asNondeterministic()`. + * + * @group udf_funcs + * @since 3.4.0 + */ + def udf[RT: TypeTag, A1: TypeTag](f: A1 => RT): UserDefinedFunction = { + ScalarUserDefinedFunction(f, typeTag[RT], typeTag[A1]) + } + + /** + * Defines a Scala closure of 2 arguments as user-defined function (UDF). The data types are + * automatically inferred based on the Scala closure's signature. By default the returned UDF is + * deterministic. To change it to nondeterministic, call the API + * `UserDefinedFunction.asNondeterministic()`. + * + * @group udf_funcs + * @since 3.4.0 + */ + def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag](f: (A1, A2) => RT): UserDefinedFunction = { + ScalarUserDefinedFunction(f, typeTag[RT], typeTag[A1], typeTag[A2]) + } + + /** + * Defines a Scala closure of 3 arguments as user-defined function (UDF). The data types are + * automatically inferred based on the Scala closure's signature. By default the returned UDF is + * deterministic. To change it to nondeterministic, call the API + * `UserDefinedFunction.asNondeterministic()`. + * + * @group udf_funcs + * @since 3.4.0 + */ + def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag]( + f: (A1, A2, A3) => RT): UserDefinedFunction = { + ScalarUserDefinedFunction(f, typeTag[RT], typeTag[A1], typeTag[A2], typeTag[A3]) + } + + /** + * Defines a Scala closure of 4 arguments as user-defined function (UDF). The data types are + * automatically inferred based on the Scala closure's signature. By default the returned UDF is + * deterministic. To change it to nondeterministic, call the API + * `UserDefinedFunction.asNondeterministic()`. + * + * @group udf_funcs + * @since 3.4.0 + */ + def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag]( + f: (A1, A2, A3, A4) => RT): UserDefinedFunction = { + ScalarUserDefinedFunction(f, typeTag[RT], typeTag[A1], typeTag[A2], typeTag[A3], typeTag[A4]) + } + + /** + * Defines a Scala closure of 5 arguments as user-defined function (UDF). The data types are + * automatically inferred based on the Scala closure's signature. By default the returned UDF is + * deterministic. To change it to nondeterministic, call the API + * `UserDefinedFunction.asNondeterministic()`. + * + * @group udf_funcs + * @since 3.4.0 + */ + def udf[RT: TypeTag, A1: TypeTag, A2: TypeTag, A3: TypeTag, A4: TypeTag, A5: TypeTag]( + f: (A1, A2, A3, A4, A5) => RT): UserDefinedFunction = { + ScalarUserDefinedFunction( + f, + typeTag[RT], + typeTag[A1], + typeTag[A2], + typeTag[A3], + typeTag[A4], + typeTag[A5]) + } + + /** + * Defines a Scala closure of 6 arguments as user-defined function (UDF). The data types are + * automatically inferred based on the Scala closure's signature. By default the returned UDF is + * deterministic. To change it to nondeterministic, call the API + * `UserDefinedFunction.asNondeterministic()`. + * + * @group udf_funcs + * @since 3.4.0 + */ + def udf[ + RT: TypeTag, + A1: TypeTag, + A2: TypeTag, + A3: TypeTag, + A4: TypeTag, + A5: TypeTag, + A6: TypeTag](f: (A1, A2, A3, A4, A5, A6) => RT): UserDefinedFunction = { + ScalarUserDefinedFunction( + f, + typeTag[RT], + typeTag[A1], + typeTag[A2], + typeTag[A3], + typeTag[A4], + typeTag[A5], + typeTag[A6]) + } + + /** + * Defines a Scala closure of 7 arguments as user-defined function (UDF). The data types are + * automatically inferred based on the Scala closure's signature. By default the returned UDF is + * deterministic. To change it to nondeterministic, call the API + * `UserDefinedFunction.asNondeterministic()`. + * + * @group udf_funcs + * @since 3.4.0 + */ + def udf[ + RT: TypeTag, + A1: TypeTag, + A2: TypeTag, + A3: TypeTag, + A4: TypeTag, + A5: TypeTag, + A6: TypeTag, + A7: TypeTag](f: (A1, A2, A3, A4, A5, A6, A7) => RT): UserDefinedFunction = { + ScalarUserDefinedFunction( + f, + typeTag[RT], + typeTag[A1], + typeTag[A2], + typeTag[A3], + typeTag[A4], + typeTag[A5], + typeTag[A6], + typeTag[A7]) + } + + /** + * Defines a Scala closure of 8 arguments as user-defined function (UDF). The data types are + * automatically inferred based on the Scala closure's signature. By default the returned UDF is + * deterministic. To change it to nondeterministic, call the API + * `UserDefinedFunction.asNondeterministic()`. + * + * @group udf_funcs + * @since 3.4.0 + */ + def udf[ + RT: TypeTag, + A1: TypeTag, + A2: TypeTag, + A3: TypeTag, + A4: TypeTag, + A5: TypeTag, + A6: TypeTag, + A7: TypeTag, + A8: TypeTag](f: (A1, A2, A3, A4, A5, A6, A7, A8) => RT): UserDefinedFunction = { + ScalarUserDefinedFunction( + f, + typeTag[RT], + typeTag[A1], + typeTag[A2], + typeTag[A3], + typeTag[A4], + typeTag[A5], + typeTag[A6], + typeTag[A7], + typeTag[A8]) + } + + /** + * Defines a Scala closure of 9 arguments as user-defined function (UDF). The data types are + * automatically inferred based on the Scala closure's signature. By default the returned UDF is + * deterministic. To change it to nondeterministic, call the API + * `UserDefinedFunction.asNondeterministic()`. + * + * @group udf_funcs + * @since 3.4.0 + */ + def udf[ + RT: TypeTag, + A1: TypeTag, + A2: TypeTag, + A3: TypeTag, + A4: TypeTag, + A5: TypeTag, + A6: TypeTag, + A7: TypeTag, + A8: TypeTag, + A9: TypeTag](f: (A1, A2, A3, A4, A5, A6, A7, A8, A9) => RT): UserDefinedFunction = { + ScalarUserDefinedFunction( + f, + typeTag[RT], + typeTag[A1], + typeTag[A2], + typeTag[A3], + typeTag[A4], + typeTag[A5], + typeTag[A6], + typeTag[A7], + typeTag[A8], + typeTag[A9]) + } + + /** + * Defines a Scala closure of 10 arguments as user-defined function (UDF). The data types are + * automatically inferred based on the Scala closure's signature. By default the returned UDF is + * deterministic. To change it to nondeterministic, call the API + * `UserDefinedFunction.asNondeterministic()`. + * + * @group udf_funcs + * @since 3.4.0 + */ + def udf[ + RT: TypeTag, + A1: TypeTag, + A2: TypeTag, + A3: TypeTag, + A4: TypeTag, + A5: TypeTag, + A6: TypeTag, + A7: TypeTag, + A8: TypeTag, + A9: TypeTag, + A10: TypeTag](f: (A1, A2, A3, A4, A5, A6, A7, A8, A9, A10) => RT): UserDefinedFunction = { + ScalarUserDefinedFunction( + f, + typeTag[RT], + typeTag[A1], + typeTag[A2], + typeTag[A3], + typeTag[A4], + typeTag[A5], + typeTag[A6], + typeTag[A7], + typeTag[A8], + typeTag[A9], + typeTag[A10]) + } + // scalastyle:off line.size.limit + +} diff --git a/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/package.scala b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/package.scala new file mode 100644 index 0000000000000..556b472283a37 --- /dev/null +++ b/connector/connect/client/jvm/src/main/scala/org/apache/spark/sql/package.scala @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark + +import org.apache.spark.sql.catalyst.encoders.AgnosticEncoder + +package object sql { + type DataFrame = Dataset[Row] + + private[sql] def encoderFor[E: Encoder]: AgnosticEncoder[E] = { + implicitly[Encoder[E]].asInstanceOf[AgnosticEncoder[E]] + } +} diff --git a/connector/connect/client/jvm/src/test/resources/artifact-tests/crc/README.md b/connector/connect/client/jvm/src/test/resources/artifact-tests/crc/README.md new file mode 100644 index 0000000000000..df9af41064444 --- /dev/null +++ b/connector/connect/client/jvm/src/test/resources/artifact-tests/crc/README.md @@ -0,0 +1,5 @@ +The CRCs for a specific file are stored in a text file with the same name (excluding the original extension). + +The CRCs are calculated for data chunks of `32768 bytes` (individual CRCs) and are newline delimited. + +The CRCs were calculated using https://simplycalc.com/crc32-file.php \ No newline at end of file diff --git a/connector/connect/client/jvm/src/test/resources/artifact-tests/crc/junitLargeJar.txt b/connector/connect/client/jvm/src/test/resources/artifact-tests/crc/junitLargeJar.txt new file mode 100644 index 0000000000000..3e89631dea57c --- /dev/null +++ b/connector/connect/client/jvm/src/test/resources/artifact-tests/crc/junitLargeJar.txt @@ -0,0 +1,12 @@ +902183889 +2415704507 +1084811487 +1951510 +1158852476 +2003120166 +3026803842 +3850244775 +3409267044 +652109216 +104029242 +3019434266 \ No newline at end of file diff --git a/connector/connect/client/jvm/src/test/resources/artifact-tests/crc/smallClassFile.txt b/connector/connect/client/jvm/src/test/resources/artifact-tests/crc/smallClassFile.txt new file mode 100644 index 0000000000000..531f98ce9a225 --- /dev/null +++ b/connector/connect/client/jvm/src/test/resources/artifact-tests/crc/smallClassFile.txt @@ -0,0 +1 @@ +1935693963 \ No newline at end of file diff --git a/connector/connect/client/jvm/src/test/resources/artifact-tests/crc/smallClassFileDup.txt b/connector/connect/client/jvm/src/test/resources/artifact-tests/crc/smallClassFileDup.txt new file mode 100644 index 0000000000000..531f98ce9a225 --- /dev/null +++ b/connector/connect/client/jvm/src/test/resources/artifact-tests/crc/smallClassFileDup.txt @@ -0,0 +1 @@ +1935693963 \ No newline at end of file diff --git a/connector/connect/client/jvm/src/test/resources/artifact-tests/crc/smallJar.txt b/connector/connect/client/jvm/src/test/resources/artifact-tests/crc/smallJar.txt new file mode 100644 index 0000000000000..df32adcce7ab5 --- /dev/null +++ b/connector/connect/client/jvm/src/test/resources/artifact-tests/crc/smallJar.txt @@ -0,0 +1 @@ +1631702900 \ No newline at end of file diff --git a/connector/connect/client/jvm/src/test/resources/artifact-tests/junitLargeJar.jar b/connector/connect/client/jvm/src/test/resources/artifact-tests/junitLargeJar.jar new file mode 100755 index 0000000000000..6da55d8b8520d Binary files /dev/null and b/connector/connect/client/jvm/src/test/resources/artifact-tests/junitLargeJar.jar differ diff --git a/connector/connect/client/jvm/src/test/resources/artifact-tests/smallClassFile.class b/connector/connect/client/jvm/src/test/resources/artifact-tests/smallClassFile.class new file mode 100755 index 0000000000000..e796030e471b0 Binary files /dev/null and b/connector/connect/client/jvm/src/test/resources/artifact-tests/smallClassFile.class differ diff --git a/connector/connect/client/jvm/src/test/resources/artifact-tests/smallClassFileDup.class b/connector/connect/client/jvm/src/test/resources/artifact-tests/smallClassFileDup.class new file mode 100755 index 0000000000000..e796030e471b0 Binary files /dev/null and b/connector/connect/client/jvm/src/test/resources/artifact-tests/smallClassFileDup.class differ diff --git a/connector/connect/client/jvm/src/test/resources/artifact-tests/smallJar.jar b/connector/connect/client/jvm/src/test/resources/artifact-tests/smallJar.jar new file mode 100755 index 0000000000000..3c4930e8e9549 Binary files /dev/null and b/connector/connect/client/jvm/src/test/resources/artifact-tests/smallJar.jar differ diff --git a/connector/connect/client/jvm/src/test/resources/log4j2.properties b/connector/connect/client/jvm/src/test/resources/log4j2.properties new file mode 100644 index 0000000000000..ab02104c69697 --- /dev/null +++ b/connector/connect/client/jvm/src/test/resources/log4j2.properties @@ -0,0 +1,39 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the file target/unit-tests.log +rootLogger.level = info +rootLogger.appenderRef.file.ref = ${sys:test.appender:-File} + +appender.file.type = File +appender.file.name = File +appender.file.fileName = target/unit-tests.log +appender.file.layout.type = PatternLayout +appender.file.layout.pattern = %d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n%ex + +# Tests that launch java subprocesses can set the "test.appender" system property to +# "console" to avoid having the child process's logs overwrite the unit test's +# log file. +appender.console.type = Console +appender.console.name = console +appender.console.target = SYSTEM_ERR +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %t: %m%n%ex + +# Ignore messages below warning level from Jetty, because it's a bit verbose +logger.jetty.name = org.sparkproject.jetty +logger.jetty.level = warn diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala new file mode 100644 index 0000000000000..ee7117552c89c --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ClientE2ETestSuite.scala @@ -0,0 +1,847 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +import java.io.{ByteArrayOutputStream, PrintStream} +import java.nio.file.Files + +import scala.collection.JavaConverters._ + +import io.grpc.StatusRuntimeException +import java.util.Properties +import org.apache.commons.io.FileUtils +import org.apache.commons.io.output.TeeOutputStream +import org.apache.commons.lang3.{JavaVersion, SystemUtils} +import org.scalactic.TolerantNumerics + +import org.apache.spark.SPARK_VERSION +import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.StringEncoder +import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema +import org.apache.spark.sql.catalyst.parser.ParseException +import org.apache.spark.sql.connect.client.util.{IntegrationTestUtils, RemoteSparkSession} +import org.apache.spark.sql.functions.{aggregate, array, broadcast, col, count, lit, rand, sequence, shuffle, struct, transform, udf} +import org.apache.spark.sql.types._ + +class ClientE2ETestSuite extends RemoteSparkSession with SQLHelper { + + // Spark Result + test("spark result schema") { + val df = spark.sql("select val from (values ('Hello'), ('World')) as t(val)") + df.withResult { result => + val schema = result.schema + assert(schema == StructType(StructField("val", StringType, nullable = false) :: Nil)) + } + } + + test("spark result array") { + val df = spark.sql("select val from (values ('Hello'), ('World')) as t(val)") + val result = df.collect() + assert(result.length == 2) + assert(result(0).getString(0) == "Hello") + assert(result(1).getString(0) == "World") + } + + test("eager execution of sql") { + assume(IntegrationTestUtils.isSparkHiveJarAvailable) + withTable("test_martin") { + // Fails, because table does not exist. + assertThrows[StatusRuntimeException] { + spark.sql("select * from test_martin").collect() + } + // Execute eager, DML + spark.sql("create table test_martin (id int)") + // Execute read again. + val rows = spark.sql("select * from test_martin").collect() + assert(rows.length == 0) + spark.sql("insert into test_martin values (1), (2)") + val rows_new = spark.sql("select * from test_martin").collect() + assert(rows_new.length == 2) + } + } + + test("simple dataset") { + val df = spark.range(10).limit(3) + val result = df.collect() + assert(result.length == 3) + assert(result(0) == 0) + assert(result(1) == 1) + assert(result(2) == 2) + } + + ignore("SPARK-42665: Ignore simple udf test until the udf is fully implemented.") { + def dummyUdf(x: Int): Int = x + 5 + val myUdf = udf(dummyUdf _) + val df = spark.range(5).select(myUdf(Column("id"))) + val result = df.collect() + assert(result.length == 5) + result.zipWithIndex.foreach { case (v, idx) => + assert(v.getInt(0) == idx + 5) + } + } + + test("read and write") { + val testDataPath = java.nio.file.Paths + .get( + IntegrationTestUtils.sparkHome, + "connector", + "connect", + "common", + "src", + "test", + "resources", + "query-tests", + "test-data", + "people.csv") + .toAbsolutePath + val df = spark.read + .format("csv") + .option("path", testDataPath.toString) + .options(Map("header" -> "true", "delimiter" -> ";")) + .schema( + StructType( + StructField("name", StringType) :: + StructField("age", IntegerType) :: + StructField("job", StringType) :: Nil)) + .load() + val outputFolderPath = Files.createTempDirectory("output").toAbsolutePath + + df.write + .format("csv") + .mode("overwrite") + .options(Map("header" -> "true", "delimiter" -> ";")) + .save(outputFolderPath.toString) + + // We expect only one csv file saved. + val outputFile = outputFolderPath.toFile + .listFiles() + .filter(file => file.getPath.endsWith(".csv"))(0) + + assert(FileUtils.contentEquals(testDataPath.toFile, outputFile)) + } + + test("read path collision") { + val testDataPath = java.nio.file.Paths + .get( + IntegrationTestUtils.sparkHome, + "connector", + "connect", + "common", + "src", + "test", + "resources", + "query-tests", + "test-data", + "people.csv") + .toAbsolutePath + val df = spark.read + .format("csv") + .option("path", testDataPath.toString) + .options(Map("header" -> "true", "delimiter" -> ";")) + .schema( + StructType( + StructField("name", StringType) :: + StructField("age", IntegerType) :: + StructField("job", StringType) :: Nil)) + .csv(testDataPath.toString) + // Failed because the path cannot be provided both via option and load method (csv). + assertThrows[StatusRuntimeException] { + df.collect() + } + } + + test("textFile") { + val testDataPath = java.nio.file.Paths + .get( + IntegrationTestUtils.sparkHome, + "connector", + "connect", + "common", + "src", + "test", + "resources", + "query-tests", + "test-data", + "people.txt") + .toAbsolutePath + val result = spark.read.textFile(testDataPath.toString).collect() + val expected = Array("Michael, 29", "Andy, 30", "Justin, 19") + assert(result.length == 3) + assert(result === expected) + } + + test("write table") { + withTable("myTable") { + val df = spark.range(10).limit(3) + df.write.mode(SaveMode.Overwrite).saveAsTable("myTable") + spark.range(2).write.insertInto("myTable") + val result = spark.sql("select * from myTable").sort("id").collect() + assert(result.length == 5) + assert(result(0).getLong(0) == 0) + assert(result(1).getLong(0) == 0) + assert(result(2).getLong(0) == 1) + assert(result(3).getLong(0) == 1) + assert(result(4).getLong(0) == 2) + } + } + + test("write without table or path") { + // Should receive no error to write noop + spark.range(10).write.format("noop").mode("append").save() + } + + test("write jdbc") { + if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_9)) { + val url = "jdbc:derby:memory:1234" + val table = "t1" + try { + spark.range(10).write.jdbc(url = s"$url;create=true", table, new Properties()) + val result = spark.read.jdbc(url = url, table, new Properties()).collect() + assert(result.length == 10) + } finally { + // clean up + assertThrows[StatusRuntimeException] { + spark.read.jdbc(url = s"$url;drop=true", table, new Properties()).collect() + } + } + } + } + + test("writeTo with create") { + withTable("testcat.myTableV2") { + + val rows = Seq(Row(1L, "a"), Row(2L, "b"), Row(3L, "c")) + + val schema = StructType(Array(StructField("id", LongType), StructField("data", StringType))) + + spark.createDataFrame(rows.asJava, schema).writeTo("testcat.myTableV2").create() + + val outputRows = spark.table("testcat.myTableV2").collect() + assert(outputRows.length == 3) + } + } + + test("writeTo with create and using") { + withTable("testcat.myTableV2") { + val rows = Seq(Row(1L, "a"), Row(2L, "b"), Row(3L, "c")) + + val schema = StructType(Array(StructField("id", LongType), StructField("data", StringType))) + + spark.createDataFrame(rows.asJava, schema).writeTo("testcat.myTableV2").create() + val outputRows = spark.table("testcat.myTableV2").collect() + assert(outputRows.length == 3) + + val columns = spark.table("testcat.myTableV2").columns + assert(columns.length == 2) + + val sqlOutputRows = spark.sql("select * from testcat.myTableV2").collect() + assert(outputRows.length == 3) + assert(sqlOutputRows(0).schema == schema) + assert(sqlOutputRows(1).getString(1) == "b") + } + } + + test("writeTo with create and append") { + withTable("testcat.myTableV2") { + + val rows = Seq(Row(1L, "a"), Row(2L, "b"), Row(3L, "c")) + + val schema = StructType(Array(StructField("id", LongType), StructField("data", StringType))) + + spark.sql("CREATE TABLE testcat.myTableV2 (id bigint, data string) USING foo") + + assert(spark.table("testcat.myTableV2").collect().isEmpty) + + spark.createDataFrame(rows.asJava, schema).writeTo("testcat.myTableV2").append() + val outputRows = spark.table("testcat.myTableV2").collect() + assert(outputRows.length == 3) + } + } + + test("WriteTo with overwrite") { + withTable("testcat.myTableV2") { + + val rows1 = (1L to 3L).map { i => + Row(i, "" + (i - 1 + 'a')) + } + val rows2 = (4L to 7L).map { i => + Row(i, "" + (i - 1 + 'a')) + } + + val schema = StructType(Array(StructField("id", LongType), StructField("data", StringType))) + + spark.sql( + "CREATE TABLE testcat.myTableV2 (id bigint, data string) USING foo PARTITIONED BY (id)") + + assert(spark.table("testcat.myTableV2").collect().isEmpty) + + spark.createDataFrame(rows1.asJava, schema).writeTo("testcat.myTableV2").append() + val outputRows = spark.table("testcat.myTableV2").collect() + assert(outputRows.length == 3) + + spark + .createDataFrame(rows2.asJava, schema) + .writeTo("testcat.myTableV2") + .overwrite(functions.expr("true")) + val outputRows2 = spark.table("testcat.myTableV2").collect() + assert(outputRows2.length == 4) + + } + } + + test("WriteTo with overwritePartitions") { + withTable("testcat.myTableV2") { + + val rows = (4L to 7L).map { i => + Row(i, "" + (i - 1 + 'a')) + } + + val schema = StructType(Array(StructField("id", LongType), StructField("data", StringType))) + + spark.sql( + "CREATE TABLE testcat.myTableV2 (id bigint, data string) USING foo PARTITIONED BY (id)") + + assert(spark.table("testcat.myTableV2").collect().isEmpty) + + spark + .createDataFrame(rows.asJava, schema) + .writeTo("testcat.myTableV2") + .overwritePartitions() + val outputRows = spark.table("testcat.myTableV2").collect() + assert(outputRows.length == 4) + + } + } + + test("write path collision") { + val df = spark.range(10) + val outputFolderPath = Files.createTempDirectory("output").toAbsolutePath + // Failed because the path cannot be provided both via option and save method. + assertThrows[StatusRuntimeException] { + df.write.option("path", outputFolderPath.toString).save(outputFolderPath.toString) + } + } + + // TODO test large result when we can create table or view + // test("test spark large result") + private def captureStdOut(block: => Unit): String = { + val currentOut = Console.out + val capturedOut = new ByteArrayOutputStream() + val newOut = new PrintStream(new TeeOutputStream(currentOut, capturedOut)) + Console.withOut(newOut) { + block + } + capturedOut.toString + } + + private def checkFragments(result: String, fragmentsToCheck: Seq[String]): Unit = { + fragmentsToCheck.foreach { fragment => + assert(result.contains(fragment)) + } + } + + private def testCapturedStdOut(block: => Unit, fragmentsToCheck: String*): Unit = { + checkFragments(captureStdOut(block), fragmentsToCheck) + } + + private def testCapturedStdOut( + block: => Unit, + expectedNumLines: Int, + expectedMaxWidth: Int, + fragmentsToCheck: String*): Unit = { + val result = captureStdOut(block) + val lines = result.split('\n') + assert(lines.length === expectedNumLines) + assert(lines.map((s: String) => s.length).max <= expectedMaxWidth) + checkFragments(result, fragmentsToCheck) + } + + private val simpleSchema = new StructType().add("value", "long", nullable = true) + + // Dataset tests + test("Dataset inspection") { + val df = spark.range(10) + val local = spark.newDataFrame { builder => + builder.getLocalRelationBuilder.setSchema(simpleSchema.catalogString) + } + assert(!df.isLocal) + assert(local.isLocal) + assert(!df.isStreaming) + assert(df.toString.contains("[value: bigint]")) + assert(df.inputFiles.isEmpty) + } + + test("Dataset schema") { + val df = spark.range(10) + assert(df.schema === simpleSchema) + assert(df.dtypes === Array(("value", "LongType"))) + assert(df.columns === Array("value")) + testCapturedStdOut(df.printSchema(), simpleSchema.treeString) + testCapturedStdOut(df.printSchema(5), simpleSchema.treeString(5)) + } + + test("Dataframe schema") { + val df = spark.sql("select * from range(10)") + val expectedSchema = new StructType().add("id", "long", nullable = false) + assert(df.schema === expectedSchema) + assert(df.dtypes === Array(("id", "LongType"))) + assert(df.columns === Array("id")) + testCapturedStdOut(df.printSchema(), expectedSchema.treeString) + testCapturedStdOut(df.printSchema(5), expectedSchema.treeString(5)) + } + + test("Dataset explain") { + val df = spark.range(10) + val simpleExplainFragments = Seq("== Physical Plan ==") + testCapturedStdOut(df.explain(), simpleExplainFragments: _*) + testCapturedStdOut(df.explain(false), simpleExplainFragments: _*) + testCapturedStdOut(df.explain("simple"), simpleExplainFragments: _*) + val extendedExplainFragments = Seq( + "== Parsed Logical Plan ==", + "== Analyzed Logical Plan ==", + "== Optimized Logical Plan ==") ++ + simpleExplainFragments + testCapturedStdOut(df.explain(true), extendedExplainFragments: _*) + testCapturedStdOut(df.explain("extended"), extendedExplainFragments: _*) + testCapturedStdOut( + df.explain("cost"), + simpleExplainFragments :+ "== Optimized Logical Plan ==": _*) + testCapturedStdOut(df.explain("codegen"), "WholeStageCodegen subtrees.") + testCapturedStdOut(df.explain("formatted"), "Range", "Arguments: ") + } + + test("Dataset result collection") { + def checkResult(rows: TraversableOnce[java.lang.Long], expectedValues: Long*): Unit = { + rows.toIterator.zipAll(expectedValues.iterator, null, null).foreach { + case (actual, expected) => assert(actual === expected) + } + } + val df = spark.range(10) + checkResult(df.head() :: Nil, 0L) + checkResult(df.head(5), 0L, 1L, 2L, 3L, 4L) + checkResult(df.first() :: Nil, 0L) + assert(!df.isEmpty) + assert(df.filter("id > 100").isEmpty) + checkResult(df.take(3), 0L, 1L, 2L) + checkResult(df.tail(3), 7L, 8L, 9L) + checkResult(df.takeAsList(10).asScala, 0L, 1L, 2L, 3L, 4L, 5L, 6L, 7L, 8L, 9L) + checkResult(df.filter("id % 3 = 0").collect(), 0L, 3L, 6L, 9L) + checkResult(df.filter("id < 3").collectAsList().asScala, 0L, 1L, 2L) + val iterator = df.filter("id > 5 and id < 9").toLocalIterator() + try { + checkResult(iterator.asScala, 6L, 7L, 8L) + } finally { + iterator.asInstanceOf[AutoCloseable].close() + } + } + + test("Dataset show") { + val df = spark.range(20) + testCapturedStdOut(df.show(), 24, 5, "+---+", "| id|", "| 0|", "| 19|") + testCapturedStdOut( + df.show(10), + 15, + 24, + "+---+", + "| id|", + "| 0|", + "| 9|", + "only showing top 10 rows") + val wideDf = + spark.range(4).selectExpr("id", "concat('very_very_very_long_string', id) as val") + testCapturedStdOut( + wideDf.show(true), + 8, + 26, + "+---+--------------------+", + "| id| val|", + "| 0|very_very_very_lo...|") + testCapturedStdOut( + wideDf.show(false), + 8, + 33, + "+---+---------------------------+", + "|id |val |", + "|2 |very_very_very_long_string2|") + testCapturedStdOut( + wideDf.show(2, truncate = false), + 7, + 33, + "+---+---------------------------+", + "|id |val |", + "|1 |very_very_very_long_string1|", + "only showing top 2 rows") + testCapturedStdOut( + df.show(8, 10, vertical = true), + 17, + 23, + "-RECORD 3--", + "id | 7", + "only showing top 8 rows") + } + + test("Dataset randomSplit") { + implicit val tolerance = TolerantNumerics.tolerantDoubleEquality(0.01) + + val df = spark.range(100) + def checkSample( + ds: Dataset[java.lang.Long], + lower: Double, + upper: Double, + seed: Long): Unit = { + assert(ds.plan.getRoot.hasSample) + val sample = ds.plan.getRoot.getSample + assert(sample.getSeed === seed) + assert(sample.getLowerBound === lower) + assert(sample.getUpperBound === upper) + } + val Array(ds1, ds2, ds3) = df.randomSplit(Array(8, 9, 7), 123L) + checkSample(ds1, 0, 8.0 / 24.0, 123L) + checkSample(ds2, 8.0 / 24.0, 17.0 / 24.0, 123L) + checkSample(ds3, 17.0 / 24.0, 1.0, 123L) + + val datasets = df.randomSplitAsList(Array(1, 2, 3, 4), 9L) + assert(datasets.size() === 4) + checkSample(datasets.get(0), 0, 1.0 / 10.0, 9L) + checkSample(datasets.get(1), 1.0 / 10.0, 3.0 / 10.0, 9L) + checkSample(datasets.get(2), 3.0 / 10.0, 6.0 / 10.0, 9L) + checkSample(datasets.get(3), 6.0 / 10.0, 1.0, 9L) + } + + test("Dataset count") { + assert(spark.range(10).count() === 10) + } + + test("Dataset collect tuple") { + val session = spark + import session.implicits._ + val result = session + .range(3) + .select(col("id"), (col("id") % 2).cast("int").as("a"), (col("id") / lit(10.0d)).as("b")) + .as[(Long, Int, Double)] + .collect() + result.zipWithIndex.foreach { case ((id, a, b), i) => + assert(id == i) + assert(a == id % 2) + assert(b == id / 10.0d) + } + } + + private val generateMyTypeColumns = Seq( + (col("id") / lit(10.0d)).as("b"), + col("id"), + lit("world").as("d"), + (col("id") % 2).cast("int").as("a")) + + private def validateMyTypeResult(result: Array[MyType]): Unit = { + result.zipWithIndex.foreach { case (MyType(id, a, b), i) => + assert(id == i) + assert(a == id % 2) + assert(b == id / 10.0d) + } + } + + test("Dataset collect complex type") { + val session = spark + import session.implicits._ + val result = session + .range(3) + .select(generateMyTypeColumns: _*) + .as[MyType] + .collect() + validateMyTypeResult(result) + } + + test("Dataset typed select - simple column") { + val numRows = spark.range(1000).select(count("id")).first() + assert(numRows === 1000) + } + + test("Dataset typed select - complex column") { + val session = spark + import session.implicits._ + val ds = session + .range(3) + .select(struct(generateMyTypeColumns: _*).as[MyType]) + validateMyTypeResult(ds.collect()) + } + + test("lambda functions") { + // This test is mostly to validate lambda variables are properly resolved. + val result = spark + .range(3) + .select( + col("id"), + array(sequence(col("id"), lit(10)), sequence(col("id") * 2, lit(10))).as("data")) + .select(col("id"), transform(col("data"), x => transform(x, x => x + 1)).as("data")) + .select( + col("id"), + transform(col("data"), x => aggregate(x, lit(0L), (x, y) => x + y)).as("summaries")) + .collect() + val expected = Array(Row(0L, Seq(66L, 66L)), Row(1L, Seq(65L, 63L)), Row(2L, Seq(63L, 56L))) + assert(result === expected) + } + + test("shuffle array") { + // We cannot do structural tests for shuffle because its random seed will always change. + val result = spark + .sql("select 1") + .select(shuffle(array(lit(1), lit(2), lit(3), lit(74)))) + .head() + .getSeq[Int](0) + assert(result.toSet === Set(1, 2, 3, 74)) + } + + test("ambiguous joins") { + val left = spark.range(100).select(col("id"), rand(10).as("a")) + val right = spark.range(100).select(col("id"), rand(12).as("a")) + val joined = left.join(right, left("id") === right("id")).select(left("id"), right("a")) + assert(joined.schema.catalogString === "struct") + + val joined2 = left + .join(right, left.colRegex("id") === right.colRegex("id")) + .select(left("id"), right("a")) + assert(joined2.schema.catalogString === "struct") + } + + test("broadcast join") { + withSQLConf("spark.sql.autoBroadcastJoinThreshold" -> "-1") { + val left = spark.range(100).select(col("id"), rand(10).as("a")) + val right = spark.range(100).select(col("id"), rand(12).as("a")) + val joined = + left.join(broadcast(right), left("id") === right("id")).select(left("id"), right("a")) + assert(joined.schema.catalogString === "struct") + testCapturedStdOut(joined.explain(), "BroadcastHashJoin") + } + } + + test("test temp view") { + try { + spark.range(100).createTempView("test1") + assert(spark.sql("SELECT * FROM test1").count() == 100) + spark.range(1000).createOrReplaceTempView("test1") + assert(spark.sql("SELECT * FROM test1").count() == 1000) + spark.range(100).createGlobalTempView("view1") + assert(spark.sql("SELECT * FROM global_temp.view1").count() == 100) + spark.range(1000).createOrReplaceGlobalTempView("view1") + assert(spark.sql("SELECT * FROM global_temp.view1").count() == 1000) + } finally { + spark.sql("DROP VIEW IF EXISTS test1") + spark.sql("DROP VIEW IF EXISTS global_temp.view1") + } + } + + test("version") { + assert(spark.version == SPARK_VERSION) + } + + test("time") { + val timeFragments = Seq("Time taken: ", " ms") + testCapturedStdOut(spark.time(spark.sql("select 1").collect()), timeFragments: _*) + } + + test("RuntimeConfig") { + intercept[NoSuchElementException](spark.conf.get("foo.bar")) + assert(spark.conf.getOption("foo.bar").isEmpty) + spark.conf.set("foo.bar", value = true) + assert(spark.conf.getOption("foo.bar") === Option("true")) + spark.conf.set("foo.bar.numBaz", 100L) + assert(spark.conf.get("foo.bar.numBaz") === "100") + spark.conf.set("foo.bar.name", "donkey") + assert(spark.conf.get("foo.bar.name") === "donkey") + spark.conf.unset("foo.bar.name") + val allKeyValues = spark.conf.getAll + assert(allKeyValues("foo.bar") === "true") + assert(allKeyValues("foo.bar.numBaz") === "100") + assert(!spark.conf.isModifiable("foo.bar")) // This is a bit odd. + assert(spark.conf.isModifiable("spark.sql.ansi.enabled")) + assert(!spark.conf.isModifiable("spark.sql.globalTempDatabase")) + intercept[Exception](spark.conf.set("spark.sql.globalTempDatabase", "/dev/null")) + } + + test("SparkVersion") { + assert(!spark.version.isEmpty) + } + + private def checkSameResult[E](expected: scala.collection.Seq[E], dataset: Dataset[E]): Unit = { + dataset.withResult { result => + assert(expected === result.iterator.asScala.toBuffer) + } + } + + test("Local Relation implicit conversion") { + val session = spark + import session.implicits._ + + val simpleValues = Seq(1, 24, 3) + checkSameResult(simpleValues, simpleValues.toDS()) + checkSameResult(simpleValues.map(v => Row(v)), simpleValues.toDF()) + + val complexValues = Seq((5, "a"), (6, "b")) + checkSameResult(complexValues, complexValues.toDS()) + checkSameResult( + complexValues.map(kv => KV(kv._2, kv._1)), + complexValues.toDF("value", "key").as[KV]) + } + + test("SparkSession.createDataFrame - row") { + val rows = java.util.Arrays.asList(Row("bob", 99), Row("Club", 5), Row("Bag", 5)) + val schema = new StructType().add("key", "string").add("value", "int") + checkSameResult(rows.asScala, spark.createDataFrame(rows, schema)) + } + + test("SparkSession.createDataFrame - bean") { + def bean(v: String): SimpleBean = { + val bean = new SimpleBean + bean.setValue(v) + bean + } + val beans = java.util.Arrays.asList(bean("x"), bean("s"), bean("d")) + checkSameResult( + beans.asScala.map(b => Row(b.value)), + spark.createDataFrame(beans, classOf[SimpleBean])) + } + + test("SparkSession typed createDataSet/createDataframe") { + val session = spark + import session.implicits._ + val list = java.util.Arrays.asList(KV("bob", 99), KV("Club", 5), KV("Bag", 5)) + checkSameResult(list.asScala, session.createDataset(list)) + checkSameResult( + list.asScala.map(kv => Row(kv.key, kv.value)), + session.createDataFrame(list.asScala.toSeq)) + } + + test("SparkSession newSession") { + val oldId = spark.sql("SELECT 1").analyze.getSessionId + val newId = spark.newSession().sql("SELECT 1").analyze.getSessionId + assert(oldId != newId) + } + + test("createDataFrame from complex type schema") { + val schema = new StructType() + .add( + "c1", + new StructType() + .add("c1-1", StringType) + .add("c1-2", StringType)) + val data = Seq(Row(Row(null, "a2")), Row(Row("b1", "b2")), Row(null)) + val result = spark.createDataFrame(data.asJava, schema).collect() + assert(result === data) + } + + test("SameSemantics") { + val plan = spark.sql("select 1") + val otherPlan = spark.sql("select 1") + assert(plan.sameSemantics(otherPlan)) + } + + test("sameSemantics and semanticHash") { + val df1 = spark.createDataFrame(Seq((1, 2), (4, 5))) + val df2 = spark.createDataFrame(Seq((1, 2), (4, 5))) + val df3 = spark.createDataFrame(Seq((0, 2), (4, 5))) + val df4 = spark.createDataFrame(Seq((0, 2), (4, 5))) + + assert(df1.sameSemantics(df2) === true) + assert(df1.sameSemantics(df3) === false) + assert(df3.sameSemantics(df4) === true) + + assert(df1.semanticHash === df2.semanticHash) + assert(df1.semanticHash !== df3.semanticHash) + assert(df3.semanticHash === df4.semanticHash) + } + + test("toJSON") { + val expected = Array( + """{"b":0.0,"id":0,"d":"world","a":0}""", + """{"b":0.1,"id":1,"d":"world","a":1}""", + """{"b":0.2,"id":2,"d":"world","a":0}""") + val result = spark + .range(3) + .select(generateMyTypeColumns: _*) + .toJSON + .collect() + assert(result sameElements expected) + } + + test("json from Dataset[String] inferSchema") { + val session = spark + import session.implicits._ + val expected = Seq( + new GenericRowWithSchema( + Array(73, "Shandong", "Kong"), + new StructType().add("age", LongType).add("city", StringType).add("name", StringType))) + val ds = Seq("""{"name":"Kong","age":73,"city":'Shandong'}""").toDS() + val result = spark.read.option("allowSingleQuotes", "true").json(ds) + checkSameResult(expected, result) + } + + test("json from Dataset[String] with schema") { + val session = spark + import session.implicits._ + val schema = new StructType().add("city", StringType).add("name", StringType) + val expected = Seq(new GenericRowWithSchema(Array("Shandong", "Kong"), schema)) + val ds = Seq("""{"name":"Kong","age":73,"city":'Shandong'}""").toDS() + val result = spark.read.schema(schema).option("allowSingleQuotes", "true").json(ds) + checkSameResult(expected, result) + } + + test("json from Dataset[String] with invalid schema") { + val message = intercept[ParseException] { + spark.read.schema("123").json(spark.createDataset(Seq.empty[String])(StringEncoder)) + }.getMessage + assert(message.contains("PARSE_SYNTAX_ERROR")) + } + + test("csv from Dataset[String] inferSchema") { + val session = spark + import session.implicits._ + val expected = Seq( + new GenericRowWithSchema( + Array("Meng", 84, "Shandong"), + new StructType().add("name", StringType).add("age", LongType).add("city", StringType))) + val ds = Seq("name,age,city", """"Meng",84,"Shandong"""").toDS() + val result = spark.read + .option("header", "true") + .option("inferSchema", "true") + .csv(ds) + checkSameResult(expected, result) + } + + test("csv from Dataset[String] with schema") { + val session = spark + import session.implicits._ + val schema = new StructType().add("name", StringType).add("age", LongType) + val expected = Seq(new GenericRowWithSchema(Array("Meng", 84), schema)) + val ds = Seq(""""Meng",84,"Shandong"""").toDS() + val result = spark.read.schema(schema).csv(ds) + checkSameResult(expected, result) + } + + test("csv from Dataset[String] with invalid schema") { + val message = intercept[ParseException] { + spark.read.schema("123").csv(spark.createDataset(Seq.empty[String])(StringEncoder)) + }.getMessage + assert(message.contains("PARSE_SYNTAX_ERROR")) + } +} + +private[sql] case class MyType(id: Long, a: Double, b: Double) +private[sql] case class KV(key: String, value: Int) +private[sql] class SimpleBean { + @scala.beans.BeanProperty + var value: String = _ +} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ColumnTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ColumnTestSuite.scala new file mode 100644 index 0000000000000..0d361fe1007f7 --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/ColumnTestSuite.scala @@ -0,0 +1,211 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +import java.io.ByteArrayOutputStream + +import scala.collection.JavaConverters._ + +import org.apache.spark.sql.{functions => fn} +import org.apache.spark.sql.connect.client.util.ConnectFunSuite +import org.apache.spark.sql.types._ + +/** + * Tests for client local Column behavior. + */ +class ColumnTestSuite extends ConnectFunSuite { + test("equals & hashcode") { + def expr: Column = fn.when(fn.col("a") < 10, "a").otherwise("b") + val a = expr + val b = expr + val c = expr.as("nope") + assert(a == a) + assert(b == b) + assert(c == c) + assert(a == b) + assert(b == a) + assert(a != c) + assert(c != a) + assert(b != c) + assert(c != b) + assert(a.hashCode == b.hashCode) + assert(a.hashCode != c.hashCode) + } + + test("invalid when usage") { + intercept[IllegalArgumentException] { + fn.col("a").when(fn.lit(true), 2) + } + intercept[IllegalArgumentException] { + fn.col("a").isNull.when(fn.lit(true), 2) + } + intercept[IllegalArgumentException] { + fn.when(fn.col("a") < 10, 1) + .otherwise(2) + .when(fn.col("b") > 8, 3) + } + } + + test("invalid otherwise usage") { + intercept[IllegalArgumentException] { + fn.col("a").otherwise(2) + } + intercept[IllegalArgumentException] { + fn.col("a").isNull.otherwise(2) + } + intercept[IllegalArgumentException] { + fn.when(fn.col("a") < 10, 1) + .otherwise(2) + .otherwise(3) + } + } + + test("invalid withField usage") { + intercept[IllegalArgumentException] { + fn.col("c").withField(null, fn.lit(1)) + } + intercept[IllegalArgumentException] { + fn.col("c").withField("x", null) + } + } + + def testSame( + name: String, + f1: (Column, Column) => Column, + f2: (Column, Column) => Column): Unit = test(name + " are the same") { + val a = fn.col("a") + val b = fn.col("b") + assert(f1(a, b) == f2(a, b)) + } + testSame("=== and equalTo", _ === _, _.equalTo(_)) + testSame("=!= and notEqual", _ =!= _, _.notEqual(_)) + testSame("> and gt", _ > _, _.gt(_)) + testSame("< and lt", _ < _, _.lt(_)) + testSame(">= and geq", _ >= _, _.geq(_)) + testSame("<= and leq", _ <= _, _.leq(_)) + testSame("<=> and eqNullSafe", _ <=> _, _.eqNullSafe(_)) + testSame("|| and or", _ || _, _.or(_)) + testSame("&& and and", _ && _, _.and(_)) + testSame("+ and plus", _ + _, _.plus(_)) + testSame("- and minus", _ - _, _.minus(_)) + testSame("* and multiply", _ * _, _.multiply(_)) + testSame("/ and divide", _ / _, _.divide(_)) + testSame("% and mod", _ % _, _.mod(_)) + + test("isIn") { + val a = fn.col("a") + val values = Seq(1, 5, 6) + assert(a.isin(values: _*) == a.isInCollection(values)) + assert(a.isin(values: _*) == a.isInCollection(values.asJava)) + } + + test("getItem/apply/getField are the same") { + val a = fn.col("a") + assert(a("x") == a.getItem("x")) + assert(a("x") == a.getField("x")) + } + + test("substr variations") { + val a = fn.col("a") + assert(a.substr(2, 10) == a.substr(fn.lit(2), fn.lit(10))) + } + + test("startsWith variations") { + val a = fn.col("a") + assert(a.endsWith("p_") == a.endsWith(fn.lit("p_"))) + } + + test("endsWith variations") { + val a = fn.col("a") + assert(a.endsWith("world") == a.endsWith(fn.lit("world"))) + } + + test("alias/as/name are the same") { + val a = fn.col("a") + assert(a.as("x") == a.alias("x")) + assert(a.as("x") == a.name("x")) + } + + test("multi-alias variations") { + val a = fn.col("a") + assert(a.as("x" :: "y" :: Nil) == a.as(Array("x", "y"))) + } + + test("cast variations") { + val a = fn.col("a") + assert(a.cast("string") == a.cast(StringType)) + } + + test("desc and desc_nulls_last are the same") { + val a = fn.col("a") + assert(a.desc == a.desc_nulls_last) + } + + test("asc and asc_nulls_first are the same") { + val a = fn.col("a") + assert(a.asc == a.asc_nulls_first) + } + + private def captureStdOut(block: => Unit): String = { + val capturedOut = new ByteArrayOutputStream() + Console.withOut(capturedOut)(block) + capturedOut.toString() + } + + test("explain") { + val x = fn.col("a") + fn.col("b") + val explain1 = captureStdOut(x.explain(false)) + val explain2 = captureStdOut(x.explain(true)) + assert(explain1 == explain2) + val expectedFragments = Seq("unresolved_function", "function_name: \"+\"", "arguments") + expectedFragments.foreach { fragment => + assert(explain1.contains(fragment)) + } + } + + private def testColName(dataType: DataType, f: ColumnName => StructField): Unit = { + test("ColumnName " + dataType.catalogString) { + val actual = f(new ColumnName("col")) + val expected = StructField("col", dataType) + assert(actual === expected) + } + } + + testColName(BooleanType, _.boolean) + testColName(ByteType, _.byte) + testColName(ShortType, _.short) + testColName(IntegerType, _.int) + testColName(LongType, _.long) + testColName(FloatType, _.float) + testColName(DoubleType, _.double) + testColName(DecimalType.USER_DEFAULT, _.decimal) + testColName(DecimalType(20, 10), _.decimal(20, 10)) + testColName(DateType, _.date) + testColName(TimestampType, _.timestamp) + testColName(StringType, _.string) + testColName(BinaryType, _.binary) + testColName(ArrayType(IntegerType), _.array(IntegerType)) + + private val mapType = MapType(StringType, StringType) + testColName(mapType, _.map(mapType)) + testColName(MapType(StringType, IntegerType), _.map(StringType, IntegerType)) + + private val structType1 = new StructType().add("a", "int").add("b", "string") + private val structType2 = structType1.add("c", "binary") + testColName(structType1, _.struct(structType1)) + testColName(structType2, _.struct(structType2.fields: _*)) +} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionSuite.scala new file mode 100644 index 0000000000000..1f6ea879248dc --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/DataFrameNaFunctionSuite.scala @@ -0,0 +1,407 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import scala.collection.JavaConverters._ + +import org.apache.spark.sql.connect.client.util.QueryTest +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types.{StringType, StructType} + +class DataFrameNaFunctionSuite extends QueryTest with SQLHelper { + private def createDF(): DataFrame = { + val sparkSession = spark + import sparkSession.implicits._ + Seq[(String, java.lang.Integer, java.lang.Double)]( + ("Bob", 16, 176.5), + ("Alice", null, 164.3), + ("David", 60, null), + ("Nina", 25, Double.NaN), + ("Amy", null, null), + (null, null, null)).toDF("name", "age", "height") + } + + def createNaNDF(): DataFrame = { + val sparkSession = spark + import sparkSession.implicits._ + Seq[( + java.lang.Integer, + java.lang.Long, + java.lang.Short, + java.lang.Byte, + java.lang.Float, + java.lang.Double)]( + (1, 1L, 1.toShort, 1.toByte, 1.0f, 1.0), + (0, 0L, 0.toShort, 0.toByte, Float.NaN, Double.NaN)).toDF( + "int", + "long", + "short", + "byte", + "float", + "double") + } + + def createDFWithNestedColumns: DataFrame = { + val schema = new StructType() + .add( + "c1", + new StructType() + .add("c1-1", StringType) + .add("c1-2", StringType)) + val data = Seq(Row(Row(null, "a2")), Row(Row("b1", "b2")), Row(null)) + spark.createDataFrame(data.asJava, schema) + } + + test("drop") { + val input = createDF() + val rows = input.collect() + + val result1 = input.na.drop("name" :: Nil).select("name") + val expected1 = Array(Row("Bob"), Row("Alice"), Row("David"), Row("Nina"), Row("Amy")) + checkAnswer(result1, expected1) + + val result2 = input.na.drop("age" :: Nil).select("name") + val expected2 = Array(Row("Bob"), Row("David"), Row("Nina")) + checkAnswer(result2, expected2) + + val result3 = input.na.drop("age" :: "height" :: Nil) + val expected3 = Array(rows(0)) + checkAnswer(result3, expected3) + + val result4 = input.na.drop() + checkAnswer(result4, expected3) + + // dropna on an a dataframe with no column should return an empty data frame. + val empty = input.filter("age > 100") + assert(empty.na.drop().count() === 0L) + + // Make sure the columns are properly named. + assert(input.na.drop().columns.toSeq === input.columns.toSeq) + } + + test("drop with how") { + val input = createDF() + val rows = input.collect() + + checkAnswer( + input.na.drop("all").select("name"), + Row("Bob") :: Row("Alice") :: Row("David") :: Row("Nina") :: Row("Amy") :: Nil) + + checkAnswer(input.na.drop("any"), rows(0) :: Nil) + + checkAnswer(input.na.drop("any", Seq("age", "height")), rows(0) :: Nil) + + checkAnswer( + input.na.drop("all", Seq("age", "height")).select("name"), + Row("Bob") :: Row("Alice") :: Row("David") :: Row("Nina") :: Nil) + } + + test("drop with threshold") { + val input = createDF() + val rows = input.collect() + + checkAnswer(input.na.drop(2, Seq("age", "height")), rows(0) :: Nil) + + checkAnswer(input.na.drop(3, Seq("name", "age", "height")), rows(0)) + + // Make sure the columns are properly named. + assert(input.na.drop(2, Seq("age", "height")).columns.toSeq === input.columns.toSeq) + } + + test("fill") { + val sparkSession = spark + import sparkSession.implicits._ + + val input = createDF() + + val boolInput = Seq[(String, java.lang.Boolean)]( + ("Bob", false), + ("Alice", null), + ("Mallory", true), + (null, null)).toDF("name", "spy") + + val fillNumeric = input.na.fill(50.6) + checkAnswer( + fillNumeric, + Row("Bob", 16, 176.5) :: + Row("Alice", 50, 164.3) :: + Row("David", 60, 50.6) :: + Row("Nina", 25, 50.6) :: + Row("Amy", 50, 50.6) :: + Row(null, 50, 50.6) :: Nil) + + // Make sure the columns are properly named. + assert(fillNumeric.columns.toSeq === input.columns.toSeq) + + // string + checkAnswer( + input.na.fill("unknown").select("name"), + Row("Bob") :: Row("Alice") :: Row("David") :: + Row("Nina") :: Row("Amy") :: Row("unknown") :: Nil) + assert(input.na.fill("unknown").columns.toSeq === input.columns.toSeq) + + // boolean + checkAnswer( + boolInput.na.fill(true).select("spy"), + Row(false) :: Row(true) :: Row(true) :: Row(true) :: Nil) + assert(boolInput.na.fill(true).columns.toSeq === boolInput.columns.toSeq) + + // fill double with subset columns + checkAnswer( + input.na.fill(50.6, "age" :: Nil).select("name", "age"), + Row("Bob", 16) :: + Row("Alice", 50) :: + Row("David", 60) :: + Row("Nina", 25) :: + Row("Amy", 50) :: + Row(null, 50) :: Nil) + + // fill boolean with subset columns + checkAnswer( + boolInput.na.fill(true, "spy" :: Nil).select("name", "spy"), + Row("Bob", false) :: + Row("Alice", true) :: + Row("Mallory", true) :: + Row(null, true) :: Nil) + + // fill string with subset columns + checkAnswer( + Seq[(String, String)]((null, null)).toDF("col1", "col2").na.fill("test", "col1" :: Nil), + Row("test", null)) + + checkAnswer( + Seq[(Long, Long)]((1, 2), (-1, -2), (9123146099426677101L, 9123146560113991650L)) + .toDF("a", "b") + .na + .fill(0), + Row(1, 2) :: Row(-1, -2) :: Row(9123146099426677101L, 9123146560113991650L) :: Nil) + + checkAnswer( + Seq[(java.lang.Long, java.lang.Double)]( + (null, 3.14), + (9123146099426677101L, null), + (9123146560113991650L, 1.6), + (null, null)).toDF("a", "b").na.fill(0.2), + Row(0, 3.14) :: Row(9123146099426677101L, 0.2) :: Row(9123146560113991650L, 1.6) + :: Row(0, 0.2) :: Nil) + + checkAnswer( + Seq[(java.lang.Long, java.lang.Float)]( + (null, 3.14f), + (9123146099426677101L, null), + (9123146560113991650L, 1.6f), + (null, null)).toDF("a", "b").na.fill(0.2), + Row(0, 3.14f) :: Row(9123146099426677101L, 0.2f) :: Row(9123146560113991650L, 1.6f) + :: Row(0, 0.2f) :: Nil) + + checkAnswer( + Seq[(java.lang.Long, java.lang.Double)]((null, 1.23), (3L, null), (4L, 3.45)) + .toDF("a", "b") + .na + .fill(2.34), + Row(2, 1.23) :: Row(3, 2.34) :: Row(4, 3.45) :: Nil) + + checkAnswer( + Seq[(java.lang.Long, java.lang.Double)]((null, 1.23), (3L, null), (4L, 3.45)) + .toDF("a", "b") + .na + .fill(5), + Row(5, 1.23) :: Row(3, 5.0) :: Row(4, 3.45) :: Nil) + } + + test("fill with map") { + val sparkSession = spark + import sparkSession.implicits._ + + val df = Seq[( + String, + String, + java.lang.Integer, + java.lang.Long, + java.lang.Float, + java.lang.Double, + java.lang.Boolean)]((null, null, null, null, null, null, null)) + .toDF( + "stringFieldA", + "stringFieldB", + "integerField", + "longField", + "floatField", + "doubleField", + "booleanField") + + val fillMap = Map( + "stringFieldA" -> "test", + "integerField" -> 1, + "longField" -> 2L, + "floatField" -> 3.3f, + "doubleField" -> 4.4d, + "booleanField" -> false) + + val expectedRow = Row("test", null, 1, 2L, 3.3f, 4.4d, false) + checkAnswer(df.na.fill(fillMap), expectedRow) + checkAnswer(df.na.fill(fillMap.asJava), expectedRow) // Test Java version + + // Ensure replacement values are cast to the column data type. + checkAnswer( + df.na.fill( + Map("integerField" -> 1d, "longField" -> 2d, "floatField" -> 3d, "doubleField" -> 4d)), + Row(null, null, 1, 2L, 3f, 4d, null)) + + // Ensure column types do not change. Columns that have null values replaced + // will no longer be flagged as nullable, so do not compare schemas directly. + assert( + df.na.fill(fillMap).schema.fields.map(_.dataType) === + df.schema.fields.map(_.dataType)) + } + + test("fill with col(*)") { + val df = createDF() + // If columns are specified with "*", they are ignored. + checkAnswer(df.na.fill("new name", Seq("*")), df.collect()) + } + + test("drop with col(*)") { + val df = createDF() + val ex = intercept[RuntimeException] { + df.na.drop("any", Seq("*")).collect() + } + assert(ex.getMessage.contains("UNRESOLVED_COLUMN.WITH_SUGGESTION")) + } + + test("fill with nested columns") { + val df = createDFWithNestedColumns + checkAnswer(df.na.fill("a1", Seq("c1.c1-1")), df) + } + + test("drop with nested columns") { + val df = createDFWithNestedColumns + + // Rows with the specified nested columns whose null values are dropped. + assert(df.count == 3) + checkAnswer(df.na.drop("any", Seq("c1.c1-1")), Seq(Row(Row("b1", "b2")))) + } + + test("replace") { + val input = createDF() + + val result1 = input.na + .replace( + Seq("age", "height"), + Map( + 16 -> 61, + 60 -> 6, + 164.3 -> 461.3 // Alice is really tall + )) + .collect() + assert(result1(0) === Row("Bob", 61, 176.5)) + assert(result1(1) === Row("Alice", null, 461.3)) + assert(result1(2) === Row("David", 6, null)) + assert(result1(3).get(2).asInstanceOf[Double].isNaN) + assert(result1(4) === Row("Amy", null, null)) + assert(result1(5) === Row(null, null, null)) + + // Replace only the age column + val result2 = input.na + .replace( + "age", + Map( + 16 -> 61, + 60 -> 6, + 164.3 -> 461.3 // Alice is really tall + )) + .collect() + assert(result2(0) === Row("Bob", 61, 176.5)) + assert(result2(1) === Row("Alice", null, 164.3)) + assert(result2(2) === Row("David", 6, null)) + assert(result2(3).get(2).asInstanceOf[Double].isNaN) + assert(result2(4) === Row("Amy", null, null)) + assert(result2(5) === Row(null, null, null)) + } + + test("replace with null") { + val input = spark.sql( + "select name, height, married from (values " + + "('Bob', 176.5, true), " + + "('Alice', 164.3, false), " + + "('David', null, true))" + + "as t(name, height, married)") + + // Replace String with String and null + val result1 = input.na.replace("name", Map("Bob" -> "Bravo", "Alice" -> null)) + + checkAnswer( + result1, + Row("Bravo", 176.5, true) :: + Row(null, 164.3, false) :: + Row("David", null, true) :: Nil) + + // Replace Double with null + val result2 = input.na.replace("height", Map[Any, Any](164.3 -> null)) + checkAnswer( + result2, + Row("Bob", 176.5, true) :: + Row("Alice", null, false) :: + Row("David", null, true) :: Nil) + + // Replace Boolean with null + checkAnswer( + input.na.replace("*", Map[Any, Any](false -> null)), + Row("Bob", 176.5, true) :: + Row("Alice", 164.3, null) :: + Row("David", null, true) :: Nil) + + // Replace String with null and then drop rows containing null + checkAnswer( + input.na.replace("name", Map("Bob" -> null)).na.drop("name" :: Nil).select("name"), + Row("Alice") :: Row("David") :: Nil) + } + + test("replace nan with float") { + checkAnswer( + createNaNDF().na.replace("*", Map(Float.NaN -> 10.0f)), + Row(1, 1L, 1.toShort, 1.toByte, 1.0f, 1.0) :: + Row(0, 0L, 0.toShort, 0.toByte, 10.0f, 10.0) :: Nil) + } + + test("replace nan with double") { + checkAnswer( + createNaNDF().na.replace("*", Map(Double.NaN -> 10.0)), + Row(1, 1L, 1.toShort, 1.toByte, 1.0f, 1.0) :: + Row(0, 0L, 0.toShort, 0.toByte, 10.0f, 10.0) :: Nil) + } + + test("replace float with nan") { + withSQLConf(SQLConf.ANSI_ENABLED.key -> false.toString) { + checkAnswer( + createNaNDF().na.replace("*", Map(1.0f -> Float.NaN)), + Row(0, 0L, 0.toShort, 0.toByte, Float.NaN, Double.NaN) :: + Row(0, 0L, 0.toShort, 0.toByte, Float.NaN, Double.NaN) :: Nil) + } + } + + test("replace double with nan") { + withSQLConf(SQLConf.ANSI_ENABLED.key -> false.toString) { + checkAnswer( + createNaNDF().na.replace("*", Map(1.0 -> Double.NaN)), + Row(0, 0L, 0.toShort, 0.toByte, Float.NaN, Double.NaN) :: + Row(0, 0L, 0.toShort, 0.toByte, Float.NaN, Double.NaN) :: Nil) + } + } + +} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala new file mode 100644 index 0000000000000..aea31005f3bd6 --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/DataFrameStatSuite.scala @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql + +import java.util.Random + +import io.grpc.StatusRuntimeException +import org.scalatest.matchers.must.Matchers._ + +import org.apache.spark.sql.connect.client.util.RemoteSparkSession + +class DataFrameStatSuite extends RemoteSparkSession { + private def toLetter(i: Int): String = (i + 97).toChar.toString + + test("approxQuantile") { + val session = spark + import session.implicits._ + + val n = 1000 + val df = Seq.tabulate(n + 1)(i => (i, 2.0 * i)).toDF("singles", "doubles") + + val q1 = 0.5 + val q2 = 0.8 + val epsilons = List(0.1, 0.05, 0.001) + + for (epsilon <- epsilons) { + val Array(single1) = df.stat.approxQuantile("singles", Array(q1), epsilon) + val Array(double2) = df.stat.approxQuantile("doubles", Array(q2), epsilon) + // Also make sure there is no regression by computing multiple quantiles at once. + val Array(d1, d2) = df.stat.approxQuantile("doubles", Array(q1, q2), epsilon) + val Array(s1, s2) = df.stat.approxQuantile("singles", Array(q1, q2), epsilon) + + val errorSingle = 1000 * epsilon + val errorDouble = 2.0 * errorSingle + + assert(math.abs(single1 - q1 * n) <= errorSingle) + assert(math.abs(double2 - 2 * q2 * n) <= errorDouble) + assert(math.abs(s1 - q1 * n) <= errorSingle) + assert(math.abs(s2 - q2 * n) <= errorSingle) + assert(math.abs(d1 - 2 * q1 * n) <= errorDouble) + assert(math.abs(d2 - 2 * q2 * n) <= errorDouble) + + // Multiple columns + val Array(Array(ms1, ms2), Array(md1, md2)) = + df.stat.approxQuantile(Array("singles", "doubles"), Array(q1, q2), epsilon) + + assert(math.abs(ms1 - q1 * n) <= errorSingle) + assert(math.abs(ms2 - q2 * n) <= errorSingle) + assert(math.abs(md1 - 2 * q1 * n) <= errorDouble) + assert(math.abs(md2 - 2 * q2 * n) <= errorDouble) + } + + // quantile should be in the range [0.0, 1.0] + val e = intercept[IllegalArgumentException] { + df.stat.approxQuantile(Array("singles", "doubles"), Array(q1, q2, -0.1), epsilons.head) + } + assert(e.getMessage.contains("percentile should be in the range [0.0, 1.0]")) + + // relativeError should be non-negative + val e2 = intercept[IllegalArgumentException] { + df.stat.approxQuantile(Array("singles", "doubles"), Array(q1, q2), -1.0) + } + assert(e2.getMessage.contains("Relative Error must be non-negative")) + } + + test("covariance") { + val session = spark + import session.implicits._ + + val df = + Seq.tabulate(10)(i => (i, 2.0 * i, toLetter(i))).toDF("singles", "doubles", "letters") + + val results = df.stat.cov("singles", "doubles") + assert(math.abs(results - 55.0 / 3) < 1e-12) + intercept[StatusRuntimeException] { + df.stat.cov("singles", "letters") // doesn't accept non-numerical dataTypes + } + val decimalData = Seq.tabulate(6)(i => (BigDecimal(i % 3), BigDecimal(i % 2))).toDF("a", "b") + val decimalRes = decimalData.stat.cov("a", "b") + assert(math.abs(decimalRes) < 1e-12) + } + + test("correlation") { + val session = spark + import session.implicits._ + + val df = Seq.tabulate(10)(i => (i, 2 * i, i * -1.0)).toDF("a", "b", "c") + val corr1 = df.stat.corr("a", "b", "pearson") + assert(math.abs(corr1 - 1.0) < 1e-12) + val corr2 = df.stat.corr("a", "c", "pearson") + assert(math.abs(corr2 + 1.0) < 1e-12) + val df2 = Seq.tabulate(20)(x => (x, x * x - 2 * x + 3.5)).toDF("a", "b") + val corr3 = df2.stat.corr("a", "b", "pearson") + assert(math.abs(corr3 - 0.95723391394758572) < 1e-12) + } + + test("crosstab") { + val session = spark + import session.implicits._ + + val rng = new Random() + val data = Seq.tabulate(25)(_ => (rng.nextInt(5), rng.nextInt(10))) + val df = data.toDF("a", "b") + val crosstab = df.stat.crosstab("a", "b") + val columnNames = crosstab.schema.fieldNames + assert(columnNames(0) === "a_b") + // reduce by key + val expected = data.map(t => (t, 1)).groupBy(_._1).mapValues(_.length) + val rows = crosstab.collect() + rows.foreach { row => + val i = row.getString(0).toInt + for (col <- 1 until columnNames.length) { + val j = columnNames(col).toInt + assert(row.getLong(col) === expected.getOrElse((i, j), 0).toLong) + } + } + } + + test("freqItems") { + val session = spark + import session.implicits._ + + val rows = Seq.tabulate(1000) { i => + if (i % 3 == 0) (1, toLetter(1), -1.0) else (i, toLetter(i), i * -1.0) + } + val df = rows.toDF("numbers", "letters", "negDoubles") + + val results = df.stat.freqItems(Array("numbers", "letters"), 0.1) + val items = results.collect().head + assert(items.getSeq[Int](0).contains(1)) + assert(items.getSeq[String](1).contains(toLetter(1))) + + val singleColResults = df.stat.freqItems(Array("negDoubles"), 0.1) + val items2 = singleColResults.collect().head + assert(items2.getSeq[Double](0).contains(-1.0)) + } + + test("sampleBy") { + val session = spark + import session.implicits._ + val df = Seq("Bob", "Alice", "Nico", "Bob", "Alice").toDF("name") + val fractions = Map("Alice" -> 0.3, "Nico" -> 1.0) + val sampled = df.stat.sampleBy("name", fractions, 36L) + val rows = sampled.groupBy("name").count().orderBy("name").collect() + assert(rows.length == 1) + val row0 = rows(0) + assert(row0.getString(0) == "Nico") + assert(row0.getLong(1) == 1L) + } + + test("countMinSketch") { + val df = spark.range(1000) + + val sketch1 = df.stat.countMinSketch("id", depth = 10, width = 20, seed = 42) + assert(sketch1.totalCount() === 1000) + assert(sketch1.depth() === 10) + assert(sketch1.width() === 20) + + val sketch = df.stat.countMinSketch("id", eps = 0.001, confidence = 0.99, seed = 42) + assert(sketch.totalCount() === 1000) + assert(sketch.relativeError() === 0.001) + assert(sketch.confidence() === 0.99 +- 5e-3) + } +} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/DatasetSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/DatasetSuite.scala new file mode 100644 index 0000000000000..e5738fe7acdc9 --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/DatasetSuite.scala @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +import java.util.concurrent.TimeUnit +import java.util.concurrent.atomic.AtomicLong + +import io.grpc.Server +import io.grpc.inprocess.{InProcessChannelBuilder, InProcessServerBuilder} +import java.util.Properties +import org.scalatest.BeforeAndAfterEach + +import org.apache.spark.connect.proto +import org.apache.spark.sql.connect.client.{DummySparkConnectService, SparkConnectClient} +import org.apache.spark.sql.connect.client.util.ConnectFunSuite +import org.apache.spark.sql.functions._ + +// Add sample tests. +// - sample fraction: simple.sample(0.1) +// - sample withReplacement_fraction: simple.sample(withReplacement = true, 0.11) +// Add tests for exceptions thrown +class DatasetSuite extends ConnectFunSuite with BeforeAndAfterEach { + + private var server: Server = _ + private var service: DummySparkConnectService = _ + private var ss: SparkSession = _ + + private def newSparkSession(): SparkSession = { + val client = new SparkConnectClient( + proto.UserContext.newBuilder().build(), + InProcessChannelBuilder.forName(getClass.getName).directExecutor(), + "test") + new SparkSession(client, cleaner = SparkSession.cleaner, planIdGenerator = new AtomicLong) + } + + private def startDummyServer(): Unit = { + service = new DummySparkConnectService() + server = InProcessServerBuilder + .forName(getClass.getName) + .addService(service) + .build() + server.start() + } + + override def beforeEach(): Unit = { + super.beforeEach() + startDummyServer() + ss = newSparkSession() + } + + override def afterEach(): Unit = { + if (server != null) { + server.shutdownNow() + assert(server.awaitTermination(5, TimeUnit.SECONDS), "server failed to shutdown") + } + } + + test("write") { + val df = ss.newDataFrame(_ => ()).limit(10) + + val builder = proto.WriteOperation.newBuilder() + builder + .setInput(df.plan.getRoot) + .setPath("my/test/path") + .setMode(proto.WriteOperation.SaveMode.SAVE_MODE_ERROR_IF_EXISTS) + .setSource("parquet") + .addSortColumnNames("col1") + .addPartitioningColumns("col99") + .setBucketBy( + proto.WriteOperation.BucketBy + .newBuilder() + .setNumBuckets(2) + .addBucketColumnNames("col1") + .addBucketColumnNames("col2")) + + val expectedPlan = proto.Plan + .newBuilder() + .setCommand(proto.Command.newBuilder().setWriteOperation(builder)) + .build() + + df.write + .sortBy("col1") + .partitionBy("col99") + .bucketBy(2, "col1", "col2") + .parquet("my/test/path") + val actualPlan = service.getAndClearLatestInputPlan() + assert(actualPlan.equals(expectedPlan)) + } + + test("write jdbc") { + val df = ss.newDataFrame(_ => ()).limit(10) + + val builder = proto.WriteOperation.newBuilder() + builder + .setInput(df.plan.getRoot) + .setMode(proto.WriteOperation.SaveMode.SAVE_MODE_ERROR_IF_EXISTS) + .setSource("jdbc") + .putOptions("a", "b") + .putOptions("1", "2") + .putOptions("url", "url") + .putOptions("dbtable", "table") + + val expectedPlan = proto.Plan + .newBuilder() + .setCommand(proto.Command.newBuilder().setWriteOperation(builder)) + .build() + + val connectionProperties = new Properties + connectionProperties.put("a", "b") + connectionProperties.put("1", "2") + df.write.jdbc("url", "table", connectionProperties) + + val actualPlan = service.getAndClearLatestInputPlan() + assert(actualPlan.equals(expectedPlan)) + } + + test("write V2") { + val df = ss.newDataFrame(_ => ()).limit(10) + + val builder = proto.WriteOperationV2.newBuilder() + builder + .setInput(df.plan.getRoot) + .setTableName("t1") + .addPartitioningColumns(col("col99").expr) + .setProvider("json") + .putTableProperties("key", "value") + .putOptions("key2", "value2") + .setMode(proto.WriteOperationV2.Mode.MODE_CREATE_OR_REPLACE) + + val expectedPlan = proto.Plan + .newBuilder() + .setCommand(proto.Command.newBuilder().setWriteOperationV2(builder)) + .build() + + df.writeTo("t1") + .partitionedBy(col("col99")) + .using("json") + .tableProperty("key", "value") + .options(Map("key2" -> "value2")) + .createOrReplace() + val actualPlan = service.getAndClearLatestInputPlan() + assert(actualPlan.equals(expectedPlan)) + } + + test("Pivot") { + val df = ss.newDataFrame(_ => ()) + intercept[IllegalArgumentException] { + df.groupBy().pivot(Column("c"), Seq(Column("col"))) + } + } + + test("command extension") { + val extension = proto.ExamplePluginCommand.newBuilder().setCustomField("abc").build() + val command = proto.Command + .newBuilder() + .setExtension(com.google.protobuf.Any.pack(extension)) + .build() + val expectedPlan = proto.Plan.newBuilder().setCommand(command).build() + ss.execute(com.google.protobuf.Any.pack(extension)) + val actualPlan = service.getAndClearLatestInputPlan() + assert(actualPlan.equals(expectedPlan)) + } +} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/FunctionTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/FunctionTestSuite.scala new file mode 100644 index 0000000000000..9e02eb1307896 --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/FunctionTestSuite.scala @@ -0,0 +1,257 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +import java.util.Collections + +import org.apache.spark.sql.connect.client.util.ConnectFunSuite +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.{DataType, StructType} + +/** + * Tests for client local function behavior. + * + * This mostly tests is various function variants produce the same columns. + */ +class FunctionTestSuite extends ConnectFunSuite { + private def testEquals(name: String, columns: Column*): Unit = { + test(name) { + assert(columns.nonEmpty) + val unique = columns.distinct + assert(unique.size == 1) + } + } + + private val a = col("a") + private val b = col("b") + private val c = col("c") + + private val schema = new StructType() + .add("key", "long") + .add("value", "string") + + testEquals("col/column", a, column("a")) + testEquals("asc/asc_nulls_first", asc("a"), asc_nulls_first("a")) + testEquals("desc/desc_nulls_last", desc("a"), desc_nulls_last("a")) + testEquals( + "approx_count_distinct", + approxCountDistinct(a), + approxCountDistinct("a"), + approx_count_distinct("a"), + approx_count_distinct(a)) + testEquals( + "approx_count_distinct rsd", + approxCountDistinct(a, 0.1), + approxCountDistinct("a", 0.1), + approx_count_distinct("a", 0.1), + approx_count_distinct(a, 0.1)) + testEquals("avg/mean", avg("a"), avg(a), mean(a), mean("a")) + testEquals("collect_list", collect_list("a"), collect_list(a)) + testEquals("collect_set", collect_set("a"), collect_set(a)) + testEquals("corr", corr("a", "b"), corr(a, b)) + testEquals( + "count_distinct", + countDistinct(a, b, c), + countDistinct("a", "b", "c"), + count_distinct(a, b, c)) + testEquals("covar_pop", covar_pop(a, b), covar_pop("a", "b")) + testEquals("covar_samp", covar_samp(a, b), covar_samp("a", "b")) + testEquals( + "first", + first("a"), + first(a), + first("a", ignoreNulls = false), + first(a, ignoreNulls = false)) + testEquals("grouping", grouping("a"), grouping(a)) + testEquals("grouping_id", grouping_id("a", "b"), grouping_id(a, b)) + testEquals("kurtosis", kurtosis("a"), kurtosis(a)) + testEquals( + "last", + last("a"), + last(a), + last("a", ignoreNulls = false), + last(a, ignoreNulls = false)) + testEquals("max", max("a"), max(a)) + testEquals("min", min("a"), min(a)) + testEquals("skewness", skewness("a"), skewness(a)) + testEquals("stddev", stddev("a"), stddev(a)) + testEquals("stddev_samp", stddev_samp("a"), stddev_samp(a)) + testEquals("stddev_pop", stddev_pop("a"), stddev_pop(a)) + testEquals("sum", sum("a"), sum(a)) + testEquals("sum_distinct", sumDistinct("a"), sumDistinct(a), sum_distinct(a)) + testEquals("variance", variance("a"), variance(a)) + testEquals("var_samp", var_samp("a"), var_samp(a)) + testEquals("var_pop", var_pop("a"), var_pop(a)) + testEquals("array", array(a, b, c), array("a", "b", "c")) + testEquals( + "monotonicallyIncreasingId", + monotonicallyIncreasingId(), + monotonically_increasing_id()) + testEquals("sqrt", sqrt("a"), sqrt(a)) + testEquals("struct", struct(a, c, b), struct("a", "c", "b")) + testEquals("bitwise_not", bitwiseNOT(a), bitwise_not(a)) + testEquals("acos", acos("a"), acos(a)) + testEquals("acosh", acosh("a"), acosh(a)) + testEquals("asin", asin("a"), asin(a)) + testEquals("asinh", asinh("a"), asinh(a)) + testEquals("atan", atan("a"), atan(a)) + testEquals("atan2", atan2(a, b), atan2(a, "b"), atan2("a", b), atan2("a", "b")) + testEquals("atanh", atanh("a"), atanh(a)) + testEquals("bin", bin("a"), bin(a)) + testEquals("cbrt", cbrt("a"), cbrt(a)) + testEquals("ceil", ceil(a), ceil("a")) + testEquals("cos", cos("a"), cos(a)) + testEquals("cosh", cosh("a"), cosh(a)) + testEquals("exp", exp("a"), exp(a)) + testEquals("expm1", expm1("a"), expm1(a)) + testEquals("floor", floor(a), floor("a")) + testEquals("greatest", greatest(a, b, c), greatest("a", "b", "c")) + testEquals("hypot", hypot(a, b), hypot("a", b), hypot(a, "b"), hypot("a", "b")) + testEquals( + "hypot right fixed", + hypot(lit(3d), a), + hypot(lit(3d), "a"), + hypot(3d, a), + hypot(3d, "a")) + testEquals( + "hypot left fixed", + hypot(a, lit(4d)), + hypot(a, 4d), + hypot("a", lit(4d)), + hypot("a", 4d)) + testEquals("least", least(a, b, c), least("a", "b", "c")) + testEquals("log", log("a"), log(a)) + testEquals("log base", log(2.0, "a"), log(2.0, a)) + testEquals("log10", log10("a"), log10(a)) + testEquals("log1p", log1p("a"), log1p(a)) + testEquals("log2", log2("a"), log2(a)) + testEquals("pow", pow(a, b), pow(a, "b"), pow("a", b), pow("a", "b")) + testEquals("pow left fixed", pow(lit(7d), b), pow(lit(7d), "b"), pow(7d, b), pow(7d, "b")) + testEquals("pow right fixed", pow(a, lit(9d)), pow(a, 9d), pow("a", lit(9d)), pow("a", 9d)) + testEquals("rint", rint(a), rint("a")) + testEquals("round", round(a), round(a, 0)) + testEquals("bround", bround(a), bround(a, 0)) + testEquals("shiftleft", shiftLeft(a, 2), shiftleft(a, 2)) + testEquals("shiftright", shiftRight(a, 3), shiftright(a, 3)) + testEquals("shiftrightunsigned", shiftRightUnsigned(a, 3), shiftrightunsigned(a, 3)) + testEquals("signum", signum("a"), signum(a)) + testEquals("sin", sin("a"), sin(a)) + testEquals("sinh", sinh("a"), sinh(a)) + testEquals("tan", tan("a"), tan(a)) + testEquals("tanh", tanh("a"), tanh(a)) + testEquals("degrees", toDegrees(a), toDegrees("a"), degrees(a), degrees("a")) + testEquals("radians", toRadians(a), toRadians("a"), radians(a), radians("a")) + testEquals( + "regexp_replace", + regexp_replace(a, lit("foo"), lit("bar")), + regexp_replace(a, "foo", "bar")) + testEquals("add_months", add_months(a, lit(1)), add_months(a, 1)) + testEquals("date_add", date_add(a, lit(2)), date_add(a, 2)) + testEquals("date_sub", date_sub(a, lit(2)), date_sub(a, 2)) + testEquals("next_day", next_day(a, lit("Mon")), next_day(a, lit("Mon"))) + testEquals("unix_timestamp", unix_timestamp(), unix_timestamp(current_timestamp())) + testEquals( + "from_utc_timestamp", + from_utc_timestamp(a, "GMT"), + from_utc_timestamp(a, lit("GMT"))) + testEquals("to_utc_timestamp", to_utc_timestamp(a, "GMT"), to_utc_timestamp(a, lit("GMT"))) + testEquals( + "window", + window(a, "10 seconds", "10 seconds", "0 second"), + window(a, "10 seconds", "10 seconds"), + window(a, "10 seconds")) + testEquals("session_window", session_window(a, "1 second"), session_window(a, lit("1 second"))) + testEquals("slice", slice(a, 1, 2), slice(a, lit(1), lit(2))) + testEquals("bucket", bucket(lit(3), a), bucket(3, a)) + testEquals( + "lag", + lag(a, 1), + lag("a", 1), + lag(a, 1, null), + lag("a", 1, null), + lag(a, 1, null, false)) + testEquals( + "lead", + lead(a, 2), + lead("a", 2), + lead(a, 2, null), + lead("a", 2, null), + lead(a, 2, null, false)) + testEquals( + "aggregate", + aggregate(a, lit(0), (l, r) => l + r), + aggregate(a, lit(0), (l, r) => l + r, id => id)) + testEquals( + "from_json", + from_json(a, schema.asInstanceOf[DataType]), + from_json(a, schema), + from_json(a, lit(schema.json)), + from_json(a, schema.json, Map.empty[String, String]), + from_json(a, schema.json, Collections.emptyMap[String, String]), + from_json(a, schema.asInstanceOf[DataType], Map.empty[String, String]), + from_json(a, schema.asInstanceOf[DataType], Collections.emptyMap[String, String]), + from_json(a, schema, Map.empty[String, String]), + from_json(a, schema, Collections.emptyMap[String, String]), + from_json(a, lit(schema.json), Collections.emptyMap[String, String])) + testEquals("schema_of_json", schema_of_json(lit("x,y")), schema_of_json("x,y")) + testEquals( + "to_json", + to_json(a), + to_json(a, Collections.emptyMap[String, String]), + to_json(a, Map.empty[String, String])) + testEquals("sort_array", sort_array(a), sort_array(a, asc = true)) + testEquals("sequence", sequence(lit(1), lit(10)), sequence(lit(1), lit(10), lit(1L))) + testEquals( + "from_csv", + from_csv(a, lit(schema.toDDL), Collections.emptyMap[String, String]), + from_csv(a, schema, Map.empty[String, String])) + testEquals( + "schema_of_csv", + schema_of_csv(lit("x,y")), + schema_of_csv("x,y"), + schema_of_csv(lit("x,y"), Collections.emptyMap())) + testEquals("to_csv", to_csv(a), to_csv(a, Collections.emptyMap[String, String])) + + test("assert_true no message") { + val e = assert_true(a).expr + assert(e.hasUnresolvedFunction) + val fn = e.getUnresolvedFunction + assert(fn.getFunctionName == "assert_true") + assert(fn.getArgumentsCount == 1) + assert(fn.getArguments(0) == a.expr) + } + + test("json_tuple zero args") { + intercept[IllegalArgumentException](json_tuple(a)) + } + + test("rand no seed") { + val e = rand().expr + assert(e.hasUnresolvedFunction) + val fn = e.getUnresolvedFunction + assert(fn.getFunctionName == "rand") + assert(fn.getArgumentsCount == 0) + } + + test("randn no seed") { + val e = randn().expr + assert(e.hasUnresolvedFunction) + val fn = e.getUnresolvedFunction + assert(fn.getFunctionName == "randn") + assert(fn.getArgumentsCount == 0) + } +} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala new file mode 100644 index 0000000000000..a57c6b390124e --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/PlanGenerationTestSuite.scala @@ -0,0 +1,2131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +import java.nio.file.{Files, Path} +import java.util.{Collections, Properties} +import java.util.concurrent.atomic.AtomicLong + +import scala.collection.mutable +import scala.util.{Failure, Success, Try} + +import com.google.protobuf.util.JsonFormat +import com.google.protobuf.util.JsonFormat.TypeRegistry +import io.grpc.inprocess.InProcessChannelBuilder +import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach} + +import org.apache.spark.connect.proto +import org.apache.spark.internal.Logging +import org.apache.spark.sql.{functions => fn} +import org.apache.spark.sql.catalyst.ScalaReflection +import org.apache.spark.sql.catalyst.encoders.AgnosticEncoders.StringEncoder +import org.apache.spark.sql.connect.client.SparkConnectClient +import org.apache.spark.sql.connect.client.util.ConnectFunSuite +import org.apache.spark.sql.expressions.Window +import org.apache.spark.sql.functions.lit +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.CalendarInterval + +// scalastyle:off +/** + * Test the plans generated by the client. This serves two purposes: + * + * 1. Make sure the generated plan matches our expectations. The generated JSON file can be used + * for this during review. + * 1. Make sure the generated plans are stable. Changes to the generated plans should be rare. + * The generated plan is compared to the (previously) generated proto file; the test fails + * when they are different. + * + * If you need to re-generate the golden files, you need to set the SPARK_GENERATE_GOLDEN_FILES=1 + * environment variable before running this test, e.g.: + * {{{ + * SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "connect-client-jvm/testOnly org.apache.spark.sql.PlanGenerationTestSuite" + * }}} + * + * Note that the plan protos are used as the input for the `ProtoToParsedPlanTestSuite` in the + * `connector/connect/server` module + */ +// scalastyle:on +class PlanGenerationTestSuite + extends ConnectFunSuite + with BeforeAndAfterAll + with BeforeAndAfterEach + with Logging { + + // Borrowed from SparkFunSuite + private val regenerateGoldenFiles: Boolean = System.getenv("SPARK_GENERATE_GOLDEN_FILES") == "1" + + protected val queryFilePath: Path = commonResourcePath.resolve("queries") + + // A relative path to /connector/connect/server, used by `ProtoToParsedPlanTestSuite` to run + // with the datasource. + protected val testDataPath: Path = java.nio.file.Paths.get( + "../", + "common", + "src", + "test", + "resources", + "query-tests", + "test-data") + + private val registry = TypeRegistry + .newBuilder() + .add(proto.ExamplePluginRelation.getDescriptor) + .add(proto.ExamplePluginExpression.getDescriptor) + .add(proto.ExamplePluginCommand.getDescriptor) + .build() + + private val printer = JsonFormat.printer().usingTypeRegistry(registry) + + private var session: SparkSession = _ + + override protected def beforeAll(): Unit = { + super.beforeAll() + val client = SparkConnectClient( + proto.UserContext.newBuilder().build(), + InProcessChannelBuilder.forName("/dev/null")) + session = + new SparkSession(client, cleaner = SparkSession.cleaner, planIdGenerator = new AtomicLong) + } + + override protected def beforeEach(): Unit = { + session.resetPlanIdGenerator() + } + + override protected def afterAll(): Unit = { + session.close() + super.afterAll() + } + + private def test(name: String)(f: => Dataset[_]): Unit = super.test(name) { + val actual = f.plan.getRoot + val goldenFile = queryFilePath.resolve(name.replace(' ', '_') + ".proto.bin") + Try(readRelation(goldenFile)) match { + case Success(expected) if expected == actual => + // Ok! + case Success(_) if regenerateGoldenFiles => + logInfo("Rewriting Golden File") + writeGoldenFile(goldenFile, actual) + case Success(expected) => + fail(s""" + |Expected and actual plans do not match: + | + |=== Expected Plan === + |$expected + | + |=== Actual Plan === + |$actual + |""".stripMargin) + case Failure(_) if regenerateGoldenFiles => + logInfo("Writing Golden File") + writeGoldenFile(goldenFile, actual) + case Failure(_) => + fail( + "No golden file found. Please re-run this test with the " + + "SPARK_GENERATE_GOLDEN_FILES=1 environment variable set") + } + } + + private def readRelation(path: Path): proto.Relation = { + val input = Files.newInputStream(path) + try proto.Relation.parseFrom(input) + finally { + input.close() + } + } + + private def writeGoldenFile(path: Path, relation: proto.Relation): Unit = { + val output = Files.newOutputStream(path) + try relation.writeTo(output) + finally { + output.close() + } + // Write the json file for verification. + val jsonPath = + path.getParent.resolve(path.getFileName.toString.stripSuffix(".proto.bin") + ".json") + val writer = Files.newBufferedWriter(jsonPath) + try writer.write(printer.print(relation)) + finally { + writer.close() + } + } + + private val urlWithUserAndPass = "jdbc:h2:mem:testdb0;user=testUser;password=testPass" + + private val simpleSchema = new StructType() + .add("id", "long") + .add("a", "int") + .add("b", "double") + + private val simpleSchemaString = simpleSchema.catalogString + + private val otherSchema = new StructType() + .add("a", "int") + .add("id", "long") + .add("payload", "binary") + + private val otherSchemaString = otherSchema.catalogString + + private val complexSchema = simpleSchema + .add("d", simpleSchema) + .add("e", "array") + .add("f", MapType(StringType, simpleSchema)) + .add("g", "string") + + private val complexSchemaString = complexSchema.catalogString + + private val binarySchema = new StructType() + .add("id", "long") + .add("bytes", "binary") + + private val binarySchemaString = binarySchema.catalogString + + private val temporalsSchema = new StructType() + .add("d", "date") + .add("t", "timestamp") + .add("s", "string") + .add("x", "bigint") + .add( + "wt", + new StructType() + .add("start", "timestamp") + .add("end", "timestamp")) + + private val temporalsSchemaString = temporalsSchema.catalogString + + private def createLocalRelation(schema: String): DataFrame = session.newDataFrame { builder => + // TODO API is not consistent. Now we have two different ways of working with schemas! + builder.getLocalRelationBuilder.setSchema(schema) + } + + // A few helper dataframes. + private def simple: DataFrame = createLocalRelation(simpleSchemaString) + private def left: DataFrame = simple + private def right: DataFrame = createLocalRelation(otherSchemaString) + private def complex = createLocalRelation(complexSchemaString) + private def binary = createLocalRelation(binarySchemaString) + private def temporals = createLocalRelation(temporalsSchemaString) + + /* Spark Session API */ + test("range") { + session.range(1, 10, 1, 2) + } + + test("read") { + session.read + .format("csv") + .schema( + StructType( + StructField("name", StringType) :: + StructField("age", IntegerType) :: + StructField("job", StringType) :: Nil)) + .option("header", "true") + .options(Map("delimiter" -> ";")) + .load(testDataPath.resolve("people.csv").toString) + } + + test("read jdbc") { + session.read.jdbc(urlWithUserAndPass, "TEST.TIMETYPES", new Properties()) + } + + test("read jdbc with partition") { + session.read.jdbc(urlWithUserAndPass, "TEST.EMP", "THEID", 0, 4, 3, new Properties()) + } + + test("read jdbc with predicates") { + val parts = Array[String]("THEID < 2", "THEID >= 2") + session.read.jdbc(urlWithUserAndPass, "TEST.PEOPLE", parts, new Properties()) + } + + test("read json") { + session.read.json(testDataPath.resolve("people.json").toString) + } + + test("json from dataset") { + session.read + .schema(new StructType().add("c1", StringType).add("c2", IntegerType)) + .option("allowSingleQuotes", "true") + .json(session.emptyDataset(StringEncoder)) + } + + test("toJSON") { + complex.toJSON + } + + test("read csv") { + session.read.csv(testDataPath.resolve("people.csv").toString) + } + + test("csv from dataset") { + session.read + .schema(new StructType().add("c1", StringType).add("c2", IntegerType)) + .option("header", "true") + .csv(session.emptyDataset(StringEncoder)) + } + + test("read parquet") { + session.read.parquet(testDataPath.resolve("users.parquet").toString) + } + + test("read orc") { + session.read.orc(testDataPath.resolve("users.orc").toString) + } + + test("read table") { + session.read.table("myTable") + } + + test("table") { + session.table("myTable") + } + + test("read text") { + session.read.text(testDataPath.resolve("people.txt").toString) + } + + /* Dataset API */ + test("select") { + simple.select(fn.col("id")) + } + + test("select typed 1-arg") { + val encoder = ScalaReflection.encoderFor[(Long, Int)] + simple.select(fn.struct(fn.col("id"), fn.col("a")).as(encoder)) + } + + test("limit") { + simple.limit(10) + } + + test("filter") { + simple.filter(fn.col("id") === fn.lit(10L)) + } + + test("toDF") { + simple.toDF("x1", "x2", "x3") + } + + test("to") { + simple.to( + new StructType() + .add("b", "double") + .add("id", "int")) + } + + test("join inner_no_condition") { + left.join(right) + } + + test("join inner_using_single_col") { + left.join(right, "id") + } + + test("join inner_using_multiple_col_array") { + left.join(right, Array("id", "a")) + } + + test("join inner_using_multiple_col_seq") { + left.join(right, Seq("id", "a")) + } + + test("join using_single_col") { + left.join(right, "id", "left_semi") + } + + test("join using_multiple_col_array") { + left.join(right, Array("id", "a"), "full_outer") + } + + test("join using_multiple_col_seq") { + left.join(right, Seq("id", "a"), "right_outer") + } + + test("join inner_condition") { + left.alias("l").join(right.alias("r"), fn.col("l.a") === fn.col("r.a")) + } + + test("join condition") { + left.as("l").join(right.as("r"), fn.col("l.id") === fn.col("r.id"), "left_anti") + } + + test("crossJoin") { + left.crossJoin(right) + } + + test("sortWithinPartitions strings") { + simple.sortWithinPartitions("a", "id") + } + + test("sortWithinPartitions columns") { + simple.sortWithinPartitions(fn.col("id"), fn.col("b")) + } + + test("sort strings") { + simple.sort("b", "a") + } + + test("sort columns") { + simple.sort(fn.col("id"), fn.col("b")) + } + + test("orderBy strings") { + simple.sort("b", "id", "a") + } + + test("orderBy columns") { + simple.sort(fn.col("id"), fn.col("b"), fn.col("a")) + } + + test("apply") { + val stable = simple + stable.select(stable("a")) + } + + test("hint") { + simple.hint("coalesce", 100) + } + + test("col") { + val stable = simple + stable.select(stable.col("id"), stable.col("b")) + } + + test("colRegex") { + simple.select(simple.colRegex("`a|id`")) + } + + test("as string") { + simple.as("foo") + } + + test("as symbol") { + simple.as('bar) + } + test("alias string") { + simple.alias("fooz") + } + + test("alias symbol") { + simple.alias("bob") + } + + test("select strings") { + simple.select("id", "a") + } + + test("selectExpr") { + simple.selectExpr("a + 10 as x", "id % 10 as grp") + } + + test("filter expr") { + simple.filter("exp(a) < 10.0") + } + + test("where column") { + simple.where(fn.col("id") === fn.lit(1L)) + } + + test("where expr") { + simple.where("a + id < 1000") + } + + test("unpivot values") { + simple.unpivot( + ids = Array(fn.col("id"), fn.col("a")), + values = Array(fn.col("b")), + variableColumnName = "name", + valueColumnName = "value") + } + + test("unpivot no_values") { + simple.unpivot( + ids = Array(fn.col("id")), + variableColumnName = "name", + valueColumnName = "value") + } + + test("melt values") { + simple.unpivot( + ids = Array(fn.col("a")), + values = Array(fn.col("id")), + variableColumnName = "name", + valueColumnName = "value") + } + + test("melt no_values") { + simple.melt( + ids = Array(fn.col("id"), fn.col("a")), + variableColumnName = "name", + valueColumnName = "value") + } + + test("offset") { + simple.offset(1000) + } + + test("union") { + simple.union(simple) + } + + test("unionAll") { + simple.union(simple) + } + + test("unionByName") { + simple.drop("b").unionByName(right.drop("payload")) + } + + test("unionByName allowMissingColumns") { + simple.unionByName(right, allowMissingColumns = true) + } + + test("intersect") { + simple.intersect(simple) + } + + test("intersectAll") { + simple.intersectAll(simple) + } + + test("except") { + simple.except(simple) + } + + test("exceptAll") { + simple.exceptAll(simple) + } + + test("sample fraction_seed") { + simple.sample(0.43, 9890823L) + } + + test("sample withReplacement_fraction_seed") { + simple.sample(withReplacement = true, 0.23, 898L) + } + + test("withColumn single") { + simple.withColumn("z", fn.expr("a + 100")) + } + + test("withColumns scala_map") { + simple.withColumns(Map(("b", fn.lit("redacted")), ("z", fn.expr("a + 100")))) + } + + test("withColumns java_map") { + val map = new java.util.HashMap[String, Column] + map.put("g", fn.col("id")) + map.put("a", fn.lit("123")) + simple.withColumns(map) + } + + test("withColumnRenamed single") { + simple.withColumnRenamed("id", "nid") + } + + test("withColumnRenamed scala_map") { + simple.withColumnsRenamed(Map(("a", "alpha"), ("b", "beta"))) + } + + test("withColumnRenamed java_map") { + val map = new java.util.HashMap[String, String] + map.put("id", "nid") + map.put("b", "bravo") + simple.withColumnsRenamed(map) + } + + test("withMetadata") { + val builder = new MetadataBuilder + builder.putString("description", "unique identifier") + simple.withMetadata("id", builder.build()) + } + + test("drop single string") { + simple.drop("a") + } + + test("drop multiple strings") { + simple.drop("id", "a", "b") + } + + test("drop single column") { + simple.drop(fn.col("b")) + } + + test("drop multiple column") { + simple.drop(fn.col("b"), fn.col("id")) + } + + test("dropDuplicates") { + simple.dropDuplicates() + } + + test("dropDuplicates names seq") { + simple.dropDuplicates("a" :: "b" :: Nil) + } + + test("dropDuplicates names array") { + simple.dropDuplicates(Array("a", "id")) + } + + test("dropDuplicates varargs") { + simple.dropDuplicates("a", "b", "id") + } + + test("describe") { + simple.describe("id", "b") + } + + test("summary") { + simple.summary("mean", "min") + } + + test("repartition") { + simple.repartition(24) + } + + test("repartition num_partitions_expressions") { + simple.repartition(22, fn.col("a"), fn.col("id")) + } + + test("repartition expressions") { + simple.repartition(fn.col("id"), fn.col("b")) + } + + test("repartitionByRange num_partitions_expressions") { + simple.repartitionByRange(33, fn.col("b"), fn.col("id").desc_nulls_first) + } + + test("repartitionByRange expressions") { + simple.repartitionByRange(fn.col("a").asc, fn.col("id").desc_nulls_first) + } + + test("coalesce") { + simple.coalesce(5) + } + + test("distinct") { + simple.distinct() + } + + /* Column API */ + private def columnTest(name: String)(f: => Column): Unit = { + test("column " + name) { + complex.select(f) + } + } + + private def orderColumnTest(name: String)(f: => Column): Unit = { + test("column " + name) { + complex.orderBy(f) + } + } + + columnTest("apply") { + fn.col("f").apply("super_duper_key") + } + + columnTest("unary minus") { + -fn.lit(1) + } + + columnTest("not") { + !fn.lit(true) + } + + columnTest("equals") { + fn.col("a") === fn.col("b") + } + + columnTest("not equals") { + fn.col("a") =!= fn.col("b") + } + + columnTest("gt") { + fn.col("a") > fn.col("b") + } + + columnTest("lt") { + fn.col("a") < fn.col("b") + } + + columnTest("geq") { + fn.col("a") >= fn.col("b") + } + + columnTest("leq") { + fn.col("a") <= fn.col("b") + } + + columnTest("eqNullSafe") { + fn.col("a") <=> fn.col("b") + } + + columnTest("when otherwise") { + val a = fn.col("a") + fn.when(a < 10, "low").when(a < 20, "medium").otherwise("high") + } + + columnTest("between") { + fn.col("a").between(10, 20) + } + + columnTest("isNaN") { + fn.col("b").isNaN + } + + columnTest("isNull") { + fn.col("g").isNull + } + + columnTest("isNotNull") { + fn.col("g").isNotNull + } + + columnTest("and") { + fn.col("a") > 10 && fn.col("b") < 0.5d + } + + columnTest("or") { + fn.col("a") > 10 || fn.col("b") < 0.5d + } + + columnTest("add") { + fn.col("a") + fn.col("b") + } + + columnTest("subtract") { + fn.col("a") - fn.col("b") + } + + columnTest("multiply") { + fn.col("a") * fn.col("b") + } + + columnTest("divide") { + fn.col("a") / fn.col("b") + } + + columnTest("modulo") { + fn.col("a") % 10 + } + + columnTest("isin") { + fn.col("g").isin("hello", "world", "foo") + } + + columnTest("like") { + fn.col("g").like("%bob%") + } + + columnTest("rlike") { + fn.col("g").like("^[0-9]*$") + } + + columnTest("ilike") { + fn.col("g").like("%fOb%") + } + + columnTest("getItem") { + fn.col("e").getItem(3) + } + + columnTest("withField") { + fn.col("d").withField("x", fn.lit("xq")) + } + + columnTest("dropFields") { + fn.col("d").dropFields("a", "c") + } + + columnTest("getField") { + fn.col("d").getItem("b") + } + + columnTest("substr") { + fn.col("g").substr(8, 3) + } + + columnTest("contains") { + fn.col("g").contains("baz") + } + + columnTest("startsWith") { + fn.col("g").startsWith("prefix_") + } + + columnTest("endsWith") { + fn.col("g").endsWith("suffix_") + } + + columnTest("alias") { + fn.col("a").name("b") + } + + columnTest("as multi") { + fn.expr("inline(map_values(f))").as(Array("v1", "v2", "v3")) + } + + columnTest("as with metadata") { + val builder = new MetadataBuilder + builder.putString("comment", "modified E field") + fn.col("e").as("e_mod", builder.build()) + } + + columnTest("cast") { + fn.col("a").cast("long") + } + + orderColumnTest("desc") { + fn.col("b").desc + } + + orderColumnTest("desc_nulls_first") { + fn.col("b").desc_nulls_first + } + + orderColumnTest("desc_nulls_last") { + fn.col("b").desc_nulls_last + } + + orderColumnTest("asc") { + fn.col("a").asc + } + + orderColumnTest("asc_nulls_first") { + fn.col("a").asc_nulls_first + } + + orderColumnTest("asc_nulls_last") { + fn.col("a").asc_nulls_last + } + + columnTest("bitwiseOR") { + fn.col("a").bitwiseOR(7) + } + + columnTest("bitwiseAND") { + fn.col("a").bitwiseAND(255) + } + + columnTest("bitwiseXOR") { + fn.col("a").bitwiseXOR(78) + } + + columnTest("star") { + fn.col("*") + } + + columnTest("star with target") { + fn.col("d.*") + } + + /* Function API */ + private def functionTest(name: String)(f: => Column): Unit = { + test("function " + name) { + complex.select(f) + } + } + + functionTest("col") { + fn.col("id") + } + + functionTest("asc") { + fn.asc("a") + } + + functionTest("asc_nulls_first") { + fn.asc_nulls_first("a") + } + + functionTest("asc_nulls_last") { + fn.asc_nulls_last("a") + } + + functionTest("desc") { + fn.desc("a") + } + + functionTest("desc_nulls_first") { + fn.desc_nulls_first("a") + } + + functionTest("desc_nulls_last") { + fn.desc_nulls_last("a") + } + + functionTest("approx_count_distinct") { + fn.approx_count_distinct("a") + } + + functionTest("approx_count_distinct rsd") { + fn.approx_count_distinct("a", 0.1) + } + + functionTest("avg") { + fn.avg("a") + } + + functionTest("collect_list") { + fn.collect_list("a") + } + + functionTest("collect_set") { + fn.collect_set("a") + } + + functionTest("corr") { + fn.corr("a", "b") + } + + functionTest("count") { + fn.count(fn.col("a")) + } + + test("function count typed") { + simple.select(fn.count("a")) + } + + functionTest("countDistinct") { + fn.countDistinct("a", "g") + } + + functionTest("covar_pop") { + fn.covar_pop("a", "b") + } + + functionTest("covar_samp") { + fn.covar_samp("a", "b") + } + + functionTest("first") { + fn.first("a", ignoreNulls = true) + } + + functionTest("kurtosis") { + fn.kurtosis("a") + } + + functionTest("last") { + fn.last("a", ignoreNulls = false) + } + + functionTest("mode") { + fn.mode(fn.col("a")) + } + + test("function max") { + simple.select(fn.max("id")) + } + + functionTest("max_by") { + fn.max_by(fn.col("a"), fn.col("b")) + } + + functionTest("median") { + fn.median(fn.col("a")) + } + + functionTest("min") { + fn.min("a") + } + + functionTest("min_by") { + fn.min_by(fn.col("a"), fn.col("b")) + } + + functionTest("percentile_approx") { + fn.percentile_approx(fn.col("a"), fn.lit(0.3), fn.lit(20)) + } + + functionTest("product") { + fn.product(fn.col("a")) + } + + functionTest("skewness") { + fn.skewness("a") + } + + functionTest("stddev") { + fn.stddev("a") + } + + functionTest("stddev_samp") { + fn.stddev_samp("a") + } + + functionTest("stddev_pop") { + fn.stddev_pop("a") + } + + functionTest("sum") { + fn.sum("a") + } + + functionTest("sum_distinct") { + fn.sum_distinct(fn.col("a")) + } + + functionTest("variance") { + fn.variance("a") + } + + functionTest("var_samp") { + fn.var_samp("a") + } + + functionTest("var_pop") { + fn.var_pop("a") + } + + functionTest("array") { + fn.array("a", "a") + } + + functionTest("map") { + fn.map(fn.col("a"), fn.col("g"), lit(22), lit("dummy")) + } + + functionTest("map_from_arrays") { + fn.map_from_arrays(fn.array(lit(1), lit(2)), fn.array(lit("one"), lit("two"))) + } + + functionTest("coalesce") { + fn.coalesce(fn.col("a"), lit(3)) + } + + functionTest("input_file_name") { + fn.input_file_name() + } + + functionTest("isnan") { + fn.isnan(fn.col("b")) + } + + functionTest("isnull") { + fn.isnull(fn.col("a")) + } + + functionTest("monotonically_increasing_id") { + fn.monotonically_increasing_id() + } + + functionTest("nanvl") { + fn.nanvl(lit(Double.NaN), fn.col("a")) + } + + functionTest("negate") { + fn.negate(fn.col("a")) + } + + functionTest("rand with seed") { + fn.rand(133) + } + + functionTest("randn with seed") { + fn.randn(133) + } + + functionTest("spark_partition_id") { + fn.spark_partition_id() + } + + functionTest("sqrt") { + fn.sqrt("b") + } + + functionTest("struct") { + fn.struct("a", "d") + } + + functionTest("bitwise_not") { + fn.bitwise_not(fn.col("a")) + } + + functionTest("expr") { + fn.expr("a + 1") + } + + functionTest("abs") { + fn.abs(fn.col("a")) + } + + functionTest("acos") { + fn.acos("b") + } + + functionTest("acosh") { + fn.acosh("b") + } + + functionTest("asin") { + fn.asin("b") + } + + functionTest("asinh") { + fn.asinh("b") + } + + functionTest("atan") { + fn.atan("b") + } + + functionTest("atan2") { + fn.atan2(fn.col("a").cast("double"), "b") + } + + functionTest("atanh") { + fn.atanh("b") + } + + functionTest("bin") { + fn.bin("b") + } + + functionTest("ceil") { + fn.ceil("b") + } + + functionTest("ceil scale") { + fn.ceil(fn.col("b"), lit(2)) + } + + functionTest("conv") { + fn.conv(fn.col("b"), 10, 16) + } + + functionTest("cos") { + fn.cos("b") + } + + functionTest("cosh") { + fn.cosh("b") + } + + functionTest("cot") { + fn.cot(fn.col("b")) + } + + functionTest("csc") { + fn.csc(fn.col("b")) + } + + functionTest("exp") { + fn.exp("b") + } + + functionTest("expm1") { + fn.expm1("b") + } + + functionTest("factorial") { + fn.factorial(fn.col("a") % 10) + } + + functionTest("floor") { + fn.floor("b") + } + + functionTest("floor scale") { + fn.floor(fn.col("b"), lit(2)) + } + + functionTest("greatest") { + fn.greatest(fn.col("a"), fn.col("d").getItem("a")) + } + + functionTest("hex") { + fn.hex(fn.col("a")) + } + + functionTest("unhex") { + fn.unhex(fn.col("a")) + } + + functionTest("hypot") { + fn.hypot(fn.col("a"), fn.col("b")) + } + + functionTest("least") { + fn.least(fn.col("a"), fn.col("d").getItem("a")) + } + + functionTest("log") { + fn.log("b") + } + + functionTest("log with base") { + fn.log(2, "b") + } + + functionTest("log10") { + fn.log10("b") + } + + functionTest("log1p") { + fn.log1p("a") + } + + functionTest("log2") { + fn.log2("a") + } + + functionTest("pow") { + fn.pow("a", "b") + } + + functionTest("pmod") { + fn.pmod(fn.col("a"), fn.lit(10)) + } + + functionTest("rint") { + fn.rint("b") + } + + functionTest("round") { + fn.round(fn.col("b"), 2) + } + + functionTest("bround") { + fn.round(fn.col("b"), 2) + } + + functionTest("sec") { + fn.sec(fn.col("b")) + } + + functionTest("shiftleft") { + fn.shiftleft(fn.col("b"), 2) + } + + functionTest("shiftright") { + fn.shiftright(fn.col("b"), 2) + } + + functionTest("shiftrightunsigned") { + fn.shiftrightunsigned(fn.col("b"), 2) + } + + functionTest("signum") { + fn.signum("b") + } + + functionTest("sin") { + fn.sin("b") + } + + functionTest("sinh") { + fn.sinh("b") + } + + functionTest("tan") { + fn.tan("b") + } + + functionTest("tanh") { + fn.tanh("b") + } + + functionTest("degrees") { + fn.degrees("b") + } + + functionTest("radians") { + fn.radians("b") + } + + functionTest("md5") { + fn.md5(fn.col("g").cast("binary")) + } + + functionTest("sha1") { + fn.sha1(fn.col("g").cast("binary")) + } + + functionTest("sha2") { + fn.sha2(fn.col("g").cast("binary"), 512) + } + + functionTest("crc32") { + fn.crc32(fn.col("g").cast("binary")) + } + + functionTest("hash") { + fn.hash(fn.col("b"), fn.col("id")) + } + + functionTest("xxhash64") { + fn.xxhash64(fn.col("id"), fn.col("a"), fn.col("d"), fn.col("g")) + } + + functionTest("assert_true with message") { + fn.assert_true(fn.col("id") > 0, lit("id negative!")) + } + + functionTest("raise_error") { + fn.raise_error(fn.lit("kaboom")) + } + + functionTest("ascii") { + fn.ascii(fn.col("g")) + } + + functionTest("base64") { + fn.base64(fn.col("g").cast("binary")) + } + + functionTest("bit_length") { + fn.bit_length(fn.col("g")) + } + + functionTest("concat_ws") { + fn.concat_ws("-", fn.col("b"), lit("world"), fn.col("id")) + } + + functionTest("decode") { + fn.decode(fn.col("g").cast("binary"), "UTF-8") + } + + functionTest("encode") { + fn.encode(fn.col("g"), "UTF-8") + } + + functionTest("format_number") { + fn.format_number(fn.col("b"), 1) + } + + functionTest("initcap") { + fn.initcap(fn.col("g")) + } + + functionTest("length") { + fn.length(fn.col("g")) + } + + functionTest("lower") { + fn.lower(fn.col("g")) + } + + functionTest("levenshtein") { + fn.levenshtein(fn.col("g"), lit("bob")) + } + + functionTest("locate") { + fn.locate("jar", fn.col("g")) + } + + functionTest("locate with pos") { + fn.locate("jar", fn.col("g"), 10) + } + + functionTest("lpad") { + fn.lpad(fn.col("g"), 10, "-") + } + + test("function lpad binary") { + binary.select(fn.lpad(fn.col("bytes"), 5, Array(0xc, 0xa, 0xf, 0xe).map(_.toByte))) + } + + functionTest("ltrim") { + fn.ltrim(fn.col("g")) + } + + functionTest("ltrim with pattern") { + fn.ltrim(fn.col("g"), "xxx") + } + + functionTest("octet_length") { + fn.octet_length(fn.col("g")) + } + + functionTest("regexp_extract") { + fn.regexp_extract(fn.col("g"), "(\\d+)-(\\d+)", 1) + } + + functionTest("regexp_replace") { + fn.regexp_replace(fn.col("g"), "(\\d+)", "XXX") + } + + functionTest("unbase64") { + fn.unbase64(fn.col("g")) + } + + functionTest("rpad") { + fn.rpad(fn.col("g"), 10, "-") + } + + test("function rpad binary") { + binary.select(fn.rpad(fn.col("bytes"), 5, Array(0xb, 0xa, 0xb, 0xe).map(_.toByte))) + } + + functionTest("rtrim") { + fn.rtrim(fn.col("g")) + } + + functionTest("rtrim with pattern") { + fn.rtrim(fn.col("g"), "yyy") + } + + functionTest("split") { + fn.split(fn.col("g"), ";") + } + + functionTest("split with limit") { + fn.split(fn.col("g"), ";", 10) + } + + functionTest("substring") { + fn.substring(fn.col("g"), 4, 5) + } + + functionTest("substring_index") { + fn.substring_index(fn.col("g"), ";", 5) + } + + functionTest("overlay") { + fn.overlay(fn.col("b"), lit("foo"), lit(4)) + } + + functionTest("overlay with len") { + fn.overlay(fn.col("b"), lit("foo"), lit(4), lit("3")) + } + + functionTest("sentences") { + fn.sentences(fn.col("g")) + } + + functionTest("sentences with locale") { + fn.sentences(fn.col("g"), lit("en"), lit("US")) + } + + functionTest("translate") { + fn.translate(fn.col("g"), "foo", "bar") + } + + functionTest("trim") { + fn.trim(fn.col("g")) + } + + functionTest("trim with pattern") { + fn.trim(fn.col("g"), "---") + } + + functionTest("upper") { + fn.upper(fn.col("g")) + } + + functionTest("years") { + fn.years(Column("a")) + } + + functionTest("months") { + fn.months(Column("a")) + } + + functionTest("days") { + fn.days(Column("a")) + } + + functionTest("hours") { + fn.hours(Column("a")) + } + + functionTest("bucket") { + fn.bucket(3, Column("a")) + } + + functionTest("cume_dist") { + fn.cume_dist().over(Window.partitionBy(Column("a")).orderBy(Column("id"))) + } + + functionTest("dense_rank") { + fn.dense_rank().over(Window.partitionBy(Column("a")).orderBy(Column("id"))) + } + + functionTest("lag") { + fn.lag(Column("g"), 1, null, ignoreNulls = true) + .over(Window.partitionBy(Column("a")).orderBy(Column("id"))) + } + + functionTest("lead") { + fn.lead(Column("g"), 2, "dv", ignoreNulls = true) + .over(Window.partitionBy(Column("a")).orderBy(Column("id"))) + } + + functionTest("nth_value") { + fn.nth_value(Column("g"), 3, ignoreNulls = true) + .over(Window.partitionBy(Column("a")).orderBy(Column("id"))) + } + + functionTest("ntile") { + fn.ntile(4).over(Window.partitionBy(Column("a")).orderBy(Column("id"))) + } + + functionTest("percent_rank") { + fn.percent_rank().over(Window.partitionBy(Column("a")).orderBy(Column("id"))) + } + + functionTest("rank") { + fn.rank().over(Window.partitionBy(Column("a")).orderBy(Column("id"))) + } + + functionTest("row_number") { + fn.row_number().over(Window.partitionBy(Column("a")).orderBy(Column("id"))) + } + + private def temporalFunctionTest(name: String)(f: => Column): Unit = { + test("function " + name) { + temporals.select(f) + } + } + + temporalFunctionTest("add_months") { + fn.add_months(fn.col("d"), 2) + } + + temporalFunctionTest("current_date") { + fn.current_date() + } + + temporalFunctionTest("current_timestamp") { + fn.current_timestamp() + } + + temporalFunctionTest("localtimestamp") { + fn.localtimestamp() + } + + temporalFunctionTest("date_format") { + fn.date_format(fn.col("d"), "yyyy-MM-dd") + } + + temporalFunctionTest("date_add") { + fn.date_add(fn.col("d"), 2) + } + + temporalFunctionTest("date_sub") { + fn.date_sub(fn.col("d"), 2) + } + + temporalFunctionTest("datediff") { + fn.datediff(fn.col("d"), fn.make_date(lit(2020), lit(10), lit(10))) + } + + temporalFunctionTest("year") { + fn.year(fn.col("d")) + } + + temporalFunctionTest("quarter") { + fn.quarter(fn.col("d")) + } + + temporalFunctionTest("month") { + fn.month(fn.col("d")) + } + + temporalFunctionTest("dayofweek") { + fn.dayofweek(fn.col("d")) + } + + temporalFunctionTest("dayofmonth") { + fn.dayofmonth(fn.col("d")) + } + + temporalFunctionTest("dayofyear") { + fn.dayofyear(fn.col("d")) + } + + temporalFunctionTest("hour") { + fn.hour(fn.col("t")) + } + + temporalFunctionTest("last_day") { + fn.last_day(fn.col("t")) + } + + temporalFunctionTest("minute") { + fn.minute(fn.col("t")) + } + + temporalFunctionTest("make_date") { + fn.make_date(fn.lit(2018), fn.lit(5), fn.lit(14)) + } + + temporalFunctionTest("months_between") { + fn.months_between(fn.current_date(), fn.col("d")) + } + + temporalFunctionTest("months_between with roundoff") { + fn.months_between(fn.current_date(), fn.col("d"), roundOff = true) + } + + temporalFunctionTest("next_day") { + fn.next_day(fn.col("d"), "Mon") + } + + temporalFunctionTest("second") { + fn.second(fn.col("t")) + } + + temporalFunctionTest("weekofyear") { + fn.weekofyear(fn.col("d")) + } + + temporalFunctionTest("from_unixtime") { + fn.from_unixtime(lit(1L)) + } + + temporalFunctionTest("unix_timestamp") { + fn.unix_timestamp() + } + + temporalFunctionTest("unix_timestamp with format") { + fn.unix_timestamp(fn.col("s"), "yyyy-MM-dd HH:mm:ss.SSSS") + } + + temporalFunctionTest("to_timestamp") { + fn.to_timestamp(fn.col("s")) + } + + temporalFunctionTest("to_timestamp with format") { + fn.to_timestamp(fn.col("s"), "yyyy-MM-dd HH:mm:ss.SSSS") + } + + temporalFunctionTest("to_date") { + fn.to_date(fn.col("s")) + } + + temporalFunctionTest("to_date with format") { + fn.to_date(fn.col("s"), "yyyy-MM-dd") + } + + temporalFunctionTest("trunc") { + fn.trunc(fn.col("d"), "mm") + } + + temporalFunctionTest("date_trunc") { + fn.trunc(fn.col("t"), "minute") + } + + temporalFunctionTest("from_utc_timestamp") { + fn.from_utc_timestamp(fn.col("t"), "-08:00") + } + + temporalFunctionTest("to_utc_timestamp") { + fn.to_utc_timestamp(fn.col("t"), "-04:00") + } + + temporalFunctionTest("window") { + fn.window(fn.col("t"), "1 second") + } + + test("function window_time") { + val metadata = new MetadataBuilder().putBoolean("spark.timeWindow", value = true).build() + temporals + .withMetadata("wt", metadata) + .select(fn.window_time(fn.col("wt"))) + } + + temporalFunctionTest("session_window") { + fn.session_window(fn.col("t"), "10 minutes") + } + + temporalFunctionTest("timestamp_seconds") { + fn.timestamp_seconds(fn.col("x")) + } + + // Array of Long + // Array of Long + // Array of Array of Long + // Map string, Long + // Map string, Long + + functionTest("array_contains") { + fn.array_contains(fn.col("e"), lit(1)) + } + + functionTest("array_append") { + fn.array_append(fn.col("e"), lit(1)) + } + + functionTest("arrays_overlap") { + fn.arrays_overlap(fn.col("e"), fn.array(lit(1), lit(2))) + } + + functionTest("slice") { + fn.slice(fn.col("e"), 0, 5) + } + + functionTest("array_join") { + fn.array_join(fn.col("e"), ";") + } + + functionTest("array_join with null replacement") { + fn.array_join(fn.col("e"), ";", "null") + } + + functionTest("concat") { + fn.concat(fn.col("e"), fn.array(lit(1), lit(2)), fn.sequence(lit(33), lit(40))) + } + + functionTest("array_position") { + fn.array_position(fn.col("e"), 10) + } + + functionTest("element_at") { + fn.element_at(fn.col("f"), "bob") + } + + functionTest("get") { + fn.get(fn.col("e"), lit(2)) + } + + functionTest("array_sort") { + fn.array_sort(fn.col("e")) + } + + functionTest("array_sort with comparator") { + fn.array_sort(fn.col("e"), (l, r) => l - r) + } + + functionTest("array_remove") { + fn.array_remove(fn.col("e"), 314) + } + + functionTest("array_compact") { + fn.array_compact(fn.col("e")) + } + + functionTest("array_distinct") { + fn.array_distinct(fn.col("e")) + } + + functionTest("array_intersect") { + fn.array_intersect(fn.col("e"), fn.array(lit(10), lit(4))) + } + + functionTest("array_insert") { + fn.array_insert(fn.col("e"), lit(0), lit(1)) + } + + functionTest("array_union") { + fn.array_union(fn.col("e"), fn.array(lit(1), lit(2), lit(3))) + } + + functionTest("array_except") { + fn.array_except(fn.col("e"), fn.array(lit(1), lit(2), lit(4))) + } + + functionTest("transform") { + fn.transform(fn.col("e"), x => x + 1) + } + + functionTest("transform with index") { + fn.transform(fn.col("e"), (x, i) => x + i) + } + + functionTest("exists") { + fn.exists(fn.col("e"), x => x > 10) + } + + functionTest("forall") { + fn.forall(fn.col("e"), x => x > 10) + } + + functionTest("filter") { + fn.filter(fn.col("e"), x => x > 10) + } + + functionTest("filter with pair input") { + fn.filter(fn.col("e"), (x, i) => x > 10 && i > 2) + } + + functionTest("aggregate") { + fn.aggregate(fn.col("e"), lit(0), (x, y) => x + y) + } + + functionTest("zip_with") { + fn.zip_with(fn.col("e"), fn.col("e"), (x, y) => x + y) + } + + functionTest("transform_keys") { + fn.transform_keys(fn.col("f"), (k, v) => fn.concat(k, v.getItem("id"))) + } + + functionTest("transform_values") { + fn.transform_values(fn.col("f"), (k, v) => v.withField("key", k)) + } + + functionTest("map_filter") { + fn.map_filter(fn.col("f"), (k, _) => k.contains(lit("baz"))) + } + + functionTest("map_zip_with") { + fn.map_zip_with(fn.col("f"), fn.col("f"), (_, v1, v2) => v1.getItem("id") + v2.getItem("id")) + } + + functionTest("explode") { + fn.explode(fn.col("e")) + } + + functionTest("explode_outer") { + fn.explode_outer(fn.col("e")) + } + + functionTest("posexplode") { + fn.posexplode(fn.col("e")) + } + + functionTest("posexplode_outer") { + fn.posexplode_outer(fn.col("e")) + } + + functionTest("inline") { + fn.inline(fn.map_values(fn.col("f"))) + } + + functionTest("inline_outer") { + fn.inline_outer(fn.map_values(fn.col("f"))) + } + + functionTest("get_json_object") { + fn.get_json_object(fn.col("g"), "$.device_type") + } + + functionTest("json_tuple") { + fn.json_tuple(fn.col("g"), "a", "b", "id") + } + + functionTest("from_json") { + fn.from_json(fn.col("g"), simpleSchema) + } + + functionTest("schema_of_json") { + fn.schema_of_json(lit("""[{"col":01}]""")) + } + + functionTest("schema_of_json with options") { + fn.schema_of_json( + lit("""[{"col":01}]"""), + Collections.singletonMap("allowNumericLeadingZeros", "true")) + } + + functionTest("to_json") { + fn.to_json(fn.col("d"), Map(("timestampFormat", "dd/MM/yyyy"))) + } + + functionTest("size") { + fn.size(fn.col("f")) + } + + functionTest("sort_array") { + fn.sort_array(fn.col("e")) + } + + functionTest("array_min") { + fn.array_min(fn.col("e")) + } + + functionTest("array_max") { + fn.array_max(fn.col("e")) + } + + functionTest("reverse") { + fn.reverse(fn.col("e")) + } + + functionTest("flatten") { + fn.flatten(fn.array(fn.col("e"), fn.sequence(fn.lit(1), fn.lit(10)))) + } + + functionTest("sequence") { + fn.sequence(fn.lit(1), fn.lit(10)) + } + + functionTest("array_repeat") { + fn.array_repeat(fn.col("a"), 10) + } + + functionTest("map_contains_key") { + fn.map_contains_key(fn.col("f"), "xyz") + } + + functionTest("map_keys") { + fn.map_keys(fn.col("f")) + } + + functionTest("map_values") { + fn.map_values(fn.col("f")) + } + + functionTest("map_entries") { + fn.map_entries(fn.col("f")) + } + + functionTest("map_from_entries") { + fn.map_from_entries(fn.transform(fn.col("e"), (x, i) => fn.struct(i, x))) + } + + functionTest("arrays_zip") { + fn.arrays_zip(fn.col("e"), fn.sequence(lit(1), lit(20))) + } + + functionTest("map_concat") { + fn.map_concat( + fn.col("f"), + fn.map(lit("foo"), fn.struct(lit(12L).as("id"), lit(68).as("a"), lit(Math.E).as("b")))) + } + + functionTest("from_csv") { + fn.from_csv(fn.col("g"), simpleSchema, Map(("mode", "FAILFAST"))) + } + + functionTest("schema_of_csv") { + fn.schema_of_csv(lit("1|abc"), Collections.singletonMap("sep", "|")) + } + + functionTest("to_csv") { + fn.to_csv(fn.col("d"), Collections.singletonMap("sep", "|")) + } + + test("groupby agg") { + simple + .groupBy(Column("id")) + .agg( + "a" -> "max", + "b" -> "stddev", + "b" -> "std", + "b" -> "mean", + "b" -> "average", + "b" -> "avg", + "*" -> "size", + "a" -> "count") + } + + test("groupby agg string") { + simple + .groupBy("id", "b") + .agg("a" -> "max", "a" -> "count") + } + + test("groupby agg columns") { + simple + .groupBy(Column("id")) + .agg(functions.max("a"), functions.sum("b")) + } + + test("groupby max") { + simple + .groupBy(Column("id")) + .max("a", "b") + } + + test("groupby min") { + simple + .groupBy(Column("id")) + .min("a", "b") + } + + test("groupby mean") { + simple + .groupBy(Column("id")) + .mean("a", "b") + } + + test("groupby avg") { + simple + .groupBy(Column("id")) + .avg("a", "b") + } + + test("groupby sum") { + simple + .groupBy(Column("id")) + .sum("a", "b") + } + + test("groupby count") { + simple + .groupBy(Column("id")) + .count() + } + + test("rollup column") { + simple.rollup(Column("a"), Column("b")).count() + } + + test("cube column") { + simple.cube(Column("a"), Column("b")).count() + } + + test("rollup string") { + simple.rollup("a", "b").count() + } + + test("cube string") { + simple.cube("a", "b").count() + } + + test("grouping and grouping_id") { + simple + .cube("a", "b") + .agg(fn.grouping("a"), fn.grouping("b"), fn.grouping_id("a", "b")) + } + + test("pivot") { + simple.groupBy(Column("id")).pivot("a", Seq(1, 2, 3)).agg(functions.count(Column("b"))) + } + + test("pivot without column values") { + simple.groupBy(Column("id")).pivot("a").agg(functions.count(Column("b"))) + } + + test("test broadcast") { + left.join(fn.broadcast(right), "id") + } + + test("function lit") { + simple.select( + fn.lit(fn.col("id")), + fn.lit('id), + fn.lit(true), + fn.lit(68.toByte), + fn.lit(9872.toShort), + fn.lit(-8726532), + fn.lit(7834609328726532L), + fn.lit(Math.E), + fn.lit(-0.8f), + fn.lit(BigDecimal(8997620, 5)), + fn.lit(BigDecimal(898897667231L, 7).bigDecimal), + fn.lit("connect!"), + fn.lit('T'), + fn.lit(Array.tabulate(10)(i => ('A' + i).toChar)), + fn.lit(Array.tabulate(23)(i => (i + 120).toByte)), + fn.lit(mutable.WrappedArray.make(Array[Byte](8.toByte, 6.toByte))), + fn.lit(null), + fn.lit(java.time.LocalDate.of(2020, 10, 10)), + fn.lit(Decimal.apply(BigDecimal(8997620, 6))), + fn.lit(java.time.Instant.ofEpochMilli(1677155519808L)), + fn.lit(new java.sql.Timestamp(12345L)), + fn.lit(java.time.LocalDateTime.of(2023, 2, 23, 20, 36)), + fn.lit(java.sql.Date.valueOf("2023-02-23")), + fn.lit(java.time.Duration.ofSeconds(200L)), + fn.lit(java.time.Period.ofDays(100)), + fn.lit(new CalendarInterval(2, 20, 100L))) + } + + test("function lit array") { + simple.select( + fn.lit(Array.emptyDoubleArray), + fn.lit(Array(Array(1), Array(2), Array(3))), + fn.lit(Array(Array(Array(1)), Array(Array(2)), Array(Array(3)))), + fn.lit(Array(true, false)), + fn.lit(Array(67.toByte, 68.toByte, 69.toByte)), + fn.lit(Array(9872.toShort, 9873.toShort, 9874.toShort)), + fn.lit(Array(-8726532, 8726532, -8726533)), + fn.lit(Array(7834609328726531L, 7834609328726532L, 7834609328726533L)), + fn.lit(Array(Math.E, 1.toDouble, 2.toDouble)), + fn.lit(Array(-0.8f, -0.7f, -0.9f)), + fn.lit(Array(BigDecimal(8997620, 5), BigDecimal(8997621, 5))), + fn.lit( + Array(BigDecimal(898897667231L, 7).bigDecimal, BigDecimal(898897667231L, 7).bigDecimal)), + fn.lit(Array("connect!", "disconnect!")), + fn.lit(Array('T', 'F')), + fn.lit( + Array( + Array.tabulate(10)(i => ('A' + i).toChar), + Array.tabulate(10)(i => ('B' + i).toChar))), + fn.lit(Array(java.time.LocalDate.of(2020, 10, 10), java.time.LocalDate.of(2020, 10, 11))), + fn.lit( + Array( + java.time.Instant.ofEpochMilli(1677155519808L), + java.time.Instant.ofEpochMilli(1677155519809L))), + fn.lit(Array(new java.sql.Timestamp(12345L), new java.sql.Timestamp(23456L))), + fn.lit( + Array( + java.time.LocalDateTime.of(2023, 2, 23, 20, 36), + java.time.LocalDateTime.of(2023, 2, 23, 21, 36))), + fn.lit(Array(java.sql.Date.valueOf("2023-02-23"), java.sql.Date.valueOf("2023-03-01"))), + fn.lit(Array(java.time.Duration.ofSeconds(100L), java.time.Duration.ofSeconds(200L))), + fn.lit(Array(java.time.Period.ofDays(100), java.time.Period.ofDays(200))), + fn.lit(Array(new CalendarInterval(2, 20, 100L), new CalendarInterval(2, 21, 200L)))) + } + + /* Window API */ + test("window") { + simple.select( + fn.min("id").over(Window.partitionBy(Column("a"), Column("b"))), + fn.min("id").over(Window.partitionBy("a", "b")), + fn.min("id").over(Window.orderBy(Column("a"), Column("b"))), + fn.min("id").over(Window.orderBy("a", "b")), + fn.min("id").over(Window.orderBy("a").rowsBetween(2L, 3L)), + fn.min("id").over(Window.orderBy("a").rangeBetween(2L, 3L)), + fn.count(Column("id")).over()) + } + + /* Extensions */ + test("relation extension") { + val input = proto.ExamplePluginRelation + .newBuilder() + .setInput(simple.plan.getRoot) + .build() + session.newDataFrame(com.google.protobuf.Any.pack(input)) + } + + test("expression extension") { + val extension = proto.ExamplePluginExpression + .newBuilder() + .setChild( + proto.Expression + .newBuilder() + .setUnresolvedAttribute(proto.Expression.UnresolvedAttribute + .newBuilder() + .setUnparsedIdentifier("id"))) + .setCustomField("abc") + .build() + simple.select(Column(com.google.protobuf.Any.pack(extension))) + } + + test("crosstab") { + simple.stat.crosstab("a", "b") + } + + test("freqItems") { + simple.stat.freqItems(Array("id", "a"), 0.1) + } + + test("sampleBy") { + simple.stat.sampleBy("id", Map(0 -> 0.1, 1 -> 0.2), 0L) + } + + test("drop") { + simple.na.drop(5, Seq("id", "a")) + } + + test("fill") { + simple.na.fill(8L, Seq("id")) + } + + test("replace") { + simple.na.replace[Long]("id", Map(1L -> 8L)) + } + + /* Reader API */ + test("table API with options") { + session.read.options(Map("p1" -> "v1", "p2" -> "v2")).table("tempdb.myTable") + } +} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/SQLHelper.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/SQLHelper.scala new file mode 100644 index 0000000000000..002785a57c006 --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/SQLHelper.scala @@ -0,0 +1,52 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +trait SQLHelper { + + def spark: SparkSession + + /** + * Sets all SQL configurations specified in `pairs`, calls `f`, and then restores all SQL + * configurations. + */ + protected def withSQLConf(pairs: (String, String)*)(f: => Unit): Unit = { + val (keys, values) = pairs.unzip + val currentValues = keys.map { key => + if (spark.conf.getOption(key).isDefined) { + Some(spark.conf.get(key)) + } else { + None + } + } + (keys, values).zipped.foreach { (k, v) => + if (spark.conf.isModifiable(k)) { + spark.conf.set(k, v) + } else { + throw new AnalysisException(s"Cannot modify the value of a static config: $k") + } + + } + try f + finally { + keys.zip(currentValues).foreach { + case (key, Some(value)) => spark.conf.set(key, value) + case (key, None) => spark.conf.unset(key) + } + } + } +} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/SQLImplicitsTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/SQLImplicitsTestSuite.scala new file mode 100644 index 0000000000000..470736fbebe2e --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/SQLImplicitsTestSuite.scala @@ -0,0 +1,156 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +import java.sql.{Date, Timestamp} +import java.time.{Duration, Instant, LocalDate, LocalDateTime, Period} +import java.time.temporal.ChronoUnit +import java.util.concurrent.atomic.AtomicLong + +import io.grpc.inprocess.InProcessChannelBuilder +import org.apache.commons.lang3.{JavaVersion, SystemUtils} +import org.scalatest.BeforeAndAfterAll + +import org.apache.spark.connect.proto +import org.apache.spark.sql.catalyst.encoders.{AgnosticEncoder, ExpressionEncoder} +import org.apache.spark.sql.connect.client.SparkConnectClient +import org.apache.spark.sql.connect.client.util.ConnectFunSuite + +/** + * Test suite for SQL implicits. + */ +class SQLImplicitsTestSuite extends ConnectFunSuite with BeforeAndAfterAll { + private var session: SparkSession = _ + + override protected def beforeAll(): Unit = { + super.beforeAll() + val client = SparkConnectClient( + proto.UserContext.newBuilder().build(), + InProcessChannelBuilder.forName("/dev/null")) + session = + new SparkSession(client, cleaner = SparkSession.cleaner, planIdGenerator = new AtomicLong) + } + + test("column resolution") { + val spark = session + import spark.implicits._ + def assertEqual(left: Column, right: Column): Unit = assert(left == right) + assertEqual($"x", Column("x")) + assertEqual('y, Column("y")) + } + + test("test implicit encoder resolution") { + val spark = session + import spark.implicits._ + def testImplicit[T: Encoder](expected: T): Unit = { + val encoder = implicitly[Encoder[T]].asInstanceOf[AgnosticEncoder[T]] + val expressionEncoder = ExpressionEncoder(encoder).resolveAndBind() + val serializer = expressionEncoder.createSerializer() + val deserializer = expressionEncoder.createDeserializer() + val actual = deserializer(serializer(expected)) + assert(actual === expected) + } + + val booleans = Array(false, true, false, false) + testImplicit(booleans.head) + testImplicit(java.lang.Boolean.valueOf(booleans.head)) + testImplicit(booleans) + testImplicit(booleans.toSeq) + testImplicit(booleans.toSeq)(newBooleanSeqEncoder) + + val bytes = Array(76.toByte, 59.toByte, 121.toByte) + testImplicit(bytes.head) + testImplicit(java.lang.Byte.valueOf(bytes.head)) + testImplicit(bytes) + testImplicit(bytes.toSeq) + testImplicit(bytes.toSeq)(newByteSeqEncoder) + + val shorts = Array(21.toShort, (-213).toShort, 14876.toShort) + testImplicit(shorts.head) + testImplicit(java.lang.Short.valueOf(shorts.head)) + testImplicit(shorts) + testImplicit(shorts.toSeq) + testImplicit(shorts.toSeq)(newShortSeqEncoder) + + val ints = Array(4, 6, 5) + testImplicit(ints.head) + testImplicit(java.lang.Integer.valueOf(ints.head)) + testImplicit(ints) + testImplicit(ints.toSeq) + testImplicit(ints.toSeq)(newIntSeqEncoder) + + val longs = Array(System.nanoTime(), System.currentTimeMillis()) + testImplicit(longs.head) + testImplicit(java.lang.Long.valueOf(longs.head)) + testImplicit(longs) + testImplicit(longs.toSeq) + testImplicit(longs.toSeq)(newLongSeqEncoder) + + val floats = Array(3f, 10.9f) + testImplicit(floats.head) + testImplicit(java.lang.Float.valueOf(floats.head)) + testImplicit(floats) + testImplicit(floats.toSeq) + testImplicit(floats.toSeq)(newFloatSeqEncoder) + + val doubles = Array(23.78d, -329.6d) + testImplicit(doubles.head) + testImplicit(java.lang.Double.valueOf(doubles.head)) + testImplicit(doubles) + testImplicit(doubles.toSeq) + testImplicit(doubles.toSeq)(newDoubleSeqEncoder) + + val strings = Array("foo", "baz", "bar") + testImplicit(strings.head) + testImplicit(strings) + testImplicit(strings.toSeq) + testImplicit(strings.toSeq)(newStringSeqEncoder) + + val myTypes = Array(MyType(12L, Math.E, Math.PI), MyType(0, 0, 0)) + testImplicit(myTypes.head) + testImplicit(myTypes) + testImplicit(myTypes.toSeq) + testImplicit(myTypes.toSeq)(newProductSeqEncoder[MyType]) + + // Others. + val decimal = java.math.BigDecimal.valueOf(3141527000000000000L, 18) + testImplicit(decimal) + testImplicit(BigDecimal(decimal)) + testImplicit(Date.valueOf(LocalDate.now())) + testImplicit(LocalDate.now()) + // SPARK-42770: Run `LocalDateTime.now()` and `Instant.now()` with Java 8 & 11 always + // get microseconds on both Linux and MacOS, but there are some differences when + // using Java 17, it will get accurate nanoseconds on Linux, but still get the microseconds + // on MacOS. At present, Spark always converts them to microseconds, this will cause the + // test fail when using Java 17 on Linux, so add `truncatedTo(ChronoUnit.MICROS)` when + // testing on Linux using Java 17 to ensure the accuracy of input data is microseconds. + if (SystemUtils.isJavaVersionAtLeast(JavaVersion.JAVA_17) && SystemUtils.IS_OS_LINUX) { + testImplicit(LocalDateTime.now().truncatedTo(ChronoUnit.MICROS)) + testImplicit(Instant.now().truncatedTo(ChronoUnit.MICROS)) + testImplicit(Timestamp.from(Instant.now().truncatedTo(ChronoUnit.MICROS))) + } else { + testImplicit(LocalDateTime.now()) + testImplicit(Instant.now()) + testImplicit(Timestamp.from(Instant.now())) + } + testImplicit(Period.ofYears(2)) + testImplicit(Duration.ofMinutes(77)) + testImplicit(SaveMode.Append) + testImplicit(Map(("key", "value"), ("foo", "baz"))) + testImplicit(Set(1, 2, 4)) + } +} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/UserDefinedFunctionSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/UserDefinedFunctionSuite.scala new file mode 100644 index 0000000000000..1c4ee21773749 --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/UserDefinedFunctionSuite.scala @@ -0,0 +1,51 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +import scala.reflect.runtime.universe.typeTag + +import org.scalatest.BeforeAndAfterEach + +import org.apache.spark.sql.catalyst.ScalaReflection +import org.apache.spark.sql.connect.client.util.ConnectFunSuite +import org.apache.spark.sql.connect.common.UdfPacket +import org.apache.spark.sql.functions.udf +import org.apache.spark.util.Utils + +class UserDefinedFunctionSuite extends ConnectFunSuite with BeforeAndAfterEach { + + test("udf and encoder serialization") { + def func(x: Int): Int = x + 1 + + val myUdf = udf(func _) + val colWithUdf = myUdf(Column("dummy")) + + val udfExpr = colWithUdf.expr.getCommonInlineUserDefinedFunction + assert(udfExpr.getDeterministic) + assert(udfExpr.getArgumentsCount == 1) + assert(udfExpr.getArguments(0) == Column("dummy").expr) + val udfObj = udfExpr.getScalarScalaUdf + + assert(udfObj.getNullable) + + val deSer = Utils.deserialize[UdfPacket](udfObj.getPayload.toByteArray) + + assert(deSer.function.asInstanceOf[Int => Int](5) == func(5)) + assert(deSer.outputEncoder == ScalaReflection.encoderFor(typeTag[Int])) + assert(deSer.inputEncoders == Seq(ScalaReflection.encoderFor(typeTag[Int]))) + } +} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/ArtifactSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/ArtifactSuite.scala new file mode 100644 index 0000000000000..f3d2e5be954db --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/ArtifactSuite.scala @@ -0,0 +1,250 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.client + +import java.io.InputStream +import java.nio.file.{Files, Path, Paths} +import java.util.concurrent.TimeUnit + +import collection.JavaConverters._ +import com.google.protobuf.ByteString +import io.grpc.{ManagedChannel, Server} +import io.grpc.inprocess.{InProcessChannelBuilder, InProcessServerBuilder} +import org.scalatest.BeforeAndAfterEach + +import org.apache.spark.connect.proto +import org.apache.spark.connect.proto.AddArtifactsRequest +import org.apache.spark.sql.connect.client.util.ConnectFunSuite + +class ArtifactSuite extends ConnectFunSuite with BeforeAndAfterEach { + + private var client: SparkConnectClient = _ + private var service: DummySparkConnectService = _ + private var server: Server = _ + private var artifactManager: ArtifactManager = _ + private var channel: ManagedChannel = _ + + private def startDummyServer(): Unit = { + service = new DummySparkConnectService() + server = InProcessServerBuilder + .forName(getClass.getName) + .addService(service) + .build() + server.start() + } + + private def createArtifactManager(): Unit = { + channel = InProcessChannelBuilder.forName(getClass.getName).directExecutor().build() + artifactManager = new ArtifactManager(proto.UserContext.newBuilder().build(), channel) + } + + override def beforeEach(): Unit = { + super.beforeEach() + startDummyServer() + createArtifactManager() + client = null + } + + override def afterEach(): Unit = { + if (server != null) { + server.shutdownNow() + assert(server.awaitTermination(5, TimeUnit.SECONDS), "server failed to shutdown") + } + + if (channel != null) { + channel.shutdownNow() + } + + if (client != null) { + client.shutdown() + } + } + + private val CHUNK_SIZE: Int = 32 * 1024 + protected def artifactFilePath: Path = baseResourcePath.resolve("artifact-tests") + protected def artifactCrcPath: Path = artifactFilePath.resolve("crc") + + private def getCrcValues(filePath: Path): Seq[Long] = { + val fileName = filePath.getFileName.toString + val crcFileName = fileName.split('.').head + ".txt" + Files + .readAllLines(artifactCrcPath.resolve(crcFileName)) + .asScala + .map(_.toLong) + .toSeq + } + + /** + * Check if the data sent to the server (stored in `artifactChunk`) is equivalent to the local + * data at `localPath`. + * @param artifactChunk + * @param localPath + */ + private def assertFileDataEquality( + artifactChunk: AddArtifactsRequest.ArtifactChunk, + localPath: Path): Unit = { + val localData = ByteString.readFrom(Files.newInputStream(localPath)) + val expectedCrc = getCrcValues(localPath).head + assert(artifactChunk.getData == localData) + assert(artifactChunk.getCrc == expectedCrc) + } + + private def singleChunkArtifactTest(path: String): Unit = { + test(s"Single Chunk Artifact - $path") { + val artifactPath = artifactFilePath.resolve(path) + artifactManager.addArtifact(artifactPath.toString) + + val receivedRequests = service.getAndClearLatestAddArtifactRequests() + // Single `AddArtifactRequest` + assert(receivedRequests.size == 1) + + val request = receivedRequests.head + assert(request.hasBatch) + + val batch = request.getBatch + // Single artifact in batch + assert(batch.getArtifactsList.size() == 1) + + val singleChunkArtifact = batch.getArtifacts(0) + val namePrefix = artifactPath.getFileName.toString match { + case jar if jar.endsWith(".jar") => "jars" + case cf if cf.endsWith(".class") => "classes" + } + assert(singleChunkArtifact.getName.equals(namePrefix + "/" + path)) + assertFileDataEquality(singleChunkArtifact.getData, artifactPath) + } + } + + singleChunkArtifactTest("smallClassFile.class") + + singleChunkArtifactTest("smallJar.jar") + + private def readNextChunk(in: InputStream): ByteString = { + val buf = new Array[Byte](CHUNK_SIZE) + var bytesRead = 0 + var count = 0 + while (count != -1 && bytesRead < CHUNK_SIZE) { + count = in.read(buf, bytesRead, CHUNK_SIZE - bytesRead) + if (count != -1) { + bytesRead += count + } + } + if (bytesRead == 0) ByteString.empty() + else ByteString.copyFrom(buf, 0, bytesRead) + } + + /** + * Reads data in a chunk of `CHUNK_SIZE` bytes from `in` and verify equality with server-side + * data stored in `chunk`. + * @param in + * @param chunk + * @return + */ + private def checkChunksDataAndCrc( + filePath: Path, + chunks: Seq[AddArtifactsRequest.ArtifactChunk]): Unit = { + val in = Files.newInputStream(filePath) + val crcs = getCrcValues(filePath) + chunks.zip(crcs).foreach { case (chunk, expectedCrc) => + val expectedData = readNextChunk(in) + chunk.getData == expectedData && chunk.getCrc == expectedCrc + } + } + + test("Chunked Artifact - junitLargeJar.jar") { + val artifactPath = artifactFilePath.resolve("junitLargeJar.jar") + artifactManager.addArtifact(artifactPath.toString) + // Expected chunks = roundUp( file_size / chunk_size) = 12 + // File size of `junitLargeJar.jar` is 384581 bytes. + val expectedChunks = (384581 + (CHUNK_SIZE - 1)) / CHUNK_SIZE + val receivedRequests = service.getAndClearLatestAddArtifactRequests() + assert(384581 == Files.size(artifactPath)) + assert(receivedRequests.size == expectedChunks) + assert(receivedRequests.head.hasBeginChunk) + val beginChunkRequest = receivedRequests.head.getBeginChunk + assert(beginChunkRequest.getName == "jars/junitLargeJar.jar") + assert(beginChunkRequest.getTotalBytes == 384581) + assert(beginChunkRequest.getNumChunks == expectedChunks) + val dataChunks = Seq(beginChunkRequest.getInitialChunk) ++ + receivedRequests.drop(1).map(_.getChunk) + checkChunksDataAndCrc(artifactPath, dataChunks) + } + + test("Batched SingleChunkArtifacts") { + val file1 = artifactFilePath.resolve("smallClassFile.class").toUri + val file2 = artifactFilePath.resolve("smallJar.jar").toUri + artifactManager.addArtifacts(Seq(file1, file2)) + val receivedRequests = service.getAndClearLatestAddArtifactRequests() + // Single request containing 2 artifacts. + assert(receivedRequests.size == 1) + + val request = receivedRequests.head + assert(request.hasBatch) + + val batch = request.getBatch + assert(batch.getArtifactsList.size() == 2) + + val artifacts = batch.getArtifactsList + assert(artifacts.get(0).getName == "classes/smallClassFile.class") + assert(artifacts.get(1).getName == "jars/smallJar.jar") + + assertFileDataEquality(artifacts.get(0).getData, Paths.get(file1)) + assertFileDataEquality(artifacts.get(1).getData, Paths.get(file2)) + } + + test("Mix of SingleChunkArtifact and chunked artifact") { + val file1 = artifactFilePath.resolve("smallClassFile.class").toUri + val file2 = artifactFilePath.resolve("junitLargeJar.jar").toUri + val file3 = artifactFilePath.resolve("smallClassFileDup.class").toUri + val file4 = artifactFilePath.resolve("smallJar.jar").toUri + artifactManager.addArtifacts(Seq(file1, file2, file3, file4)) + val receivedRequests = service.getAndClearLatestAddArtifactRequests() + // There are a total of 14 requests. + // The 1st request contains a single artifact - smallClassFile.class (There are no + // other artifacts batched with it since the next one is large multi-chunk artifact) + // Requests 2-13 (1-indexed) belong to the transfer of junitLargeJar.jar. This includes + // the first "beginning chunk" and the subsequent data chunks. + // The last request (14) contains both smallClassFileDup.class and smallJar.jar batched + // together. + assert(receivedRequests.size == 1 + 12 + 1) + + val firstReqBatch = receivedRequests.head.getBatch.getArtifactsList + assert(firstReqBatch.size() == 1) + assert(firstReqBatch.get(0).getName == "classes/smallClassFile.class") + assertFileDataEquality(firstReqBatch.get(0).getData, Paths.get(file1)) + + val secondReq = receivedRequests(1) + assert(secondReq.hasBeginChunk) + val beginChunkRequest = secondReq.getBeginChunk + assert(beginChunkRequest.getName == "jars/junitLargeJar.jar") + assert(beginChunkRequest.getTotalBytes == 384581) + assert(beginChunkRequest.getNumChunks == 12) + // Large artifact data chunks are requests number 3 to 13. + val dataChunks = Seq(beginChunkRequest.getInitialChunk) ++ + receivedRequests.drop(2).dropRight(1).map(_.getChunk) + checkChunksDataAndCrc(Paths.get(file2), dataChunks) + + val lastBatch = receivedRequests.last.getBatch + assert(lastBatch.getArtifactsCount == 2) + val remainingArtifacts = lastBatch.getArtifactsList + assert(remainingArtifacts.get(0).getName == "classes/smallClassFileDup.class") + assert(remainingArtifacts.get(1).getName == "jars/smallJar.jar") + + assertFileDataEquality(remainingArtifacts.get(0).getData, Paths.get(file3)) + assertFileDataEquality(remainingArtifacts.get(1).getData, Paths.get(file4)) + } +} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala new file mode 100644 index 0000000000000..68369512fb7e3 --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/CheckConnectJvmClientCompatibility.scala @@ -0,0 +1,261 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.client + +import java.io.{File, Writer} +import java.net.URLClassLoader +import java.nio.charset.StandardCharsets +import java.nio.file.{Files, Paths} +import java.util.regex.Pattern + +import scala.reflect.runtime.universe.runtimeMirror + +import com.typesafe.tools.mima.core.{Problem, ProblemFilter, ProblemFilters} +import com.typesafe.tools.mima.lib.MiMaLib + +import org.apache.spark.sql.connect.client.util.IntegrationTestUtils._ +import org.apache.spark.util.ChildFirstURLClassLoader + +/** + * A tool for checking the binary compatibility of the connect client API against the spark SQL + * API using MiMa. We did not write this check using a SBT build rule as the rule cannot provide + * the same level of freedom as a test. With a test we can: + * 1. Specify any two jars to run the compatibility check. + * 1. Easily make the test automatically pick up all new methods added while the client is being + * built. + * + * We can run this check by executing the `dev/connect-jvm-client-mima-check`. + */ +// scalastyle:off println +object CheckConnectJvmClientCompatibility { + + private lazy val sparkHome: String = { + if (!sys.env.contains("SPARK_HOME")) { + throw new IllegalArgumentException("SPARK_HOME is not set.") + } + sys.env("SPARK_HOME") + } + + def main(args: Array[String]): Unit = { + var resultWriter: Writer = null + try { + resultWriter = Files.newBufferedWriter( + Paths.get(s"$sparkHome/.connect-mima-check-result"), + StandardCharsets.UTF_8) + val clientJar: File = + findJar( + "connector/connect/client/jvm", + "spark-connect-client-jvm-assembly", + "spark-connect-client-jvm") + val sqlJar: File = findJar("sql/core", "spark-sql", "spark-sql") + val problems = checkMiMaCompatibility(clientJar, sqlJar) + if (problems.nonEmpty) { + resultWriter.write(s"ERROR: Comparing client jar: $clientJar and and sql jar: $sqlJar \n") + resultWriter.write(s"problems: \n") + resultWriter.write(s"${problems.map(p => p.description("client")).mkString("\n")}") + resultWriter.write("\n") + resultWriter.write( + "Exceptions to binary compatibility can be added in " + + "'CheckConnectJvmClientCompatibility#checkMiMaCompatibility'\n") + } + val incompatibleApis = checkDatasetApiCompatibility(clientJar, sqlJar) + if (incompatibleApis.nonEmpty) { + resultWriter.write( + "ERROR: The Dataset apis only exist in the connect client " + + "module and not belong to the sql module include: \n") + resultWriter.write(incompatibleApis.mkString("\n")) + resultWriter.write("\n") + resultWriter.write( + "Exceptions can be added to exceptionMethods in " + + "'CheckConnectJvmClientCompatibility#checkDatasetApiCompatibility'\n") + } + } catch { + case e: Throwable => + println(e.getMessage) + resultWriter.write(s"ERROR: ${e.getMessage}") + } finally { + if (resultWriter != null) { + resultWriter.close() + } + } + } + + /** + * MiMa takes an old jar (sql jar) and a new jar (client jar) as inputs and then reports all + * incompatibilities found in the new jar. The incompatibility result is then filtered using + * include and exclude rules. Include rules are first applied to find all client classes that + * need to be checked. Then exclude rules are applied to filter out all unsupported methods in + * the client classes. + */ + private def checkMiMaCompatibility(clientJar: File, sqlJar: File): List[Problem] = { + val mima = new MiMaLib(Seq(clientJar, sqlJar)) + val allProblems = mima.collectProblems(sqlJar, clientJar, List.empty) + val includedRules = Seq( + IncludeByName("org.apache.spark.sql.Column.*"), + IncludeByName("org.apache.spark.sql.ColumnName.*"), + IncludeByName("org.apache.spark.sql.DataFrame.*"), + IncludeByName("org.apache.spark.sql.DataFrameReader.*"), + IncludeByName("org.apache.spark.sql.DataFrameNaFunctions.*"), + IncludeByName("org.apache.spark.sql.DataFrameStatFunctions.*"), + IncludeByName("org.apache.spark.sql.DataFrameWriter.*"), + IncludeByName("org.apache.spark.sql.DataFrameWriterV2.*"), + IncludeByName("org.apache.spark.sql.Dataset.*"), + IncludeByName("org.apache.spark.sql.functions.*"), + IncludeByName("org.apache.spark.sql.RelationalGroupedDataset.*"), + IncludeByName("org.apache.spark.sql.SparkSession.*"), + IncludeByName("org.apache.spark.sql.RuntimeConfig.*"), + IncludeByName("org.apache.spark.sql.TypedColumn.*"), + IncludeByName("org.apache.spark.sql.SQLImplicits.*"), + IncludeByName("org.apache.spark.sql.DatasetHolder.*")) + val excludeRules = Seq( + // Filter unsupported rules: + // Note when muting errors for a method, checks on all overloading methods are also muted. + + // Skip all shaded dependencies and proto files in the client. + ProblemFilters.exclude[Problem]("org.sparkproject.*"), + ProblemFilters.exclude[Problem]("org.apache.spark.connect.proto.*"), + + // DataFrame Reader & Writer + ProblemFilters.exclude[Problem]("org.apache.spark.sql.DataFrameReader.json"), // deprecated + + // DataFrameNaFunctions + ProblemFilters.exclude[Problem]("org.apache.spark.sql.DataFrameNaFunctions.this"), + + // DataFrameStatFunctions + ProblemFilters.exclude[Problem]("org.apache.spark.sql.DataFrameStatFunctions.bloomFilter"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.DataFrameStatFunctions.this"), + + // Dataset + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.ofRows"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.DATASET_ID_TAG"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.COL_POS_KEY"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.DATASET_ID_KEY"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.curId"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.observe"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.queryExecution"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.encoder"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.sqlContext"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.joinWith"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.select"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.selectUntyped"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.reduce"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.groupByKey"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.explode"), // deprecated + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.filter"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.map"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.mapPartitions"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.flatMap"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.foreach"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.foreachPartition"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.rdd"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.toJavaRDD"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.javaRDD"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.writeStream"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.Dataset.this"), + + // functions + ProblemFilters.exclude[Problem]("org.apache.spark.sql.functions.udf"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.functions.call_udf"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.functions.callUDF"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.functions.unwrap_udt"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.functions.udaf"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.functions.typedlit"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.functions.typedLit"), + + // RelationalGroupedDataset + ProblemFilters.exclude[Problem]("org.apache.spark.sql.RelationalGroupedDataset.apply"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.RelationalGroupedDataset.as"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.RelationalGroupedDataset.this"), + + // SparkSession + ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.clearDefaultSession"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.setDefaultSession"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.sparkContext"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.sharedState"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.sessionState"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.sqlContext"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.listenerManager"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.experimental"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.udf"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.streams"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.createDataFrame"), + ProblemFilters.exclude[Problem]( + "org.apache.spark.sql.SparkSession.baseRelationToDataFrame"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.createDataset"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.catalog"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.executeCommand"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.readStream"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.SparkSession.this"), + + // RuntimeConfig + ProblemFilters.exclude[Problem]("org.apache.spark.sql.RuntimeConfig.this"), + + // TypedColumn + ProblemFilters.exclude[Problem]("org.apache.spark.sql.TypedColumn.this"), + + // SQLImplicits + ProblemFilters.exclude[Problem]("org.apache.spark.sql.SQLImplicits.this"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.SQLImplicits.rddToDatasetHolder"), + ProblemFilters.exclude[Problem]("org.apache.spark.sql.SQLImplicits._sqlContext")) + val problems = allProblems + .filter { p => + includedRules.exists(rule => rule(p)) + } + .filter { p => + excludeRules.forall(rule => rule(p)) + } + problems + } + + private def checkDatasetApiCompatibility(clientJar: File, sqlJar: File): Seq[String] = { + + def methods(jar: File, className: String): Seq[String] = { + val classLoader: URLClassLoader = + new ChildFirstURLClassLoader(Seq(jar.toURI.toURL).toArray, this.getClass.getClassLoader) + val mirror = runtimeMirror(classLoader) + // scalastyle:off classforname + val classSymbol = + mirror.classSymbol(Class.forName(className, false, classLoader)) + // scalastyle:on classforname + classSymbol.typeSignature.members + .filter(_.isMethod) + .map(_.asMethod) + .filter(m => m.isPublic) + .map(_.fullName) + .toSeq + } + + val className = "org.apache.spark.sql.Dataset" + val clientMethods = methods(clientJar, className) + val sqlMethods = methods(sqlJar, className) + // Exclude some public methods that must be added through `exceptionMethods` + val exceptionMethods = + Seq("org.apache.spark.sql.Dataset.collectResult", "org.apache.spark.sql.Dataset.plan") + + // Find new public functions that are not in sql module `Dataset`. + clientMethods.diff(sqlMethods).diff(exceptionMethods) + } + + private case class IncludeByName(name: String) extends ProblemFilter { + private[this] val pattern = + Pattern.compile(name.split("\\*", -1).map(Pattern.quote).mkString(".*")) + + override def apply(problem: Problem): Boolean = { + pattern.matcher(problem.matchName.getOrElse("")).matches + } + } +} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/SparkConnectClientBuilderParseTestSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/SparkConnectClientBuilderParseTestSuite.scala new file mode 100644 index 0000000000000..2c6886d0386c5 --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/SparkConnectClientBuilderParseTestSuite.scala @@ -0,0 +1,131 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.client + +import org.apache.spark.sql.connect.client.util.ConnectFunSuite + +/** + * Test suite for [[SparkConnectClient.Builder]] parsing and configuration. + */ +class SparkConnectClientBuilderParseTestSuite extends ConnectFunSuite { + private def build(args: String*): SparkConnectClient.Builder = { + SparkConnectClient.builder().parse(args.toArray) + } + + private def argumentTest( + name: String, + value: String, + extractor: SparkConnectClient.Builder => String): Unit = { + test("Argument - " + name) { + val builder = build("--" + name, value) + assert(value === extractor(builder)) + val e = intercept[IllegalArgumentException] { + build("--" + name) + } + assert(e.getMessage.contains("option requires a value")) + } + } + + argumentTest("host", "www.apache.org", _.host) + argumentTest("port", "1506", _.port.toString) + argumentTest("token", "azbycxdwev1234567890", _.token.get) + argumentTest("user_id", "U1238", _.userId.get) + argumentTest("user_name", "alice", _.userName.get) + argumentTest("user_agent", "MY APP", _.userAgent) + + test("Argument - remote") { + val builder = + build("--remote", "sc://srv.apache.org/;user_id=x127;user_name=Q;token=nahnah;param1=x") + assert(builder.host === "srv.apache.org") + assert(builder.port === 15002) + assert(builder.token.contains("nahnah")) + assert(builder.userId.contains("x127")) + assert(builder.options === Map(("user_name", "Q"), ("param1", "x"))) + } + + test("Argument - use_ssl") { + val builder = build("--use_ssl") + assert(builder.sslEnabled) + } + + test("Argument - option") { + val builder = + build("--option", "foo=bar", "--option", "c1=s8", "--option", "ns.sns.setting=baz") + assert(builder.options === Map(("foo", "bar"), ("c1", "s8"), ("ns.sns.setting", "baz"))) + val e1 = intercept[NoSuchElementException](build("--option")) + // assert(e1.getMessage.contains("requires a key-value pair")) + intercept[MatchError](build("--option", "not_a_config")) + val e2 = intercept[IllegalArgumentException](build("--option", "bar=baz=bak")) + assert(e2.getMessage.contains("should contain key=value")) + } + + test("Argument - unsupported") { + val e = intercept[IllegalArgumentException](build("--unknown")) + assert(e.getMessage.contains("is an unsupported argument")) + } + + test("SparkSession - create") { + { + val builder = build( + "--remote", + "sc://localhost:15033", + "--port", + "1507", + "--user_agent", + "U8912", + "--user_id", + "Q12") + assert(builder.host === "localhost") + assert(builder.port === 1507) + assert(builder.userAgent === "U8912") + assert(!builder.sslEnabled) + assert(builder.token.isEmpty) + assert(builder.userId.contains("Q12")) + assert(builder.userName.isEmpty) + assert(builder.options.isEmpty) + } + { + val builder = build( + "--use_ssl", + "--user_name", + "Nico", + "--option", + "mode=turbo", + "--option", + "cluster=mycl") + assert(builder.host === "localhost") + assert(builder.port === 15002) + assert(builder.userAgent == "_SPARK_CONNECT_SCALA") + assert(builder.sslEnabled) + assert(builder.token.isEmpty) + assert(builder.userId.isEmpty) + assert(builder.userName.contains("Nico")) + assert(builder.options === Map(("mode", "turbo"), ("cluster", "mycl"))) + } + { + val builder = build("--token", "thisismysecret") + assert(builder.host === "localhost") + assert(builder.port === 15002) + assert(builder.userAgent === "_SPARK_CONNECT_SCALA") + assert(builder.sslEnabled) + assert(builder.token.contains("thisismysecret")) + assert(builder.userId.isEmpty) + assert(builder.userName.isEmpty) + assert(builder.options.isEmpty) + } + } +} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/SparkConnectClientSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/SparkConnectClientSuite.scala new file mode 100755 index 0000000000000..bc600e5a07168 --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/SparkConnectClientSuite.scala @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.client + +import java.util.concurrent.TimeUnit + +import io.grpc.{Server, StatusRuntimeException} +import io.grpc.netty.NettyServerBuilder +import io.grpc.stub.StreamObserver +import org.scalatest.BeforeAndAfterEach +import scala.collection.mutable + +import org.apache.spark.connect.proto +import org.apache.spark.connect.proto.{AddArtifactsRequest, AddArtifactsResponse, AnalyzePlanRequest, AnalyzePlanResponse, ExecutePlanRequest, ExecutePlanResponse, SparkConnectServiceGrpc} +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connect.client.util.ConnectFunSuite +import org.apache.spark.sql.connect.common.config.ConnectCommon + +class SparkConnectClientSuite extends ConnectFunSuite with BeforeAndAfterEach { + + private var client: SparkConnectClient = _ + private var service: DummySparkConnectService = _ + private var server: Server = _ + + private def startDummyServer(port: Int): Unit = { + service = new DummySparkConnectService + server = NettyServerBuilder + .forPort(port) + .addService(service) + .build() + server.start() + } + + override def beforeEach(): Unit = { + super.beforeEach() + client = null + server = null + service = null + } + + override def afterEach(): Unit = { + if (server != null) { + server.shutdownNow() + assert(server.awaitTermination(5, TimeUnit.SECONDS), "server failed to shutdown") + } + + if (client != null) { + client.shutdown() + } + } + + test("Placeholder test: Create SparkConnectClient") { + client = SparkConnectClient.builder().userId("abc123").build() + assert(client.userId == "abc123") + } + + // Use 0 to start the server at a random port + private def testClientConnection(serverPort: Int = 0)( + clientBuilder: Int => SparkConnectClient): Unit = { + startDummyServer(serverPort) + client = clientBuilder(server.getPort) + val request = AnalyzePlanRequest + .newBuilder() + .setSessionId("abc123") + .build() + + val response = client.analyze(request) + assert(response.getSessionId === "abc123") + } + + test("Test connection") { + testClientConnection() { testPort => SparkConnectClient.builder().port(testPort).build() } + } + + test("Test connection string") { + testClientConnection() { testPort => + SparkConnectClient.builder().connectionString(s"sc://localhost:$testPort").build() + } + } + + test("Test encryption") { + startDummyServer(0) + client = SparkConnectClient + .builder() + .connectionString(s"sc://localhost:${server.getPort}/;use_ssl=true") + .build() + + val request = AnalyzePlanRequest.newBuilder().setSessionId("abc123").build() + + // Failed the ssl handshake as the dummy server does not have any server credentials installed. + assertThrows[StatusRuntimeException] { + client.analyze(request) + } + } + + test("SparkSession initialisation with connection string") { + val testPort = 16002 + client = SparkConnectClient.builder().connectionString(s"sc://localhost:$testPort").build() + startDummyServer(testPort) + val session = SparkSession.builder().client(client).build() + val df = session.range(10) + df.analyze // Trigger RPC + assert(df.plan === service.getAndClearLatestInputPlan()) + } + + private case class TestPackURI( + connectionString: String, + isCorrect: Boolean, + extraChecks: SparkConnectClient => Unit = _ => {}) + + private val URIs = Seq[TestPackURI]( + TestPackURI("sc://host", isCorrect = true), + TestPackURI( + "sc://localhost/", + isCorrect = true, + client => testClientConnection(ConnectCommon.CONNECT_GRPC_BINDING_PORT)(_ => client)), + TestPackURI( + "sc://localhost:1234/", + isCorrect = true, + client => testClientConnection(1234)(_ => client)), + TestPackURI( + "sc://localhost/;", + isCorrect = true, + client => testClientConnection(ConnectCommon.CONNECT_GRPC_BINDING_PORT)(_ => client)), + TestPackURI("sc://host:123", isCorrect = true), + TestPackURI( + "sc://host:123/;user_id=a94", + isCorrect = true, + client => assert(client.userId == "a94")), + TestPackURI( + "sc://host:123/;user_agent=a945", + isCorrect = true, + client => assert(client.userAgent == "a945")), + TestPackURI("scc://host:12", isCorrect = false), + TestPackURI("http://host", isCorrect = false), + TestPackURI("sc:/host:1234/path", isCorrect = false), + TestPackURI("sc://host/path", isCorrect = false), + TestPackURI("sc://host/;parm1;param2", isCorrect = false), + TestPackURI("sc://host:123;user_id=a94", isCorrect = false), + TestPackURI("sc:///user_id=123", isCorrect = false), + TestPackURI("sc://host:-4", isCorrect = false), + TestPackURI("sc://:123/", isCorrect = false), + TestPackURI("sc://host:123/;use_ssl=true", isCorrect = true), + TestPackURI("sc://host:123/;token=mySecretToken", isCorrect = true), + TestPackURI("sc://host:123/;token=", isCorrect = false), + TestPackURI("sc://host:123/;use_ssl=true;token=mySecretToken", isCorrect = true), + TestPackURI("sc://host:123/;token=mySecretToken;use_ssl=true", isCorrect = true), + TestPackURI("sc://host:123/;use_ssl=false;token=mySecretToken", isCorrect = false), + TestPackURI("sc://host:123/;token=mySecretToken;use_ssl=false", isCorrect = false), + TestPackURI("sc://host:123/;param1=value1;param2=value2", isCorrect = true)) + + private def checkTestPack(testPack: TestPackURI): Unit = { + val client = SparkConnectClient.builder().connectionString(testPack.connectionString).build() + testPack.extraChecks(client) + } + + URIs.foreach { testPack => + test(s"Check URI: ${testPack.connectionString}, isCorrect: ${testPack.isCorrect}") { + if (!testPack.isCorrect) { + assertThrows[IllegalArgumentException](checkTestPack(testPack)) + } else { + checkTestPack(testPack) + } + } + } +} + +class DummySparkConnectService() extends SparkConnectServiceGrpc.SparkConnectServiceImplBase { + + private var inputPlan: proto.Plan = _ + private val inputArtifactRequests: mutable.ListBuffer[AddArtifactsRequest] = + mutable.ListBuffer.empty + + private[sql] def getAndClearLatestInputPlan(): proto.Plan = { + val plan = inputPlan + inputPlan = null + plan + } + + private[sql] def getAndClearLatestAddArtifactRequests(): Seq[AddArtifactsRequest] = { + val requests = inputArtifactRequests.toSeq + inputArtifactRequests.clear() + requests + } + + override def executePlan( + request: ExecutePlanRequest, + responseObserver: StreamObserver[ExecutePlanResponse]): Unit = { + // Reply with a dummy response using the same client ID + val requestSessionId = request.getSessionId + inputPlan = request.getPlan + val response = ExecutePlanResponse + .newBuilder() + .setSessionId(requestSessionId) + .build() + responseObserver.onNext(response) + responseObserver.onCompleted() + } + + override def analyzePlan( + request: AnalyzePlanRequest, + responseObserver: StreamObserver[AnalyzePlanResponse]): Unit = { + // Reply with a dummy response using the same client ID + val requestSessionId = request.getSessionId + request.getAnalyzeCase match { + case proto.AnalyzePlanRequest.AnalyzeCase.SCHEMA => + inputPlan = request.getSchema.getPlan + case proto.AnalyzePlanRequest.AnalyzeCase.EXPLAIN => + inputPlan = request.getExplain.getPlan + case proto.AnalyzePlanRequest.AnalyzeCase.TREE_STRING => + inputPlan = request.getTreeString.getPlan + case proto.AnalyzePlanRequest.AnalyzeCase.IS_LOCAL => + inputPlan = request.getIsLocal.getPlan + case proto.AnalyzePlanRequest.AnalyzeCase.IS_STREAMING => + inputPlan = request.getIsStreaming.getPlan + case proto.AnalyzePlanRequest.AnalyzeCase.INPUT_FILES => + inputPlan = request.getInputFiles.getPlan + case _ => inputPlan = null + } + val response = AnalyzePlanResponse + .newBuilder() + .setSessionId(requestSessionId) + .build() + responseObserver.onNext(response) + responseObserver.onCompleted() + } + + override def addArtifacts(responseObserver: StreamObserver[AddArtifactsResponse]) + : StreamObserver[AddArtifactsRequest] = new StreamObserver[AddArtifactsRequest] { + override def onNext(v: AddArtifactsRequest): Unit = inputArtifactRequests.append(v) + + override def onError(throwable: Throwable): Unit = responseObserver.onError(throwable) + + override def onCompleted(): Unit = { + responseObserver.onNext(proto.AddArtifactsResponse.newBuilder().build()) + responseObserver.onCompleted() + } + } +} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/util/ConnectFunSuite.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/util/ConnectFunSuite.scala new file mode 100755 index 0000000000000..1ece0838b1bf4 --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/util/ConnectFunSuite.scala @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.client.util + +import java.nio.file.Path + +import org.scalatest.funsuite.AnyFunSuite // scalastyle:ignore funsuite + +/** + * The basic testsuite the client tests should extend from. + */ +trait ConnectFunSuite extends AnyFunSuite { // scalastyle:ignore funsuite + + // Borrowed from SparkFunSuite + protected def getWorkspaceFilePath(first: String, more: String*): Path = { + if (!(sys.props.contains("spark.test.home") || sys.env.contains("SPARK_HOME"))) { + fail("spark.test.home or SPARK_HOME is not set.") + } + val sparkHome = sys.props.getOrElse("spark.test.home", sys.env("SPARK_HOME")) + java.nio.file.Paths.get(sparkHome, first +: more: _*) + } + + protected val baseResourcePath: Path = { + getWorkspaceFilePath( + "connector", + "connect", + "client", + "jvm", + "src", + "test", + "resources").toAbsolutePath + } + + protected val commonResourcePath: Path = { + getWorkspaceFilePath( + "connector", + "connect", + "common", + "src", + "test", + "resources", + "query-tests").toAbsolutePath + } +} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/util/IntegrationTestUtils.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/util/IntegrationTestUtils.scala new file mode 100644 index 0000000000000..408caa585342c --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/util/IntegrationTestUtils.scala @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.client.util + +import java.io.File +import java.nio.file.{Files, Paths} + +import scala.util.Properties.versionNumberString + +import org.scalatest.Assertions.fail + +object IntegrationTestUtils { + + // System properties used for testing and debugging + private val DEBUG_SC_JVM_CLIENT = "spark.debug.sc.jvm.client" + + private[sql] lazy val scalaVersion = { + versionNumberString.split('.') match { + case Array(major, minor, _*) => major + "." + minor + case _ => versionNumberString + } + } + + private[sql] lazy val scalaDir = s"scala-$scalaVersion" + + private[sql] lazy val sparkHome: String = { + if (!(sys.props.contains("spark.test.home") || sys.env.contains("SPARK_HOME"))) { + fail("spark.test.home or SPARK_HOME is not set.") + } + sys.props.getOrElse("spark.test.home", sys.env("SPARK_HOME")) + } + private[connect] val isDebug = System.getProperty(DEBUG_SC_JVM_CLIENT, "false").toBoolean + + // Log server start stop debug info into console + // scalastyle:off println + private[connect] def debug(msg: String): Unit = if (isDebug) println(msg) + // scalastyle:on println + private[connect] def debug(error: Throwable): Unit = if (isDebug) error.printStackTrace() + + private[sql] lazy val isSparkHiveJarAvailable: Boolean = { + val filePath = s"$sparkHome/assembly/target/$scalaDir/jars/" + + s"spark-hive_$scalaVersion-${org.apache.spark.SPARK_VERSION}.jar" + Files.exists(Paths.get(filePath)) + } + + /** + * Find a jar in the Spark project artifacts. It requires a build first (e.g. build/sbt package, + * build/mvn clean install -DskipTests) so that this method can find the jar in the target + * folders. + * + * @return + * the jar + */ + private[sql] def findJar( + path: String, + sbtName: String, + mvnName: String, + test: Boolean = false): File = { + val targetDir = new File(new File(sparkHome, path), "target") + assert( + targetDir.exists(), + s"Fail to locate the target folder: '${targetDir.getCanonicalPath}'. " + + s"SPARK_HOME='${new File(sparkHome).getCanonicalPath}'. " + + "Make sure the spark project jars has been built (e.g. using build/sbt package)" + + "and the env variable `SPARK_HOME` is set correctly.") + val suffix = if (test) "-tests.jar" else ".jar" + val jars = recursiveListFiles(targetDir).filter { f => + // SBT jar + (f.getParentFile.getName == scalaDir && + f.getName.startsWith(sbtName) && f.getName.endsWith(suffix)) || + // Maven Jar + (f.getParent.endsWith("target") && + f.getName.startsWith(mvnName) && + f.getName.endsWith(s"${org.apache.spark.SPARK_VERSION}$suffix")) + } + // It is possible we found more than one: one built by maven, and another by SBT + assert(jars.nonEmpty, s"Failed to find the jar inside folder: ${targetDir.getCanonicalPath}") + debug("Using jar: " + jars(0).getCanonicalPath) + jars(0) // return the first jar found + } + + private def recursiveListFiles(f: File): Array[File] = { + val these = f.listFiles + these ++ these.filter(_.isDirectory).flatMap(recursiveListFiles) + } +} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/util/QueryTest.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/util/QueryTest.scala new file mode 100644 index 0000000000000..1c3f49f897f52 --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/util/QueryTest.scala @@ -0,0 +1,209 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.client.util + +import java.util.TimeZone + +import org.scalatest.Assertions + +import org.apache.spark.sql.{DataFrame, Row} +import org.apache.spark.sql.catalyst.util.sideBySide + +abstract class QueryTest extends RemoteSparkSession { + + /** + * Runs the plan and makes sure the answer matches the expected result. + * + * @param df + * the [[DataFrame]] to be executed + * @param expectedAnswer + * the expected result in a [[Seq]] of [[Row]]s. + */ + protected def checkAnswer(df: => DataFrame, expectedAnswer: Seq[Row]): Unit = { + QueryTest.checkAnswer(df, expectedAnswer) + } + + protected def checkAnswer(df: => DataFrame, expectedAnswer: Row): Unit = { + checkAnswer(df, Seq(expectedAnswer)) + } + + protected def checkAnswer(df: => DataFrame, expectedAnswer: DataFrame): Unit = { + checkAnswer(df, expectedAnswer.collect()) + } +} + +object QueryTest extends Assertions { + + /** + * Runs the plan and makes sure the answer matches the expected result. + * + * @param df + * the DataFrame to be executed + * @param expectedAnswer + * the expected result in a Seq of Rows. + */ + def checkAnswer(df: DataFrame, expectedAnswer: Seq[Row], isSorted: Boolean = false): Unit = { + getErrorMessageInCheckAnswer(df, expectedAnswer, isSorted) match { + case Some(errorMessage) => fail(errorMessage) + case None => + } + } + + /** + * Runs the plan and makes sure the answer matches the expected result. If there was exception + * during the execution or the contents of the DataFrame does not match the expected result, an + * error message will be returned. Otherwise, a None will be returned. + * + * @param df + * the DataFrame to be executed + * @param expectedAnswer + * the expected result in a Seq of Rows. + */ + def getErrorMessageInCheckAnswer( + df: DataFrame, + expectedAnswer: Seq[Row], + isSorted: Boolean = false): Option[String] = { + val sparkAnswer = + try df.collect().toSeq + catch { + case e: Exception => + val errorMessage = + s""" + |Exception thrown while executing query: + |${df.analyze} + |== Exception == + |$e + |${org.apache.spark.sql.catalyst.util.stackTraceToString(e)} + """.stripMargin + return Some(errorMessage) + } + + sameRows(expectedAnswer, sparkAnswer, isSorted).map { results => + s""" + |Results do not match for query: + |Timezone: ${TimeZone.getDefault} + |Timezone Env: ${sys.env.getOrElse("TZ", "")} + | + |${df.analyze} + |== Results == + |$results + """.stripMargin + } + } + + def prepareAnswer(answer: Seq[Row], isSorted: Boolean): Seq[Row] = { + // Converts data to types that we can do equality comparison using Scala collections. + // For BigDecimal type, the Scala type has a better definition of equality test (similar to + // Java's java.math.BigDecimal.compareTo). + // For binary arrays, we convert it to Seq to avoid of calling java.util.Arrays.equals for + // equality test. + val converted: Seq[Row] = answer.map(prepareRow) + if (!isSorted) converted.sortBy(_.toString()) else converted + } + + // We need to call prepareRow recursively to handle schemas with struct types. + def prepareRow(row: Row): Row = { + Row.fromSeq(row.toSeq.map { + case null => null + case bd: java.math.BigDecimal => BigDecimal(bd) + // Equality of WrappedArray differs for AnyVal and AnyRef in Scala 2.12.2+ + case seq: Seq[_] => + seq.map { + case b: java.lang.Byte => b.byteValue + case s: java.lang.Short => s.shortValue + case i: java.lang.Integer => i.intValue + case l: java.lang.Long => l.longValue + case f: java.lang.Float => f.floatValue + case d: java.lang.Double => d.doubleValue + case x => x + } + // Convert array to Seq for easy equality check. + case b: Array[_] => b.toSeq + case r: Row => prepareRow(r) + case o => o + }) + } + + private def genError( + expectedAnswer: Seq[Row], + sparkAnswer: Seq[Row], + isSorted: Boolean = false): String = { + val getRowType: Option[Row] => String = row => + row + .map(row => + if (row.schema == null) { + "struct<>" + } else { + s"${row.schema.catalogString}" + }) + .getOrElse("struct<>") + + s""" + |== Results == + |${sideBySide( + s"== Correct Answer - ${expectedAnswer.size} ==" +: + getRowType(expectedAnswer.headOption) +: + prepareAnswer(expectedAnswer, isSorted).map(_.toString()), + s"== Spark Answer - ${sparkAnswer.size} ==" +: + getRowType(sparkAnswer.headOption) +: + prepareAnswer(sparkAnswer, isSorted).map(_.toString())).mkString("\n")} + """.stripMargin + } + + def includesRows(expectedRows: Seq[Row], sparkAnswer: Seq[Row]): Option[String] = { + if (!prepareAnswer(expectedRows, true).toSet.subsetOf( + prepareAnswer(sparkAnswer, true).toSet)) { + return Some(genError(expectedRows, sparkAnswer, true)) + } + None + } + + def compare(obj1: Any, obj2: Any): Boolean = (obj1, obj2) match { + case (null, null) => true + case (null, _) => false + case (_, null) => false + case (a: Array[_], b: Array[_]) => + a.length == b.length && a.zip(b).forall { case (l, r) => compare(l, r) } + case (a: Map[_, _], b: Map[_, _]) => + a.size == b.size && a.keys.forall { aKey => + b.keys.find(bKey => compare(aKey, bKey)).exists(bKey => compare(a(aKey), b(bKey))) + } + case (a: Iterable[_], b: Iterable[_]) => + a.size == b.size && a.zip(b).forall { case (l, r) => compare(l, r) } + case (a: Product, b: Product) => + compare(a.productIterator.toSeq, b.productIterator.toSeq) + case (a: Row, b: Row) => + compare(a.toSeq, b.toSeq) + // 0.0 == -0.0, turn float/double to bits before comparison, to distinguish 0.0 and -0.0. + case (a: Double, b: Double) => + java.lang.Double.doubleToRawLongBits(a) == java.lang.Double.doubleToRawLongBits(b) + case (a: Float, b: Float) => + java.lang.Float.floatToRawIntBits(a) == java.lang.Float.floatToRawIntBits(b) + case (a, b) => a == b + } + + def sameRows( + expectedAnswer: Seq[Row], + sparkAnswer: Seq[Row], + isSorted: Boolean = false): Option[String] = { + if (!compare(prepareAnswer(expectedAnswer, isSorted), prepareAnswer(sparkAnswer, isSorted))) { + return Some(genError(expectedAnswer, sparkAnswer, isSorted)) + } + None + } +} diff --git a/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/util/RemoteSparkSession.scala b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/util/RemoteSparkSession.scala new file mode 100644 index 0000000000000..43bf722020cd7 --- /dev/null +++ b/connector/connect/client/jvm/src/test/scala/org/apache/spark/sql/connect/client/util/RemoteSparkSession.scala @@ -0,0 +1,190 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.client.util + +import java.io.{BufferedOutputStream, File} +import java.util.concurrent.TimeUnit + +import scala.io.Source + +import org.scalatest.BeforeAndAfterAll +import sys.process._ + +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connect.client.SparkConnectClient +import org.apache.spark.sql.connect.client.util.IntegrationTestUtils._ +import org.apache.spark.sql.connect.common.config.ConnectCommon +import org.apache.spark.util.Utils + +/** + * An util class to start a local spark connect server in a different process for local E2E tests. + * Pre-running the tests, the spark connect artifact needs to be built using e.g. `build/sbt + * package`. It is designed to start the server once but shared by all tests. It is equivalent to + * use the following command to start the connect server via command line: + * + * {{{ + * bin/spark-shell \ + * --jars `ls connector/connect/server/target/**/spark-connect*SNAPSHOT.jar | paste -sd ',' -` \ + * --conf spark.plugins=org.apache.spark.sql.connect.SparkConnectPlugin + * }}} + * + * Set system property `spark.test.home` or env variable `SPARK_HOME` if the test is not executed + * from the Spark project top folder. Set system property `spark.debug.sc.jvm.client=true` to + * print the server process output in the console to debug server start stop problems. + */ +object SparkConnectServerUtils { + + // Server port + private[connect] val port = ConnectCommon.CONNECT_GRPC_BINDING_PORT + util.Random.nextInt(1000) + + @volatile private var stopped = false + + private var consoleOut: BufferedOutputStream = _ + private val serverStopCommand = "q" + + private lazy val sparkConnect: Process = { + debug("Starting the Spark Connect Server...") + val connectJar = findJar( + "connector/connect/server", + "spark-connect-assembly", + "spark-connect").getCanonicalPath + val driverClassPath = connectJar + ":" + + findJar("sql/catalyst", "spark-catalyst", "spark-catalyst", test = true).getCanonicalPath + val catalogImplementation = if (IntegrationTestUtils.isSparkHiveJarAvailable) { + "hive" + } else { + // scalastyle:off println + println( + "Will start Spark Connect server with `spark.sql.catalogImplementation=in-memory`, " + + "some tests that rely on Hive will be ignored. If you don't want to skip them:\n" + + "1. Test with maven: run `build/mvn install -DskipTests -Phive` before testing\n" + + "2. Test with sbt: run test with `-Phive` profile") + // scalastyle:on println + "in-memory" + } + val builder = Process( + Seq( + "bin/spark-submit", + "--driver-class-path", + driverClassPath, + "--conf", + s"spark.connect.grpc.binding.port=$port", + "--conf", + "spark.sql.catalog.testcat=org.apache.spark.sql.connector.catalog.InMemoryTableCatalog", + "--conf", + s"spark.sql.catalogImplementation=$catalogImplementation", + "--class", + "org.apache.spark.sql.connect.SimpleSparkConnectService", + connectJar), + new File(sparkHome)) + + val io = new ProcessIO( + in => consoleOut = new BufferedOutputStream(in), + out => Source.fromInputStream(out).getLines.foreach(debug), + err => Source.fromInputStream(err).getLines.foreach(debug)) + val process = builder.run(io) + + // Adding JVM shutdown hook + sys.addShutdownHook(stop()) + process + } + + def start(): Unit = { + assert(!stopped) + sparkConnect + } + + def stop(): Int = { + stopped = true + debug("Stopping the Spark Connect Server...") + try { + consoleOut.write(serverStopCommand.getBytes) + consoleOut.flush() + consoleOut.close() + } catch { + case e: Throwable => + debug(e) + sparkConnect.destroy() + } + + val code = sparkConnect.exitValue() + debug(s"Spark Connect Server is stopped with exit code: $code") + code + } +} + +trait RemoteSparkSession extends ConnectFunSuite with BeforeAndAfterAll { + import SparkConnectServerUtils._ + var spark: SparkSession = _ + + override def beforeAll(): Unit = { + super.beforeAll() + SparkConnectServerUtils.start() + spark = SparkSession.builder().client(SparkConnectClient.builder().port(port).build()).build() + + // Retry and wait for the server to start + val stop = System.nanoTime() + TimeUnit.MINUTES.toNanos(1) // ~1 min + var sleepInternalMs = TimeUnit.SECONDS.toMillis(1) // 1s with * 2 backoff + var success = false + val error = new RuntimeException(s"Failed to start the test server on port $port.") + + while (!success && System.nanoTime() < stop) { + try { + // Run a simple query to verify the server is really up and ready + val result = spark + .sql("select val from (values ('Hello'), ('World')) as t(val)") + .collect() + assert(result.length == 2) + success = true + debug("Spark Connect Server is up.") + } catch { + // ignored the error + case e: Throwable => + error.addSuppressed(e) + Thread.sleep(sleepInternalMs) + sleepInternalMs *= 2 + } + } + + // Throw error if failed + if (!success) { + debug(error) + throw error + } + } + + override def afterAll(): Unit = { + try { + if (spark != null) spark.stop() + } catch { + case e: Throwable => debug(e) + } + spark = null + super.afterAll() + } + + /** + * Drops table `tableName` after calling `f`. + */ + protected def withTable(tableNames: String*)(f: => Unit): Unit = { + Utils.tryWithSafeFinally(f) { + tableNames.foreach { name => + spark.sql(s"DROP TABLE IF EXISTS $name").collect() + } + } + } +} diff --git a/connector/connect/common/pom.xml b/connector/connect/common/pom.xml new file mode 100644 index 0000000000000..4d4c93ef6a9d7 --- /dev/null +++ b/connector/connect/common/pom.xml @@ -0,0 +1,226 @@ + + + + + 4.0.0 + + org.apache.spark + spark-parent_2.12 + 3.4.1 + ../../../pom.xml + + + spark-connect-common_2.12 + jar + Spark Project Connect Common + https://spark.apache.org/ + + connect-common + 31.0.1-jre + 1.0.1 + 1.47.0 + 6.0.53 + + + + org.apache.spark + spark-catalyst_${scala.binary.version} + ${project.version} + provided + + + com.google.guava + guava + + + + + org.scala-lang + scala-library + + + com.google.protobuf + protobuf-java + ${protobuf.version} + compile + + + io.grpc + grpc-netty + ${io.grpc.version} + + + io.grpc + grpc-protobuf + ${io.grpc.version} + + + io.grpc + grpc-services + ${io.grpc.version} + + + io.grpc + grpc-stub + ${io.grpc.version} + + + io.netty + netty-codec-http2 + ${netty.version} + provided + + + io.netty + netty-handler-proxy + ${netty.version} + provided + + + io.netty + netty-transport-native-unix-common + ${netty.version} + provided + + + org.apache.tomcat + annotations-api + ${tomcat.annotations.api.version} + provided + + + + org.apache.spark + spark-tags_${scala.binary.version} + test-jar + test + + + + + + + kr.motd.maven + os-maven-plugin + 1.6.2 + + + target/scala-${scala.binary.version}/classes + target/scala-${scala.binary.version}/test-classes + + + org.codehaus.mojo + build-helper-maven-plugin + + + add-sources + generate-sources + + add-source + + + + src/main/scala-${scala.binary.version} + + + + + add-scala-test-sources + generate-test-sources + + add-test-source + + + + src/test/gen-java + + + + + + + + + + default-protoc + + true + + + + + + org.xolstice.maven.plugins + protobuf-maven-plugin + 0.6.1 + + com.google.protobuf:protoc:${protobuf.version}:exe:${os.detected.classifier} + grpc-java + io.grpc:protoc-gen-grpc-java:${io.grpc.version}:exe:${os.detected.classifier} + src/main/protobuf + + + + + compile + compile-custom + test-compile + + + + + + + + + user-defined-protoc + + ${env.SPARK_PROTOC_EXEC_PATH} + ${env.CONNECT_PLUGIN_EXEC_PATH} + + + + + org.xolstice.maven.plugins + protobuf-maven-plugin + 0.6.1 + + ${spark.protoc.executable.path} + grpc-java + ${connect.plugin.executable.path} + src/main/protobuf + + + + + compile + compile-custom + test-compile + + + + + + + + + diff --git a/connector/connect/common/src/main/buf.gen.yaml b/connector/connect/common/src/main/buf.gen.yaml new file mode 100644 index 0000000000000..d74d08632fd94 --- /dev/null +++ b/connector/connect/common/src/main/buf.gen.yaml @@ -0,0 +1,36 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +version: v1 +plugins: + - remote: buf.build/protocolbuffers/plugins/cpp:v3.20.0-1 + out: gen/proto/cpp + - remote: buf.build/protocolbuffers/plugins/csharp:v3.20.0-1 + out: gen/proto/csharp + - remote: buf.build/protocolbuffers/plugins/java:v3.20.0-1 + out: gen/proto/java + - remote: buf.build/grpc/plugins/ruby:v1.47.0-1 + out: gen/proto/ruby + - remote: buf.build/protocolbuffers/plugins/ruby:v21.2.0-1 + out: gen/proto/ruby + # Building the Python build and building the mypy interfaces. + - remote: buf.build/protocolbuffers/plugins/python:v3.19.3-1 + out: gen/proto/python + - remote: buf.build/grpc/plugins/python:v1.47.0-1 + out: gen/proto/python + - name: mypy + out: gen/proto/python + diff --git a/connector/connect/common/src/main/buf.work.yaml b/connector/connect/common/src/main/buf.work.yaml new file mode 100644 index 0000000000000..a02dead420cdf --- /dev/null +++ b/connector/connect/common/src/main/buf.work.yaml @@ -0,0 +1,19 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +version: v1 +directories: + - protobuf diff --git a/connector/connect/common/src/main/protobuf/buf.yaml b/connector/connect/common/src/main/protobuf/buf.yaml new file mode 100644 index 0000000000000..496e97af3fa0b --- /dev/null +++ b/connector/connect/common/src/main/protobuf/buf.yaml @@ -0,0 +1,23 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +version: v1 +breaking: + use: + - FILE +lint: + use: + - DEFAULT diff --git a/connector/connect/common/src/main/protobuf/spark/connect/base.proto b/connector/connect/common/src/main/protobuf/spark/connect/base.proto new file mode 100644 index 0000000000000..530edb2d8c0bb --- /dev/null +++ b/connector/connect/common/src/main/protobuf/spark/connect/base.proto @@ -0,0 +1,551 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = 'proto3'; + +package spark.connect; + +import "google/protobuf/any.proto"; +import "spark/connect/commands.proto"; +import "spark/connect/common.proto"; +import "spark/connect/expressions.proto"; +import "spark/connect/relations.proto"; +import "spark/connect/types.proto"; + +option java_multiple_files = true; +option java_package = "org.apache.spark.connect.proto"; + +// A [[Plan]] is the structure that carries the runtime information for the execution from the +// client to the server. A [[Plan]] can either be of the type [[Relation]] which is a reference +// to the underlying logical plan or it can be of the [[Command]] type that is used to execute +// commands on the server. +message Plan { + oneof op_type { + Relation root = 1; + Command command = 2; + } +} + + + +// User Context is used to refer to one particular user session that is executing +// queries in the backend. +message UserContext { + string user_id = 1; + string user_name = 2; + + // To extend the existing user context message that is used to identify incoming requests, + // Spark Connect leverages the Any protobuf type that can be used to inject arbitrary other + // messages into this message. Extensions are stored as a `repeated` type to be able to + // handle multiple active extensions. + repeated google.protobuf.Any extensions = 999; +} + +// Request to perform plan analyze, optionally to explain the plan. +message AnalyzePlanRequest { + // (Required) + // + // The session_id specifies a spark session for a user id (which is specified + // by user_context.user_id). The session_id is set by the client to be able to + // collate streaming responses from different queries within the dedicated session. + string session_id = 1; + + // (Required) User context + UserContext user_context = 2; + + // Provides optional information about the client sending the request. This field + // can be used for language or version specific information and is only intended for + // logging purposes and will not be interpreted by the server. + optional string client_type = 3; + + oneof analyze { + Schema schema = 4; + Explain explain = 5; + TreeString tree_string = 6; + IsLocal is_local = 7; + IsStreaming is_streaming = 8; + InputFiles input_files = 9; + SparkVersion spark_version = 10; + DDLParse ddl_parse = 11; + SameSemantics same_semantics = 12; + SemanticHash semantic_hash = 13; + Persist persist = 14; + Unpersist unpersist = 15; + GetStorageLevel get_storage_level = 16; + } + + message Schema { + // (Required) The logical plan to be analyzed. + Plan plan = 1; + } + + // Explains the input plan based on a configurable mode. + message Explain { + // (Required) The logical plan to be analyzed. + Plan plan = 1; + + // (Required) For analyzePlan rpc calls, configure the mode to explain plan in strings. + ExplainMode explain_mode = 2; + + // Plan explanation mode. + enum ExplainMode { + EXPLAIN_MODE_UNSPECIFIED = 0; + + // Generates only physical plan. + EXPLAIN_MODE_SIMPLE = 1; + + // Generates parsed logical plan, analyzed logical plan, optimized logical plan and physical plan. + // Parsed Logical plan is a unresolved plan that extracted from the query. Analyzed logical plans + // transforms which translates unresolvedAttribute and unresolvedRelation into fully typed objects. + // The optimized logical plan transforms through a set of optimization rules, resulting in the + // physical plan. + EXPLAIN_MODE_EXTENDED = 2; + + // Generates code for the statement, if any and a physical plan. + EXPLAIN_MODE_CODEGEN = 3; + + // If plan node statistics are available, generates a logical plan and also the statistics. + EXPLAIN_MODE_COST = 4; + + // Generates a physical plan outline and also node details. + EXPLAIN_MODE_FORMATTED = 5; + } + } + + message TreeString { + // (Required) The logical plan to be analyzed. + Plan plan = 1; + } + + message IsLocal { + // (Required) The logical plan to be analyzed. + Plan plan = 1; + } + + message IsStreaming { + // (Required) The logical plan to be analyzed. + Plan plan = 1; + } + + message InputFiles { + // (Required) The logical plan to be analyzed. + Plan plan = 1; + } + + message SparkVersion { } + + message DDLParse { + // (Required) The DDL formatted string to be parsed. + string ddl_string = 1; + } + + + // Returns `true` when the logical query plans are equal and therefore return same results. + message SameSemantics { + // (Required) The plan to be compared. + Plan target_plan = 1; + + // (Required) The other plan to be compared. + Plan other_plan = 2; + } + + message SemanticHash { + // (Required) The logical plan to get a hashCode. + Plan plan = 1; + } + + message Persist { + // (Required) The logical plan to persist. + Relation relation = 1; + + // (Optional) The storage level. + optional StorageLevel storage_level = 2; + } + + message Unpersist { + // (Required) The logical plan to unpersist. + Relation relation = 1; + + // (Optional) Whether to block until all blocks are deleted. + optional bool blocking = 2; + } + + message GetStorageLevel { + // (Required) The logical plan to get the storage level. + Relation relation = 1; + } +} + +// Response to performing analysis of the query. Contains relevant metadata to be able to +// reason about the performance. +message AnalyzePlanResponse { + string session_id = 1; + + oneof result { + Schema schema = 2; + Explain explain = 3; + TreeString tree_string = 4; + IsLocal is_local = 5; + IsStreaming is_streaming = 6; + InputFiles input_files = 7; + SparkVersion spark_version = 8; + DDLParse ddl_parse = 9; + SameSemantics same_semantics = 10; + SemanticHash semantic_hash = 11; + Persist persist = 12; + Unpersist unpersist = 13; + GetStorageLevel get_storage_level = 14; + } + + message Schema { + DataType schema = 1; + } + + message Explain { + string explain_string = 1; + } + + message TreeString { + string tree_string = 1; + } + + message IsLocal { + bool is_local = 1; + } + + message IsStreaming { + bool is_streaming = 1; + } + + message InputFiles { + // A best-effort snapshot of the files that compose this Dataset + repeated string files = 1; + } + + message SparkVersion { + string version = 1; + } + + message DDLParse { + DataType parsed = 1; + } + + message SameSemantics { + bool result = 1; + } + + message SemanticHash { + int32 result = 1; + } + + message Persist { } + + message Unpersist { } + + message GetStorageLevel { + // (Required) The StorageLevel as a result of get_storage_level request. + StorageLevel storage_level = 1; + } +} + +// A request to be executed by the service. +message ExecutePlanRequest { + // (Required) + // + // The session_id specifies a spark session for a user id (which is specified + // by user_context.user_id). The session_id is set by the client to be able to + // collate streaming responses from different queries within the dedicated session. + string session_id = 1; + + // (Required) User context + // + // user_context.user_id and session+id both identify a unique remote spark session on the + // server side. + UserContext user_context = 2; + + // (Required) The logical plan to be executed / analyzed. + Plan plan = 3; + + // Provides optional information about the client sending the request. This field + // can be used for language or version specific information and is only intended for + // logging purposes and will not be interpreted by the server. + optional string client_type = 4; +} + +// The response of a query, can be one or more for each request. Responses belonging to the +// same input query, carry the same `session_id`. +message ExecutePlanResponse { + string session_id = 1; + + // Union type for the different response messages. + oneof response_type { + ArrowBatch arrow_batch = 2; + + // Special case for executing SQL commands. + SqlCommandResult sql_command_result = 5; + + // Support arbitrary result objects. + google.protobuf.Any extension = 999; + } + + // Metrics for the query execution. Typically, this field is only present in the last + // batch of results and then represent the overall state of the query execution. + Metrics metrics = 4; + + // The metrics observed during the execution of the query plan. + repeated ObservedMetrics observed_metrics = 6; + + // (Optional) The Spark schema. This field is available when `collect` is called. + DataType schema = 7; + + // A SQL command returns an opaque Relation that can be directly used as input for the next + // call. + message SqlCommandResult { + Relation relation = 1; + } + + // Batch results of metrics. + message ArrowBatch { + int64 row_count = 1; + bytes data = 2; + } + + message Metrics { + + repeated MetricObject metrics = 1; + + message MetricObject { + string name = 1; + int64 plan_id = 2; + int64 parent = 3; + map execution_metrics = 4; + } + + message MetricValue { + string name = 1; + int64 value = 2; + string metric_type = 3; + } + } + + message ObservedMetrics { + string name = 1; + repeated Expression.Literal values = 2; + } +} + +// The key-value pair for the config request and response. +message KeyValue { + // (Required) The key. + string key = 1; + // (Optional) The value. + optional string value = 2; +} + +// Request to update or fetch the configurations. +message ConfigRequest { + // (Required) + // + // The session_id specifies a spark session for a user id (which is specified + // by user_context.user_id). The session_id is set by the client to be able to + // collate streaming responses from different queries within the dedicated session. + string session_id = 1; + + // (Required) User context + UserContext user_context = 2; + + // (Required) The operation for the config. + Operation operation = 3; + + // Provides optional information about the client sending the request. This field + // can be used for language or version specific information and is only intended for + // logging purposes and will not be interpreted by the server. + optional string client_type = 4; + + message Operation { + oneof op_type { + Set set = 1; + Get get = 2; + GetWithDefault get_with_default = 3; + GetOption get_option = 4; + GetAll get_all = 5; + Unset unset = 6; + IsModifiable is_modifiable = 7; + } + } + + message Set { + // (Required) The config key-value pairs to set. + repeated KeyValue pairs = 1; + } + + message Get { + // (Required) The config keys to get. + repeated string keys = 1; + } + + message GetWithDefault { + // (Required) The config key-value paris to get. The value will be used as the default value. + repeated KeyValue pairs = 1; + } + + message GetOption { + // (Required) The config keys to get optionally. + repeated string keys = 1; + } + + message GetAll { + // (Optional) The prefix of the config key to get. + optional string prefix = 1; + } + + message Unset { + // (Required) The config keys to unset. + repeated string keys = 1; + } + + message IsModifiable { + // (Required) The config keys to check the config is modifiable. + repeated string keys = 1; + } +} + +// Response to the config request. +message ConfigResponse { + string session_id = 1; + + // (Optional) The result key-value pairs. + // + // Available when the operation is 'Get', 'GetWithDefault', 'GetOption', 'GetAll'. + // Also available for the operation 'IsModifiable' with boolean string "true" and "false". + repeated KeyValue pairs = 2; + + // (Optional) + // + // Warning messages for deprecated or unsupported configurations. + repeated string warnings = 3; +} + +// Request to transfer client-local artifacts. +message AddArtifactsRequest { + + // (Required) + // + // The session_id specifies a spark session for a user id (which is specified + // by user_context.user_id). The session_id is set by the client to be able to + // collate streaming responses from different queries within the dedicated session. + string session_id = 1; + + // User context + UserContext user_context = 2; + + // Provides optional information about the client sending the request. This field + // can be used for language or version specific information and is only intended for + // logging purposes and will not be interpreted by the server. + optional string client_type = 6; + + // A chunk of an Artifact. + message ArtifactChunk { + // Data chunk. + bytes data = 1; + // CRC to allow server to verify integrity of the chunk. + int64 crc = 2; + } + + // An artifact that is contained in a single `ArtifactChunk`. + // Generally, this message represents tiny artifacts such as REPL-generated class files. + message SingleChunkArtifact { + // The name of the artifact is expected in the form of a "Relative Path" that is made up of a + // sequence of directories and the final file element. + // Examples of "Relative Path"s: "jars/test.jar", "classes/xyz.class", "abc.xyz", "a/b/X.jar". + // The server is expected to maintain the hierarchy of files as defined by their name. (i.e + // The relative path of the file on the server's filesystem will be the same as the name of + // the provided artifact) + string name = 1; + // A single data chunk. + ArtifactChunk data = 2; + } + + // A number of `SingleChunkArtifact` batched into a single RPC. + message Batch { + repeated SingleChunkArtifact artifacts = 1; + } + + // Signals the beginning/start of a chunked artifact. + // A large artifact is transferred through a payload of `BeginChunkedArtifact` followed by a + // sequence of `ArtifactChunk`s. + message BeginChunkedArtifact { + // Name of the artifact undergoing chunking. Follows the same conventions as the `name` in + // the `Artifact` message. + string name = 1; + // Total size of the artifact in bytes. + int64 total_bytes = 2; + // Number of chunks the artifact is split into. + // This includes the `initial_chunk`. + int64 num_chunks = 3; + // The first/initial chunk. + ArtifactChunk initial_chunk = 4; + } + + // The payload is either a batch of artifacts or a partial chunk of a large artifact. + oneof payload { + Batch batch = 3; + // The metadata and the initial chunk of a large artifact chunked into multiple requests. + // The server side is notified about the total size of the large artifact as well as the + // number of chunks to expect. + BeginChunkedArtifact begin_chunk = 4; + // A chunk of an artifact excluding metadata. This can be any chunk of a large artifact + // excluding the first chunk (which is included in `BeginChunkedArtifact`). + ArtifactChunk chunk = 5; + } +} + +// Response to adding an artifact. Contains relevant metadata to verify successful transfer of +// artifact(s). +message AddArtifactsResponse { + // Metadata of an artifact. + message ArtifactSummary { + string name = 1; + // Whether the CRC (Cyclic Redundancy Check) is successful on server verification. + // The server discards any artifact that fails the CRC. + // If false, the client may choose to resend the artifact specified by `name`. + bool is_crc_successful = 2; + } + + // The list of artifact(s) seen by the server. + repeated ArtifactSummary artifacts = 1; +} + +// Main interface for the SparkConnect service. +service SparkConnectService { + + // Executes a request that contains the query and returns a stream of [[Response]]. + // + // It is guaranteed that there is at least one ARROW batch returned even if the result set is empty. + rpc ExecutePlan(ExecutePlanRequest) returns (stream ExecutePlanResponse) {} + + // Analyzes a query and returns a [[AnalyzeResponse]] containing metadata about the query. + rpc AnalyzePlan(AnalyzePlanRequest) returns (AnalyzePlanResponse) {} + + // Update or fetch the configurations and returns a [[ConfigResponse]] containing the result. + rpc Config(ConfigRequest) returns (ConfigResponse) {} + + // Add artifacts to the session and returns a [[AddArtifactsResponse]] containing metadata about + // the added artifacts. + rpc AddArtifacts(stream AddArtifactsRequest) returns (AddArtifactsResponse) {} +} + diff --git a/connector/connect/common/src/main/protobuf/spark/connect/catalog.proto b/connector/connect/common/src/main/protobuf/spark/connect/catalog.proto new file mode 100644 index 0000000000000..b49be901526ba --- /dev/null +++ b/connector/connect/common/src/main/protobuf/spark/connect/catalog.proto @@ -0,0 +1,228 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = 'proto3'; + +package spark.connect; + +import "spark/connect/types.proto"; + +option java_multiple_files = true; +option java_package = "org.apache.spark.connect.proto"; + +// Catalog messages are marked as unstable. +message Catalog { + oneof cat_type { + CurrentDatabase current_database = 1; + SetCurrentDatabase set_current_database = 2; + ListDatabases list_databases = 3; + ListTables list_tables = 4; + ListFunctions list_functions = 5; + ListColumns list_columns = 6; + GetDatabase get_database = 7; + GetTable get_table = 8; + GetFunction get_function = 9; + DatabaseExists database_exists = 10; + TableExists table_exists = 11; + FunctionExists function_exists = 12; + CreateExternalTable create_external_table = 13; + CreateTable create_table = 14; + DropTempView drop_temp_view = 15; + DropGlobalTempView drop_global_temp_view = 16; + RecoverPartitions recover_partitions = 17; + IsCached is_cached = 18; + CacheTable cache_table = 19; + UncacheTable uncache_table = 20; + ClearCache clear_cache = 21; + RefreshTable refresh_table = 22; + RefreshByPath refresh_by_path = 23; + CurrentCatalog current_catalog = 24; + SetCurrentCatalog set_current_catalog = 25; + ListCatalogs list_catalogs = 26; + } +} + +// See `spark.catalog.currentDatabase` +message CurrentDatabase { } + +// See `spark.catalog.setCurrentDatabase` +message SetCurrentDatabase { + // (Required) + string db_name = 1; +} + +// See `spark.catalog.listDatabases` +message ListDatabases { } + +// See `spark.catalog.listTables` +message ListTables { + // (Optional) + optional string db_name = 1; +} + +// See `spark.catalog.listFunctions` +message ListFunctions { + // (Optional) + optional string db_name = 1; +} + +// See `spark.catalog.listColumns` +message ListColumns { + // (Required) + string table_name = 1; + // (Optional) + optional string db_name = 2; +} + +// See `spark.catalog.getDatabase` +message GetDatabase { + // (Required) + string db_name = 1; +} + +// See `spark.catalog.getTable` +message GetTable { + // (Required) + string table_name = 1; + // (Optional) + optional string db_name = 2; +} + +// See `spark.catalog.getFunction` +message GetFunction { + // (Required) + string function_name = 1; + // (Optional) + optional string db_name = 2; +} + +// See `spark.catalog.databaseExists` +message DatabaseExists { + // (Required) + string db_name = 1; +} + +// See `spark.catalog.tableExists` +message TableExists { + // (Required) + string table_name = 1; + // (Optional) + optional string db_name = 2; +} + +// See `spark.catalog.functionExists` +message FunctionExists { + // (Required) + string function_name = 1; + // (Optional) + optional string db_name = 2; +} + +// See `spark.catalog.createExternalTable` +message CreateExternalTable { + // (Required) + string table_name = 1; + // (Optional) + optional string path = 2; + // (Optional) + optional string source = 3; + // (Optional) + optional DataType schema = 4; + // Options could be empty for valid data source format. + // The map key is case insensitive. + map options = 5; +} + +// See `spark.catalog.createTable` +message CreateTable { + // (Required) + string table_name = 1; + // (Optional) + optional string path = 2; + // (Optional) + optional string source = 3; + // (Optional) + optional string description = 4; + // (Optional) + optional DataType schema = 5; + // Options could be empty for valid data source format. + // The map key is case insensitive. + map options = 6; +} + +// See `spark.catalog.dropTempView` +message DropTempView { + // (Required) + string view_name = 1; +} + +// See `spark.catalog.dropGlobalTempView` +message DropGlobalTempView { + // (Required) + string view_name = 1; +} + +// See `spark.catalog.recoverPartitions` +message RecoverPartitions { + // (Required) + string table_name = 1; +} + +// See `spark.catalog.isCached` +message IsCached { + // (Required) + string table_name = 1; +} + +// See `spark.catalog.cacheTable` +message CacheTable { + // (Required) + string table_name = 1; +} + +// See `spark.catalog.uncacheTable` +message UncacheTable { + // (Required) + string table_name = 1; +} + +// See `spark.catalog.clearCache` +message ClearCache { } + +// See `spark.catalog.refreshTable` +message RefreshTable { + // (Required) + string table_name = 1; +} + +// See `spark.catalog.refreshByPath` +message RefreshByPath { + // (Required) + string path = 1; +} + +// See `spark.catalog.currentCatalog` +message CurrentCatalog { } + +// See `spark.catalog.setCurrentCatalog` +message SetCurrentCatalog { + // (Required) + string catalog_name = 1; +} + +// See `spark.catalog.listCatalogs` +message ListCatalogs { } diff --git a/connector/connect/common/src/main/protobuf/spark/connect/commands.proto b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto new file mode 100644 index 0000000000000..5af6ef5bbad04 --- /dev/null +++ b/connector/connect/common/src/main/protobuf/spark/connect/commands.proto @@ -0,0 +1,176 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = 'proto3'; + +import "google/protobuf/any.proto"; +import "spark/connect/expressions.proto"; +import "spark/connect/relations.proto"; + +package spark.connect; + +option java_multiple_files = true; +option java_package = "org.apache.spark.connect.proto"; + +// A [[Command]] is an operation that is executed by the server that does not directly consume or +// produce a relational result. +message Command { + oneof command_type { + CommonInlineUserDefinedFunction register_function = 1; + WriteOperation write_operation = 2; + CreateDataFrameViewCommand create_dataframe_view = 3; + WriteOperationV2 write_operation_v2 = 4; + SqlCommand sql_command = 5; + + // This field is used to mark extensions to the protocol. When plugins generate arbitrary + // Commands they can add them here. During the planning the correct resolution is done. + google.protobuf.Any extension = 999; + + } +} + +// A SQL Command is used to trigger the eager evaluation of SQL commands in Spark. +// +// When the SQL provide as part of the message is a command it will be immediately evaluated +// and the result will be collected and returned as part of a LocalRelation. If the result is +// not a command, the operation will simply return a SQL Relation. This allows the client to be +// almost oblivious to the server-side behavior. +message SqlCommand { + // (Required) SQL Query. + string sql = 1; + + // (Optional) A map of parameter names to literal expressions. + map args = 2; +} + +// A command that can create DataFrame global temp view or local temp view. +message CreateDataFrameViewCommand { + // (Required) The relation that this view will be built on. + Relation input = 1; + + // (Required) View name. + string name = 2; + + // (Required) Whether this is global temp view or local temp view. + bool is_global = 3; + + // (Required) + // + // If true, and if the view already exists, updates it; if false, and if the view + // already exists, throws exception. + bool replace = 4; +} + +// As writes are not directly handled during analysis and planning, they are modeled as commands. +message WriteOperation { + // (Required) The output of the `input` relation will be persisted according to the options. + Relation input = 1; + + // (Optional) Format value according to the Spark documentation. Examples are: text, parquet, delta. + optional string source = 2; + + // (Optional) + // + // The destination of the write operation can be either a path or a table. + // If the destination is neither a path nor a table, such as jdbc and noop, + // the `save_type` should not be set. + oneof save_type { + string path = 3; + SaveTable table = 4; + } + + // (Required) the save mode. + SaveMode mode = 5; + + // (Optional) List of columns to sort the output by. + repeated string sort_column_names = 6; + + // (Optional) List of columns for partitioning. + repeated string partitioning_columns = 7; + + // (Optional) Bucketing specification. Bucketing must set the number of buckets and the columns + // to bucket by. + BucketBy bucket_by = 8; + + // (Optional) A list of configuration options. + map options = 9; + + message SaveTable { + // (Required) The table name. + string table_name = 1; + // (Required) The method to be called to write to the table. + TableSaveMethod save_method = 2; + + enum TableSaveMethod { + TABLE_SAVE_METHOD_UNSPECIFIED = 0; + TABLE_SAVE_METHOD_SAVE_AS_TABLE = 1; + TABLE_SAVE_METHOD_INSERT_INTO = 2; + } + } + + message BucketBy { + repeated string bucket_column_names = 1; + int32 num_buckets = 2; + } + + enum SaveMode { + SAVE_MODE_UNSPECIFIED = 0; + SAVE_MODE_APPEND = 1; + SAVE_MODE_OVERWRITE = 2; + SAVE_MODE_ERROR_IF_EXISTS = 3; + SAVE_MODE_IGNORE = 4; + } +} + +// As writes are not directly handled during analysis and planning, they are modeled as commands. +message WriteOperationV2 { + // (Required) The output of the `input` relation will be persisted according to the options. + Relation input = 1; + + // (Required) The destination of the write operation must be either a path or a table. + string table_name = 2; + + // (Optional) A provider for the underlying output data source. Spark's default catalog supports + // "parquet", "json", etc. + optional string provider = 3; + + // (Optional) List of columns for partitioning for output table created by `create`, + // `createOrReplace`, or `replace` + repeated Expression partitioning_columns = 4; + + // (Optional) A list of configuration options. + map options = 5; + + // (Optional) A list of table properties. + map table_properties = 6; + + // (Required) Write mode. + Mode mode = 7; + + enum Mode { + MODE_UNSPECIFIED = 0; + MODE_CREATE = 1; + MODE_OVERWRITE = 2; + MODE_OVERWRITE_PARTITIONS = 3; + MODE_APPEND = 4; + MODE_REPLACE = 5; + MODE_CREATE_OR_REPLACE = 6; + } + + // (Optional) A condition for overwrite saving mode + Expression overwrite_condition = 8; +} diff --git a/connector/connect/common/src/main/protobuf/spark/connect/common.proto b/connector/connect/common/src/main/protobuf/spark/connect/common.proto new file mode 100644 index 0000000000000..342588ea38411 --- /dev/null +++ b/connector/connect/common/src/main/protobuf/spark/connect/common.proto @@ -0,0 +1,37 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = 'proto3'; + +package spark.connect; + +option java_multiple_files = true; +option java_package = "org.apache.spark.connect.proto"; + +// StorageLevel for persisting Datasets/Tables. +message StorageLevel { + // (Required) Whether the cache should use disk or not. + bool use_disk = 1; + // (Required) Whether the cache should use memory or not. + bool use_memory = 2; + // (Required) Whether the cache should use off-heap or not. + bool use_off_heap = 3; + // (Required) Whether the cached data is deserialized or not. + bool deserialized = 4; + // (Required) The number of replicas. + int32 replication = 5; +} diff --git a/connector/connect/common/src/main/protobuf/spark/connect/example_plugins.proto b/connector/connect/common/src/main/protobuf/spark/connect/example_plugins.proto new file mode 100644 index 0000000000000..03208c7a4392f --- /dev/null +++ b/connector/connect/common/src/main/protobuf/spark/connect/example_plugins.proto @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = 'proto3'; + +import "spark/connect/relations.proto"; +import "spark/connect/expressions.proto"; + +package spark.connect; + +option java_multiple_files = true; +option java_package = "org.apache.spark.connect.proto"; + +message ExamplePluginRelation { + Relation input = 1; + string custom_field = 2; + +} + +message ExamplePluginExpression { + Expression child = 1; + string custom_field = 2; +} + +message ExamplePluginCommand { + string custom_field = 1; +} \ No newline at end of file diff --git a/connector/connect/common/src/main/protobuf/spark/connect/expressions.proto b/connector/connect/common/src/main/protobuf/spark/connect/expressions.proto new file mode 100644 index 0000000000000..af67f10e05f04 --- /dev/null +++ b/connector/connect/common/src/main/protobuf/spark/connect/expressions.proto @@ -0,0 +1,358 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = 'proto3'; + +import "google/protobuf/any.proto"; +import "spark/connect/types.proto"; + +package spark.connect; + +option java_multiple_files = true; +option java_package = "org.apache.spark.connect.proto"; + +// Expression used to refer to fields, functions and similar. This can be used everywhere +// expressions in SQL appear. +message Expression { + + oneof expr_type { + Literal literal = 1; + UnresolvedAttribute unresolved_attribute = 2; + UnresolvedFunction unresolved_function = 3; + ExpressionString expression_string = 4; + UnresolvedStar unresolved_star = 5; + Alias alias = 6; + Cast cast = 7; + UnresolvedRegex unresolved_regex = 8; + SortOrder sort_order = 9; + LambdaFunction lambda_function = 10; + Window window = 11; + UnresolvedExtractValue unresolved_extract_value = 12; + UpdateFields update_fields = 13; + UnresolvedNamedLambdaVariable unresolved_named_lambda_variable = 14; + CommonInlineUserDefinedFunction common_inline_user_defined_function = 15; + + // This field is used to mark extensions to the protocol. When plugins generate arbitrary + // relations they can add them here. During the planning the correct resolution is done. + google.protobuf.Any extension = 999; + } + + + // Expression for the OVER clause or WINDOW clause. + message Window { + + // (Required) The window function. + Expression window_function = 1; + + // (Optional) The way that input rows are partitioned. + repeated Expression partition_spec = 2; + + // (Optional) Ordering of rows in a partition. + repeated SortOrder order_spec = 3; + + // (Optional) Window frame in a partition. + // + // If not set, it will be treated as 'UnspecifiedFrame'. + WindowFrame frame_spec = 4; + + // The window frame + message WindowFrame { + + // (Required) The type of the frame. + FrameType frame_type = 1; + + // (Required) The lower bound of the frame. + FrameBoundary lower = 2; + + // (Required) The upper bound of the frame. + FrameBoundary upper = 3; + + enum FrameType { + FRAME_TYPE_UNDEFINED = 0; + + // RowFrame treats rows in a partition individually. + FRAME_TYPE_ROW = 1; + + // RangeFrame treats rows in a partition as groups of peers. + // All rows having the same 'ORDER BY' ordering are considered as peers. + FRAME_TYPE_RANGE = 2; + } + + message FrameBoundary { + oneof boundary { + // CURRENT ROW boundary + bool current_row = 1; + + // UNBOUNDED boundary. + // For lower bound, it will be converted to 'UnboundedPreceding'. + // for upper bound, it will be converted to 'UnboundedFollowing'. + bool unbounded = 2; + + // This is an expression for future proofing. We are expecting literals on the server side. + Expression value = 3; + } + } + } + } + + // SortOrder is used to specify the data ordering, it is normally used in Sort and Window. + // It is an unevaluable expression and cannot be evaluated, so can not be used in Projection. + message SortOrder { + // (Required) The expression to be sorted. + Expression child = 1; + + // (Required) The sort direction, should be ASCENDING or DESCENDING. + SortDirection direction = 2; + + // (Required) How to deal with NULLs, should be NULLS_FIRST or NULLS_LAST. + NullOrdering null_ordering = 3; + + enum SortDirection { + SORT_DIRECTION_UNSPECIFIED = 0; + SORT_DIRECTION_ASCENDING = 1; + SORT_DIRECTION_DESCENDING = 2; + } + + enum NullOrdering { + SORT_NULLS_UNSPECIFIED = 0; + SORT_NULLS_FIRST = 1; + SORT_NULLS_LAST = 2; + } + } + + message Cast { + // (Required) the expression to be casted. + Expression expr = 1; + + // (Required) the data type that the expr to be casted to. + oneof cast_to_type { + DataType type = 2; + // If this is set, Server will use Catalyst parser to parse this string to DataType. + string type_str = 3; + } + } + + message Literal { + oneof literal_type { + DataType null = 1; + bytes binary = 2; + bool boolean = 3; + + int32 byte = 4; + int32 short = 5; + int32 integer = 6; + int64 long = 7; + float float = 10; + double double = 11; + Decimal decimal = 12; + + string string = 13; + + // Date in units of days since the UNIX epoch. + int32 date = 16; + // Timestamp in units of microseconds since the UNIX epoch. + int64 timestamp = 17; + // Timestamp in units of microseconds since the UNIX epoch (without timezone information). + int64 timestamp_ntz = 18; + + CalendarInterval calendar_interval = 19; + int32 year_month_interval = 20; + int64 day_time_interval = 21; + Array array = 22; + } + + message Decimal { + // the string representation. + string value = 1; + // The maximum number of digits allowed in the value. + // the maximum precision is 38. + optional int32 precision = 2; + // declared scale of decimal literal + optional int32 scale = 3; + } + + message CalendarInterval { + int32 months = 1; + int32 days = 2; + int64 microseconds = 3; + } + + message Array { + DataType element_type = 1; + repeated Literal elements = 2; + } + } + + // An unresolved attribute that is not explicitly bound to a specific column, but the column + // is resolved during analysis by name. + message UnresolvedAttribute { + // (Required) An identifier that will be parsed by Catalyst parser. This should follow the + // Spark SQL identifier syntax. + string unparsed_identifier = 1; + + // (Optional) The id of corresponding connect plan. + optional int64 plan_id = 2; + } + + // An unresolved function is not explicitly bound to one explicit function, but the function + // is resolved during analysis following Sparks name resolution rules. + message UnresolvedFunction { + // (Required) name (or unparsed name for user defined function) for the unresolved function. + string function_name = 1; + + // (Optional) Function arguments. Empty arguments are allowed. + repeated Expression arguments = 2; + + // (Required) Indicate if this function should be applied on distinct values. + bool is_distinct = 3; + + // (Required) Indicate if this is a user defined function. + // + // When it is not a user defined function, Connect will use the function name directly. + // When it is a user defined function, Connect will parse the function name first. + bool is_user_defined_function = 4; + } + + // Expression as string. + message ExpressionString { + // (Required) A SQL expression that will be parsed by Catalyst parser. + string expression = 1; + } + + // UnresolvedStar is used to expand all the fields of a relation or struct. + message UnresolvedStar { + + // (Optional) The target of the expansion. + // + // If set, it should end with '.*' and will be parsed by 'parseAttributeName' + // in the server side. + optional string unparsed_target = 1; + } + + // Represents all of the input attributes to a given relational operator, for example in + // "SELECT `(id)?+.+` FROM ...". + message UnresolvedRegex { + // (Required) The column name used to extract column with regex. + string col_name = 1; + + // (Optional) The id of corresponding connect plan. + optional int64 plan_id = 2; + } + + // Extracts a value or values from an Expression + message UnresolvedExtractValue { + // (Required) The expression to extract value from, can be + // Map, Array, Struct or array of Structs. + Expression child = 1; + + // (Required) The expression to describe the extraction, can be + // key of Map, index of Array, field name of Struct. + Expression extraction = 2; + } + + // Add, replace or drop a field of `StructType` expression by name. + message UpdateFields { + // (Required) The struct expression. + Expression struct_expression = 1; + + // (Required) The field name. + string field_name = 2; + + // (Optional) The expression to add or replace. + // + // When not set, it means this field will be dropped. + Expression value_expression = 3; + } + + message Alias { + // (Required) The expression that alias will be added on. + Expression expr = 1; + + // (Required) a list of name parts for the alias. + // + // Scalar columns only has one name that presents. + repeated string name = 2; + + // (Optional) Alias metadata expressed as a JSON map. + optional string metadata = 3; + } + + message LambdaFunction { + // (Required) The lambda function. + // + // The function body should use 'UnresolvedAttribute' as arguments, the sever side will + // replace 'UnresolvedAttribute' with 'UnresolvedNamedLambdaVariable'. + Expression function = 1; + + // (Required) Function variables. Must contains 1 ~ 3 variables. + repeated Expression.UnresolvedNamedLambdaVariable arguments = 2; + } + + message UnresolvedNamedLambdaVariable { + + // (Required) a list of name parts for the variable. Must not be empty. + repeated string name_parts = 1; + } +} + +message CommonInlineUserDefinedFunction { + // (Required) Name of the user-defined function. + string function_name = 1; + // (Optional) Indicate if the user-defined function is deterministic. + bool deterministic = 2; + // (Optional) Function arguments. Empty arguments are allowed. + repeated Expression arguments = 3; + // (Required) Indicate the function type of the user-defined function. + oneof function { + PythonUDF python_udf = 4; + ScalarScalaUDF scalar_scala_udf = 5; + JavaUDF java_udf = 6; + } +} + +message PythonUDF { + // (Required) Output type of the Python UDF + DataType output_type = 1; + // (Required) EvalType of the Python UDF + int32 eval_type = 2; + // (Required) The encoded commands of the Python UDF + bytes command = 3; + // (Required) Python version being used in the client. + string python_ver = 4; +} + +message ScalarScalaUDF { + // (Required) Serialized JVM object containing UDF definition, input encoders and output encoder + bytes payload = 1; + // (Optional) Input type(s) of the UDF + repeated DataType inputTypes = 2; + // (Required) Output type of the UDF + DataType outputType = 3; + // (Required) True if the UDF can return null value + bool nullable = 4; +} + +message JavaUDF { + // (Required) Fully qualified name of Java class + string class_name = 1; + + // (Optional) Output type of the Java UDF + optional DataType output_type = 2; + + // (Required) Indicate if the Java user-defined function is an aggregate function + bool aggregate = 3; +} diff --git a/connector/connect/common/src/main/protobuf/spark/connect/relations.proto b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto new file mode 100644 index 0000000000000..68ce84f2cbed4 --- /dev/null +++ b/connector/connect/common/src/main/protobuf/spark/connect/relations.proto @@ -0,0 +1,852 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = 'proto3'; + +package spark.connect; + +import "google/protobuf/any.proto"; +import "spark/connect/expressions.proto"; +import "spark/connect/types.proto"; +import "spark/connect/catalog.proto"; + +option java_multiple_files = true; +option java_package = "org.apache.spark.connect.proto"; + +// The main [[Relation]] type. Fundamentally, a relation is a typed container +// that has exactly one explicit relation type set. +// +// When adding new relation types, they have to be registered here. +message Relation { + RelationCommon common = 1; + oneof rel_type { + Read read = 2; + Project project = 3; + Filter filter = 4; + Join join = 5; + SetOperation set_op = 6; + Sort sort = 7; + Limit limit = 8; + Aggregate aggregate = 9; + SQL sql = 10; + LocalRelation local_relation = 11; + Sample sample = 12; + Offset offset = 13; + Deduplicate deduplicate = 14; + Range range = 15; + SubqueryAlias subquery_alias = 16; + Repartition repartition = 17; + ToDF to_df = 18; + WithColumnsRenamed with_columns_renamed = 19; + ShowString show_string = 20; + Drop drop = 21; + Tail tail = 22; + WithColumns with_columns = 23; + Hint hint = 24; + Unpivot unpivot = 25; + ToSchema to_schema = 26; + RepartitionByExpression repartition_by_expression = 27; + MapPartitions map_partitions = 28; + CollectMetrics collect_metrics = 29; + Parse parse = 30; + GroupMap group_map = 31; + CoGroupMap co_group_map = 32; + + // NA functions + NAFill fill_na = 90; + NADrop drop_na = 91; + NAReplace replace = 92; + + // stat functions + StatSummary summary = 100; + StatCrosstab crosstab = 101; + StatDescribe describe = 102; + StatCov cov = 103; + StatCorr corr = 104; + StatApproxQuantile approx_quantile = 105; + StatFreqItems freq_items = 106; + StatSampleBy sample_by = 107; + + // Catalog API (experimental / unstable) + Catalog catalog = 200; + + // This field is used to mark extensions to the protocol. When plugins generate arbitrary + // relations they can add them here. During the planning the correct resolution is done. + google.protobuf.Any extension = 998; + Unknown unknown = 999; + } +} + +// Used for testing purposes only. +message Unknown {} + +// Common metadata of all relations. +message RelationCommon { + // (Required) Shared relation metadata. + string source_info = 1; + + // (Optional) A per-client globally unique id for a given connect plan. + optional int64 plan_id = 2; +} + +// Relation that uses a SQL query to generate the output. +message SQL { + // (Required) The SQL query. + string query = 1; + + // (Optional) A map of parameter names to literal expressions. + map args = 2; +} + +// Relation that reads from a file / table or other data source. Does not have additional +// inputs. +message Read { + oneof read_type { + NamedTable named_table = 1; + DataSource data_source = 2; + } + + message NamedTable { + // (Required) Unparsed identifier for the table. + string unparsed_identifier = 1; + + // Options for the named table. The map key is case insensitive. + map options = 2; + } + + message DataSource { + // (Optional) Supported formats include: parquet, orc, text, json, parquet, csv, avro. + // + // If not set, the value from SQL conf 'spark.sql.sources.default' will be used. + optional string format = 1; + + // (Optional) If not set, Spark will infer the schema. + // + // This schema string should be either DDL-formatted or JSON-formatted. + optional string schema = 2; + + // Options for the data source. The context of this map varies based on the + // data source format. This options could be empty for valid data source format. + // The map key is case insensitive. + map options = 3; + + // (Optional) A list of path for file-system backed data sources. + repeated string paths = 4; + + // (Optional) Condition in the where clause for each partition. + // + // This is only supported by the JDBC data source. + repeated string predicates = 5; + } +} + +// Projection of a bag of expressions for a given input relation. +// +// The input relation must be specified. +// The projected expression can be an arbitrary expression. +message Project { + // (Optional) Input relation is optional for Project. + // + // For example, `SELECT ABS(-1)` is valid plan without an input plan. + Relation input = 1; + + // (Required) A Project requires at least one expression. + repeated Expression expressions = 3; +} + +// Relation that applies a boolean expression `condition` on each row of `input` to produce +// the output result. +message Filter { + // (Required) Input relation for a Filter. + Relation input = 1; + + // (Required) A Filter must have a condition expression. + Expression condition = 2; +} + +// Relation of type [[Join]]. +// +// `left` and `right` must be present. +message Join { + // (Required) Left input relation for a Join. + Relation left = 1; + + // (Required) Right input relation for a Join. + Relation right = 2; + + // (Optional) The join condition. Could be unset when `using_columns` is utilized. + // + // This field does not co-exist with using_columns. + Expression join_condition = 3; + + // (Required) The join type. + JoinType join_type = 4; + + // Optional. using_columns provides a list of columns that should present on both sides of + // the join inputs that this Join will join on. For example A JOIN B USING col_name is + // equivalent to A JOIN B on A.col_name = B.col_name. + // + // This field does not co-exist with join_condition. + repeated string using_columns = 5; + + enum JoinType { + JOIN_TYPE_UNSPECIFIED = 0; + JOIN_TYPE_INNER = 1; + JOIN_TYPE_FULL_OUTER = 2; + JOIN_TYPE_LEFT_OUTER = 3; + JOIN_TYPE_RIGHT_OUTER = 4; + JOIN_TYPE_LEFT_ANTI = 5; + JOIN_TYPE_LEFT_SEMI = 6; + JOIN_TYPE_CROSS = 7; + } +} + +// Relation of type [[SetOperation]] +message SetOperation { + // (Required) Left input relation for a Set operation. + Relation left_input = 1; + + // (Required) Right input relation for a Set operation. + Relation right_input = 2; + + // (Required) The Set operation type. + SetOpType set_op_type = 3; + + // (Optional) If to remove duplicate rows. + // + // True to preserve all results. + // False to remove duplicate rows. + optional bool is_all = 4; + + // (Optional) If to perform the Set operation based on name resolution. + // + // Only UNION supports this option. + optional bool by_name = 5; + + // (Optional) If to perform the Set operation and allow missing columns. + // + // Only UNION supports this option. + optional bool allow_missing_columns = 6; + + enum SetOpType { + SET_OP_TYPE_UNSPECIFIED = 0; + SET_OP_TYPE_INTERSECT = 1; + SET_OP_TYPE_UNION = 2; + SET_OP_TYPE_EXCEPT = 3; + } +} + +// Relation of type [[Limit]] that is used to `limit` rows from the input relation. +message Limit { + // (Required) Input relation for a Limit. + Relation input = 1; + + // (Required) the limit. + int32 limit = 2; +} + +// Relation of type [[Offset]] that is used to read rows staring from the `offset` on +// the input relation. +message Offset { + // (Required) Input relation for an Offset. + Relation input = 1; + + // (Required) the limit. + int32 offset = 2; +} + +// Relation of type [[Tail]] that is used to fetch `limit` rows from the last of the input relation. +message Tail { + // (Required) Input relation for an Tail. + Relation input = 1; + + // (Required) the limit. + int32 limit = 2; +} + +// Relation of type [[Aggregate]]. +message Aggregate { + // (Required) Input relation for a RelationalGroupedDataset. + Relation input = 1; + + // (Required) How the RelationalGroupedDataset was built. + GroupType group_type = 2; + + // (Required) Expressions for grouping keys + repeated Expression grouping_expressions = 3; + + // (Required) List of values that will be translated to columns in the output DataFrame. + repeated Expression aggregate_expressions = 4; + + // (Optional) Pivots a column of the current `DataFrame` and performs the specified aggregation. + Pivot pivot = 5; + + enum GroupType { + GROUP_TYPE_UNSPECIFIED = 0; + GROUP_TYPE_GROUPBY = 1; + GROUP_TYPE_ROLLUP = 2; + GROUP_TYPE_CUBE = 3; + GROUP_TYPE_PIVOT = 4; + } + + message Pivot { + // (Required) The column to pivot + Expression col = 1; + + // (Optional) List of values that will be translated to columns in the output DataFrame. + // + // Note that if it is empty, the server side will immediately trigger a job to collect + // the distinct values of the column. + repeated Expression.Literal values = 2; + } +} + +// Relation of type [[Sort]]. +message Sort { + // (Required) Input relation for a Sort. + Relation input = 1; + + // (Required) The ordering expressions + repeated Expression.SortOrder order = 2; + + // (Optional) if this is a global sort. + optional bool is_global = 3; +} + + +// Drop specified columns. +message Drop { + // (Required) The input relation. + Relation input = 1; + + // (Optional) columns to drop. + repeated Expression columns = 2; + + // (Optional) names of columns to drop. + repeated string column_names = 3; +} + + +// Relation of type [[Deduplicate]] which have duplicate rows removed, could consider either only +// the subset of columns or all the columns. +message Deduplicate { + // (Required) Input relation for a Deduplicate. + Relation input = 1; + + // (Optional) Deduplicate based on a list of column names. + // + // This field does not co-use with `all_columns_as_keys`. + repeated string column_names = 2; + + // (Optional) Deduplicate based on all the columns of the input relation. + // + // This field does not co-use with `column_names`. + optional bool all_columns_as_keys = 3; +} + +// A relation that does not need to be qualified by name. +message LocalRelation { + // (Optional) Local collection data serialized into Arrow IPC streaming format which contains + // the schema of the data. + optional bytes data = 1; + + // (Optional) The schema of local data. + // It should be either a DDL-formatted type string or a JSON string. + // + // The server side will update the column names and data types according to this schema. + // If the 'data' is not provided, then this schema will be required. + optional string schema = 2; +} + +// Relation of type [[Sample]] that samples a fraction of the dataset. +message Sample { + // (Required) Input relation for a Sample. + Relation input = 1; + + // (Required) lower bound. + double lower_bound = 2; + + // (Required) upper bound. + double upper_bound = 3; + + // (Optional) Whether to sample with replacement. + optional bool with_replacement = 4; + + // (Optional) The random seed. + optional int64 seed = 5; + + // (Required) Explicitly sort the underlying plan to make the ordering deterministic or cache it. + // This flag is true when invoking `dataframe.randomSplit` to randomly splits DataFrame with the + // provided weights. Otherwise, it is false. + bool deterministic_order = 6; +} + +// Relation of type [[Range]] that generates a sequence of integers. +message Range { + // (Optional) Default value = 0 + optional int64 start = 1; + + // (Required) + int64 end = 2; + + // (Required) + int64 step = 3; + + // Optional. Default value is assigned by 1) SQL conf "spark.sql.leafNodeDefaultParallelism" if + // it is set, or 2) spark default parallelism. + optional int32 num_partitions = 4; +} + +// Relation alias. +message SubqueryAlias { + // (Required) The input relation of SubqueryAlias. + Relation input = 1; + + // (Required) The alias. + string alias = 2; + + // (Optional) Qualifier of the alias. + repeated string qualifier = 3; +} + +// Relation repartition. +message Repartition { + // (Required) The input relation of Repartition. + Relation input = 1; + + // (Required) Must be positive. + int32 num_partitions = 2; + + // (Optional) Default value is false. + optional bool shuffle = 3; +} + +// Compose the string representing rows for output. +// It will invoke 'Dataset.showString' to compute the results. +message ShowString { + // (Required) The input relation. + Relation input = 1; + + // (Required) Number of rows to show. + int32 num_rows = 2; + + // (Required) If set to more than 0, truncates strings to + // `truncate` characters and all cells will be aligned right. + int32 truncate = 3; + + // (Required) If set to true, prints output rows vertically (one line per column value). + bool vertical = 4; +} + +// Computes specified statistics for numeric and string columns. +// It will invoke 'Dataset.summary' (same as 'StatFunctions.summary') +// to compute the results. +message StatSummary { + // (Required) The input relation. + Relation input = 1; + + // (Optional) Statistics from to be computed. + // + // Available statistics are: + // count + // mean + // stddev + // min + // max + // arbitrary approximate percentiles specified as a percentage (e.g. 75%) + // count_distinct + // approx_count_distinct + // + // If no statistics are given, this function computes 'count', 'mean', 'stddev', 'min', + // 'approximate quartiles' (percentiles at 25%, 50%, and 75%), and 'max'. + repeated string statistics = 2; +} + +// Computes basic statistics for numeric and string columns, including count, mean, stddev, min, +// and max. If no columns are given, this function computes statistics for all numerical or +// string columns. +message StatDescribe { + // (Required) The input relation. + Relation input = 1; + + // (Optional) Columns to compute statistics on. + repeated string cols = 2; +} + +// Computes a pair-wise frequency table of the given columns. Also known as a contingency table. +// It will invoke 'Dataset.stat.crosstab' (same as 'StatFunctions.crossTabulate') +// to compute the results. +message StatCrosstab { + // (Required) The input relation. + Relation input = 1; + + // (Required) The name of the first column. + // + // Distinct items will make the first item of each row. + string col1 = 2; + + // (Required) The name of the second column. + // + // Distinct items will make the column names of the DataFrame. + string col2 = 3; +} + +// Calculate the sample covariance of two numerical columns of a DataFrame. +// It will invoke 'Dataset.stat.cov' (same as 'StatFunctions.calculateCov') to compute the results. +message StatCov { + // (Required) The input relation. + Relation input = 1; + + // (Required) The name of the first column. + string col1 = 2; + + // (Required) The name of the second column. + string col2 = 3; +} + +// Calculates the correlation of two columns of a DataFrame. Currently only supports the Pearson +// Correlation Coefficient. It will invoke 'Dataset.stat.corr' (same as +// 'StatFunctions.pearsonCorrelation') to compute the results. +message StatCorr { + // (Required) The input relation. + Relation input = 1; + + // (Required) The name of the first column. + string col1 = 2; + + // (Required) The name of the second column. + string col2 = 3; + + // (Optional) Default value is 'pearson'. + // + // Currently only supports the Pearson Correlation Coefficient. + optional string method = 4; +} + +// Calculates the approximate quantiles of numerical columns of a DataFrame. +// It will invoke 'Dataset.stat.approxQuantile' (same as 'StatFunctions.approxQuantile') +// to compute the results. +message StatApproxQuantile { + // (Required) The input relation. + Relation input = 1; + + // (Required) The names of the numerical columns. + repeated string cols = 2; + + // (Required) A list of quantile probabilities. + // + // Each number must belong to [0, 1]. + // For example 0 is the minimum, 0.5 is the median, 1 is the maximum. + repeated double probabilities = 3; + + // (Required) The relative target precision to achieve (greater than or equal to 0). + // + // If set to zero, the exact quantiles are computed, which could be very expensive. + // Note that values greater than 1 are accepted but give the same result as 1. + double relative_error = 4; +} + +// Finding frequent items for columns, possibly with false positives. +// It will invoke 'Dataset.stat.freqItems' (same as 'StatFunctions.freqItems') +// to compute the results. +message StatFreqItems { + // (Required) The input relation. + Relation input = 1; + + // (Required) The names of the columns to search frequent items in. + repeated string cols = 2; + + // (Optional) The minimum frequency for an item to be considered `frequent`. + // Should be greater than 1e-4. + optional double support = 3; +} + + +// Returns a stratified sample without replacement based on the fraction +// given on each stratum. +// It will invoke 'Dataset.stat.freqItems' (same as 'StatFunctions.freqItems') +// to compute the results. +message StatSampleBy { + // (Required) The input relation. + Relation input = 1; + + // (Required) The column that defines strata. + Expression col = 2; + + // (Required) Sampling fraction for each stratum. + // + // If a stratum is not specified, we treat its fraction as zero. + repeated Fraction fractions = 3; + + // (Optional) The random seed. + optional int64 seed = 5; + + message Fraction { + // (Required) The stratum. + Expression.Literal stratum = 1; + + // (Required) The fraction value. Must be in [0, 1]. + double fraction = 2; + } +} + + +// Replaces null values. +// It will invoke 'Dataset.na.fill' (same as 'DataFrameNaFunctions.fill') to compute the results. +// Following 3 parameter combinations are supported: +// 1, 'values' only contains 1 item, 'cols' is empty: +// replaces null values in all type-compatible columns. +// 2, 'values' only contains 1 item, 'cols' is not empty: +// replaces null values in specified columns. +// 3, 'values' contains more than 1 items, then 'cols' is required to have the same length: +// replaces each specified column with corresponding value. +message NAFill { + // (Required) The input relation. + Relation input = 1; + + // (Optional) Optional list of column names to consider. + repeated string cols = 2; + + // (Required) Values to replace null values with. + // + // Should contain at least 1 item. + // Only 4 data types are supported now: bool, long, double, string + repeated Expression.Literal values = 3; +} + + +// Drop rows containing null values. +// It will invoke 'Dataset.na.drop' (same as 'DataFrameNaFunctions.drop') to compute the results. +message NADrop { + // (Required) The input relation. + Relation input = 1; + + // (Optional) Optional list of column names to consider. + // + // When it is empty, all the columns in the input relation will be considered. + repeated string cols = 2; + + // (Optional) The minimum number of non-null and non-NaN values required to keep. + // + // When not set, it is equivalent to the number of considered columns, which means + // a row will be kept only if all columns are non-null. + // + // 'how' options ('all', 'any') can be easily converted to this field: + // - 'all' -> set 'min_non_nulls' 1; + // - 'any' -> keep 'min_non_nulls' unset; + optional int32 min_non_nulls = 3; +} + + +// Replaces old values with the corresponding values. +// It will invoke 'Dataset.na.replace' (same as 'DataFrameNaFunctions.replace') +// to compute the results. +message NAReplace { + // (Required) The input relation. + Relation input = 1; + + // (Optional) List of column names to consider. + // + // When it is empty, all the type-compatible columns in the input relation will be considered. + repeated string cols = 2; + + // (Optional) The value replacement mapping. + repeated Replacement replacements = 3; + + message Replacement { + // (Required) The old value. + // + // Only 4 data types are supported now: null, bool, double, string. + Expression.Literal old_value = 1; + + // (Required) The new value. + // + // Should be of the same data type with the old value. + Expression.Literal new_value = 2; + } +} + + +// Rename columns on the input relation by the same length of names. +message ToDF { + // (Required) The input relation of RenameColumnsBySameLengthNames. + Relation input = 1; + + // (Required) + // + // The number of columns of the input relation must be equal to the length + // of this field. If this is not true, an exception will be returned. + repeated string column_names = 2; +} + + +// Rename columns on the input relation by a map with name to name mapping. +message WithColumnsRenamed { + // (Required) The input relation. + Relation input = 1; + + + // (Required) + // + // Renaming column names of input relation from A to B where A is the map key + // and B is the map value. This is a no-op if schema doesn't contain any A. It + // does not require that all input relation column names to present as keys. + // duplicated B are not allowed. + map rename_columns_map = 2; +} + +// Adding columns or replacing the existing columns that have the same names. +message WithColumns { + // (Required) The input relation. + Relation input = 1; + + // (Required) + // + // Given a column name, apply the corresponding expression on the column. If column + // name exists in the input relation, then replace the column. If the column name + // does not exist in the input relation, then adds it as a new column. + // + // Only one name part is expected from each Expression.Alias. + // + // An exception is thrown when duplicated names are present in the mapping. + repeated Expression.Alias aliases = 2; +} + +// Specify a hint over a relation. Hint should have a name and optional parameters. +message Hint { + // (Required) The input relation. + Relation input = 1; + + // (Required) Hint name. + // + // Supported Join hints include BROADCAST, MERGE, SHUFFLE_HASH, SHUFFLE_REPLICATE_NL. + // + // Supported partitioning hints include COALESCE, REPARTITION, REPARTITION_BY_RANGE. + string name = 2; + + // (Optional) Hint parameters. + repeated Expression parameters = 3; +} + +// Unpivot a DataFrame from wide format to long format, optionally leaving identifier columns set. +message Unpivot { + // (Required) The input relation. + Relation input = 1; + + // (Required) Id columns. + repeated Expression ids = 2; + + // (Optional) Value columns to unpivot. + optional Values values = 3; + + // (Required) Name of the variable column. + string variable_column_name = 4; + + // (Required) Name of the value column. + string value_column_name = 5; + + message Values { + repeated Expression values = 1; + } +} + +message ToSchema { + // (Required) The input relation. + Relation input = 1; + + // (Required) The user provided schema. + // + // The Sever side will update the dataframe with this schema. + DataType schema = 2; +} + +message RepartitionByExpression { + // (Required) The input relation. + Relation input = 1; + + // (Required) The partitioning expressions. + repeated Expression partition_exprs = 2; + + // (Optional) number of partitions, must be positive. + optional int32 num_partitions = 3; +} + +message MapPartitions { + // (Required) Input relation for a mapPartitions-equivalent API: mapInPandas, mapInArrow. + Relation input = 1; + + // (Required) Input user-defined function. + CommonInlineUserDefinedFunction func = 2; +} + +message GroupMap { + // (Required) Input relation for Group Map API: apply, applyInPandas. + Relation input = 1; + + // (Required) Expressions for grouping keys. + repeated Expression grouping_expressions = 2; + + // (Required) Input user-defined function. + CommonInlineUserDefinedFunction func = 3; +} + +message CoGroupMap { + // (Required) One input relation for CoGroup Map API - applyInPandas. + Relation input = 1; + + // Expressions for grouping keys of the first input relation. + repeated Expression input_grouping_expressions = 2; + + // (Required) The other input relation. + Relation other = 3; + + // Expressions for grouping keys of the other input relation. + repeated Expression other_grouping_expressions = 4; + + // (Required) Input user-defined function. + CommonInlineUserDefinedFunction func = 5; +} + +// Collect arbitrary (named) metrics from a dataset. +message CollectMetrics { + // (Required) The input relation. + Relation input = 1; + + // (Required) Name of the metrics. + string name = 2; + + // (Required) The metric sequence. + repeated Expression metrics = 3; +} + +message Parse { + // (Required) Input relation to Parse. The input is expected to have single text column. + Relation input = 1; + // (Required) The expected format of the text. + ParseFormat format = 2; + + // (Optional) DataType representing the schema. If not set, Spark will infer the schema. + optional DataType schema = 3; + + // Options for the csv/json parser. The map key is case insensitive. + map options = 4; + enum ParseFormat { + PARSE_FORMAT_UNSPECIFIED = 0; + PARSE_FORMAT_CSV = 1; + PARSE_FORMAT_JSON = 2; + } +} diff --git a/connector/connect/common/src/main/protobuf/spark/connect/types.proto b/connector/connect/common/src/main/protobuf/spark/connect/types.proto new file mode 100644 index 0000000000000..68833b5d220b3 --- /dev/null +++ b/connector/connect/common/src/main/protobuf/spark/connect/types.proto @@ -0,0 +1,194 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = 'proto3'; + +package spark.connect; + +option java_multiple_files = true; +option java_package = "org.apache.spark.connect.proto"; + +// This message describes the logical [[DataType]] of something. It does not carry the value +// itself but only describes it. +message DataType { + oneof kind { + NULL null = 1; + + Binary binary = 2; + + Boolean boolean = 3; + + // Numeric types + Byte byte = 4; + Short short = 5; + Integer integer = 6; + Long long = 7; + + Float float = 8; + Double double = 9; + Decimal decimal = 10; + + // String types + String string = 11; + Char char = 12; + VarChar var_char = 13; + + // Datatime types + Date date = 14; + Timestamp timestamp = 15; + TimestampNTZ timestamp_ntz = 16; + + // Interval types + CalendarInterval calendar_interval = 17; + YearMonthInterval year_month_interval = 18; + DayTimeInterval day_time_interval = 19; + + // Complex types + Array array = 20; + Struct struct = 21; + Map map = 22; + + // UserDefinedType + UDT udt = 23; + + // UnparsedDataType + Unparsed unparsed = 24; + } + + message Boolean { + uint32 type_variation_reference = 1; + } + + message Byte { + uint32 type_variation_reference = 1; + } + + message Short { + uint32 type_variation_reference = 1; + } + + message Integer { + uint32 type_variation_reference = 1; + } + + message Long { + uint32 type_variation_reference = 1; + } + + message Float { + uint32 type_variation_reference = 1; + } + + message Double { + uint32 type_variation_reference = 1; + } + + message String { + uint32 type_variation_reference = 1; + } + + message Binary { + uint32 type_variation_reference = 1; + } + + message NULL { + uint32 type_variation_reference = 1; + } + + message Timestamp { + uint32 type_variation_reference = 1; + } + + message Date { + uint32 type_variation_reference = 1; + } + + message TimestampNTZ { + uint32 type_variation_reference = 1; + } + + message CalendarInterval { + uint32 type_variation_reference = 1; + } + + message YearMonthInterval { + optional int32 start_field = 1; + optional int32 end_field = 2; + uint32 type_variation_reference = 3; + } + + message DayTimeInterval { + optional int32 start_field = 1; + optional int32 end_field = 2; + uint32 type_variation_reference = 3; + } + + // Start compound types. + message Char { + int32 length = 1; + uint32 type_variation_reference = 2; + } + + message VarChar { + int32 length = 1; + uint32 type_variation_reference = 2; + } + + message Decimal { + optional int32 scale = 1; + optional int32 precision = 2; + uint32 type_variation_reference = 3; + } + + message StructField { + string name = 1; + DataType data_type = 2; + bool nullable = 3; + optional string metadata = 4; + } + + message Struct { + repeated StructField fields = 1; + uint32 type_variation_reference = 2; + } + + message Array { + DataType element_type = 1; + bool contains_null = 2; + uint32 type_variation_reference = 3; + } + + message Map { + DataType key_type = 1; + DataType value_type = 2; + bool value_contains_null = 3; + uint32 type_variation_reference = 4; + } + + message UDT { + string type = 1; + optional string jvm_class = 2; + optional string python_class = 3; + optional string serialized_python_class = 4; + DataType sql_type = 5; + } + + message Unparsed { + // (Required) The unparsed data type string + string data_type_string = 1; + } +} diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/DataTypeProtoConverter.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/DataTypeProtoConverter.scala new file mode 100644 index 0000000000000..28ddbe844d445 --- /dev/null +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/DataTypeProtoConverter.scala @@ -0,0 +1,363 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.common + +import scala.collection.convert.ImplicitConversions._ + +import org.apache.spark.connect.proto +import org.apache.spark.sql.types._ +import org.apache.spark.util.Utils + +/** + * Helper class for conversions between [[DataType]] and [[proto.DataType]]. + */ +object DataTypeProtoConverter { + def toCatalystType(t: proto.DataType): DataType = { + t.getKindCase match { + case proto.DataType.KindCase.NULL => NullType + + case proto.DataType.KindCase.BINARY => BinaryType + + case proto.DataType.KindCase.BOOLEAN => BooleanType + + case proto.DataType.KindCase.BYTE => ByteType + case proto.DataType.KindCase.SHORT => ShortType + case proto.DataType.KindCase.INTEGER => IntegerType + case proto.DataType.KindCase.LONG => LongType + + case proto.DataType.KindCase.FLOAT => FloatType + case proto.DataType.KindCase.DOUBLE => DoubleType + case proto.DataType.KindCase.DECIMAL => toCatalystDecimalType(t.getDecimal) + + case proto.DataType.KindCase.STRING => StringType + case proto.DataType.KindCase.CHAR => CharType(t.getChar.getLength) + case proto.DataType.KindCase.VAR_CHAR => VarcharType(t.getVarChar.getLength) + + case proto.DataType.KindCase.DATE => DateType + case proto.DataType.KindCase.TIMESTAMP => TimestampType + case proto.DataType.KindCase.TIMESTAMP_NTZ => TimestampNTZType + + case proto.DataType.KindCase.CALENDAR_INTERVAL => CalendarIntervalType + case proto.DataType.KindCase.YEAR_MONTH_INTERVAL => + toCatalystYearMonthIntervalType(t.getYearMonthInterval) + case proto.DataType.KindCase.DAY_TIME_INTERVAL => + toCatalystDayTimeIntervalType(t.getDayTimeInterval) + + case proto.DataType.KindCase.ARRAY => toCatalystArrayType(t.getArray) + case proto.DataType.KindCase.STRUCT => toCatalystStructType(t.getStruct) + case proto.DataType.KindCase.MAP => toCatalystMapType(t.getMap) + + case proto.DataType.KindCase.UDT => toCatalystUDT(t.getUdt) + + case _ => + throw InvalidPlanInput(s"Does not support convert ${t.getKindCase} to catalyst types.") + } + } + + private def toCatalystDecimalType(t: proto.DataType.Decimal): DecimalType = { + (t.hasPrecision, t.hasScale) match { + case (true, true) => DecimalType(t.getPrecision, t.getScale) + case (true, false) => new DecimalType(t.getPrecision) + case _ => new DecimalType() + } + } + + private def toCatalystYearMonthIntervalType(t: proto.DataType.YearMonthInterval) = { + (t.hasStartField, t.hasEndField) match { + case (true, true) => YearMonthIntervalType(t.getStartField.toByte, t.getEndField.toByte) + case (true, false) => YearMonthIntervalType(t.getStartField.toByte) + case _ => YearMonthIntervalType() + } + } + + private def toCatalystDayTimeIntervalType(t: proto.DataType.DayTimeInterval) = { + (t.hasStartField, t.hasEndField) match { + case (true, true) => DayTimeIntervalType(t.getStartField.toByte, t.getEndField.toByte) + case (true, false) => DayTimeIntervalType(t.getStartField.toByte) + case _ => DayTimeIntervalType() + } + } + + private def toCatalystArrayType(t: proto.DataType.Array): ArrayType = { + ArrayType(toCatalystType(t.getElementType), t.getContainsNull) + } + + private def toCatalystStructType(t: proto.DataType.Struct): StructType = { + val fields = t.getFieldsList.toSeq.map { protoField => + val metadata = if (protoField.hasMetadata) { + Metadata.fromJson(protoField.getMetadata) + } else { + Metadata.empty + } + StructField( + name = protoField.getName, + dataType = toCatalystType(protoField.getDataType), + nullable = protoField.getNullable, + metadata = metadata) + } + StructType.apply(fields) + } + + private def toCatalystMapType(t: proto.DataType.Map): MapType = { + MapType(toCatalystType(t.getKeyType), toCatalystType(t.getValueType), t.getValueContainsNull) + } + + private def toCatalystUDT(t: proto.DataType.UDT): UserDefinedType[_] = { + if (t.getType != "udt") { + throw InvalidPlanInput( + s"""UserDefinedType requires the 'type' field to be 'udt', but got '${t.getType}'.""") + } + + if (t.hasJvmClass) { + Utils + .classForName[UserDefinedType[_]](t.getJvmClass) + .getConstructor() + .newInstance() + } else { + if (!t.hasPythonClass || !t.hasSerializedPythonClass || !t.hasSqlType) { + throw InvalidPlanInput( + "PythonUserDefinedType requires all the three fields: " + + "python_class, serialized_python_class and sql_type.") + } + + new PythonUserDefinedType( + sqlType = toCatalystType(t.getSqlType), + pyUDT = t.getPythonClass, + serializedPyClass = t.getSerializedPythonClass) + } + } + + def toConnectProtoType(t: DataType): proto.DataType = { + t match { + case NullType => + proto.DataType + .newBuilder() + .setNull(proto.DataType.NULL.getDefaultInstance) + .build() + + case BooleanType => + proto.DataType + .newBuilder() + .setBoolean(proto.DataType.Boolean.getDefaultInstance) + .build() + + case BinaryType => + proto.DataType + .newBuilder() + .setBinary(proto.DataType.Binary.getDefaultInstance) + .build() + + case ByteType => + proto.DataType + .newBuilder() + .setByte(proto.DataType.Byte.getDefaultInstance) + .build() + + case ShortType => + proto.DataType + .newBuilder() + .setShort(proto.DataType.Short.getDefaultInstance) + .build() + + case IntegerType => + proto.DataType + .newBuilder() + .setInteger(proto.DataType.Integer.getDefaultInstance) + .build() + + case LongType => + proto.DataType + .newBuilder() + .setLong(proto.DataType.Long.getDefaultInstance) + .build() + + case FloatType => + proto.DataType + .newBuilder() + .setFloat(proto.DataType.Float.getDefaultInstance) + .build() + + case DoubleType => + proto.DataType + .newBuilder() + .setDouble(proto.DataType.Double.getDefaultInstance) + .build() + + case DecimalType.Fixed(precision, scale) => + proto.DataType + .newBuilder() + .setDecimal( + proto.DataType.Decimal.newBuilder().setPrecision(precision).setScale(scale).build()) + .build() + + case StringType => + proto.DataType + .newBuilder() + .setString(proto.DataType.String.getDefaultInstance) + .build() + + case CharType(length) => + proto.DataType + .newBuilder() + .setChar(proto.DataType.Char.newBuilder().setLength(length).build()) + .build() + + case VarcharType(length) => + proto.DataType + .newBuilder() + .setVarChar(proto.DataType.VarChar.newBuilder().setLength(length).build()) + .build() + + case DateType => + proto.DataType + .newBuilder() + .setDate(proto.DataType.Date.getDefaultInstance) + .build() + + case TimestampType => + proto.DataType + .newBuilder() + .setTimestamp(proto.DataType.Timestamp.getDefaultInstance) + .build() + + case TimestampNTZType => + proto.DataType + .newBuilder() + .setTimestampNtz(proto.DataType.TimestampNTZ.getDefaultInstance) + .build() + + case CalendarIntervalType => + proto.DataType + .newBuilder() + .setCalendarInterval(proto.DataType.CalendarInterval.getDefaultInstance) + .build() + + case YearMonthIntervalType(startField, endField) => + proto.DataType + .newBuilder() + .setYearMonthInterval( + proto.DataType.YearMonthInterval + .newBuilder() + .setStartField(startField) + .setEndField(endField) + .build()) + .build() + + case DayTimeIntervalType(startField, endField) => + proto.DataType + .newBuilder() + .setDayTimeInterval( + proto.DataType.DayTimeInterval + .newBuilder() + .setStartField(startField) + .setEndField(endField) + .build()) + .build() + + case ArrayType(elementType: DataType, containsNull: Boolean) => + proto.DataType + .newBuilder() + .setArray( + proto.DataType.Array + .newBuilder() + .setElementType(toConnectProtoType(elementType)) + .setContainsNull(containsNull) + .build()) + .build() + + case StructType(fields: Array[StructField]) => + val protoFields = fields.toSeq.map { + case StructField( + name: String, + dataType: DataType, + nullable: Boolean, + metadata: Metadata) => + if (metadata.equals(Metadata.empty)) { + proto.DataType.StructField + .newBuilder() + .setName(name) + .setDataType(toConnectProtoType(dataType)) + .setNullable(nullable) + .build() + } else { + proto.DataType.StructField + .newBuilder() + .setName(name) + .setDataType(toConnectProtoType(dataType)) + .setNullable(nullable) + .setMetadata(metadata.json) + .build() + } + } + proto.DataType + .newBuilder() + .setStruct( + proto.DataType.Struct + .newBuilder() + .addAllFields(protoFields) + .build()) + .build() + + case MapType(keyType: DataType, valueType: DataType, valueContainsNull: Boolean) => + proto.DataType + .newBuilder() + .setMap( + proto.DataType.Map + .newBuilder() + .setKeyType(toConnectProtoType(keyType)) + .setValueType(toConnectProtoType(valueType)) + .setValueContainsNull(valueContainsNull) + .build()) + .build() + + case pyudt: PythonUserDefinedType => + // Python UDT + proto.DataType + .newBuilder() + .setUdt( + proto.DataType.UDT + .newBuilder() + .setType("udt") + .setPythonClass(pyudt.pyUDT) + .setSqlType(toConnectProtoType(pyudt.sqlType)) + .setSerializedPythonClass(pyudt.serializedPyClass) + .build()) + .build() + + case udt: UserDefinedType[_] => + // Scala/Java UDT + val builder = proto.DataType.UDT.newBuilder() + builder + .setType("udt") + .setJvmClass(udt.getClass.getName) + .setSqlType(toConnectProtoType(udt.sqlType)) + + if (udt.pyUDT != null) { + builder.setPythonClass(udt.pyUDT) + } + + proto.DataType + .newBuilder() + .setUdt(builder.build()) + .build() + + case _ => + throw InvalidPlanInput(s"Does not support convert ${t.typeName} to connect proto types.") + } + } +} diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/InvalidPlanInput.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/InvalidPlanInput.scala new file mode 100644 index 0000000000000..0caa4122f098c --- /dev/null +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/InvalidPlanInput.scala @@ -0,0 +1,25 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.common + +/** + * Error thrown when a connect plan is not valid. + */ +final case class InvalidPlanInput( + private val message: String = "", + private val cause: Throwable = None.orNull) + extends Exception(message, cause) diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/LiteralValueProtoConverter.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/LiteralValueProtoConverter.scala new file mode 100644 index 0000000000000..ceef9b21244f4 --- /dev/null +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/LiteralValueProtoConverter.scala @@ -0,0 +1,145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.common + +import java.lang.{Boolean => JBoolean, Byte => JByte, Character => JChar, Double => JDouble, Float => JFloat, Integer => JInteger, Long => JLong, Short => JShort} +import java.math.{BigDecimal => JBigDecimal} +import java.sql.{Date, Timestamp} +import java.time._ + +import com.google.protobuf.ByteString + +import org.apache.spark.connect.proto +import org.apache.spark.sql.catalyst.util.{DateTimeUtils, IntervalUtils} +import org.apache.spark.sql.connect.common.DataTypeProtoConverter._ +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.CalendarInterval + +object LiteralValueProtoConverter { + + private lazy val nullType = + proto.DataType.newBuilder().setNull(proto.DataType.NULL.getDefaultInstance).build() + + /** + * Transforms literal value to the `proto.Expression.Literal.Builder`. + * + * @return + * proto.Expression.Literal.Builder + */ + @scala.annotation.tailrec + def toLiteralProtoBuilder(literal: Any): proto.Expression.Literal.Builder = { + val builder = proto.Expression.Literal.newBuilder() + + def decimalBuilder(precision: Int, scale: Int, value: String) = { + builder.getDecimalBuilder.setPrecision(precision).setScale(scale).setValue(value) + } + + def calendarIntervalBuilder(months: Int, days: Int, microseconds: Long) = { + builder.getCalendarIntervalBuilder + .setMonths(months) + .setDays(days) + .setMicroseconds(microseconds) + } + + def arrayBuilder(array: Array[_]) = { + val ab = builder.getArrayBuilder + .setElementType(toConnectProtoType(toDataType(array.getClass.getComponentType))) + array.foreach(x => ab.addElements(toLiteralProto(x))) + ab + } + + literal match { + case v: Boolean => builder.setBoolean(v) + case v: Byte => builder.setByte(v) + case v: Short => builder.setShort(v) + case v: Int => builder.setInteger(v) + case v: Long => builder.setLong(v) + case v: Float => builder.setFloat(v) + case v: Double => builder.setDouble(v) + case v: BigDecimal => + builder.setDecimal(decimalBuilder(v.precision, v.scale, v.toString)) + case v: JBigDecimal => + builder.setDecimal(decimalBuilder(v.precision, v.scale, v.toString)) + case v: String => builder.setString(v) + case v: Char => builder.setString(v.toString) + case v: Array[Char] => builder.setString(String.valueOf(v)) + case v: Array[Byte] => builder.setBinary(ByteString.copyFrom(v)) + case v: collection.mutable.WrappedArray[_] => toLiteralProtoBuilder(v.array) + case v: LocalDate => builder.setDate(v.toEpochDay.toInt) + case v: Decimal => + builder.setDecimal(decimalBuilder(Math.max(v.precision, v.scale), v.scale, v.toString)) + case v: Instant => builder.setTimestamp(DateTimeUtils.instantToMicros(v)) + case v: Timestamp => builder.setTimestamp(DateTimeUtils.fromJavaTimestamp(v)) + case v: LocalDateTime => builder.setTimestampNtz(DateTimeUtils.localDateTimeToMicros(v)) + case v: Date => builder.setDate(DateTimeUtils.fromJavaDate(v)) + case v: Duration => builder.setDayTimeInterval(IntervalUtils.durationToMicros(v)) + case v: Period => builder.setYearMonthInterval(IntervalUtils.periodToMonths(v)) + case v: Array[_] => builder.setArray(arrayBuilder(v)) + case v: CalendarInterval => + builder.setCalendarInterval(calendarIntervalBuilder(v.months, v.days, v.microseconds)) + case null => builder.setNull(nullType) + case _ => throw new UnsupportedOperationException(s"literal $literal not supported (yet).") + } + } + + /** + * Transforms literal value to the `proto.Expression.Literal`. + * + * @return + * proto.Expression.Literal + */ + def toLiteralProto(literal: Any): proto.Expression.Literal = + toLiteralProtoBuilder(literal).build() + + private def toDataType(clz: Class[_]): DataType = clz match { + // primitive types + case JShort.TYPE => ShortType + case JInteger.TYPE => IntegerType + case JLong.TYPE => LongType + case JDouble.TYPE => DoubleType + case JByte.TYPE => ByteType + case JFloat.TYPE => FloatType + case JBoolean.TYPE => BooleanType + case JChar.TYPE => StringType + + // java classes + case _ if clz == classOf[LocalDate] || clz == classOf[Date] => DateType + case _ if clz == classOf[Instant] || clz == classOf[Timestamp] => TimestampType + case _ if clz == classOf[LocalDateTime] => TimestampNTZType + case _ if clz == classOf[Duration] => DayTimeIntervalType.DEFAULT + case _ if clz == classOf[Period] => YearMonthIntervalType.DEFAULT + case _ if clz == classOf[JBigDecimal] => DecimalType.SYSTEM_DEFAULT + case _ if clz == classOf[Array[Byte]] => BinaryType + case _ if clz == classOf[Array[Char]] => StringType + case _ if clz == classOf[JShort] => ShortType + case _ if clz == classOf[JInteger] => IntegerType + case _ if clz == classOf[JLong] => LongType + case _ if clz == classOf[JDouble] => DoubleType + case _ if clz == classOf[JByte] => ByteType + case _ if clz == classOf[JFloat] => FloatType + case _ if clz == classOf[JBoolean] => BooleanType + + // other scala classes + case _ if clz == classOf[String] => StringType + case _ if clz == classOf[BigInt] || clz == classOf[BigDecimal] => DecimalType.SYSTEM_DEFAULT + case _ if clz == classOf[CalendarInterval] => CalendarIntervalType + case _ if clz.isArray => ArrayType(toDataType(clz.getComponentType)) + case _ => + throw new UnsupportedOperationException(s"Unsupported component type $clz in arrays.") + } +} diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/StorageLevelProtoConverter.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/StorageLevelProtoConverter.scala new file mode 100644 index 0000000000000..7bf273843b5c3 --- /dev/null +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/StorageLevelProtoConverter.scala @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.common + +import org.apache.spark.connect.proto +import org.apache.spark.storage.StorageLevel + +/** + * Helper class for conversions between [[StrageLevel]] and [[proto.StorageLevel]]. + */ +object StorageLevelProtoConverter { + def toStorageLevel(sl: proto.StorageLevel): StorageLevel = { + StorageLevel( + useDisk = sl.getUseDisk, + useMemory = sl.getUseMemory, + useOffHeap = sl.getUseOffHeap, + deserialized = sl.getDeserialized, + replication = sl.getReplication) + } + + def toConnectProtoType(sl: StorageLevel): proto.StorageLevel = { + proto.StorageLevel + .newBuilder() + .setUseDisk(sl.useDisk) + .setUseMemory(sl.useMemory) + .setUseOffHeap(sl.useOffHeap) + .setDeserialized(sl.deserialized) + .setReplication(sl.replication) + .build() + } +} diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/UdfPacket.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/UdfPacket.scala new file mode 100644 index 0000000000000..6829b8d1b21a6 --- /dev/null +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/UdfPacket.scala @@ -0,0 +1,70 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.common + +import com.google.protobuf.ByteString +import java.io.{InputStream, ObjectInputStream, ObjectOutputStream, OutputStream} + +import org.apache.spark.sql.catalyst.encoders.AgnosticEncoder + +/** + * A wrapper class around the UDF and it's Input/Output [[AgnosticEncoder]](s). + * + * This class is shared between the client and the server to allow for serialization and + * deserialization of the JVM object. + * + * @param function + * The UDF + * @param inputEncoders + * A list of [[AgnosticEncoder]](s) for all input arguments of the UDF + * @param outputEncoder + * An [[AgnosticEncoder]] for the output of the UDF + */ +@SerialVersionUID(8866761834651399125L) +case class UdfPacket( + function: AnyRef, + inputEncoders: Seq[AgnosticEncoder[_]], + outputEncoder: AgnosticEncoder[_]) + extends Serializable { + + def writeTo(out: OutputStream): Unit = { + val oos = new ObjectOutputStream(out) + oos.writeObject(this) + oos.flush() + } + + def toByteString: ByteString = { + val out = ByteString.newOutput() + writeTo(out) + out.toByteString + } +} + +object UdfPacket { + def apply(in: InputStream): UdfPacket = { + val ois = new ObjectInputStream(in) + ois.readObject().asInstanceOf[UdfPacket] + } + + def apply(bytes: ByteString): UdfPacket = { + val in = bytes.newInput() + try UdfPacket(in) + finally { + in.close() + } + } +} diff --git a/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/config/ConnectCommon.scala b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/config/ConnectCommon.scala new file mode 100644 index 0000000000000..3f594d79b627b --- /dev/null +++ b/connector/connect/common/src/main/scala/org/apache/spark/sql/connect/common/config/ConnectCommon.scala @@ -0,0 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.common.config + +private[connect] object ConnectCommon { + val CONNECT_GRPC_BINDING_PORT: Int = 15002 + val CONNECT_GRPC_MAX_MESSAGE_SIZE: Int = 128 * 1024 * 1024; +} diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/alias_string.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/alias_string.explain new file mode 100644 index 0000000000000..4e31a67c18f80 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/alias_string.explain @@ -0,0 +1,2 @@ +SubqueryAlias fooz ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/alias_symbol.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/alias_symbol.explain new file mode 100644 index 0000000000000..552164a15c6d7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/alias_symbol.explain @@ -0,0 +1,2 @@ +SubqueryAlias bob ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/apply.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/apply.explain new file mode 100644 index 0000000000000..a01142070a531 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/apply.explain @@ -0,0 +1,2 @@ +Project [a#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/as_string.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/as_string.explain new file mode 100644 index 0000000000000..f192daed1a3f9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/as_string.explain @@ -0,0 +1,2 @@ +SubqueryAlias foo ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/as_symbol.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/as_symbol.explain new file mode 100644 index 0000000000000..08bb0d1a86abf --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/as_symbol.explain @@ -0,0 +1,2 @@ +SubqueryAlias bar ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/coalesce.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/coalesce.explain new file mode 100644 index 0000000000000..5d300bdd16250 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/coalesce.explain @@ -0,0 +1,2 @@ +Repartition 5, false ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/col.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/col.explain new file mode 100644 index 0000000000000..6219ddc79c1de --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/col.explain @@ -0,0 +1,2 @@ +Project [id#0L, b#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/colRegex.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/colRegex.explain new file mode 100644 index 0000000000000..c0a9b3df30b26 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/colRegex.explain @@ -0,0 +1,2 @@ +Project [id#0L, a#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_add.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_add.explain new file mode 100644 index 0000000000000..a00233be9c556 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_add.explain @@ -0,0 +1,2 @@ +Project [(cast(a#0 as double) + b#0) AS (a + b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_alias.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_alias.explain new file mode 100644 index 0000000000000..aa9a6af225846 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_alias.explain @@ -0,0 +1,2 @@ +Project [a#0 AS b#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_and.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_and.explain new file mode 100644 index 0000000000000..c65419786287e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_and.explain @@ -0,0 +1,2 @@ +Project [((a#0 > 10) AND (b#0 < 0.5)) AS ((a > 10) AND (b < 0.5))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_apply.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_apply.explain new file mode 100644 index 0000000000000..06025418e24f6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_apply.explain @@ -0,0 +1,2 @@ +Project [f#0[super_duper_key] AS f[super_duper_key]#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_as_multi.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_as_multi.explain new file mode 100644 index 0000000000000..097223afda75d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_as_multi.explain @@ -0,0 +1,3 @@ +Project [v1#0L, v2#0, v3#0] ++- Generate inline(map_values(f#0)), false, [v1#0L, v2#0, v3#0] + +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_as_with_metadata.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_as_with_metadata.explain new file mode 100644 index 0000000000000..9b0e538bf1628 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_as_with_metadata.explain @@ -0,0 +1,2 @@ +Project [e#0 AS e_mod#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_asc.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_asc.explain new file mode 100644 index 0000000000000..1223297b2d438 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_asc.explain @@ -0,0 +1,2 @@ +Sort [a#0 ASC NULLS FIRST], true ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_asc_nulls_first.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_asc_nulls_first.explain new file mode 100644 index 0000000000000..1223297b2d438 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_asc_nulls_first.explain @@ -0,0 +1,2 @@ +Sort [a#0 ASC NULLS FIRST], true ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_asc_nulls_last.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_asc_nulls_last.explain new file mode 100644 index 0000000000000..62d108d46d527 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_asc_nulls_last.explain @@ -0,0 +1,2 @@ +Sort [a#0 ASC NULLS LAST], true ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_between.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_between.explain new file mode 100644 index 0000000000000..140fefe250f89 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_between.explain @@ -0,0 +1,2 @@ +Project [((a#0 >= 10) AND (a#0 <= 20)) AS ((a >= 10) AND (a <= 20))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_bitwiseAND.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_bitwiseAND.explain new file mode 100644 index 0000000000000..497e839477e28 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_bitwiseAND.explain @@ -0,0 +1,2 @@ +Project [(a#0 & 255) AS (a & 255)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_bitwiseOR.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_bitwiseOR.explain new file mode 100644 index 0000000000000..ceb4f23b43905 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_bitwiseOR.explain @@ -0,0 +1,2 @@ +Project [(a#0 | 7) AS (a | 7)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_bitwiseXOR.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_bitwiseXOR.explain new file mode 100644 index 0000000000000..efb99f2993fb6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_bitwiseXOR.explain @@ -0,0 +1,2 @@ +Project [(a#0 ^ 78) AS (a ^ 78)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_cast.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_cast.explain new file mode 100644 index 0000000000000..88451be9338a8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_cast.explain @@ -0,0 +1,2 @@ +Project [cast(a#0 as bigint) AS a#0L] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_contains.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_contains.explain new file mode 100644 index 0000000000000..15b561a68f1a0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_contains.explain @@ -0,0 +1,2 @@ +Project [Contains(g#0, baz) AS contains(g, baz)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_desc.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_desc.explain new file mode 100644 index 0000000000000..89f4080296ac1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_desc.explain @@ -0,0 +1,2 @@ +Sort [b#0 DESC NULLS LAST], true ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_desc_nulls_first.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_desc_nulls_first.explain new file mode 100644 index 0000000000000..bb12e4aabc946 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_desc_nulls_first.explain @@ -0,0 +1,2 @@ +Sort [b#0 DESC NULLS FIRST], true ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_desc_nulls_last.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_desc_nulls_last.explain new file mode 100644 index 0000000000000..89f4080296ac1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_desc_nulls_last.explain @@ -0,0 +1,2 @@ +Sort [b#0 DESC NULLS LAST], true ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_divide.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_divide.explain new file mode 100644 index 0000000000000..8e8e4de67a3f6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_divide.explain @@ -0,0 +1,2 @@ +Project [(cast(a#0 as double) / cast(b#0 as double)) AS (a / b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_dropFields.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_dropFields.explain new file mode 100644 index 0000000000000..3216a4b916084 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_dropFields.explain @@ -0,0 +1,2 @@ +Project [update_fields(d#0, dropfield(a), dropfield(c)) AS update_fields(d, dropfield(), dropfield())#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_endsWith.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_endsWith.explain new file mode 100644 index 0000000000000..b9144451a2763 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_endsWith.explain @@ -0,0 +1,2 @@ +Project [EndsWith(g#0, suffix_) AS endswith(g, suffix_)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_eqNullSafe.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_eqNullSafe.explain new file mode 100644 index 0000000000000..8b43c43c8de32 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_eqNullSafe.explain @@ -0,0 +1,2 @@ +Project [(cast(a#0 as double) <=> b#0) AS (a <=> b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_equals.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_equals.explain new file mode 100644 index 0000000000000..84abc512662cf --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_equals.explain @@ -0,0 +1,2 @@ +Project [(cast(a#0 as double) = b#0) AS (a = b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_geq.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_geq.explain new file mode 100644 index 0000000000000..e7a922f21a763 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_geq.explain @@ -0,0 +1,2 @@ +Project [(cast(a#0 as double) >= b#0) AS (a >= b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_getField.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_getField.explain new file mode 100644 index 0000000000000..602251709f980 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_getField.explain @@ -0,0 +1,2 @@ +Project [d#0.b AS d.b#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_getItem.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_getItem.explain new file mode 100644 index 0000000000000..31d522711c580 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_getItem.explain @@ -0,0 +1,2 @@ +Project [e#0[3] AS e[3]#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_gt.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_gt.explain new file mode 100644 index 0000000000000..3f3f44080423c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_gt.explain @@ -0,0 +1,2 @@ +Project [(cast(a#0 as double) > b#0) AS (a > b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_ilike.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_ilike.explain new file mode 100644 index 0000000000000..1fb1a4718a04b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_ilike.explain @@ -0,0 +1,2 @@ +Project [g#0 LIKE %fOb% AS g LIKE %fOb%#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_isNaN.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_isNaN.explain new file mode 100644 index 0000000000000..a93e063e4e136 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_isNaN.explain @@ -0,0 +1,2 @@ +Project [isnan(b#0) AS isnan(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_isNotNull.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_isNotNull.explain new file mode 100644 index 0000000000000..bae67b1787150 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_isNotNull.explain @@ -0,0 +1,2 @@ +Project [isnotnull(g#0) AS (g IS NOT NULL)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_isNull.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_isNull.explain new file mode 100644 index 0000000000000..085d77bc5e89f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_isNull.explain @@ -0,0 +1,2 @@ +Project [isnull(g#0) AS (g IS NULL)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_isin.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_isin.explain new file mode 100644 index 0000000000000..f3e33acbaa8f6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_isin.explain @@ -0,0 +1,2 @@ +Project [g#0 IN (hello,world,foo) AS (g IN (hello, world, foo))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_leq.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_leq.explain new file mode 100644 index 0000000000000..dc23b7fabaf9c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_leq.explain @@ -0,0 +1,2 @@ +Project [(cast(a#0 as double) <= b#0) AS (a <= b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_like.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_like.explain new file mode 100644 index 0000000000000..ade16bb349aaa --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_like.explain @@ -0,0 +1,2 @@ +Project [g#0 LIKE %bob% AS g LIKE %bob%#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_lt.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_lt.explain new file mode 100644 index 0000000000000..62c664e3b25c2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_lt.explain @@ -0,0 +1,2 @@ +Project [(cast(a#0 as double) < b#0) AS (a < b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_modulo.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_modulo.explain new file mode 100644 index 0000000000000..b2fa105afc1fc --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_modulo.explain @@ -0,0 +1,2 @@ +Project [(a#0 % 10) AS (a % 10)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_multiply.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_multiply.explain new file mode 100644 index 0000000000000..14a8a180ffa06 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_multiply.explain @@ -0,0 +1,2 @@ +Project [(cast(a#0 as double) * b#0) AS (a * b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_not.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_not.explain new file mode 100644 index 0000000000000..3bf350c7964b4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_not.explain @@ -0,0 +1,2 @@ +Project [NOT true AS (NOT true)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_not_equals.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_not_equals.explain new file mode 100644 index 0000000000000..249c00c568e75 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_not_equals.explain @@ -0,0 +1,2 @@ +Project [NOT (cast(a#0 as double) = b#0) AS (NOT (a = b))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_or.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_or.explain new file mode 100644 index 0000000000000..1447d506d3333 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_or.explain @@ -0,0 +1,2 @@ +Project [((a#0 > 10) OR (b#0 < 0.5)) AS ((a > 10) OR (b < 0.5))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_rlike.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_rlike.explain new file mode 100644 index 0000000000000..89a351f1ec7b1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_rlike.explain @@ -0,0 +1,2 @@ +Project [g#0 LIKE ^[0-9]*$ AS g LIKE ^[0-9]*$#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_star.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_star.explain new file mode 100644 index 0000000000000..d2bcd89c109ac --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_star.explain @@ -0,0 +1,2 @@ +Project [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_star_with_target.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_star_with_target.explain new file mode 100644 index 0000000000000..0ae702c2c6bd2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_star_with_target.explain @@ -0,0 +1,2 @@ +Project [d#0.id AS id#0L, d#0.a AS a#0, d#0.b AS b#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_startsWith.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_startsWith.explain new file mode 100644 index 0000000000000..0e4d63ca6001c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_startsWith.explain @@ -0,0 +1,2 @@ +Project [StartsWith(g#0, prefix_) AS startswith(g, prefix_)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_substr.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_substr.explain new file mode 100644 index 0000000000000..b9c8b8646c960 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_substr.explain @@ -0,0 +1,2 @@ +Project [substr(g#0, 8, 3) AS substr(g, 8, 3)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_subtract.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_subtract.explain new file mode 100644 index 0000000000000..9b1eb4866d1b1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_subtract.explain @@ -0,0 +1,2 @@ +Project [(cast(a#0 as double) - b#0) AS (a - b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_unary_minus.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_unary_minus.explain new file mode 100644 index 0000000000000..b2b7bcfb85563 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_unary_minus.explain @@ -0,0 +1,2 @@ +Project [-1 AS negative(1)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_when_otherwise.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_when_otherwise.explain new file mode 100644 index 0000000000000..62858aad0df57 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_when_otherwise.explain @@ -0,0 +1,2 @@ +Project [CASE WHEN (a#0 < 10) THEN low WHEN (a#0 < 20) THEN medium ELSE high END AS CASE WHEN (a < 10) THEN low WHEN (a < 20) THEN medium ELSE high END#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/column_withField.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/column_withField.explain new file mode 100644 index 0000000000000..575fe0b0fd751 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/column_withField.explain @@ -0,0 +1,2 @@ +Project [update_fields(d#0, WithField(x, xq)) AS update_fields(d, WithField(xq))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/crossJoin.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/crossJoin.explain new file mode 100644 index 0000000000000..612f8337ac9b2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/crossJoin.explain @@ -0,0 +1,3 @@ +'Join Cross +:- LocalRelation , [id#0L, a#0, b#0] ++- LocalRelation , [a#0, id#0L, payload#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/crosstab.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/crosstab.explain new file mode 100644 index 0000000000000..a30cd136e8db8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/crosstab.explain @@ -0,0 +1,5 @@ +Project [a_b#0] ++- Project [a_b#0] + +- Aggregate [a_b#0], [a_b#0, pivotfirst(__pivot_col#0, count(1) AS count#0L, 0, 0) AS __pivot_count(1) AS count AS `count(1) AS count`#0] + +- Aggregate [CASE WHEN isnull(a#0) THEN null ELSE cast(a#0 as string) END, CASE WHEN isnull(b#0) THEN null ELSE regexp_replace(cast(b#0 as string), `, , 1) END], [CASE WHEN isnull(a#0) THEN null ELSE cast(a#0 as string) END AS a_b#0, CASE WHEN isnull(b#0) THEN null ELSE regexp_replace(cast(b#0 as string), `, , 1) END AS __pivot_col#0, count(1) AS count(1) AS count#0L] + +- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/csv_from_dataset.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/csv_from_dataset.explain new file mode 100644 index 0000000000000..9fbaa9fcede81 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/csv_from_dataset.explain @@ -0,0 +1 @@ +LogicalRDD [c1#0, c2#0], false diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/cube_column.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/cube_column.explain new file mode 100644 index 0000000000000..1721162f4783f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/cube_column.explain @@ -0,0 +1,4 @@ +Aggregate [a#0, b#0, spark_grouping_id#0L], [a#0, b#0, count(1) AS count#0L] ++- Expand [[id#0L, a#0, b#0, a#0, b#0, 0], [id#0L, a#0, b#0, a#0, null, 1], [id#0L, a#0, b#0, null, b#0, 2], [id#0L, a#0, b#0, null, null, 3]], [id#0L, a#0, b#0, a#0, b#0, spark_grouping_id#0L] + +- Project [id#0L, a#0, b#0, a#0 AS a#0, b#0 AS b#0] + +- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/cube_string.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/cube_string.explain new file mode 100644 index 0000000000000..1721162f4783f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/cube_string.explain @@ -0,0 +1,4 @@ +Aggregate [a#0, b#0, spark_grouping_id#0L], [a#0, b#0, count(1) AS count#0L] ++- Expand [[id#0L, a#0, b#0, a#0, b#0, 0], [id#0L, a#0, b#0, a#0, null, 1], [id#0L, a#0, b#0, null, b#0, 2], [id#0L, a#0, b#0, null, null, 3]], [id#0L, a#0, b#0, a#0, b#0, spark_grouping_id#0L] + +- Project [id#0L, a#0, b#0, a#0 AS a#0, b#0 AS b#0] + +- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/describe.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/describe.explain new file mode 100644 index 0000000000000..f205f7ef7a140 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/describe.explain @@ -0,0 +1,6 @@ +Project [summary#0, element_at(id#0, summary#0, None, false) AS id#0, element_at(b#0, summary#0, None, false) AS b#0] ++- Project [id#0, b#0, summary#0] + +- Generate explode([count,mean,stddev,min,max]), false, [summary#0] + +- Aggregate [map(cast(count as string), cast(count(id#0L) as string), cast(mean as string), cast(avg(id#0L) as string), cast(stddev as string), cast(stddev_samp(cast(id#0L as double)) as string), cast(min as string), cast(min(id#0L) as string), cast(max as string), cast(max(id#0L) as string)) AS id#0, map(cast(count as string), cast(count(b#0) as string), cast(mean as string), cast(avg(b#0) as string), cast(stddev as string), cast(stddev_samp(b#0) as string), cast(min as string), cast(min(b#0) as string), cast(max as string), cast(max(b#0) as string)) AS b#0] + +- Project [id#0L, b#0] + +- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/distinct.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/distinct.explain new file mode 100644 index 0000000000000..e809829dfa59e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/distinct.explain @@ -0,0 +1,2 @@ +Deduplicate [id#0L, a#0, b#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/drop.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/drop.explain new file mode 100644 index 0000000000000..85a15dfab8da4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/drop.explain @@ -0,0 +1,2 @@ +Filter atleastnnonnulls(5, id#0L, a#0) ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/dropDuplicates.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/dropDuplicates.explain new file mode 100644 index 0000000000000..e809829dfa59e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/dropDuplicates.explain @@ -0,0 +1,2 @@ +Deduplicate [id#0L, a#0, b#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/dropDuplicates_names_array.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/dropDuplicates_names_array.explain new file mode 100644 index 0000000000000..6a85a347caf50 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/dropDuplicates_names_array.explain @@ -0,0 +1,2 @@ +Deduplicate [a#0, id#0L] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/dropDuplicates_names_seq.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/dropDuplicates_names_seq.explain new file mode 100644 index 0000000000000..5af0ec857d264 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/dropDuplicates_names_seq.explain @@ -0,0 +1,2 @@ +Deduplicate [a#0, b#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/dropDuplicates_varargs.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/dropDuplicates_varargs.explain new file mode 100644 index 0000000000000..0a9079864d5da --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/dropDuplicates_varargs.explain @@ -0,0 +1,2 @@ +Deduplicate [a#0, b#0, id#0L] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/drop_multiple_column.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/drop_multiple_column.explain new file mode 100644 index 0000000000000..a01142070a531 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/drop_multiple_column.explain @@ -0,0 +1,2 @@ +Project [a#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/drop_multiple_strings.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/drop_multiple_strings.explain new file mode 100644 index 0000000000000..bdd178772c8d9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/drop_multiple_strings.explain @@ -0,0 +1,2 @@ +Project ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/drop_single_column.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/drop_single_column.explain new file mode 100644 index 0000000000000..c0a9b3df30b26 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/drop_single_column.explain @@ -0,0 +1,2 @@ +Project [id#0L, a#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/drop_single_string.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/drop_single_string.explain new file mode 100644 index 0000000000000..6219ddc79c1de --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/drop_single_string.explain @@ -0,0 +1,2 @@ +Project [id#0L, b#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/except.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/except.explain new file mode 100644 index 0000000000000..d9e8cc93af5d2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/except.explain @@ -0,0 +1,3 @@ +'Except false +:- LocalRelation , [id#0L, a#0, b#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/exceptAll.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/exceptAll.explain new file mode 100644 index 0000000000000..5519e85b1fe03 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/exceptAll.explain @@ -0,0 +1,3 @@ +'Except All true +:- LocalRelation , [id#0L, a#0, b#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/expression_extension.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/expression_extension.explain new file mode 100644 index 0000000000000..7426332004a81 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/expression_extension.explain @@ -0,0 +1,2 @@ +Project [id#0L AS abc#0L] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/fill.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/fill.explain new file mode 100644 index 0000000000000..12d9bff0e8a61 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/fill.explain @@ -0,0 +1,2 @@ +Project [coalesce(id#0L, cast(8 as bigint)) AS id#0L, a#0, b#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/filter.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/filter.explain new file mode 100644 index 0000000000000..442db6ee85acc --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/filter.explain @@ -0,0 +1,2 @@ +Filter (id#0L = 10) ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/filter_expr.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/filter_expr.explain new file mode 100644 index 0000000000000..831e7c6cf5705 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/filter_expr.explain @@ -0,0 +1,2 @@ +Filter (EXP(cast(a#0 as double)) < cast(10.0 as double)) ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/freqItems.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/freqItems.explain new file mode 100644 index 0000000000000..31ef46e24242d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/freqItems.explain @@ -0,0 +1,2 @@ +Aggregate [collect_frequent_items(id#0L, 10, 0, 0) AS id_freqItems#0, collect_frequent_items(a#0, 10, 0, 0) AS a_freqItems#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_abs.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_abs.explain new file mode 100644 index 0000000000000..78093ca5448b7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_abs.explain @@ -0,0 +1,2 @@ +Project [abs(a#0) AS abs(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_acos.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_acos.explain new file mode 100644 index 0000000000000..e14a10911a4a2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_acos.explain @@ -0,0 +1,2 @@ +Project [ACOS(b#0) AS ACOS(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_acosh.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_acosh.explain new file mode 100644 index 0000000000000..735c181fd10d3 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_acosh.explain @@ -0,0 +1,2 @@ +Project [ACOSH(b#0) AS ACOSH(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_add_months.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_add_months.explain new file mode 100644 index 0000000000000..b50a63afcde2a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_add_months.explain @@ -0,0 +1,2 @@ +Project [add_months(d#0, 2) AS add_months(d, 2)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_aggregate.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_aggregate.explain new file mode 100644 index 0000000000000..31fe84066f8c7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_aggregate.explain @@ -0,0 +1,2 @@ +Project [aggregate(e#0, 0, lambdafunction((lambda x#0 + lambda y#0), lambda x#0, lambda y#0, false), lambdafunction(lambda x#0, lambda x#0, false)) AS aggregate(e, 0, lambdafunction((namedlambdavariable() + namedlambdavariable()), namedlambdavariable(), namedlambdavariable()), lambdafunction(namedlambdavariable(), namedlambdavariable()))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_approx_count_distinct.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_approx_count_distinct.explain new file mode 100644 index 0000000000000..2b002841dfc04 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_approx_count_distinct.explain @@ -0,0 +1,2 @@ +Aggregate [approx_count_distinct(a#0, 0.05, 0, 0) AS approx_count_distinct(a)#0L] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_approx_count_distinct_rsd.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_approx_count_distinct_rsd.explain new file mode 100644 index 0000000000000..454b8a0ecc244 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_approx_count_distinct_rsd.explain @@ -0,0 +1,2 @@ +Aggregate [approx_count_distinct(a#0, 0.1, 0, 0) AS approx_count_distinct(a)#0L] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_array.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array.explain new file mode 100644 index 0000000000000..63726ee039bbe --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array.explain @@ -0,0 +1,2 @@ +Project [array(a#0, a#0) AS array(a, a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_append.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_append.explain new file mode 100644 index 0000000000000..ca2804ebb603c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_append.explain @@ -0,0 +1,2 @@ +Project [array_append(e#0, 1) AS array_append(e, 1)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_compact.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_compact.explain new file mode 100644 index 0000000000000..a78195c4ae295 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_compact.explain @@ -0,0 +1,2 @@ +Project [filter(e#0, lambdafunction(isnotnull(lambda arg#0), lambda arg#0, false)) AS array_compact(e)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_contains.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_contains.explain new file mode 100644 index 0000000000000..ecfd647863b3a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_contains.explain @@ -0,0 +1,2 @@ +Project [array_contains(e#0, 1) AS array_contains(e, 1)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_distinct.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_distinct.explain new file mode 100644 index 0000000000000..efe98a93b01fa --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_distinct.explain @@ -0,0 +1,2 @@ +Project [array_distinct(e#0) AS array_distinct(e)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_except.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_except.explain new file mode 100644 index 0000000000000..5b667f60cb503 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_except.explain @@ -0,0 +1,2 @@ +Project [array_except(e#0, array(1, 2, 4)) AS array_except(e, array(1, 2, 4))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_insert.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_insert.explain new file mode 100644 index 0000000000000..edcd790596bd2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_insert.explain @@ -0,0 +1,2 @@ +Project [array_insert(e#0, 0, 1) AS array_insert(e, 0, 1)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_intersect.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_intersect.explain new file mode 100644 index 0000000000000..db862ee9697a9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_intersect.explain @@ -0,0 +1,2 @@ +Project [array_intersect(e#0, array(10, 4)) AS array_intersect(e, array(10, 4))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_join.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_join.explain new file mode 100644 index 0000000000000..993bb6b8207f0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_join.explain @@ -0,0 +1,2 @@ +Project [array_join(cast(e#0 as array), ;, None) AS array_join(e, ;)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_join_with_null_replacement.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_join_with_null_replacement.explain new file mode 100644 index 0000000000000..0a93be004169e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_join_with_null_replacement.explain @@ -0,0 +1,2 @@ +Project [array_join(cast(e#0 as array), ;, Some(null)) AS array_join(e, ;, null)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_max.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_max.explain new file mode 100644 index 0000000000000..76a12cb50c53f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_max.explain @@ -0,0 +1,2 @@ +Project [array_max(e#0) AS array_max(e)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_min.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_min.explain new file mode 100644 index 0000000000000..e11dfe2e471d0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_min.explain @@ -0,0 +1,2 @@ +Project [array_min(e#0) AS array_min(e)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_position.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_position.explain new file mode 100644 index 0000000000000..cd3ca8313c19e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_position.explain @@ -0,0 +1,2 @@ +Project [array_position(e#0, 10) AS array_position(e, 10)#0L] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_remove.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_remove.explain new file mode 100644 index 0000000000000..c9aea402dc7e5 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_remove.explain @@ -0,0 +1,2 @@ +Project [array_remove(e#0, 314) AS array_remove(e, 314)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_repeat.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_repeat.explain new file mode 100644 index 0000000000000..f4417df82305a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_repeat.explain @@ -0,0 +1,2 @@ +Project [array_repeat(a#0, 10) AS array_repeat(a, 10)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_sort.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_sort.explain new file mode 100644 index 0000000000000..a8bb75836a462 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_sort.explain @@ -0,0 +1,2 @@ +Project [array_sort(e#0, lambdafunction(if ((isnull(lambda left#0) AND isnull(lambda right#0))) 0 else if (isnull(lambda left#0)) 1 else if (isnull(lambda right#0)) -1 else if ((lambda left#0 < lambda right#0)) -1 else if ((lambda left#0 > lambda right#0)) 1 else 0, lambda left#0, lambda right#0, false), false) AS array_sort(e, lambdafunction((IF(((namedlambdavariable() IS NULL) AND (namedlambdavariable() IS NULL)), 0, (IF((namedlambdavariable() IS NULL), 1, (IF((namedlambdavariable() IS NULL), -1, (IF((namedlambdavariable() < namedlambdavariable()), -1, (IF((namedlambdavariable() > namedlambdavariable()), 1, 0)))))))))), namedlambdavariable(), namedlambdavariable()))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_sort_with_comparator.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_sort_with_comparator.explain new file mode 100644 index 0000000000000..cd86bcc5ffdf5 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_sort_with_comparator.explain @@ -0,0 +1,2 @@ +Project [array_sort(e#0, lambdafunction((lambda x#0 - lambda y#0), lambda x#0, lambda y#0, false), false) AS array_sort(e, lambdafunction((namedlambdavariable() - namedlambdavariable()), namedlambdavariable(), namedlambdavariable()))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_union.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_union.explain new file mode 100644 index 0000000000000..31e07099c3fde --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_array_union.explain @@ -0,0 +1,2 @@ +Project [array_union(e#0, array(1, 2, 3)) AS array_union(e, array(1, 2, 3))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_arrays_overlap.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_arrays_overlap.explain new file mode 100644 index 0000000000000..0316f35ff9fcf --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_arrays_overlap.explain @@ -0,0 +1,2 @@ +Project [arrays_overlap(e#0, array(1, 2)) AS arrays_overlap(e, array(1, 2))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_arrays_zip.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_arrays_zip.explain new file mode 100644 index 0000000000000..0dc3f43b074dc --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_arrays_zip.explain @@ -0,0 +1,2 @@ +Project [arrays_zip(e#0, sequence(cast(1 as bigint), cast(20 as bigint), Some(cast(1 as bigint)), Some(America/Los_Angeles)), e, 1) AS arrays_zip(e, sequence(1, 20, 1))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_asc.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_asc.explain new file mode 100644 index 0000000000000..8052f75b66506 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_asc.explain @@ -0,0 +1,2 @@ +Project [a#0 ASC NULLS FIRST AS a ASC NULLS FIRST#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_asc_nulls_first.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_asc_nulls_first.explain new file mode 100644 index 0000000000000..8052f75b66506 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_asc_nulls_first.explain @@ -0,0 +1,2 @@ +Project [a#0 ASC NULLS FIRST AS a ASC NULLS FIRST#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_asc_nulls_last.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_asc_nulls_last.explain new file mode 100644 index 0000000000000..3dcafb3bc9ad4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_asc_nulls_last.explain @@ -0,0 +1,2 @@ +Project [a#0 ASC NULLS LAST AS a ASC NULLS LAST#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_ascii.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_ascii.explain new file mode 100644 index 0000000000000..4440c59d8d135 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_ascii.explain @@ -0,0 +1,2 @@ +Project [ascii(g#0) AS ascii(g)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_asin.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_asin.explain new file mode 100644 index 0000000000000..d71385d4912be --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_asin.explain @@ -0,0 +1,2 @@ +Project [ASIN(b#0) AS ASIN(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_asinh.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_asinh.explain new file mode 100644 index 0000000000000..c341d6c7f35c5 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_asinh.explain @@ -0,0 +1,2 @@ +Project [ASINH(b#0) AS ASINH(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_assert_true_with_message.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_assert_true_with_message.explain new file mode 100644 index 0000000000000..dfd0468941b0d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_assert_true_with_message.explain @@ -0,0 +1,2 @@ +Project [if ((id#0L > cast(0 as bigint))) null else raise_error(id negative!, NullType) AS assert_true((id > 0), id negative!)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_atan.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_atan.explain new file mode 100644 index 0000000000000..4be28fb223696 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_atan.explain @@ -0,0 +1,2 @@ +Project [ATAN(b#0) AS ATAN(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_atan2.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_atan2.explain new file mode 100644 index 0000000000000..ebc8f138e7bd0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_atan2.explain @@ -0,0 +1,2 @@ +Project [ATAN2(cast(a#0 as double), b#0) AS ATAN2(a, b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_atanh.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_atanh.explain new file mode 100644 index 0000000000000..68082ca2ec6f9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_atanh.explain @@ -0,0 +1,2 @@ +Project [ATANH(b#0) AS ATANH(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_avg.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_avg.explain new file mode 100644 index 0000000000000..f2849464dff84 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_avg.explain @@ -0,0 +1,2 @@ +Aggregate [avg(a#0) AS avg(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_base64.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_base64.explain new file mode 100644 index 0000000000000..bc3c6e4bb2bcf --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_base64.explain @@ -0,0 +1,2 @@ +Project [base64(cast(g#0 as binary)) AS base64(g)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_bin.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_bin.explain new file mode 100644 index 0000000000000..00fe43204c9c3 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_bin.explain @@ -0,0 +1,2 @@ +Project [bin(cast(b#0 as bigint)) AS bin(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_bit_length.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_bit_length.explain new file mode 100644 index 0000000000000..2953bbefa9ef0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_bit_length.explain @@ -0,0 +1,2 @@ +Project [bit_length(g#0) AS bit_length(g)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_bitwise_not.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_bitwise_not.explain new file mode 100644 index 0000000000000..022a2d5095cba --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_bitwise_not.explain @@ -0,0 +1,2 @@ +Project [~a#0 AS ~a#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_bround.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_bround.explain new file mode 100644 index 0000000000000..8bc86462fa24c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_bround.explain @@ -0,0 +1,2 @@ +Project [round(b#0, 2) AS round(b, 2)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_bucket.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_bucket.explain new file mode 100644 index 0000000000000..8ab0c9493aba6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_bucket.explain @@ -0,0 +1,2 @@ +Project [bucket(3, a#0) AS bucket(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_ceil.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_ceil.explain new file mode 100644 index 0000000000000..9cf776a8dbaa7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_ceil.explain @@ -0,0 +1,2 @@ +Project [CEIL(b#0) AS CEIL(b)#0L] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_ceil_scale.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_ceil_scale.explain new file mode 100644 index 0000000000000..cdf8d356e47dd --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_ceil_scale.explain @@ -0,0 +1,2 @@ +Project [ceil(cast(b#0 as decimal(30,15)), 2) AS ceil(b, 2)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_coalesce.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_coalesce.explain new file mode 100644 index 0000000000000..61f494f695555 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_coalesce.explain @@ -0,0 +1,2 @@ +Project [coalesce(a#0, 3) AS coalesce(a, 3)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_col.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_col.explain new file mode 100644 index 0000000000000..4519922d4cbaf --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_col.explain @@ -0,0 +1,2 @@ +Project [id#0L] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_collect_list.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_collect_list.explain new file mode 100644 index 0000000000000..102f736c62ef6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_collect_list.explain @@ -0,0 +1,2 @@ +Aggregate [collect_list(a#0, 0, 0) AS collect_list(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_collect_set.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_collect_set.explain new file mode 100644 index 0000000000000..18246a74ccc98 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_collect_set.explain @@ -0,0 +1,2 @@ +Aggregate [collect_set(a#0, 0, 0) AS collect_set(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_concat.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_concat.explain new file mode 100644 index 0000000000000..4d765e5a9c3e6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_concat.explain @@ -0,0 +1,2 @@ +Project [concat(cast(e#0 as array), cast(array(1, 2) as array), sequence(cast(33 as bigint), cast(40 as bigint), Some(cast(1 as bigint)), Some(America/Los_Angeles))) AS concat(e, array(1, 2), sequence(33, 40, 1))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_concat_ws.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_concat_ws.explain new file mode 100644 index 0000000000000..d7c7e3af456dd --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_concat_ws.explain @@ -0,0 +1,2 @@ +Project [concat_ws(-, cast(b#0 as string), world, cast(id#0L as string)) AS concat_ws(-, b, world, id)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_conv.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_conv.explain new file mode 100644 index 0000000000000..5b01bea3bea88 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_conv.explain @@ -0,0 +1,2 @@ +Project [conv(cast(b#0 as string), 10, 16, false) AS conv(b, 10, 16)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_corr.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_corr.explain new file mode 100644 index 0000000000000..1f7f4507e5e8f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_corr.explain @@ -0,0 +1,2 @@ +Aggregate [corr(cast(a#0 as double), b#0) AS corr(a, b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_cos.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_cos.explain new file mode 100644 index 0000000000000..7eaea8731923a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_cos.explain @@ -0,0 +1,2 @@ +Project [COS(b#0) AS COS(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_cosh.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_cosh.explain new file mode 100644 index 0000000000000..55c72d81fac74 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_cosh.explain @@ -0,0 +1,2 @@ +Project [COSH(b#0) AS COSH(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_cot.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_cot.explain new file mode 100644 index 0000000000000..c4c720620e5eb --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_cot.explain @@ -0,0 +1,2 @@ +Project [COT(b#0) AS COT(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_count.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_count.explain new file mode 100644 index 0000000000000..af2e5c27d5d23 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_count.explain @@ -0,0 +1,2 @@ +Aggregate [count(a#0) AS count(a)#0L] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_countDistinct.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_countDistinct.explain new file mode 100644 index 0000000000000..f74490f7b1fcf --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_countDistinct.explain @@ -0,0 +1,2 @@ +Aggregate [count(distinct a#0, g#0) AS count(DISTINCT a, g)#0L] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_count_typed.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_count_typed.explain new file mode 100644 index 0000000000000..200513a11810d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_count_typed.explain @@ -0,0 +1,2 @@ +Aggregate [count(a#0) AS count(a)#0L] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_covar_pop.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_covar_pop.explain new file mode 100644 index 0000000000000..eb090cbbb1445 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_covar_pop.explain @@ -0,0 +1,2 @@ +Aggregate [covar_pop(cast(a#0 as double), b#0) AS covar_pop(a, b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_covar_samp.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_covar_samp.explain new file mode 100644 index 0000000000000..24dc636cfaac9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_covar_samp.explain @@ -0,0 +1,2 @@ +Aggregate [covar_samp(cast(a#0 as double), b#0) AS covar_samp(a, b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_crc32.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_crc32.explain new file mode 100644 index 0000000000000..abd5c1b135b62 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_crc32.explain @@ -0,0 +1,2 @@ +Project [crc32(cast(g#0 as binary)) AS crc32(g)#0L] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_csc.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_csc.explain new file mode 100644 index 0000000000000..db0380a8d7e0f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_csc.explain @@ -0,0 +1,2 @@ +Project [CSC(b#0) AS CSC(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_cume_dist.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_cume_dist.explain new file mode 100644 index 0000000000000..4f15f83bb9fb4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_cume_dist.explain @@ -0,0 +1,5 @@ +Project [cume_dist() OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0] ++- Project [a#0, id#0L, cume_dist() OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0, cume_dist() OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0] + +- Window [cume_dist() windowspecdefinition(a#0, id#0L ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS cume_dist() OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0], [a#0], [id#0L ASC NULLS FIRST] + +- Project [a#0, id#0L] + +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_current_date.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_current_date.explain new file mode 100644 index 0000000000000..5305b346c4f2d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_current_date.explain @@ -0,0 +1,2 @@ +Project [current_date(Some(America/Los_Angeles)) AS current_date()#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_current_timestamp.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_current_timestamp.explain new file mode 100644 index 0000000000000..51631e9719b65 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_current_timestamp.explain @@ -0,0 +1,2 @@ +Project [current_timestamp() AS current_timestamp()#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_add.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_add.explain new file mode 100644 index 0000000000000..66325085b9c14 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_add.explain @@ -0,0 +1,2 @@ +Project [date_add(d#0, 2) AS date_add(d, 2)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_format.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_format.explain new file mode 100644 index 0000000000000..5248ab9bc6c62 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_format.explain @@ -0,0 +1,2 @@ +Project [date_format(cast(d#0 as timestamp), yyyy-MM-dd, Some(America/Los_Angeles)) AS date_format(d, yyyy-MM-dd)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_sub.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_sub.explain new file mode 100644 index 0000000000000..40f3140c5b121 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_sub.explain @@ -0,0 +1,2 @@ +Project [date_sub(d#0, 2) AS date_sub(d, 2)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_trunc.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_trunc.explain new file mode 100644 index 0000000000000..b2db5111e8999 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_date_trunc.explain @@ -0,0 +1,2 @@ +Project [trunc(cast(t#0 as date), minute) AS trunc(t, minute)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_datediff.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_datediff.explain new file mode 100644 index 0000000000000..fa24b087d0b15 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_datediff.explain @@ -0,0 +1,2 @@ +Project [datediff(d#0, make_date(2020, 10, 10, false)) AS datediff(d, make_date(2020, 10, 10))#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_dayofmonth.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_dayofmonth.explain new file mode 100644 index 0000000000000..6f1841693e34a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_dayofmonth.explain @@ -0,0 +1,2 @@ +Project [dayofmonth(d#0) AS dayofmonth(d)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_dayofweek.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_dayofweek.explain new file mode 100644 index 0000000000000..af4e15ca013f0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_dayofweek.explain @@ -0,0 +1,2 @@ +Project [dayofweek(d#0) AS dayofweek(d)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_dayofyear.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_dayofyear.explain new file mode 100644 index 0000000000000..deadbe011fca9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_dayofyear.explain @@ -0,0 +1,2 @@ +Project [dayofyear(d#0) AS dayofyear(d)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_days.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_days.explain new file mode 100644 index 0000000000000..16ca2fe415e28 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_days.explain @@ -0,0 +1,2 @@ +Project [days(a#0) AS days(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain new file mode 100644 index 0000000000000..3b8e1eea57676 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_decode.explain @@ -0,0 +1,2 @@ +Project [decode(cast(g#0 as binary), UTF-8) AS decode(g, UTF-8)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_degrees.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_degrees.explain new file mode 100644 index 0000000000000..47caf440868ea --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_degrees.explain @@ -0,0 +1,2 @@ +Project [DEGREES(b#0) AS DEGREES(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_dense_rank.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_dense_rank.explain new file mode 100644 index 0000000000000..0cce71ad1d834 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_dense_rank.explain @@ -0,0 +1,5 @@ +Project [DENSE_RANK() OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0] ++- Project [id#0L, a#0, DENSE_RANK() OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0, DENSE_RANK() OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0] + +- Window [dense_rank(id#0L) windowspecdefinition(a#0, id#0L ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS DENSE_RANK() OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0], [a#0], [id#0L ASC NULLS FIRST] + +- Project [id#0L, a#0] + +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_desc.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_desc.explain new file mode 100644 index 0000000000000..5985746892b9e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_desc.explain @@ -0,0 +1,2 @@ +Project [a#0 DESC NULLS LAST AS a DESC NULLS LAST#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_desc_nulls_first.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_desc_nulls_first.explain new file mode 100644 index 0000000000000..5d170cb362849 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_desc_nulls_first.explain @@ -0,0 +1,2 @@ +Project [a#0 DESC NULLS FIRST AS a DESC NULLS FIRST#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_desc_nulls_last.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_desc_nulls_last.explain new file mode 100644 index 0000000000000..5985746892b9e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_desc_nulls_last.explain @@ -0,0 +1,2 @@ +Project [a#0 DESC NULLS LAST AS a DESC NULLS LAST#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_element_at.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_element_at.explain new file mode 100644 index 0000000000000..45c17a4ccd501 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_element_at.explain @@ -0,0 +1,2 @@ +Project [element_at(f#0, bob, None, false) AS element_at(f, bob)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain new file mode 100644 index 0000000000000..56da919abf4c5 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_encode.explain @@ -0,0 +1,2 @@ +Project [encode(g#0, UTF-8) AS encode(g, UTF-8)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_exists.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_exists.explain new file mode 100644 index 0000000000000..1fab4ccb3a86a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_exists.explain @@ -0,0 +1,2 @@ +Project [exists(e#0, lambdafunction((lambda x#0 > 10), lambda x#0, false)) AS exists(e, lambdafunction((namedlambdavariable() > 10), namedlambdavariable()))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_exp.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_exp.explain new file mode 100644 index 0000000000000..e7299c1f2c7f0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_exp.explain @@ -0,0 +1,2 @@ +Project [EXP(b#0) AS EXP(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_explode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_explode.explain new file mode 100644 index 0000000000000..1f1792761f622 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_explode.explain @@ -0,0 +1,3 @@ +Project [col#0] ++- Generate explode(e#0), false, [col#0] + +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_explode_outer.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_explode_outer.explain new file mode 100644 index 0000000000000..3ee29e734dc64 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_explode_outer.explain @@ -0,0 +1,3 @@ +Project [col#0] ++- Generate explode(e#0), true, [col#0] + +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_expm1.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_expm1.explain new file mode 100644 index 0000000000000..aa111ee5f4c85 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_expm1.explain @@ -0,0 +1,2 @@ +Project [EXPM1(b#0) AS EXPM1(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_expr.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_expr.explain new file mode 100644 index 0000000000000..c317af232f6e6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_expr.explain @@ -0,0 +1,2 @@ +Project [(a#0 + 1) AS (a + 1)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_factorial.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_factorial.explain new file mode 100644 index 0000000000000..e6b5c7ee90d60 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_factorial.explain @@ -0,0 +1,2 @@ +Project [factorial((a#0 % 10)) AS factorial((a % 10))#0L] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_filter.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_filter.explain new file mode 100644 index 0000000000000..a92b212666c05 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_filter.explain @@ -0,0 +1,2 @@ +Project [filter(e#0, lambdafunction((lambda x#0 > 10), lambda x#0, false)) AS filter(e, lambdafunction((namedlambdavariable() > 10), namedlambdavariable()))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_filter_with_pair_input.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_filter_with_pair_input.explain new file mode 100644 index 0000000000000..63ab17bd1e55e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_filter_with_pair_input.explain @@ -0,0 +1,2 @@ +Project [filter(e#0, lambdafunction(((lambda x#0 > 10) AND (lambda y#0 > 2)), lambda x#0, lambda y#0, false)) AS filter(e, lambdafunction(((namedlambdavariable() > 10) AND (namedlambdavariable() > 2)), namedlambdavariable(), namedlambdavariable()))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_first.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_first.explain new file mode 100644 index 0000000000000..0675353b70692 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_first.explain @@ -0,0 +1,2 @@ +Aggregate [first(a#0, true) AS first(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_flatten.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_flatten.explain new file mode 100644 index 0000000000000..ebdb5617a86a4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_flatten.explain @@ -0,0 +1,2 @@ +Project [flatten(array(cast(e#0 as array), sequence(cast(1 as bigint), cast(10 as bigint), Some(cast(1 as bigint)), Some(America/Los_Angeles)))) AS flatten(array(e, sequence(1, 10, 1)))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_floor.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_floor.explain new file mode 100644 index 0000000000000..67caaf174524e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_floor.explain @@ -0,0 +1,2 @@ +Project [FLOOR(b#0) AS FLOOR(b)#0L] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_floor_scale.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_floor_scale.explain new file mode 100644 index 0000000000000..c788eae3ab7cb --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_floor_scale.explain @@ -0,0 +1,2 @@ +Project [floor(cast(b#0 as decimal(30,15)), 2) AS floor(b, 2)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_forall.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_forall.explain new file mode 100644 index 0000000000000..e69389808a457 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_forall.explain @@ -0,0 +1,2 @@ +Project [forall(e#0, lambdafunction((lambda x#0 > 10), lambda x#0, false)) AS forall(e, lambdafunction((namedlambdavariable() > 10), namedlambdavariable()))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_format_number.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_format_number.explain new file mode 100644 index 0000000000000..45815bf3610e8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_format_number.explain @@ -0,0 +1,2 @@ +Project [format_number(b#0, 1) AS format_number(b, 1)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_from_csv.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_from_csv.explain new file mode 100644 index 0000000000000..89e03c8188232 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_from_csv.explain @@ -0,0 +1,2 @@ +Project [from_csv(StructField(id,LongType,true), StructField(a,IntegerType,true), StructField(b,DoubleType,true), (mode,FAILFAST), g#0, Some(America/Los_Angeles), None) AS from_csv(g)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_from_json.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_from_json.explain new file mode 100644 index 0000000000000..1219f11d4696e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_from_json.explain @@ -0,0 +1,2 @@ +Project [from_json(StructField(id,LongType,true), StructField(a,IntegerType,true), StructField(b,DoubleType,true), g#0, Some(America/Los_Angeles)) AS from_json(g)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_from_unixtime.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_from_unixtime.explain new file mode 100644 index 0000000000000..a1c43b4342fe3 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_from_unixtime.explain @@ -0,0 +1,2 @@ +Project [from_unixtime(1, yyyy-MM-dd HH:mm:ss, Some(America/Los_Angeles)) AS from_unixtime(1, yyyy-MM-dd HH:mm:ss)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_from_utc_timestamp.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_from_utc_timestamp.explain new file mode 100644 index 0000000000000..37c62a1497839 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_from_utc_timestamp.explain @@ -0,0 +1,2 @@ +Project [from_utc_timestamp(t#0, -08:00) AS from_utc_timestamp(t, -08:00)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_get.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_get.explain new file mode 100644 index 0000000000000..5f3ef82b996a8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_get.explain @@ -0,0 +1,2 @@ +Project [e#0[2] AS get(e, 2)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_get_json_object.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_get_json_object.explain new file mode 100644 index 0000000000000..cfc3e05cd0a21 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_get_json_object.explain @@ -0,0 +1,2 @@ +Project [get_json_object(g#0, $.device_type) AS get_json_object(g, $.device_type)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_greatest.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_greatest.explain new file mode 100644 index 0000000000000..6347277cc590f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_greatest.explain @@ -0,0 +1,2 @@ +Project [greatest(a#0, d#0.a) AS greatest(a, d.a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_hash.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_hash.explain new file mode 100644 index 0000000000000..b823139c599f2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_hash.explain @@ -0,0 +1,2 @@ +Project [hash(b#0, id#0L, 42) AS hash(b, id)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_hex.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_hex.explain new file mode 100644 index 0000000000000..0c01682b26ec2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_hex.explain @@ -0,0 +1,2 @@ +Project [hex(cast(a#0 as bigint)) AS hex(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_hour.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_hour.explain new file mode 100644 index 0000000000000..64ee2c8358fa6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_hour.explain @@ -0,0 +1,2 @@ +Project [hour(t#0, Some(America/Los_Angeles)) AS hour(t)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_hours.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_hours.explain new file mode 100644 index 0000000000000..a019836233d2f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_hours.explain @@ -0,0 +1,2 @@ +Project [hours(a#0) AS hours(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_hypot.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_hypot.explain new file mode 100644 index 0000000000000..524aa4388cf2b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_hypot.explain @@ -0,0 +1,2 @@ +Project [HYPOT(cast(a#0 as double), b#0) AS HYPOT(a, b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_initcap.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_initcap.explain new file mode 100644 index 0000000000000..b4e98561d904a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_initcap.explain @@ -0,0 +1,2 @@ +Project [initcap(g#0) AS initcap(g)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_inline.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_inline.explain new file mode 100644 index 0000000000000..8b4c66ac60722 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_inline.explain @@ -0,0 +1,3 @@ +Project [id#0L, a#0, b#0] ++- Generate inline(map_values(f#0)), false, [id#0L, a#0, b#0] + +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_inline_outer.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_inline_outer.explain new file mode 100644 index 0000000000000..a94c28d0f2b2e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_inline_outer.explain @@ -0,0 +1,3 @@ +Project [id#0L, a#0, b#0] ++- Generate inline(map_values(f#0)), true, [id#0L, a#0, b#0] + +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_input_file_name.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_input_file_name.explain new file mode 100644 index 0000000000000..7f411a857af05 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_input_file_name.explain @@ -0,0 +1,2 @@ +Project [input_file_name() AS input_file_name()#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_isnan.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_isnan.explain new file mode 100644 index 0000000000000..a93e063e4e136 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_isnan.explain @@ -0,0 +1,2 @@ +Project [isnan(b#0) AS isnan(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_isnull.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_isnull.explain new file mode 100644 index 0000000000000..a69a7d0280744 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_isnull.explain @@ -0,0 +1,2 @@ +Project [isnull(a#0) AS (a IS NULL)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_json_tuple.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_json_tuple.explain new file mode 100644 index 0000000000000..5530a36a60bde --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_json_tuple.explain @@ -0,0 +1,3 @@ +Project [c0#0, c1#0, c2#0] ++- Generate json_tuple(g#0, a, b, id), false, [c0#0, c1#0, c2#0] + +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_kurtosis.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_kurtosis.explain new file mode 100644 index 0000000000000..9af9e55d58e2b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_kurtosis.explain @@ -0,0 +1,2 @@ +Aggregate [kurtosis(cast(a#0 as double)) AS kurtosis(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_lag.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_lag.explain new file mode 100644 index 0000000000000..6d9d4e706ecb7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_lag.explain @@ -0,0 +1,5 @@ +Project [lag(g, 1, NULL) OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN -1 FOLLOWING AND -1 FOLLOWING)#0] ++- Project [g#0, a#0, id#0L, lag(g, 1, NULL) OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN -1 FOLLOWING AND -1 FOLLOWING)#0, lag(g, 1, NULL) OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN -1 FOLLOWING AND -1 FOLLOWING)#0] + +- Window [lag(g#0, -1, null) windowspecdefinition(a#0, id#0L ASC NULLS FIRST, specifiedwindowframe(RowFrame, -1, -1)) AS lag(g, 1, NULL) OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN -1 FOLLOWING AND -1 FOLLOWING)#0], [a#0], [id#0L ASC NULLS FIRST] + +- Project [g#0, a#0, id#0L] + +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_last.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_last.explain new file mode 100644 index 0000000000000..a7ae558d5c9c7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_last.explain @@ -0,0 +1,2 @@ +Aggregate [last(a#0, false) AS last(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_last_day.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_last_day.explain new file mode 100644 index 0000000000000..e6af285d1b871 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_last_day.explain @@ -0,0 +1,2 @@ +Project [last_day(cast(t#0 as date)) AS last_day(t)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_lead.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_lead.explain new file mode 100644 index 0000000000000..6c8ce180b79b5 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_lead.explain @@ -0,0 +1,5 @@ +Project [lead(g, 2, dv) OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN 2 FOLLOWING AND 2 FOLLOWING)#0] ++- Project [g#0, a#0, id#0L, lead(g, 2, dv) OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN 2 FOLLOWING AND 2 FOLLOWING)#0, lead(g, 2, dv) OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN 2 FOLLOWING AND 2 FOLLOWING)#0] + +- Window [lead(g#0, 2, dv) windowspecdefinition(a#0, id#0L ASC NULLS FIRST, specifiedwindowframe(RowFrame, 2, 2)) AS lead(g, 2, dv) OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN 2 FOLLOWING AND 2 FOLLOWING)#0], [a#0], [id#0L ASC NULLS FIRST] + +- Project [g#0, a#0, id#0L] + +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_least.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_least.explain new file mode 100644 index 0000000000000..afc3f6ca52c37 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_least.explain @@ -0,0 +1,2 @@ +Project [least(a#0, d#0.a) AS least(a, d.a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_length.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_length.explain new file mode 100644 index 0000000000000..16c6d438c6bd1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_length.explain @@ -0,0 +1,2 @@ +Project [length(g#0) AS length(g)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_levenshtein.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_levenshtein.explain new file mode 100644 index 0000000000000..d7cc18d5129f8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_levenshtein.explain @@ -0,0 +1,2 @@ +Project [levenshtein(g#0, bob) AS levenshtein(g, bob)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_lit.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_lit.explain new file mode 100644 index 0000000000000..7f093f9df13ac --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_lit.explain @@ -0,0 +1,2 @@ +Project [id#0L, id#0L, true AS true#0, 68 AS 68#0, 9872 AS 9872#0, -8726532 AS -8726532#0, 7834609328726532 AS 7834609328726532#0L, 2.718281828459045 AS 2.718281828459045#0, -0.8 AS -0.8#0, 89.97620 AS 89.97620#0, 89889.7667231 AS 89889.7667231#0, connect! AS connect!#0, T AS T#0, ABCDEFGHIJ AS ABCDEFGHIJ#0, 0x78797A7B7C7D7E7F808182838485868788898A8B8C8D8E AS X'78797A7B7C7D7E7F808182838485868788898A8B8C8D8E'#0, 0x0806 AS X'0806'#0, null AS NULL#0, 2020-10-10 AS DATE '2020-10-10'#0, 8.997620 AS 8.997620#0, 2023-02-23 04:31:59.808 AS TIMESTAMP '2023-02-23 04:31:59.808'#0, 1969-12-31 16:00:12.345 AS TIMESTAMP '1969-12-31 16:00:12.345'#0, 2023-02-23 20:36:00 AS TIMESTAMP_NTZ '2023-02-23 20:36:00'#0, 2023-02-23 AS DATE '2023-02-23'#0, INTERVAL '0 00:03:20' DAY TO SECOND AS INTERVAL '0 00:03:20' DAY TO SECOND#0, ... 2 more fields] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_lit_array.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_lit_array.explain new file mode 100644 index 0000000000000..74d512b6910c8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_lit_array.explain @@ -0,0 +1,2 @@ +Project [[] AS ARRAY()#0, [[1],[2],[3]] AS ARRAY(ARRAY(1), ARRAY(2), ARRAY(3))#0, [[[1]],[[2]],[[3]]] AS ARRAY(ARRAY(ARRAY(1)), ARRAY(ARRAY(2)), ARRAY(ARRAY(3)))#0, [true,false] AS ARRAY(true, false)#0, 0x434445 AS X'434445'#0, [9872,9873,9874] AS ARRAY(9872S, 9873S, 9874S)#0, [-8726532,8726532,-8726533] AS ARRAY(-8726532, 8726532, -8726533)#0, [7834609328726531,7834609328726532,7834609328726533] AS ARRAY(7834609328726531L, 7834609328726532L, 7834609328726533L)#0, [2.718281828459045,1.0,2.0] AS ARRAY(2.718281828459045D, 1.0D, 2.0D)#0, [-0.8,-0.7,-0.9] AS ARRAY(CAST('-0.8' AS FLOAT), CAST('-0.7' AS FLOAT), CAST('-0.9' AS FLOAT))#0, [89.97620,89.97621] AS ARRAY(89.97620BD, 89.97621BD)#0, [89889.7667231,89889.7667231] AS ARRAY(89889.7667231BD, 89889.7667231BD)#0, [connect!,disconnect!] AS ARRAY('connect!', 'disconnect!')#0, TF AS TF#0, [ABCDEFGHIJ,BCDEFGHIJK] AS ARRAY('ABCDEFGHIJ', 'BCDEFGHIJK')#0, [18545,18546] AS ARRAY(DATE '2020-10-10', DATE '2020-10-11')#0, [1677155519808000,1677155519809000] AS ARRAY(TIMESTAMP '2023-02-23 04:31:59.808', TIMESTAMP '2023-02-23 04:31:59.809')#0, [12345000,23456000] AS ARRAY(TIMESTAMP '1969-12-31 16:00:12.345', TIMESTAMP '1969-12-31 16:00:23.456')#0, [1677184560000000,1677188160000000] AS ARRAY(TIMESTAMP_NTZ '2023-02-23 20:36:00', TIMESTAMP_NTZ '2023-02-23 21:36:00')#0, [19411,19417] AS ARRAY(DATE '2023-02-23', DATE '2023-03-01')#0, [100000000,200000000] AS ARRAY(INTERVAL '0 00:01:40' DAY TO SECOND, INTERVAL '0 00:03:20' DAY TO SECOND)#0, [0,0] AS ARRAY(INTERVAL '0-0' YEAR TO MONTH, INTERVAL '0-0' YEAR TO MONTH)#0, [2 months 20 days 0.0001 seconds,2 months 21 days 0.0002 seconds] AS ARRAY(INTERVAL '2 months 20 days 0.0001 seconds', INTERVAL '2 months 21 days 0.0002 seconds')#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_localtimestamp.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_localtimestamp.explain new file mode 100644 index 0000000000000..84b96ef8d9a1c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_localtimestamp.explain @@ -0,0 +1,2 @@ +Project [localtimestamp(Some(America/Los_Angeles)) AS localtimestamp()#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_locate.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_locate.explain new file mode 100644 index 0000000000000..350e8638995e1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_locate.explain @@ -0,0 +1,2 @@ +Project [locate(jar, g#0, 1) AS locate(jar, g, 1)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_locate_with_pos.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_locate_with_pos.explain new file mode 100644 index 0000000000000..e8de0961aff4d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_locate_with_pos.explain @@ -0,0 +1,2 @@ +Project [locate(jar, g#0, 10) AS locate(jar, g, 10)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_log.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_log.explain new file mode 100644 index 0000000000000..d3c3743b1ef40 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_log.explain @@ -0,0 +1,2 @@ +Project [LOG(E(), b#0) AS LOG(E(), b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_log10.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_log10.explain new file mode 100644 index 0000000000000..ed6ce6bf132b6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_log10.explain @@ -0,0 +1,2 @@ +Project [LOG10(b#0) AS LOG10(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_log1p.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_log1p.explain new file mode 100644 index 0000000000000..dc98083c1dc8a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_log1p.explain @@ -0,0 +1,2 @@ +Project [LOG1P(cast(a#0 as double)) AS LOG1P(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_log2.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_log2.explain new file mode 100644 index 0000000000000..dbf43897ba538 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_log2.explain @@ -0,0 +1,2 @@ +Project [LOG2(cast(a#0 as double)) AS LOG2(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_log_with_base.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_log_with_base.explain new file mode 100644 index 0000000000000..662845c915988 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_log_with_base.explain @@ -0,0 +1,2 @@ +Project [LOG(2.0, b#0) AS LOG(2.0, b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_lower.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_lower.explain new file mode 100644 index 0000000000000..d905689c35dd4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_lower.explain @@ -0,0 +1,2 @@ +Project [lower(g#0) AS lower(g)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_lpad.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_lpad.explain new file mode 100644 index 0000000000000..a8af1ecd1330a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_lpad.explain @@ -0,0 +1,2 @@ +Project [lpad(g#0, 10, -) AS lpad(g, 10, -)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_lpad_binary.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_lpad_binary.explain new file mode 100644 index 0000000000000..4efc5a3709b6f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_lpad_binary.explain @@ -0,0 +1,2 @@ +Project [staticinvoke(class org.apache.spark.unsafe.types.ByteArray, BinaryType, lpad, bytes#0, 5, 0x0C0A0F0E, BinaryType, IntegerType, BinaryType, true, false, true) AS lpad(bytes, 5, X'0C0A0F0E')#0] ++- LocalRelation , [id#0L, bytes#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_ltrim.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_ltrim.explain new file mode 100644 index 0000000000000..754adc1e9a8cf --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_ltrim.explain @@ -0,0 +1,2 @@ +Project [ltrim(g#0, None) AS ltrim(g)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_ltrim_with_pattern.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_ltrim_with_pattern.explain new file mode 100644 index 0000000000000..68c20c2b8660c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_ltrim_with_pattern.explain @@ -0,0 +1,2 @@ +Project [ltrim(xxx, Some(g#0)) AS TRIM(LEADING g FROM xxx)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_make_date.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_make_date.explain new file mode 100644 index 0000000000000..7910de7f9df02 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_make_date.explain @@ -0,0 +1,2 @@ +Project [make_date(2018, 5, 14, false) AS make_date(2018, 5, 14)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_map.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_map.explain new file mode 100644 index 0000000000000..67b9bdb45b5de --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_map.explain @@ -0,0 +1,2 @@ +Project [map(a#0, g#0, 22, dummy) AS map(a, g, 22, dummy)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_concat.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_concat.explain new file mode 100644 index 0000000000000..fb0e86e348568 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_concat.explain @@ -0,0 +1,2 @@ +Project [map_concat(f#0, map(foo, struct(id, 12, a, 68, b, 2.718281828459045))) AS map_concat(f, map(foo, struct(12 AS id, 68 AS a, 2.718281828459045 AS b)))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_contains_key.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_contains_key.explain new file mode 100644 index 0000000000000..a2bc19114f405 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_contains_key.explain @@ -0,0 +1,2 @@ +Project [array_contains(map_keys(f#0), xyz) AS map_contains_key(f, xyz)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_entries.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_entries.explain new file mode 100644 index 0000000000000..2d9d550396c6a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_entries.explain @@ -0,0 +1,2 @@ +Project [map_entries(f#0) AS map_entries(f)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_filter.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_filter.explain new file mode 100644 index 0000000000000..4e2502d0c988d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_filter.explain @@ -0,0 +1,2 @@ +Project [map_filter(f#0, lambdafunction(Contains(lambda x#0, baz), lambda x#0, lambda y#0, false)) AS map_filter(f, lambdafunction(contains(namedlambdavariable(), baz), namedlambdavariable(), namedlambdavariable()))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_from_arrays.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_from_arrays.explain new file mode 100644 index 0000000000000..08be7f83ce3f5 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_from_arrays.explain @@ -0,0 +1,2 @@ +Project [map_from_arrays(array(1, 2), array(one, two)) AS map_from_arrays(array(1, 2), array(one, two))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_from_entries.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_from_entries.explain new file mode 100644 index 0000000000000..737900bef096d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_from_entries.explain @@ -0,0 +1,2 @@ +Project [map_from_entries(transform(e#0, lambdafunction(struct(y, lambda y#0, x, lambda x#0), lambda x#0, lambda y#0, false))) AS map_from_entries(transform(e, lambdafunction(struct(namedlambdavariable(), namedlambdavariable()), namedlambdavariable(), namedlambdavariable())))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_keys.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_keys.explain new file mode 100644 index 0000000000000..85599d159549a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_keys.explain @@ -0,0 +1,2 @@ +Project [map_keys(f#0) AS map_keys(f)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_values.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_values.explain new file mode 100644 index 0000000000000..0f93262af1e77 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_values.explain @@ -0,0 +1,2 @@ +Project [map_values(f#0) AS map_values(f)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_zip_with.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_zip_with.explain new file mode 100644 index 0000000000000..2c053fa655853 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_map_zip_with.explain @@ -0,0 +1,2 @@ +Project [map_zip_with(f#0, f#0, lambdafunction((lambda y#0.id + lambda z#0.id), lambda x#0, lambda y#0, lambda z#0, false)) AS map_zip_with(f, f, lambdafunction((namedlambdavariable().id + namedlambdavariable().id), namedlambdavariable(), namedlambdavariable(), namedlambdavariable()))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_max.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_max.explain new file mode 100644 index 0000000000000..06adc24970f60 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_max.explain @@ -0,0 +1,2 @@ +Aggregate [max(id#0L) AS max(id)#0L] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_max_by.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_max_by.explain new file mode 100644 index 0000000000000..3f5c38e707a67 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_max_by.explain @@ -0,0 +1,2 @@ +Aggregate [max_by(a#0, b#0) AS max_by(a, b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_md5.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_md5.explain new file mode 100644 index 0000000000000..7bbc84785e5e8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_md5.explain @@ -0,0 +1,2 @@ +Project [md5(cast(g#0 as binary)) AS md5(g)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_median.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_median.explain new file mode 100644 index 0000000000000..8e9cbe3295ed0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_median.explain @@ -0,0 +1,2 @@ +Aggregate [percentile(a#0, 0.5, 1, 0, 0, false) AS median(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_min.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_min.explain new file mode 100644 index 0000000000000..9008ca2719c82 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_min.explain @@ -0,0 +1,2 @@ +Aggregate [min(a#0) AS min(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_min_by.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_min_by.explain new file mode 100644 index 0000000000000..f737fa2ac2ac2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_min_by.explain @@ -0,0 +1,2 @@ +Aggregate [min_by(a#0, b#0) AS min_by(a, b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_minute.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_minute.explain new file mode 100644 index 0000000000000..b51e985267fd3 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_minute.explain @@ -0,0 +1,2 @@ +Project [minute(t#0, Some(America/Los_Angeles)) AS minute(t)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_mode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_mode.explain new file mode 100644 index 0000000000000..dfa2113a2c365 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_mode.explain @@ -0,0 +1,2 @@ +Aggregate [mode(a#0, 0, 0) AS mode(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_monotonically_increasing_id.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_monotonically_increasing_id.explain new file mode 100644 index 0000000000000..22143169d0597 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_monotonically_increasing_id.explain @@ -0,0 +1,2 @@ +Project [monotonically_increasing_id() AS monotonically_increasing_id()#0L] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_month.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_month.explain new file mode 100644 index 0000000000000..3322feba158e1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_month.explain @@ -0,0 +1,2 @@ +Project [month(d#0) AS month(d)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_months.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_months.explain new file mode 100644 index 0000000000000..17b991ec1aa16 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_months.explain @@ -0,0 +1,2 @@ +Project [months(a#0) AS months(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_months_between.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_months_between.explain new file mode 100644 index 0000000000000..4ae8fbf842a78 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_months_between.explain @@ -0,0 +1,2 @@ +Project [months_between(cast(current_date(Some(America/Los_Angeles)) as timestamp), cast(d#0 as timestamp), true, Some(America/Los_Angeles)) AS months_between(current_date(), d, true)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_months_between_with_roundoff.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_months_between_with_roundoff.explain new file mode 100644 index 0000000000000..4ae8fbf842a78 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_months_between_with_roundoff.explain @@ -0,0 +1,2 @@ +Project [months_between(cast(current_date(Some(America/Los_Angeles)) as timestamp), cast(d#0 as timestamp), true, Some(America/Los_Angeles)) AS months_between(current_date(), d, true)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_nanvl.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_nanvl.explain new file mode 100644 index 0000000000000..a84a06fbeaf9f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_nanvl.explain @@ -0,0 +1,2 @@ +Project [nanvl(NaN, cast(a#0 as double)) AS nanvl(NaN, a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_negate.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_negate.explain new file mode 100644 index 0000000000000..4f047e75f06ad --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_negate.explain @@ -0,0 +1,2 @@ +Project [-a#0 AS negative(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_next_day.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_next_day.explain new file mode 100644 index 0000000000000..becd1501fa7da --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_next_day.explain @@ -0,0 +1,2 @@ +Project [next_day(d#0, Mon, false) AS next_day(d, Mon)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_nth_value.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_nth_value.explain new file mode 100644 index 0000000000000..69eb7872d528e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_nth_value.explain @@ -0,0 +1,5 @@ +Project [nth_value(g, 3) ignore nulls OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0] ++- Project [g#0, a#0, id#0L, nth_value(g, 3) ignore nulls OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0, nth_value(g, 3) ignore nulls OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0] + +- Window [nth_value(g#0, 3, true) windowspecdefinition(a#0, id#0L ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS nth_value(g, 3) ignore nulls OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0], [a#0], [id#0L ASC NULLS FIRST] + +- Project [g#0, a#0, id#0L] + +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_ntile.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_ntile.explain new file mode 100644 index 0000000000000..349ac7bbe8b0c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_ntile.explain @@ -0,0 +1,5 @@ +Project [ntile(4) OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0] ++- Project [a#0, id#0L, ntile(4) OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0, ntile(4) OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0] + +- Window [ntile(4) windowspecdefinition(a#0, id#0L ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS ntile(4) OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0], [a#0], [id#0L ASC NULLS FIRST] + +- Project [a#0, id#0L] + +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_octet_length.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_octet_length.explain new file mode 100644 index 0000000000000..b557cb5654d47 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_octet_length.explain @@ -0,0 +1,2 @@ +Project [octet_length(g#0) AS octet_length(g)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_overlay.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_overlay.explain new file mode 100644 index 0000000000000..71af9a3c5cda5 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_overlay.explain @@ -0,0 +1,2 @@ +Project [overlay(cast(b#0 as string), foo, 4, -1) AS overlay(b, foo, 4, -1)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_overlay_with_len.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_overlay_with_len.explain new file mode 100644 index 0000000000000..ab1f9323ed2e1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_overlay_with_len.explain @@ -0,0 +1,2 @@ +Project [overlay(cast(b#0 as string), foo, 4, cast(3 as int)) AS overlay(b, foo, 4, 3)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_percent_rank.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_percent_rank.explain new file mode 100644 index 0000000000000..012931bd2aad8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_percent_rank.explain @@ -0,0 +1,5 @@ +Project [PERCENT_RANK() OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0] ++- Project [id#0L, a#0, PERCENT_RANK() OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0, PERCENT_RANK() OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0] + +- Window [percent_rank(id#0L) windowspecdefinition(a#0, id#0L ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS PERCENT_RANK() OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0], [a#0], [id#0L ASC NULLS FIRST] + +- Project [id#0L, a#0] + +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_percentile_approx.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_percentile_approx.explain new file mode 100644 index 0000000000000..879accaa1b63a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_percentile_approx.explain @@ -0,0 +1,2 @@ +Aggregate [percentile_approx(a#0, 0.3, 20, 0, 0) AS percentile_approx(a, 0.3, 20)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_pmod.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_pmod.explain new file mode 100644 index 0000000000000..3db50c9486f5d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_pmod.explain @@ -0,0 +1,2 @@ +Project [pmod(a#0, 10) AS pmod(a, 10)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_posexplode.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_posexplode.explain new file mode 100644 index 0000000000000..39d0fd49866b9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_posexplode.explain @@ -0,0 +1,3 @@ +Project [pos#0, col#0] ++- Generate posexplode(e#0), false, [pos#0, col#0] + +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_posexplode_outer.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_posexplode_outer.explain new file mode 100644 index 0000000000000..c7023ef10b52d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_posexplode_outer.explain @@ -0,0 +1,3 @@ +Project [pos#0, col#0] ++- Generate posexplode(e#0), true, [pos#0, col#0] + +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_pow.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_pow.explain new file mode 100644 index 0000000000000..c6c6c0603e3e0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_pow.explain @@ -0,0 +1,2 @@ +Project [POWER(cast(a#0 as double), b#0) AS POWER(a, b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_product.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_product.explain new file mode 100644 index 0000000000000..e3ebed6a92b1e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_product.explain @@ -0,0 +1,2 @@ +Aggregate [product(cast(a#0 as double)) AS product(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_quarter.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_quarter.explain new file mode 100644 index 0000000000000..17840beaef65e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_quarter.explain @@ -0,0 +1,2 @@ +Project [quarter(d#0) AS quarter(d)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_radians.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_radians.explain new file mode 100644 index 0000000000000..012ffc6737de2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_radians.explain @@ -0,0 +1,2 @@ +Project [RADIANS(b#0) AS RADIANS(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_raise_error.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_raise_error.explain new file mode 100644 index 0000000000000..c65063a35a1ab --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_raise_error.explain @@ -0,0 +1,2 @@ +Project [raise_error(kaboom, NullType) AS raise_error(kaboom)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_rand_with_seed.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_rand_with_seed.explain new file mode 100644 index 0000000000000..0d12a53fb990d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_rand_with_seed.explain @@ -0,0 +1,2 @@ +Project [rand(133) AS rand(133)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_randn_with_seed.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_randn_with_seed.explain new file mode 100644 index 0000000000000..b976c0e35400d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_randn_with_seed.explain @@ -0,0 +1,2 @@ +Project [randn(133) AS randn(133)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_rank.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_rank.explain new file mode 100644 index 0000000000000..b8d4b5ee75650 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_rank.explain @@ -0,0 +1,5 @@ +Project [RANK() OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0] ++- Project [id#0L, a#0, RANK() OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0, RANK() OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0] + +- Window [rank(id#0L) windowspecdefinition(a#0, id#0L ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS RANK() OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0], [a#0], [id#0L ASC NULLS FIRST] + +- Project [id#0L, a#0] + +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_regexp_extract.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_regexp_extract.explain new file mode 100644 index 0000000000000..d7e08710c8f50 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_regexp_extract.explain @@ -0,0 +1,2 @@ +Project [regexp_extract(g#0, (\d+)-(\d+), 1) AS regexp_extract(g, (\d+)-(\d+), 1)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_regexp_replace.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_regexp_replace.explain new file mode 100644 index 0000000000000..0a10b3bd0220c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_regexp_replace.explain @@ -0,0 +1,2 @@ +Project [regexp_replace(g#0, (\d+), XXX, 1) AS regexp_replace(g, (\d+), XXX, 1)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_reverse.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_reverse.explain new file mode 100644 index 0000000000000..c659426e030e0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_reverse.explain @@ -0,0 +1,2 @@ +Project [reverse(e#0) AS reverse(e)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_rint.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_rint.explain new file mode 100644 index 0000000000000..2231e53941cf7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_rint.explain @@ -0,0 +1,2 @@ +Project [rint(b#0) AS rint(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_round.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_round.explain new file mode 100644 index 0000000000000..8bc86462fa24c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_round.explain @@ -0,0 +1,2 @@ +Project [round(b#0, 2) AS round(b, 2)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_row_number.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_row_number.explain new file mode 100644 index 0000000000000..d0c817f889478 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_row_number.explain @@ -0,0 +1,5 @@ +Project [row_number() OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0] ++- Project [a#0, id#0L, row_number() OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0, row_number() OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0] + +- Window [row_number() windowspecdefinition(a#0, id#0L ASC NULLS FIRST, specifiedwindowframe(RowFrame, unboundedpreceding$(), currentrow$())) AS row_number() OVER (PARTITION BY a ORDER BY id ASC NULLS FIRST ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0], [a#0], [id#0L ASC NULLS FIRST] + +- Project [a#0, id#0L] + +- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_rpad.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_rpad.explain new file mode 100644 index 0000000000000..05f59216dddf6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_rpad.explain @@ -0,0 +1,2 @@ +Project [rpad(g#0, 10, -) AS rpad(g, 10, -)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_rpad_binary.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_rpad_binary.explain new file mode 100644 index 0000000000000..10d77eef1cb65 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_rpad_binary.explain @@ -0,0 +1,2 @@ +Project [staticinvoke(class org.apache.spark.unsafe.types.ByteArray, BinaryType, rpad, bytes#0, 5, 0x0B0A0B0E, BinaryType, IntegerType, BinaryType, true, false, true) AS rpad(bytes, 5, X'0B0A0B0E')#0] ++- LocalRelation , [id#0L, bytes#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_rtrim.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_rtrim.explain new file mode 100644 index 0000000000000..c6b0debbfe59a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_rtrim.explain @@ -0,0 +1,2 @@ +Project [rtrim(g#0, None) AS rtrim(g)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_rtrim_with_pattern.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_rtrim_with_pattern.explain new file mode 100644 index 0000000000000..ea262f52de416 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_rtrim_with_pattern.explain @@ -0,0 +1,2 @@ +Project [rtrim(yyy, Some(g#0)) AS TRIM(TRAILING g FROM yyy)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_csv.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_csv.explain new file mode 100644 index 0000000000000..ecd181a4292de --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_csv.explain @@ -0,0 +1,2 @@ +Project [schema_of_csv(1|abc, (sep,|)) AS schema_of_csv(1|abc)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_json.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_json.explain new file mode 100644 index 0000000000000..8ec799bc58084 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_json.explain @@ -0,0 +1,2 @@ +Project [schema_of_json([{"col":01}]) AS schema_of_json([{"col":01}])#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_json_with_options.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_json_with_options.explain new file mode 100644 index 0000000000000..13867949177a4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_schema_of_json_with_options.explain @@ -0,0 +1,2 @@ +Project [schema_of_json([{"col":01}], (allowNumericLeadingZeros,true)) AS schema_of_json([{"col":01}])#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_sec.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sec.explain new file mode 100644 index 0000000000000..f18fb62333be4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sec.explain @@ -0,0 +1,2 @@ +Project [SEC(b#0) AS SEC(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_second.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_second.explain new file mode 100644 index 0000000000000..b35e4433e9f31 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_second.explain @@ -0,0 +1,2 @@ +Project [second(t#0, Some(America/Los_Angeles)) AS second(t)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_sentences.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sentences.explain new file mode 100644 index 0000000000000..5c88a1f7b3abd --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sentences.explain @@ -0,0 +1,2 @@ +Project [sentences(g#0, , ) AS sentences(g, , )#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_locale.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_locale.explain new file mode 100644 index 0000000000000..7819f9b542340 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sentences_with_locale.explain @@ -0,0 +1,2 @@ +Project [sentences(g#0, en, US) AS sentences(g, en, US)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_sequence.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sequence.explain new file mode 100644 index 0000000000000..2a71190c269c7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sequence.explain @@ -0,0 +1,2 @@ +Project [sequence(cast(1 as bigint), cast(10 as bigint), Some(cast(1 as bigint)), Some(America/Los_Angeles)) AS sequence(1, 10, 1)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_session_window.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_session_window.explain new file mode 100644 index 0000000000000..ab69691a8dd32 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_session_window.explain @@ -0,0 +1,4 @@ +Project [session_window#0 AS session_window#0] ++- Filter isnotnull(t#0) + +- Project [named_struct(start, precisetimestampconversion(precisetimestampconversion(t#0, TimestampType, LongType), LongType, TimestampType), end, knownnullable(precisetimestampconversion(precisetimestampconversion(cast(t#0 + cast(10 minutes as interval) as timestamp), TimestampType, LongType), LongType, TimestampType))) AS session_window#0, d#0, t#0, s#0, x#0L, wt#0] + +- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_sha1.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sha1.explain new file mode 100644 index 0000000000000..55077f061d720 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sha1.explain @@ -0,0 +1,2 @@ +Project [sha1(cast(g#0 as binary)) AS sha1(g)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_sha2.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sha2.explain new file mode 100644 index 0000000000000..8ed2705cb17cb --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sha2.explain @@ -0,0 +1,2 @@ +Project [sha2(cast(g#0 as binary), 512) AS sha2(g, 512)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_shiftleft.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_shiftleft.explain new file mode 100644 index 0000000000000..f89a8be7ceedb --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_shiftleft.explain @@ -0,0 +1,2 @@ +Project [shiftleft(cast(b#0 as int), 2) AS shiftleft(b, 2)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_shiftright.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_shiftright.explain new file mode 100644 index 0000000000000..b436f52e912b5 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_shiftright.explain @@ -0,0 +1,2 @@ +Project [shiftright(cast(b#0 as int), 2) AS shiftright(b, 2)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_shiftrightunsigned.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_shiftrightunsigned.explain new file mode 100644 index 0000000000000..282ad156b3825 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_shiftrightunsigned.explain @@ -0,0 +1,2 @@ +Project [shiftrightunsigned(cast(b#0 as int), 2) AS shiftrightunsigned(b, 2)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_signum.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_signum.explain new file mode 100644 index 0000000000000..807fa3300836c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_signum.explain @@ -0,0 +1,2 @@ +Project [SIGNUM(b#0) AS SIGNUM(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_sin.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sin.explain new file mode 100644 index 0000000000000..7e4f0af50cd0a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sin.explain @@ -0,0 +1,2 @@ +Project [SIN(b#0) AS SIN(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_sinh.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sinh.explain new file mode 100644 index 0000000000000..7feea4573306d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sinh.explain @@ -0,0 +1,2 @@ +Project [SINH(b#0) AS SINH(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_size.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_size.explain new file mode 100644 index 0000000000000..05ae4511bf83a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_size.explain @@ -0,0 +1,2 @@ +Project [size(f#0, true) AS size(f)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_skewness.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_skewness.explain new file mode 100644 index 0000000000000..bac5abec39595 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_skewness.explain @@ -0,0 +1,2 @@ +Aggregate [skewness(cast(a#0 as double)) AS skewness(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_slice.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_slice.explain new file mode 100644 index 0000000000000..96734d3b1f44d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_slice.explain @@ -0,0 +1,2 @@ +Project [slice(e#0, 0, 5) AS slice(e, 0, 5)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_sort_array.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sort_array.explain new file mode 100644 index 0000000000000..b9ab76a6d0302 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sort_array.explain @@ -0,0 +1,2 @@ +Project [sort_array(e#0, true) AS sort_array(e, true)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_spark_partition_id.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_spark_partition_id.explain new file mode 100644 index 0000000000000..3afea21244bbf --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_spark_partition_id.explain @@ -0,0 +1,2 @@ +Project [SPARK_PARTITION_ID() AS SPARK_PARTITION_ID()#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_split.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_split.explain new file mode 100644 index 0000000000000..a7b642f1efa9f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_split.explain @@ -0,0 +1,2 @@ +Project [split(g#0, ;, -1) AS split(g, ;, -1)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_split_with_limit.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_split_with_limit.explain new file mode 100644 index 0000000000000..a4c4d1a5e8b50 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_split_with_limit.explain @@ -0,0 +1,2 @@ +Project [split(g#0, ;, 10) AS split(g, ;, 10)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_sqrt.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sqrt.explain new file mode 100644 index 0000000000000..2eaea5dd3c87d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sqrt.explain @@ -0,0 +1,2 @@ +Project [SQRT(b#0) AS SQRT(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_stddev.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_stddev.explain new file mode 100644 index 0000000000000..106191e5a32ec --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_stddev.explain @@ -0,0 +1,2 @@ +Aggregate [stddev(cast(a#0 as double)) AS stddev(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_stddev_pop.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_stddev_pop.explain new file mode 100644 index 0000000000000..239e6e9b90fc2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_stddev_pop.explain @@ -0,0 +1,2 @@ +Aggregate [stddev_pop(cast(a#0 as double)) AS stddev_pop(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_stddev_samp.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_stddev_samp.explain new file mode 100644 index 0000000000000..2eef377ff7f19 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_stddev_samp.explain @@ -0,0 +1,2 @@ +Aggregate [stddev_samp(cast(a#0 as double)) AS stddev_samp(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_struct.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_struct.explain new file mode 100644 index 0000000000000..35720e40af22a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_struct.explain @@ -0,0 +1,2 @@ +Project [struct(a, a#0, d, d#0) AS struct(a, d)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring.explain new file mode 100644 index 0000000000000..fe07244fc9cec --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring.explain @@ -0,0 +1,2 @@ +Project [substring(g#0, 4, 5) AS substring(g, 4, 5)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_index.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_index.explain new file mode 100644 index 0000000000000..81ba6f07b8511 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_substring_index.explain @@ -0,0 +1,2 @@ +Project [substring_index(g#0, ;, 5) AS substring_index(g, ;, 5)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_sum.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sum.explain new file mode 100644 index 0000000000000..cade1df0c0e08 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sum.explain @@ -0,0 +1,2 @@ +Aggregate [sum(a#0) AS sum(a)#0L] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_sum_distinct.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sum_distinct.explain new file mode 100644 index 0000000000000..fd97165c2b580 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_sum_distinct.explain @@ -0,0 +1,2 @@ +Aggregate [sum(distinct a#0) AS sum(DISTINCT a)#0L] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_tan.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_tan.explain new file mode 100644 index 0000000000000..9dca6e6485f49 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_tan.explain @@ -0,0 +1,2 @@ +Project [TAN(b#0) AS TAN(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_tanh.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_tanh.explain new file mode 100644 index 0000000000000..062b38fdc2933 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_tanh.explain @@ -0,0 +1,2 @@ +Project [TANH(b#0) AS TANH(b)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_timestamp_seconds.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_timestamp_seconds.explain new file mode 100644 index 0000000000000..e18706213b8a2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_timestamp_seconds.explain @@ -0,0 +1,2 @@ +Project [timestamp_seconds(x#0L) AS timestamp_seconds(x)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_csv.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_csv.explain new file mode 100644 index 0000000000000..245ccb1dbfff1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_csv.explain @@ -0,0 +1,2 @@ +Project [to_csv((sep,|), d#0, Some(America/Los_Angeles)) AS to_csv(d)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_date.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_date.explain new file mode 100644 index 0000000000000..77d3e0cda1f4b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_date.explain @@ -0,0 +1,2 @@ +Project [cast(s#0 as date) AS to_date(s)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_date_with_format.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_date_with_format.explain new file mode 100644 index 0000000000000..3557274e9de8d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_date_with_format.explain @@ -0,0 +1,2 @@ +Project [cast(gettimestamp(s#0, yyyy-MM-dd, TimestampType, Some(America/Los_Angeles), false) as date) AS to_date(s, yyyy-MM-dd)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_json.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_json.explain new file mode 100644 index 0000000000000..cd72b12ee19b6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_json.explain @@ -0,0 +1,2 @@ +Project [to_json((timestampFormat,dd/MM/yyyy), d#0, Some(America/Los_Angeles)) AS to_json(d)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_timestamp.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_timestamp.explain new file mode 100644 index 0000000000000..bcb235cd13799 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_timestamp.explain @@ -0,0 +1,2 @@ +Project [cast(s#0 as timestamp) AS to_timestamp(s)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_timestamp_with_format.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_timestamp_with_format.explain new file mode 100644 index 0000000000000..54e1c0348a3a9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_timestamp_with_format.explain @@ -0,0 +1,2 @@ +Project [gettimestamp(s#0, yyyy-MM-dd HH:mm:ss.SSSS, TimestampType, Some(America/Los_Angeles), false) AS to_timestamp(s, yyyy-MM-dd HH:mm:ss.SSSS)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_utc_timestamp.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_utc_timestamp.explain new file mode 100644 index 0000000000000..3420ad2fdfeea --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_to_utc_timestamp.explain @@ -0,0 +1,2 @@ +Project [to_utc_timestamp(t#0, -04:00) AS to_utc_timestamp(t, -04:00)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_transform.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_transform.explain new file mode 100644 index 0000000000000..1eb446551f130 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_transform.explain @@ -0,0 +1,2 @@ +Project [transform(e#0, lambdafunction((lambda x#0 + 1), lambda x#0, false)) AS transform(e, lambdafunction((namedlambdavariable() + 1), namedlambdavariable()))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_transform_keys.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_transform_keys.explain new file mode 100644 index 0000000000000..aae92957bcd0d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_transform_keys.explain @@ -0,0 +1,2 @@ +Project [transform_keys(f#0, lambdafunction(concat(lambda x#0, cast(lambda y#0.id as string)), lambda x#0, lambda y#0, false)) AS transform_keys(f, lambdafunction(concat(namedlambdavariable(), namedlambdavariable().id), namedlambdavariable(), namedlambdavariable()))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_transform_values.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_transform_values.explain new file mode 100644 index 0000000000000..3837ff0b78f02 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_transform_values.explain @@ -0,0 +1,2 @@ +Project [transform_values(f#0, lambdafunction(update_fields(lambda y#0, WithField(key, lambda x#0)), lambda x#0, lambda y#0, false)) AS transform_values(f, lambdafunction(update_fields(namedlambdavariable(), WithField(namedlambdavariable())), namedlambdavariable(), namedlambdavariable()))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_transform_with_index.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_transform_with_index.explain new file mode 100644 index 0000000000000..99c7733b1f734 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_transform_with_index.explain @@ -0,0 +1,2 @@ +Project [transform(e#0, lambdafunction((lambda x#0 + lambda y#0), lambda x#0, lambda y#0, false)) AS transform(e, lambdafunction((namedlambdavariable() + namedlambdavariable()), namedlambdavariable(), namedlambdavariable()))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_translate.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_translate.explain new file mode 100644 index 0000000000000..ebd40501a499c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_translate.explain @@ -0,0 +1,2 @@ +Project [translate(g#0, foo, bar) AS translate(g, foo, bar)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_trim.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_trim.explain new file mode 100644 index 0000000000000..55ee4ce051620 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_trim.explain @@ -0,0 +1,2 @@ +Project [trim(g#0, None) AS trim(g)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_trim_with_pattern.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_trim_with_pattern.explain new file mode 100644 index 0000000000000..90a5607114fe7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_trim_with_pattern.explain @@ -0,0 +1,2 @@ +Project [trim(---, Some(g#0)) AS TRIM(BOTH g FROM ---)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_trunc.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_trunc.explain new file mode 100644 index 0000000000000..3c5cbd11cb5cc --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_trunc.explain @@ -0,0 +1,2 @@ +Project [trunc(d#0, mm) AS trunc(d, mm)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_unbase64.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_unbase64.explain new file mode 100644 index 0000000000000..ec85dfa262b6c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_unbase64.explain @@ -0,0 +1,2 @@ +Project [unbase64(g#0, false) AS unbase64(g)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_unhex.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_unhex.explain new file mode 100644 index 0000000000000..776ba5a0c861a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_unhex.explain @@ -0,0 +1,2 @@ +Project [unhex(cast(a#0 as string), false) AS unhex(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_unix_timestamp.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_unix_timestamp.explain new file mode 100644 index 0000000000000..764f3d82d0116 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_unix_timestamp.explain @@ -0,0 +1,2 @@ +Project [unix_timestamp(current_timestamp(), yyyy-MM-dd HH:mm:ss, Some(America/Los_Angeles), false) AS unix_timestamp(current_timestamp(), yyyy-MM-dd HH:mm:ss)#0L] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_unix_timestamp_with_format.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_unix_timestamp_with_format.explain new file mode 100644 index 0000000000000..5ae0af1debe8d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_unix_timestamp_with_format.explain @@ -0,0 +1,2 @@ +Project [unix_timestamp(s#0, yyyy-MM-dd HH:mm:ss.SSSS, Some(America/Los_Angeles), false) AS unix_timestamp(s, yyyy-MM-dd HH:mm:ss.SSSS)#0L] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_upper.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_upper.explain new file mode 100644 index 0000000000000..1a3635164c05a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_upper.explain @@ -0,0 +1,2 @@ +Project [upper(g#0) AS upper(g)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_var_pop.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_var_pop.explain new file mode 100644 index 0000000000000..d20b55fd4d0e8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_var_pop.explain @@ -0,0 +1,2 @@ +Aggregate [var_pop(cast(a#0 as double)) AS var_pop(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_var_samp.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_var_samp.explain new file mode 100644 index 0000000000000..a784b37bf2bb1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_var_samp.explain @@ -0,0 +1,2 @@ +Aggregate [var_samp(cast(a#0 as double)) AS var_samp(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_variance.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_variance.explain new file mode 100644 index 0000000000000..3b8bcea178d36 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_variance.explain @@ -0,0 +1,2 @@ +Aggregate [variance(cast(a#0 as double)) AS variance(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_weekofyear.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_weekofyear.explain new file mode 100644 index 0000000000000..813ebe09acc04 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_weekofyear.explain @@ -0,0 +1,2 @@ +Project [weekofyear(d#0) AS weekofyear(d)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_window.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_window.explain new file mode 100644 index 0000000000000..6adefaa786538 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_window.explain @@ -0,0 +1,4 @@ +Project [window#0 AS window#0] ++- Project [named_struct(start, knownnullable(precisetimestampconversion(((precisetimestampconversion(t#0, TimestampType, LongType) - CASE WHEN (((precisetimestampconversion(t#0, TimestampType, LongType) - 0) % 1000000) < cast(0 as bigint)) THEN (((precisetimestampconversion(t#0, TimestampType, LongType) - 0) % 1000000) + 1000000) ELSE ((precisetimestampconversion(t#0, TimestampType, LongType) - 0) % 1000000) END) - 0), LongType, TimestampType)), end, knownnullable(precisetimestampconversion((((precisetimestampconversion(t#0, TimestampType, LongType) - CASE WHEN (((precisetimestampconversion(t#0, TimestampType, LongType) - 0) % 1000000) < cast(0 as bigint)) THEN (((precisetimestampconversion(t#0, TimestampType, LongType) - 0) % 1000000) + 1000000) ELSE ((precisetimestampconversion(t#0, TimestampType, LongType) - 0) % 1000000) END) - 0) + 1000000), LongType, TimestampType))) AS window#0, d#0, t#0, s#0, x#0L, wt#0] + +- Filter isnotnull(t#0) + +- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_window_time.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_window_time.explain new file mode 100644 index 0000000000000..469b7c16e0d52 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_window_time.explain @@ -0,0 +1,4 @@ +Project [window_time(wt)#0] ++- Project [precisetimestampconversion((precisetimestampconversion(wt#0.end, TimestampType, LongType) - 1), LongType, TimestampType) AS window_time(wt)#0, d#0, t#0, s#0, x#0L, wt#0] + +- Project [d#0, t#0, s#0, x#0L, wt#0 AS wt#0] + +- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_xxhash64.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_xxhash64.explain new file mode 100644 index 0000000000000..f908cae7de1f9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_xxhash64.explain @@ -0,0 +1,2 @@ +Project [xxhash64(id#0L, a#0, d#0, g#0, 42) AS xxhash64(id, a, d, g)#0L] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_year.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_year.explain new file mode 100644 index 0000000000000..fad8e9b6b8448 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_year.explain @@ -0,0 +1,2 @@ +Project [year(d#0) AS year(d)#0] ++- LocalRelation , [d#0, t#0, s#0, x#0L, wt#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_years.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_years.explain new file mode 100644 index 0000000000000..ee2342c4b021c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_years.explain @@ -0,0 +1,2 @@ +Project [years(a#0) AS years(a)#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/function_zip_with.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/function_zip_with.explain new file mode 100644 index 0000000000000..53c9298360735 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/function_zip_with.explain @@ -0,0 +1,2 @@ +Project [zip_with(e#0, e#0, lambdafunction((lambda x#0 + lambda y#0), lambda x#0, lambda y#0, false)) AS zip_with(e, e, lambdafunction((namedlambdavariable() + namedlambdavariable()), namedlambdavariable(), namedlambdavariable()))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_agg.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_agg.explain new file mode 100644 index 0000000000000..acb42c1408c66 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_agg.explain @@ -0,0 +1,2 @@ +Aggregate [id#0L], [id#0L, max(a#0) AS max(a)#0, stddev(b#0) AS stddev(b)#0, stddev(b#0) AS stddev(b)#0, avg(b#0) AS avg(b)#0, avg(b#0) AS avg(b)#0, avg(b#0) AS avg(b)#0, count(1) AS count(1)#0L, count(a#0) AS count(a)#0L] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_agg_columns.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_agg_columns.explain new file mode 100644 index 0000000000000..86b919a3919f2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_agg_columns.explain @@ -0,0 +1,2 @@ +Aggregate [id#0L], [id#0L, max(a#0) AS max(a)#0, sum(b#0) AS sum(b)#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_agg_string.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_agg_string.explain new file mode 100644 index 0000000000000..1c2b2f68c64c6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_agg_string.explain @@ -0,0 +1,2 @@ +Aggregate [id#0L, b#0], [id#0L, b#0, max(a#0) AS max(a)#0, count(a#0) AS count(a)#0L] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_avg.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_avg.explain new file mode 100644 index 0000000000000..e7c559a1bf622 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_avg.explain @@ -0,0 +1,2 @@ +Aggregate [id#0L], [id#0L, avg(a#0) AS avg(a)#0, avg(b#0) AS avg(b)#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_count.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_count.explain new file mode 100644 index 0000000000000..dd08ec3bd59c9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_count.explain @@ -0,0 +1,2 @@ +Aggregate [id#0L], [id#0L, count(1) AS count#0L] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_max.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_max.explain new file mode 100644 index 0000000000000..8f00ba848caa4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_max.explain @@ -0,0 +1,2 @@ +Aggregate [id#0L], [id#0L, max(a#0) AS max(a)#0, max(b#0) AS max(b)#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_mean.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_mean.explain new file mode 100644 index 0000000000000..e7c559a1bf622 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_mean.explain @@ -0,0 +1,2 @@ +Aggregate [id#0L], [id#0L, avg(a#0) AS avg(a)#0, avg(b#0) AS avg(b)#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_min.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_min.explain new file mode 100644 index 0000000000000..b46adbdc263dd --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_min.explain @@ -0,0 +1,2 @@ +Aggregate [id#0L], [id#0L, min(a#0) AS min(a)#0, min(b#0) AS min(b)#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_sum.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_sum.explain new file mode 100644 index 0000000000000..5d6b075bbe6b1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/groupby_sum.explain @@ -0,0 +1,2 @@ +Aggregate [id#0L], [id#0L, sum(a#0) AS sum(a)#0L, sum(b#0) AS sum(b)#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/grouping_and_grouping_id.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/grouping_and_grouping_id.explain new file mode 100644 index 0000000000000..3b7d6fb2b7072 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/grouping_and_grouping_id.explain @@ -0,0 +1,4 @@ +Aggregate [a#0, b#0, spark_grouping_id#0L], [a#0, b#0, cast((shiftright(spark_grouping_id#0L, 1) & 1) as tinyint) AS grouping(a)#0, cast((shiftright(spark_grouping_id#0L, 0) & 1) as tinyint) AS grouping(b)#0, spark_grouping_id#0L AS grouping_id(a, b)#0L] ++- Expand [[id#0L, a#0, b#0, a#0, b#0, 0], [id#0L, a#0, b#0, a#0, null, 1], [id#0L, a#0, b#0, null, b#0, 2], [id#0L, a#0, b#0, null, null, 3]], [id#0L, a#0, b#0, a#0, b#0, spark_grouping_id#0L] + +- Project [id#0L, a#0, b#0, a#0 AS a#0, b#0 AS b#0] + +- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/hint.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/hint.explain new file mode 100644 index 0000000000000..210be491c38c8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/hint.explain @@ -0,0 +1,2 @@ +Repartition 100, false ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/intersect.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/intersect.explain new file mode 100644 index 0000000000000..b779995c98915 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/intersect.explain @@ -0,0 +1,3 @@ +'Intersect false +:- LocalRelation , [id#0L, a#0, b#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/intersectAll.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/intersectAll.explain new file mode 100644 index 0000000000000..537b25838f54e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/intersectAll.explain @@ -0,0 +1,3 @@ +'Intersect All true +:- LocalRelation , [id#0L, a#0, b#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/join_condition.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/join_condition.explain new file mode 100644 index 0000000000000..31681777a393e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/join_condition.explain @@ -0,0 +1,5 @@ +'Join LeftAnti, (id#0L = id#0L) +:- SubqueryAlias l +: +- LocalRelation , [id#0L, a#0, b#0] ++- SubqueryAlias r + +- LocalRelation , [a#0, id#0L, payload#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/join_inner_condition.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/join_inner_condition.explain new file mode 100644 index 0000000000000..c8f41d0927369 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/join_inner_condition.explain @@ -0,0 +1,5 @@ +'Join Inner, (a#0 = a#0) +:- SubqueryAlias l +: +- LocalRelation , [id#0L, a#0, b#0] ++- SubqueryAlias r + +- LocalRelation , [a#0, id#0L, payload#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/join_inner_no_condition.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/join_inner_no_condition.explain new file mode 100644 index 0000000000000..1414616da4d59 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/join_inner_no_condition.explain @@ -0,0 +1,3 @@ +'Join Inner +:- LocalRelation , [id#0L, a#0, b#0] ++- LocalRelation , [a#0, id#0L, payload#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/join_inner_using_multiple_col_array.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/join_inner_using_multiple_col_array.explain new file mode 100644 index 0000000000000..ed29cef333a91 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/join_inner_using_multiple_col_array.explain @@ -0,0 +1,4 @@ +'Project [id#0L, a#0, b#0, payload#0] ++- 'Join Inner, ((id#0L = id#0L) AND (a#0 = a#0)) + :- LocalRelation , [id#0L, a#0, b#0] + +- LocalRelation , [a#0, id#0L, payload#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/join_inner_using_multiple_col_seq.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/join_inner_using_multiple_col_seq.explain new file mode 100644 index 0000000000000..ed29cef333a91 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/join_inner_using_multiple_col_seq.explain @@ -0,0 +1,4 @@ +'Project [id#0L, a#0, b#0, payload#0] ++- 'Join Inner, ((id#0L = id#0L) AND (a#0 = a#0)) + :- LocalRelation , [id#0L, a#0, b#0] + +- LocalRelation , [a#0, id#0L, payload#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/join_inner_using_single_col.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/join_inner_using_single_col.explain new file mode 100644 index 0000000000000..8667e407dd3a4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/join_inner_using_single_col.explain @@ -0,0 +1,4 @@ +'Project [id#0L, a#0, b#0, a#0, payload#0] ++- 'Join Inner, (id#0L = id#0L) + :- LocalRelation , [id#0L, a#0, b#0] + +- LocalRelation , [a#0, id#0L, payload#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/join_using_multiple_col_array.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/join_using_multiple_col_array.explain new file mode 100644 index 0000000000000..6586d67708b6b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/join_using_multiple_col_array.explain @@ -0,0 +1,4 @@ +'Project [coalesce(id#0L, id#0L) AS id#0L, coalesce(a#0, a#0) AS a#0, b#0, payload#0] ++- 'Join FullOuter, ((id#0L = id#0L) AND (a#0 = a#0)) + :- LocalRelation , [id#0L, a#0, b#0] + +- LocalRelation , [a#0, id#0L, payload#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/join_using_multiple_col_seq.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/join_using_multiple_col_seq.explain new file mode 100644 index 0000000000000..ed47c617337a8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/join_using_multiple_col_seq.explain @@ -0,0 +1,4 @@ +'Project [id#0L, a#0, b#0, payload#0] ++- 'Join RightOuter, ((id#0L = id#0L) AND (a#0 = a#0)) + :- LocalRelation , [id#0L, a#0, b#0] + +- LocalRelation , [a#0, id#0L, payload#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/join_using_single_col.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/join_using_single_col.explain new file mode 100644 index 0000000000000..64553477f7589 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/join_using_single_col.explain @@ -0,0 +1,4 @@ +'Project [id#0L, a#0, b#0] ++- 'Join LeftSemi, (id#0L = id#0L) + :- LocalRelation , [id#0L, a#0, b#0] + +- LocalRelation , [a#0, id#0L, payload#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/json_from_dataset.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/json_from_dataset.explain new file mode 100644 index 0000000000000..9fbaa9fcede81 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/json_from_dataset.explain @@ -0,0 +1 @@ +LogicalRDD [c1#0, c2#0], false diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/limit.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/limit.explain new file mode 100644 index 0000000000000..3d445331b3527 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/limit.explain @@ -0,0 +1,3 @@ +GlobalLimit 10 ++- LocalLimit 10 + +- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/melt_no_values.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/melt_no_values.explain new file mode 100644 index 0000000000000..f61fc30a3a529 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/melt_no_values.explain @@ -0,0 +1,2 @@ +Expand [[id#0L, a#0, b, b#0]], [id#0L, a#0, #0, value#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/melt_values.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/melt_values.explain new file mode 100644 index 0000000000000..b5742d976dee9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/melt_values.explain @@ -0,0 +1,2 @@ +Expand [[a#0, id, id#0L]], [a#0, #0, value#0L] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/offset.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/offset.explain new file mode 100644 index 0000000000000..f1f294242628f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/offset.explain @@ -0,0 +1,2 @@ +Offset 1000 ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/orderBy_columns.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/orderBy_columns.explain new file mode 100644 index 0000000000000..f663d0dabe134 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/orderBy_columns.explain @@ -0,0 +1,2 @@ +Sort [id#0L ASC NULLS FIRST, b#0 ASC NULLS FIRST, a#0 ASC NULLS FIRST], true ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/orderBy_strings.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/orderBy_strings.explain new file mode 100644 index 0000000000000..dddaffbc84501 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/orderBy_strings.explain @@ -0,0 +1,2 @@ +Sort [b#0 ASC NULLS FIRST, id#0L ASC NULLS FIRST, a#0 ASC NULLS FIRST], true ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/pivot.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/pivot.explain new file mode 100644 index 0000000000000..b8cd844123773 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/pivot.explain @@ -0,0 +1,4 @@ +Project [id#0L, __pivot_count(b) AS `count(b)`#0[0] AS 1#0L, __pivot_count(b) AS `count(b)`#0[1] AS 2#0L, __pivot_count(b) AS `count(b)`#0[2] AS 3#0L] ++- Aggregate [id#0L], [id#0L, pivotfirst(a#0, count(b)#0L, 1, 2, 3, 0, 0) AS __pivot_count(b) AS `count(b)`#0] + +- Aggregate [id#0L, a#0], [id#0L, a#0, count(b#0) AS count(b)#0L] + +- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/pivot_without_column_values.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/pivot_without_column_values.explain new file mode 100644 index 0000000000000..1a50919770c9d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/pivot_without_column_values.explain @@ -0,0 +1,4 @@ +Project [id#0L] ++- Aggregate [id#0L], [id#0L, pivotfirst(a#0, count(b)#0L, 0, 0) AS __pivot_count(b) AS `count(b)`#0] + +- Aggregate [id#0L, a#0], [id#0L, a#0, count(b#0) AS count(b)#0L] + +- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/range.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/range.explain new file mode 100644 index 0000000000000..be44c87de030c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/range.explain @@ -0,0 +1 @@ +Range (1, 10, step=1, splits=Some(2)) diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/read.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/read.explain new file mode 100644 index 0000000000000..da1c0f25e548b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/read.explain @@ -0,0 +1 @@ +Relation [name#0,age#0,job#0] csv diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/read_csv.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/read_csv.explain new file mode 100644 index 0000000000000..4479893592a53 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/read_csv.explain @@ -0,0 +1 @@ +Relation [_c0#0] csv diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/read_jdbc.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/read_jdbc.explain new file mode 100644 index 0000000000000..c0e906176b867 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/read_jdbc.explain @@ -0,0 +1 @@ +Relation [A#0,B#0,C#0] JDBCRelation(TEST.TIMETYPES) [numPartitions=1] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/read_jdbc_with_partition.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/read_jdbc_with_partition.explain new file mode 100644 index 0000000000000..e3ddb781bd2b8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/read_jdbc_with_partition.explain @@ -0,0 +1 @@ +Relation [NAME#0,THEID#0,Dept#0] JDBCRelation(TEST.EMP) [numPartitions=3] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/read_jdbc_with_predicates.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/read_jdbc_with_predicates.explain new file mode 100644 index 0000000000000..d3eb0fc7d0ffb --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/read_jdbc_with_predicates.explain @@ -0,0 +1 @@ +Relation [NAME#0,THEID#0] JDBCRelation(TEST.PEOPLE) [numPartitions=2] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/read_json.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/read_json.explain new file mode 100644 index 0000000000000..871c86c239b93 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/read_json.explain @@ -0,0 +1 @@ +Relation [age#0L,name#0] json diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/read_orc.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/read_orc.explain new file mode 100644 index 0000000000000..c5cff325e5e72 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/read_orc.explain @@ -0,0 +1 @@ +Relation [name#0,favorite_color#0,favorite_numbers#0] orc diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/read_parquet.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/read_parquet.explain new file mode 100644 index 0000000000000..f77414dc47e14 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/read_parquet.explain @@ -0,0 +1 @@ +Relation [name#0,favorite_color#0,favorite_numbers#0] parquet diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/read_path.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/read_path.explain new file mode 100644 index 0000000000000..bad6a06e3fdc0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/read_path.explain @@ -0,0 +1 @@ +Relation [name#0,age#0] csv diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/read_table.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/read_table.explain new file mode 100644 index 0000000000000..11a96567dbcad --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/read_table.explain @@ -0,0 +1,2 @@ +SubqueryAlias primary.tempdb.myTable ++- RelationV2[id#0L] primary.tempdb.myTable primary.tempdb.myTable diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/read_text.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/read_text.explain new file mode 100644 index 0000000000000..1002d71460357 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/read_text.explain @@ -0,0 +1 @@ +Relation [value#0] text diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/relation_extension.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/relation_extension.explain new file mode 100644 index 0000000000000..df724a7dd185a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/relation_extension.explain @@ -0,0 +1 @@ +LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/repartition.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/repartition.explain new file mode 100644 index 0000000000000..f30594bc18c81 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/repartition.explain @@ -0,0 +1,2 @@ +Repartition 24, true ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/repartitionByRange_expressions.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/repartitionByRange_expressions.explain new file mode 100644 index 0000000000000..d38e6a0075f33 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/repartitionByRange_expressions.explain @@ -0,0 +1,2 @@ +RepartitionByExpression [a#0 ASC NULLS FIRST, id#0L DESC NULLS FIRST] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/repartitionByRange_num_partitions_expressions.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/repartitionByRange_num_partitions_expressions.explain new file mode 100644 index 0000000000000..d9c9678ab5e07 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/repartitionByRange_num_partitions_expressions.explain @@ -0,0 +1,2 @@ +RepartitionByExpression [b#0 ASC NULLS FIRST, id#0L DESC NULLS FIRST], 33 ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/repartition_expressions.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/repartition_expressions.explain new file mode 100644 index 0000000000000..3b602b4fd71a8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/repartition_expressions.explain @@ -0,0 +1,2 @@ +RepartitionByExpression [id#0L, b#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/repartition_num_partitions_expressions.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/repartition_num_partitions_expressions.explain new file mode 100644 index 0000000000000..e65dc5cd7b81f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/repartition_num_partitions_expressions.explain @@ -0,0 +1,2 @@ +RepartitionByExpression [a#0, id#0L], 22 ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/replace.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/replace.explain new file mode 100644 index 0000000000000..ef3de21e881f2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/replace.explain @@ -0,0 +1,2 @@ +Project [CASE WHEN (cast(id#0L as double) = 1.0) THEN cast(8.0 as bigint) ELSE id#0L END AS id#0L, a#0, b#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/rollup_column.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/rollup_column.explain new file mode 100644 index 0000000000000..c8f0f1e2aeb25 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/rollup_column.explain @@ -0,0 +1,4 @@ +Aggregate [a#0, b#0, spark_grouping_id#0L], [a#0, b#0, count(1) AS count#0L] ++- Expand [[id#0L, a#0, b#0, a#0, b#0, 0], [id#0L, a#0, b#0, a#0, null, 1], [id#0L, a#0, b#0, null, null, 3]], [id#0L, a#0, b#0, a#0, b#0, spark_grouping_id#0L] + +- Project [id#0L, a#0, b#0, a#0 AS a#0, b#0 AS b#0] + +- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/rollup_string.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/rollup_string.explain new file mode 100644 index 0000000000000..c8f0f1e2aeb25 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/rollup_string.explain @@ -0,0 +1,4 @@ +Aggregate [a#0, b#0, spark_grouping_id#0L], [a#0, b#0, count(1) AS count#0L] ++- Expand [[id#0L, a#0, b#0, a#0, b#0, 0], [id#0L, a#0, b#0, a#0, null, 1], [id#0L, a#0, b#0, null, null, 3]], [id#0L, a#0, b#0, a#0, b#0, spark_grouping_id#0L] + +- Project [id#0L, a#0, b#0, a#0 AS a#0, b#0 AS b#0] + +- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/sampleBy.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/sampleBy.explain new file mode 100644 index 0000000000000..64abbcf1b5365 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/sampleBy.explain @@ -0,0 +1,2 @@ +Filter UDF(id#0L, rand(0)) ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/sample_fraction_seed.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/sample_fraction_seed.explain new file mode 100644 index 0000000000000..f94e0a850e403 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/sample_fraction_seed.explain @@ -0,0 +1,2 @@ +Sample 0.0, 0.43, false, 9890823 ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/sample_withReplacement_fraction_seed.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/sample_withReplacement_fraction_seed.explain new file mode 100644 index 0000000000000..340c25ab6d017 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/sample_withReplacement_fraction_seed.explain @@ -0,0 +1,2 @@ +Sample 0.0, 0.23, true, 898 ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/select.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/select.explain new file mode 100644 index 0000000000000..aac54ef566259 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/select.explain @@ -0,0 +1,2 @@ +Project [id#0L] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/selectExpr.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/selectExpr.explain new file mode 100644 index 0000000000000..935a26e47d327 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/selectExpr.explain @@ -0,0 +1,2 @@ +Project [(a#0 + 10) AS x#0, (id#0L % cast(10 as bigint)) AS grp#0L] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/select_strings.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/select_strings.explain new file mode 100644 index 0000000000000..c0a9b3df30b26 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/select_strings.explain @@ -0,0 +1,2 @@ +Project [id#0L, a#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/select_typed_1-arg.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/select_typed_1-arg.explain new file mode 100644 index 0000000000000..64017a5e07345 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/select_typed_1-arg.explain @@ -0,0 +1,3 @@ +Project [id#0L, a#0] ++- Generate inline(array(struct(id, id#0L, a, a#0))), false, [id#0L, a#0] + +- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/sortWithinPartitions_columns.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/sortWithinPartitions_columns.explain new file mode 100644 index 0000000000000..ce1e7ae6f4c0a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/sortWithinPartitions_columns.explain @@ -0,0 +1,2 @@ +Sort [id#0L ASC NULLS FIRST, b#0 ASC NULLS FIRST], false ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/sortWithinPartitions_strings.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/sortWithinPartitions_strings.explain new file mode 100644 index 0000000000000..373960f809bac --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/sortWithinPartitions_strings.explain @@ -0,0 +1,2 @@ +Sort [a#0 ASC NULLS FIRST, id#0L ASC NULLS FIRST], false ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/sort_columns.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/sort_columns.explain new file mode 100644 index 0000000000000..8cb023ce5fbad --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/sort_columns.explain @@ -0,0 +1,2 @@ +Sort [id#0L ASC NULLS FIRST, b#0 ASC NULLS FIRST], true ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/sort_strings.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/sort_strings.explain new file mode 100644 index 0000000000000..2d060dc9fb8bd --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/sort_strings.explain @@ -0,0 +1,2 @@ +Sort [b#0 ASC NULLS FIRST, a#0 ASC NULLS FIRST], true ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/summary.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/summary.explain new file mode 100644 index 0000000000000..3ce8a26f13834 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/summary.explain @@ -0,0 +1,5 @@ +Project [summary#0, element_at(id#0, summary#0, None, false) AS id#0, element_at(a#0, summary#0, None, false) AS a#0, element_at(b#0, summary#0, None, false) AS b#0] ++- Project [id#0, a#0, b#0, summary#0] + +- Generate explode([mean,min]), false, [summary#0] + +- Aggregate [map(cast(mean as string), cast(avg(id#0L) as string), cast(min as string), cast(min(id#0L) as string)) AS id#0, map(cast(mean as string), cast(avg(a#0) as string), cast(min as string), cast(min(a#0) as string)) AS a#0, map(cast(mean as string), cast(avg(b#0) as string), cast(min as string), cast(min(b#0) as string)) AS b#0] + +- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/table.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/table.explain new file mode 100644 index 0000000000000..11a96567dbcad --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/table.explain @@ -0,0 +1,2 @@ +SubqueryAlias primary.tempdb.myTable ++- RelationV2[id#0L] primary.tempdb.myTable primary.tempdb.myTable diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/table_API_with_options.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/table_API_with_options.explain new file mode 100644 index 0000000000000..11a96567dbcad --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/table_API_with_options.explain @@ -0,0 +1,2 @@ +SubqueryAlias primary.tempdb.myTable ++- RelationV2[id#0L] primary.tempdb.myTable primary.tempdb.myTable diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/test_broadcast.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/test_broadcast.explain new file mode 100644 index 0000000000000..8c86098c265b8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/test_broadcast.explain @@ -0,0 +1,5 @@ +'Project [id#0L, a#0, b#0, a#0, payload#0] ++- 'Join Inner, (id#0L = id#0L) + :- LocalRelation , [id#0L, a#0, b#0] + +- ResolvedHint (strategy=broadcast) + +- LocalRelation , [a#0, id#0L, payload#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/to.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/to.explain new file mode 100644 index 0000000000000..1e113ce4ddefe --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/to.explain @@ -0,0 +1,2 @@ +Project [b#0, cast(id#0L as int) AS id#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/toDF.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/toDF.explain new file mode 100644 index 0000000000000..e751403f8d43a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/toDF.explain @@ -0,0 +1,2 @@ +Project [id#0L AS x1#0L, a#0 AS x2#0, b#0 AS x3#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/toJSON.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/toJSON.explain new file mode 100644 index 0000000000000..1698c562732e8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/toJSON.explain @@ -0,0 +1,2 @@ +Project [to_json(struct(id, id#0L, a, a#0, b, b#0, d, d#0, e, e#0, f, f#0, g, g#0), Some(America/Los_Angeles)) AS to_json(struct(id, a, b, d, e, f, g))#0] ++- LocalRelation , [id#0L, a#0, b#0, d#0, e#0, f#0, g#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/union.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/union.explain new file mode 100644 index 0000000000000..4d5d1f53b8412 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/union.explain @@ -0,0 +1,3 @@ +Union false, false +:- LocalRelation , [id#0L, a#0, b#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/unionAll.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/unionAll.explain new file mode 100644 index 0000000000000..4d5d1f53b8412 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/unionAll.explain @@ -0,0 +1,3 @@ +Union false, false +:- LocalRelation , [id#0L, a#0, b#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/unionByName.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/unionByName.explain new file mode 100644 index 0000000000000..ed960186ad4d8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/unionByName.explain @@ -0,0 +1,5 @@ +Union false, false +:- Project [id#0L, a#0] +: +- LocalRelation , [id#0L, a#0, b#0] ++- Project [id#0L, a#0] + +- LocalRelation , [a#0, id#0L, payload#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/unionByName_allowMissingColumns.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/unionByName_allowMissingColumns.explain new file mode 100644 index 0000000000000..96bd9f281c15e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/unionByName_allowMissingColumns.explain @@ -0,0 +1,5 @@ +Union false, false +:- Project [id#0L, a#0, b#0, null AS payload#0] +: +- LocalRelation , [id#0L, a#0, b#0] ++- Project [id#0L, a#0, null AS b#0, payload#0] + +- LocalRelation , [a#0, id#0L, payload#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/unpivot_no_values.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/unpivot_no_values.explain new file mode 100644 index 0000000000000..8d1749ee74c5a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/unpivot_no_values.explain @@ -0,0 +1,2 @@ +Expand [[id#0L, a, cast(a#0 as double)], [id#0L, b, b#0]], [id#0L, #0, value#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/unpivot_values.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/unpivot_values.explain new file mode 100644 index 0000000000000..f61fc30a3a529 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/unpivot_values.explain @@ -0,0 +1,2 @@ +Expand [[id#0L, a#0, b, b#0]], [id#0L, a#0, #0, value#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/where_column.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/where_column.explain new file mode 100644 index 0000000000000..bb4aa22afe02f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/where_column.explain @@ -0,0 +1,2 @@ +Filter (id#0L = 1) ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/where_expr.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/where_expr.explain new file mode 100644 index 0000000000000..cce42b721169a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/where_expr.explain @@ -0,0 +1,2 @@ +Filter ((cast(a#0 as bigint) + id#0L) < cast(1000 as bigint)) ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/window.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/window.explain new file mode 100644 index 0000000000000..6f81695792540 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/window.explain @@ -0,0 +1,8 @@ +Project [min(id) OVER (PARTITION BY a, b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#0L, min(id) OVER (PARTITION BY a, b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#0L, min(id) OVER (ORDER BY a ASC NULLS FIRST, b ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0L, min(id) OVER (ORDER BY a ASC NULLS FIRST, b ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0L, min(id) OVER (ORDER BY a ASC NULLS FIRST ROWS BETWEEN 2 FOLLOWING AND 3 FOLLOWING)#0L, min(id) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN 2 FOLLOWING AND 3 FOLLOWING)#0L, count(id) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#0L] ++- Project [id#0L, a#0, b#0, min(id) OVER (PARTITION BY a, b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#0L, min(id) OVER (PARTITION BY a, b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#0L, min(id) OVER (ORDER BY a ASC NULLS FIRST, b ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0L, min(id) OVER (ORDER BY a ASC NULLS FIRST, b ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0L, min(id) OVER (ORDER BY a ASC NULLS FIRST ROWS BETWEEN 2 FOLLOWING AND 3 FOLLOWING)#0L, min(id) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN 2 FOLLOWING AND 3 FOLLOWING)#0L, count(id) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#0L, min(id) OVER (PARTITION BY a, b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#0L, min(id) OVER (PARTITION BY a, b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#0L, min(id) OVER (ORDER BY a ASC NULLS FIRST, b ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0L, min(id) OVER (ORDER BY a ASC NULLS FIRST, b ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0L, min(id) OVER (ORDER BY a ASC NULLS FIRST ROWS BETWEEN 2 FOLLOWING AND 3 FOLLOWING)#0L, min(id) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN 2 FOLLOWING AND 3 FOLLOWING)#0L, count(id) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#0L] + +- Window [count(id#0L) windowspecdefinition(specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS count(id) OVER (ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#0L] + +- Window [min(id#0L) windowspecdefinition(a#0 ASC NULLS FIRST, specifiedwindowframe(RowFrame, 2, 3)) AS min(id) OVER (ORDER BY a ASC NULLS FIRST ROWS BETWEEN 2 FOLLOWING AND 3 FOLLOWING)#0L, min(id#0L) windowspecdefinition(a#0 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, cast(2 as int), cast(3 as int))) AS min(id) OVER (ORDER BY a ASC NULLS FIRST RANGE BETWEEN 2 FOLLOWING AND 3 FOLLOWING)#0L], [a#0 ASC NULLS FIRST] + +- Window [min(id#0L) windowspecdefinition(a#0 ASC NULLS FIRST, b#0 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS min(id) OVER (ORDER BY a ASC NULLS FIRST, b ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0L, min(id#0L) windowspecdefinition(a#0 ASC NULLS FIRST, b#0 ASC NULLS FIRST, specifiedwindowframe(RangeFrame, unboundedpreceding$(), currentrow$())) AS min(id) OVER (ORDER BY a ASC NULLS FIRST, b ASC NULLS FIRST RANGE BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW)#0L], [a#0 ASC NULLS FIRST, b#0 ASC NULLS FIRST] + +- Window [min(id#0L) windowspecdefinition(a#0, b#0, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS min(id) OVER (PARTITION BY a, b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#0L, min(id#0L) windowspecdefinition(a#0, b#0, specifiedwindowframe(RowFrame, unboundedpreceding$(), unboundedfollowing$())) AS min(id) OVER (PARTITION BY a, b ROWS BETWEEN UNBOUNDED PRECEDING AND UNBOUNDED FOLLOWING)#0L], [a#0, b#0] + +- Project [id#0L, a#0, b#0] + +- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/withColumnRenamed_java_map.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/withColumnRenamed_java_map.explain new file mode 100644 index 0000000000000..0bacc8a11d231 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/withColumnRenamed_java_map.explain @@ -0,0 +1,2 @@ +Project [id#0L AS nid#0L, a#0, b#0 AS bravo#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/withColumnRenamed_scala_map.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/withColumnRenamed_scala_map.explain new file mode 100644 index 0000000000000..56bed6fca9a58 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/withColumnRenamed_scala_map.explain @@ -0,0 +1,2 @@ +Project [id#0L, a#0 AS alpha#0, b#0 AS beta#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/withColumnRenamed_single.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/withColumnRenamed_single.explain new file mode 100644 index 0000000000000..f4713f5fbc2ed --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/withColumnRenamed_single.explain @@ -0,0 +1,2 @@ +Project [id#0L AS nid#0L, a#0, b#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/withColumn_single.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/withColumn_single.explain new file mode 100644 index 0000000000000..958529c1121fc --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/withColumn_single.explain @@ -0,0 +1,2 @@ +Project [id#0L, a#0, b#0, (a#0 + 100) AS z#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/withColumns_java_map.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/withColumns_java_map.explain new file mode 100644 index 0000000000000..dadb7bfbd861e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/withColumns_java_map.explain @@ -0,0 +1,2 @@ +Project [id#0L, 123 AS a#0, b#0, id#0L AS g#0L] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/withColumns_scala_map.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/withColumns_scala_map.explain new file mode 100644 index 0000000000000..3866cbbba70a7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/withColumns_scala_map.explain @@ -0,0 +1,2 @@ +Project [id#0L, a#0, redacted AS b#0, (a#0 + 100) AS z#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/explain-results/withMetadata.explain b/connector/connect/common/src/test/resources/query-tests/explain-results/withMetadata.explain new file mode 100644 index 0000000000000..0bd68ea4e8a1b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/explain-results/withMetadata.explain @@ -0,0 +1,2 @@ +Project [id#0L AS id#0L, a#0, b#0] ++- LocalRelation , [id#0L, a#0, b#0] diff --git a/connector/connect/common/src/test/resources/query-tests/queries/alias_string.json b/connector/connect/common/src/test/resources/query-tests/queries/alias_string.json new file mode 100644 index 0000000000000..98ea62f986b99 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/alias_string.json @@ -0,0 +1,16 @@ +{ + "common": { + "planId": "1" + }, + "subqueryAlias": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "alias": "fooz" + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/alias_string.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/alias_string.proto.bin new file mode 100644 index 0000000000000..6e8467cccde18 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/alias_string.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/alias_symbol.json b/connector/connect/common/src/test/resources/query-tests/queries/alias_symbol.json new file mode 100644 index 0000000000000..b469cbd0a3351 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/alias_symbol.json @@ -0,0 +1,16 @@ +{ + "common": { + "planId": "1" + }, + "subqueryAlias": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "alias": "bob" + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/alias_symbol.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/alias_symbol.proto.bin new file mode 100644 index 0000000000000..7034d39cd8a57 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/alias_symbol.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/apply.json b/connector/connect/common/src/test/resources/query-tests/queries/apply.json new file mode 100644 index 0000000000000..e484781708ebf --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/apply.json @@ -0,0 +1,21 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "expressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a", + "planId": "0" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/apply.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/apply.proto.bin new file mode 100644 index 0000000000000..5d5efcead5e1c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/apply.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/as_string.json b/connector/connect/common/src/test/resources/query-tests/queries/as_string.json new file mode 100644 index 0000000000000..d74c9d16a7ffb --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/as_string.json @@ -0,0 +1,16 @@ +{ + "common": { + "planId": "1" + }, + "subqueryAlias": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "alias": "foo" + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/as_string.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/as_string.proto.bin new file mode 100644 index 0000000000000..829d6083e094d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/as_string.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/as_symbol.json b/connector/connect/common/src/test/resources/query-tests/queries/as_symbol.json new file mode 100644 index 0000000000000..ca69a743175f0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/as_symbol.json @@ -0,0 +1,16 @@ +{ + "common": { + "planId": "1" + }, + "subqueryAlias": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "alias": "bar" + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/as_symbol.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/as_symbol.proto.bin new file mode 100644 index 0000000000000..f7111a4651d92 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/as_symbol.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/coalesce.json b/connector/connect/common/src/test/resources/query-tests/queries/coalesce.json new file mode 100644 index 0000000000000..cb08412296aa8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/coalesce.json @@ -0,0 +1,17 @@ +{ + "common": { + "planId": "1" + }, + "repartition": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "numPartitions": 5, + "shuffle": false + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/coalesce.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/coalesce.proto.bin new file mode 100644 index 0000000000000..b03e7d58a2bfd Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/coalesce.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/col.json b/connector/connect/common/src/test/resources/query-tests/queries/col.json new file mode 100644 index 0000000000000..f3abc8a81affb --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/col.json @@ -0,0 +1,26 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "expressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id", + "planId": "0" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b", + "planId": "0" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/col.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/col.proto.bin new file mode 100644 index 0000000000000..15c4eabb8d505 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/col.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/colRegex.json b/connector/connect/common/src/test/resources/query-tests/queries/colRegex.json new file mode 100644 index 0000000000000..3a7508b63a987 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/colRegex.json @@ -0,0 +1,21 @@ +{ + "common": { + "planId": "2" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "expressions": [{ + "unresolvedRegex": { + "colName": "`a|id`", + "planId": "1" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/colRegex.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/colRegex.proto.bin new file mode 100644 index 0000000000000..ce518b35fbd9f Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/colRegex.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_add.json b/connector/connect/common/src/test/resources/query-tests/queries/column_add.json new file mode 100644 index 0000000000000..cfa40fac8c6f9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_add.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "+", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_add.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_add.proto.bin new file mode 100644 index 0000000000000..10b410b5b08b5 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_add.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_alias.json b/connector/connect/common/src/test/resources/query-tests/queries/column_alias.json new file mode 100644 index 0000000000000..4fe650db9d3b5 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_alias.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "alias": { + "expr": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "name": ["b"] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_alias.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_alias.proto.bin new file mode 100644 index 0000000000000..e9b9076832877 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_alias.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_and.json b/connector/connect/common/src/test/resources/query-tests/queries/column_and.json new file mode 100644 index 0000000000000..d3f8cd0e73cbc --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_and.json @@ -0,0 +1,47 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "and", + "arguments": [{ + "unresolvedFunction": { + "functionName": "\u003e", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "literal": { + "integer": 10 + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "\u003c", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, { + "literal": { + "double": 0.5 + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_and.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_and.proto.bin new file mode 100644 index 0000000000000..241f1a9303b2c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_and.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_apply.json b/connector/connect/common/src/test/resources/query-tests/queries/column_apply.json new file mode 100644 index 0000000000000..b203a20a0ea6c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_apply.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedExtractValue": { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "f" + } + }, + "extraction": { + "literal": { + "string": "super_duper_key" + } + } + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_apply.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_apply.proto.bin new file mode 100644 index 0000000000000..9e56d5891f503 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_apply.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_as_multi.json b/connector/connect/common/src/test/resources/query-tests/queries/column_as_multi.json new file mode 100644 index 0000000000000..426fd1fbb7592 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_as_multi.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "alias": { + "expr": { + "expressionString": { + "expression": "inline(map_values(f))" + } + }, + "name": ["v1", "v2", "v3"] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_as_multi.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_as_multi.proto.bin new file mode 100644 index 0000000000000..602beafb01cbe Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_as_multi.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_as_with_metadata.json b/connector/connect/common/src/test/resources/query-tests/queries/column_as_with_metadata.json new file mode 100644 index 0000000000000..e943c01f26fbe --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_as_with_metadata.json @@ -0,0 +1,26 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "alias": { + "expr": { + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, + "name": ["e_mod"], + "metadata": "{\"comment\":\"modified E field\"}" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_as_with_metadata.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_as_with_metadata.proto.bin new file mode 100644 index 0000000000000..2952e871f6e65 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_as_with_metadata.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_asc.json b/connector/connect/common/src/test/resources/query-tests/queries/column_asc.json new file mode 100644 index 0000000000000..31f3102f77a49 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_asc.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "sort": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "order": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }], + "isGlobal": true + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_asc.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_asc.proto.bin new file mode 100644 index 0000000000000..ee5bda529c453 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_asc.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_asc_nulls_first.json b/connector/connect/common/src/test/resources/query-tests/queries/column_asc_nulls_first.json new file mode 100644 index 0000000000000..31f3102f77a49 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_asc_nulls_first.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "sort": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "order": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }], + "isGlobal": true + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_asc_nulls_first.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_asc_nulls_first.proto.bin new file mode 100644 index 0000000000000..ee5bda529c453 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_asc_nulls_first.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_asc_nulls_last.json b/connector/connect/common/src/test/resources/query-tests/queries/column_asc_nulls_last.json new file mode 100644 index 0000000000000..94326e0f6621d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_asc_nulls_last.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "sort": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "order": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_LAST" + }], + "isGlobal": true + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_asc_nulls_last.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_asc_nulls_last.proto.bin new file mode 100644 index 0000000000000..496fe40192dae Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_asc_nulls_last.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_between.json b/connector/connect/common/src/test/resources/query-tests/queries/column_between.json new file mode 100644 index 0000000000000..20927b93d8438 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_between.json @@ -0,0 +1,47 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "and", + "arguments": [{ + "unresolvedFunction": { + "functionName": "\u003e\u003d", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "literal": { + "integer": 10 + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "\u003c\u003d", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "literal": { + "integer": 20 + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_between.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_between.proto.bin new file mode 100644 index 0000000000000..d03dd02a2f36a Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_between.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_bitwiseAND.json b/connector/connect/common/src/test/resources/query-tests/queries/column_bitwiseAND.json new file mode 100644 index 0000000000000..bd3ac671fca33 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_bitwiseAND.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "\u0026", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "literal": { + "integer": 255 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_bitwiseAND.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_bitwiseAND.proto.bin new file mode 100644 index 0000000000000..4815bc7dd1a20 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_bitwiseAND.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_bitwiseOR.json b/connector/connect/common/src/test/resources/query-tests/queries/column_bitwiseOR.json new file mode 100644 index 0000000000000..eaa27ffa46164 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_bitwiseOR.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "|", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "literal": { + "integer": 7 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_bitwiseOR.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_bitwiseOR.proto.bin new file mode 100644 index 0000000000000..9cf110da4ad61 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_bitwiseOR.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_bitwiseXOR.json b/connector/connect/common/src/test/resources/query-tests/queries/column_bitwiseXOR.json new file mode 100644 index 0000000000000..c51eb3140c339 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_bitwiseXOR.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "^", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "literal": { + "integer": 78 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_bitwiseXOR.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_bitwiseXOR.proto.bin new file mode 100644 index 0000000000000..70c61f9620576 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_bitwiseXOR.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_cast.json b/connector/connect/common/src/test/resources/query-tests/queries/column_cast.json new file mode 100644 index 0000000000000..1a1ee5ed4d51a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_cast.json @@ -0,0 +1,28 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "cast": { + "expr": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "type": { + "long": { + } + } + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_cast.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_cast.proto.bin new file mode 100644 index 0000000000000..60e807b4c3507 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_cast.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_contains.json b/connector/connect/common/src/test/resources/query-tests/queries/column_contains.json new file mode 100644 index 0000000000000..05d6ccf38b367 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_contains.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "contains", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "baz" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_contains.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_contains.proto.bin new file mode 100644 index 0000000000000..9c796f9470c31 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_contains.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_desc.json b/connector/connect/common/src/test/resources/query-tests/queries/column_desc.json new file mode 100644 index 0000000000000..50efda387ec44 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_desc.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "sort": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "order": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, + "direction": "SORT_DIRECTION_DESCENDING", + "nullOrdering": "SORT_NULLS_LAST" + }], + "isGlobal": true + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_desc.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_desc.proto.bin new file mode 100644 index 0000000000000..df2589d8231bc Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_desc.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_desc_nulls_first.json b/connector/connect/common/src/test/resources/query-tests/queries/column_desc_nulls_first.json new file mode 100644 index 0000000000000..bed300feea2eb --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_desc_nulls_first.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "sort": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "order": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, + "direction": "SORT_DIRECTION_DESCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }], + "isGlobal": true + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_desc_nulls_first.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_desc_nulls_first.proto.bin new file mode 100644 index 0000000000000..b8caacc55b9e8 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_desc_nulls_first.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_desc_nulls_last.json b/connector/connect/common/src/test/resources/query-tests/queries/column_desc_nulls_last.json new file mode 100644 index 0000000000000..50efda387ec44 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_desc_nulls_last.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "sort": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "order": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, + "direction": "SORT_DIRECTION_DESCENDING", + "nullOrdering": "SORT_NULLS_LAST" + }], + "isGlobal": true + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_desc_nulls_last.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_desc_nulls_last.proto.bin new file mode 100644 index 0000000000000..df2589d8231bc Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_desc_nulls_last.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_divide.json b/connector/connect/common/src/test/resources/query-tests/queries/column_divide.json new file mode 100644 index 0000000000000..8d71061b151ca --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_divide.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "/", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_divide.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_divide.proto.bin new file mode 100644 index 0000000000000..49b5d8d2590dd Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_divide.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_dropFields.json b/connector/connect/common/src/test/resources/query-tests/queries/column_dropFields.json new file mode 100644 index 0000000000000..92639eeedc67d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_dropFields.json @@ -0,0 +1,30 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "updateFields": { + "structExpression": { + "updateFields": { + "structExpression": { + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }, + "fieldName": "a" + } + }, + "fieldName": "c" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_dropFields.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_dropFields.proto.bin new file mode 100644 index 0000000000000..edafc8b1f1f51 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_dropFields.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_endsWith.json b/connector/connect/common/src/test/resources/query-tests/queries/column_endsWith.json new file mode 100644 index 0000000000000..f4171c2792fbd --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_endsWith.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "endswith", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "suffix_" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_endsWith.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_endsWith.proto.bin new file mode 100644 index 0000000000000..03f41a339f00c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_endsWith.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_eqNullSafe.json b/connector/connect/common/src/test/resources/query-tests/queries/column_eqNullSafe.json new file mode 100644 index 0000000000000..eea1da49bc59e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_eqNullSafe.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "\u003c\u003d\u003e", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_eqNullSafe.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_eqNullSafe.proto.bin new file mode 100644 index 0000000000000..22de941ad44b0 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_eqNullSafe.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_equals.json b/connector/connect/common/src/test/resources/query-tests/queries/column_equals.json new file mode 100644 index 0000000000000..7397f4fb46acd --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_equals.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "\u003d", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_equals.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_equals.proto.bin new file mode 100644 index 0000000000000..e226de59ddcd4 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_equals.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_geq.json b/connector/connect/common/src/test/resources/query-tests/queries/column_geq.json new file mode 100644 index 0000000000000..9f24bc251739f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_geq.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "\u003e\u003d", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_geq.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_geq.proto.bin new file mode 100644 index 0000000000000..1c4af866109ab Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_geq.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_getField.json b/connector/connect/common/src/test/resources/query-tests/queries/column_getField.json new file mode 100644 index 0000000000000..21d5bb6f23d89 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_getField.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedExtractValue": { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }, + "extraction": { + "literal": { + "string": "b" + } + } + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_getField.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_getField.proto.bin new file mode 100644 index 0000000000000..c76b69bf5fa4a Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_getField.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_getItem.json b/connector/connect/common/src/test/resources/query-tests/queries/column_getItem.json new file mode 100644 index 0000000000000..e3bfd3d6e842a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_getItem.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedExtractValue": { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, + "extraction": { + "literal": { + "integer": 3 + } + } + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_getItem.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_getItem.proto.bin new file mode 100644 index 0000000000000..9120801100fea Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_getItem.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_gt.json b/connector/connect/common/src/test/resources/query-tests/queries/column_gt.json new file mode 100644 index 0000000000000..4bb8fb41f249d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_gt.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "\u003e", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_gt.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_gt.proto.bin new file mode 100644 index 0000000000000..44ca37fbb4048 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_gt.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_ilike.json b/connector/connect/common/src/test/resources/query-tests/queries/column_ilike.json new file mode 100644 index 0000000000000..47c1b63abe319 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_ilike.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "like", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "%fOb%" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_ilike.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_ilike.proto.bin new file mode 100644 index 0000000000000..285400db7daf5 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_ilike.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_isNaN.json b/connector/connect/common/src/test/resources/query-tests/queries/column_isNaN.json new file mode 100644 index 0000000000000..f594918ed930a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_isNaN.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "isNaN", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_isNaN.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_isNaN.proto.bin new file mode 100644 index 0000000000000..1030abda5b8c2 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_isNaN.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_isNotNull.json b/connector/connect/common/src/test/resources/query-tests/queries/column_isNotNull.json new file mode 100644 index 0000000000000..f34d3f4eac552 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_isNotNull.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "isNotNull", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_isNotNull.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_isNotNull.proto.bin new file mode 100644 index 0000000000000..e8cccdf024934 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_isNotNull.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_isNull.json b/connector/connect/common/src/test/resources/query-tests/queries/column_isNull.json new file mode 100644 index 0000000000000..74e990622a3a7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_isNull.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "isNull", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_isNull.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_isNull.proto.bin new file mode 100644 index 0000000000000..8fc24a9e21b38 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_isNull.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_isin.json b/connector/connect/common/src/test/resources/query-tests/queries/column_isin.json new file mode 100644 index 0000000000000..d8811a4e780b5 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_isin.json @@ -0,0 +1,37 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "in", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "hello" + } + }, { + "literal": { + "string": "world" + } + }, { + "literal": { + "string": "foo" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_isin.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_isin.proto.bin new file mode 100644 index 0000000000000..365e07f35bb48 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_isin.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_leq.json b/connector/connect/common/src/test/resources/query-tests/queries/column_leq.json new file mode 100644 index 0000000000000..cda8694c0439e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_leq.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "\u003c\u003d", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_leq.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_leq.proto.bin new file mode 100644 index 0000000000000..e8463292e4040 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_leq.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_like.json b/connector/connect/common/src/test/resources/query-tests/queries/column_like.json new file mode 100644 index 0000000000000..1390451af55ab --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_like.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "like", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "%bob%" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_like.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_like.proto.bin new file mode 100644 index 0000000000000..07382ec1643cb Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_like.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_lt.json b/connector/connect/common/src/test/resources/query-tests/queries/column_lt.json new file mode 100644 index 0000000000000..c927e75de181b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_lt.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "\u003c", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_lt.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_lt.proto.bin new file mode 100644 index 0000000000000..f4c3a110b126b Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_lt.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_modulo.json b/connector/connect/common/src/test/resources/query-tests/queries/column_modulo.json new file mode 100644 index 0000000000000..0c5a78eea2dff --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_modulo.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "%", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "literal": { + "integer": 10 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_modulo.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_modulo.proto.bin new file mode 100644 index 0000000000000..55bfeba04ed66 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_modulo.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_multiply.json b/connector/connect/common/src/test/resources/query-tests/queries/column_multiply.json new file mode 100644 index 0000000000000..8c17581c67d1c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_multiply.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "*", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_multiply.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_multiply.proto.bin new file mode 100644 index 0000000000000..8fd1b3941d1f7 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_multiply.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_not.json b/connector/connect/common/src/test/resources/query-tests/queries/column_not.json new file mode 100644 index 0000000000000..2f873196ba1d0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_not.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "!", + "arguments": [{ + "literal": { + "boolean": true + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_not.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_not.proto.bin new file mode 100644 index 0000000000000..19609b6ee85a5 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_not.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_not_equals.json b/connector/connect/common/src/test/resources/query-tests/queries/column_not_equals.json new file mode 100644 index 0000000000000..589d57a18b94b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_not_equals.json @@ -0,0 +1,34 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "!", + "arguments": [{ + "unresolvedFunction": { + "functionName": "\u003d", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_not_equals.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_not_equals.proto.bin new file mode 100644 index 0000000000000..cdf0b4290e61e Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_not_equals.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_or.json b/connector/connect/common/src/test/resources/query-tests/queries/column_or.json new file mode 100644 index 0000000000000..ae1424f763feb --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_or.json @@ -0,0 +1,47 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "or", + "arguments": [{ + "unresolvedFunction": { + "functionName": "\u003e", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "literal": { + "integer": 10 + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "\u003c", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, { + "literal": { + "double": 0.5 + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_or.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_or.proto.bin new file mode 100644 index 0000000000000..69f219e938a4e Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_or.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_rlike.json b/connector/connect/common/src/test/resources/query-tests/queries/column_rlike.json new file mode 100644 index 0000000000000..e53403db41cd0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_rlike.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "like", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "^[0-9]*$" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_rlike.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_rlike.proto.bin new file mode 100644 index 0000000000000..7dd56baf04213 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_rlike.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_star.json b/connector/connect/common/src/test/resources/query-tests/queries/column_star.json new file mode 100644 index 0000000000000..ef88067a7a4c1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_star.json @@ -0,0 +1,19 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedStar": { + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_star.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_star.proto.bin new file mode 100644 index 0000000000000..3ca04d082d766 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_star.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_star_with_target.json b/connector/connect/common/src/test/resources/query-tests/queries/column_star_with_target.json new file mode 100644 index 0000000000000..e1159b5673dba --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_star_with_target.json @@ -0,0 +1,20 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedStar": { + "unparsedTarget": "d.*" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_star_with_target.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_star_with_target.proto.bin new file mode 100644 index 0000000000000..af744b7d6b47c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_star_with_target.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_startsWith.json b/connector/connect/common/src/test/resources/query-tests/queries/column_startsWith.json new file mode 100644 index 0000000000000..431e13d818639 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_startsWith.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "startswith", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "prefix_" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_startsWith.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_startsWith.proto.bin new file mode 100644 index 0000000000000..fa1132c73de7b Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_startsWith.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_substr.json b/connector/connect/common/src/test/resources/query-tests/queries/column_substr.json new file mode 100644 index 0000000000000..3b02117cc6e5b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_substr.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "substr", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "integer": 8 + } + }, { + "literal": { + "integer": 3 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_substr.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_substr.proto.bin new file mode 100644 index 0000000000000..636a46a480626 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_substr.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_subtract.json b/connector/connect/common/src/test/resources/query-tests/queries/column_subtract.json new file mode 100644 index 0000000000000..d15c2941ee1bd --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_subtract.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "-", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_subtract.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_subtract.proto.bin new file mode 100644 index 0000000000000..f5716427588ed Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_subtract.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_unary_minus.json b/connector/connect/common/src/test/resources/query-tests/queries/column_unary_minus.json new file mode 100644 index 0000000000000..0db558e49e38c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_unary_minus.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "negative", + "arguments": [{ + "literal": { + "integer": 1 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_unary_minus.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_unary_minus.proto.bin new file mode 100644 index 0000000000000..66343bea4e29b Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_unary_minus.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_when_otherwise.json b/connector/connect/common/src/test/resources/query-tests/queries/column_when_otherwise.json new file mode 100644 index 0000000000000..db2ceccfd22ab --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_when_otherwise.json @@ -0,0 +1,59 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "when", + "arguments": [{ + "unresolvedFunction": { + "functionName": "\u003c", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "literal": { + "integer": 10 + } + }] + } + }, { + "literal": { + "string": "low" + } + }, { + "unresolvedFunction": { + "functionName": "\u003c", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "literal": { + "integer": 20 + } + }] + } + }, { + "literal": { + "string": "medium" + } + }, { + "literal": { + "string": "high" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_when_otherwise.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_when_otherwise.proto.bin new file mode 100644 index 0000000000000..031c3683c5e6d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_when_otherwise.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_withField.json b/connector/connect/common/src/test/resources/query-tests/queries/column_withField.json new file mode 100644 index 0000000000000..86b9396a4e13f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/column_withField.json @@ -0,0 +1,30 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "updateFields": { + "structExpression": { + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }, + "fieldName": "x", + "valueExpression": { + "literal": { + "string": "xq" + } + } + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/column_withField.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/column_withField.proto.bin new file mode 100644 index 0000000000000..a413740fa5054 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/column_withField.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/crossJoin.json b/connector/connect/common/src/test/resources/query-tests/queries/crossJoin.json new file mode 100644 index 0000000000000..fecba18096bb3 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/crossJoin.json @@ -0,0 +1,24 @@ +{ + "common": { + "planId": "2" + }, + "join": { + "left": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "right": { + "common": { + "planId": "1" + }, + "localRelation": { + "schema": "struct\u003ca:int,id:bigint,payload:binary\u003e" + } + }, + "joinType": "JOIN_TYPE_CROSS" + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/crossJoin.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/crossJoin.proto.bin new file mode 100644 index 0000000000000..ff6d2f3b4a7a0 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/crossJoin.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/crosstab.json b/connector/connect/common/src/test/resources/query-tests/queries/crosstab.json new file mode 100644 index 0000000000000..755a6fa4dd249 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/crosstab.json @@ -0,0 +1,17 @@ +{ + "common": { + "planId": "1" + }, + "crosstab": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "col1": "a", + "col2": "b" + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/crosstab.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/crosstab.proto.bin new file mode 100644 index 0000000000000..c664cedb01c5d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/crosstab.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json b/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json new file mode 100644 index 0000000000000..d34fcb6f758e6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.json @@ -0,0 +1,38 @@ +{ + "common": { + "planId": "1" + }, + "parse": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}" + } + }, + "format": "PARSE_FORMAT_CSV", + "schema": { + "struct": { + "fields": [{ + "name": "c1", + "dataType": { + "string": { + } + }, + "nullable": true + }, { + "name": "c2", + "dataType": { + "integer": { + } + }, + "nullable": true + }] + } + }, + "options": { + "header": "true" + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin new file mode 100644 index 0000000000000..5f8bd50685ca8 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/csv_from_dataset.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/cube_column.json b/connector/connect/common/src/test/resources/query-tests/queries/cube_column.json new file mode 100644 index 0000000000000..5b9709ff06576 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/cube_column.json @@ -0,0 +1,40 @@ +{ + "common": { + "planId": "1" + }, + "aggregate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "groupType": "GROUP_TYPE_CUBE", + "groupingExpressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }], + "aggregateExpressions": [{ + "alias": { + "expr": { + "unresolvedFunction": { + "functionName": "count", + "arguments": [{ + "literal": { + "integer": 1 + } + }] + } + }, + "name": ["count"] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/cube_column.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/cube_column.proto.bin new file mode 100644 index 0000000000000..d46e40b39dcfe Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/cube_column.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/cube_string.json b/connector/connect/common/src/test/resources/query-tests/queries/cube_string.json new file mode 100644 index 0000000000000..5b9709ff06576 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/cube_string.json @@ -0,0 +1,40 @@ +{ + "common": { + "planId": "1" + }, + "aggregate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "groupType": "GROUP_TYPE_CUBE", + "groupingExpressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }], + "aggregateExpressions": [{ + "alias": { + "expr": { + "unresolvedFunction": { + "functionName": "count", + "arguments": [{ + "literal": { + "integer": 1 + } + }] + } + }, + "name": ["count"] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/cube_string.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/cube_string.proto.bin new file mode 100644 index 0000000000000..d46e40b39dcfe Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/cube_string.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/describe.json b/connector/connect/common/src/test/resources/query-tests/queries/describe.json new file mode 100644 index 0000000000000..d767db5241f45 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/describe.json @@ -0,0 +1,16 @@ +{ + "common": { + "planId": "1" + }, + "describe": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "cols": ["id", "b"] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/describe.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/describe.proto.bin new file mode 100644 index 0000000000000..8a2117e519f6a Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/describe.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/distinct.json b/connector/connect/common/src/test/resources/query-tests/queries/distinct.json new file mode 100644 index 0000000000000..ae796b520353c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/distinct.json @@ -0,0 +1,16 @@ +{ + "common": { + "planId": "1" + }, + "deduplicate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "allColumnsAsKeys": true + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/distinct.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/distinct.proto.bin new file mode 100644 index 0000000000000..07430c4383106 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/distinct.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/drop.json b/connector/connect/common/src/test/resources/query-tests/queries/drop.json new file mode 100644 index 0000000000000..a5176b25b05b6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/drop.json @@ -0,0 +1,17 @@ +{ + "common": { + "planId": "1" + }, + "dropNa": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "cols": ["id", "a"], + "minNonNulls": 5 + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/drop.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/drop.proto.bin new file mode 100644 index 0000000000000..9e18d02afbc6f Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/drop.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates.json b/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates.json new file mode 100644 index 0000000000000..ae796b520353c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates.json @@ -0,0 +1,16 @@ +{ + "common": { + "planId": "1" + }, + "deduplicate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "allColumnsAsKeys": true + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates.proto.bin new file mode 100644 index 0000000000000..07430c4383106 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates_names_array.json b/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates_names_array.json new file mode 100644 index 0000000000000..e72e23c86caf0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates_names_array.json @@ -0,0 +1,16 @@ +{ + "common": { + "planId": "1" + }, + "deduplicate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "columnNames": ["a", "id"] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates_names_array.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates_names_array.proto.bin new file mode 100644 index 0000000000000..c8e3885fbf804 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates_names_array.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates_names_seq.json b/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates_names_seq.json new file mode 100644 index 0000000000000..754cecac4b256 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates_names_seq.json @@ -0,0 +1,16 @@ +{ + "common": { + "planId": "1" + }, + "deduplicate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "columnNames": ["a", "b"] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates_names_seq.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates_names_seq.proto.bin new file mode 100644 index 0000000000000..1a2d635e58e56 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates_names_seq.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates_varargs.json b/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates_varargs.json new file mode 100644 index 0000000000000..c4a8df30c5867 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates_varargs.json @@ -0,0 +1,16 @@ +{ + "common": { + "planId": "1" + }, + "deduplicate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "columnNames": ["a", "b", "id"] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates_varargs.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates_varargs.proto.bin new file mode 100644 index 0000000000000..719a373c2e384 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/dropDuplicates_varargs.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/drop_multiple_column.json b/connector/connect/common/src/test/resources/query-tests/queries/drop_multiple_column.json new file mode 100644 index 0000000000000..3ec19cf8c4c64 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/drop_multiple_column.json @@ -0,0 +1,24 @@ +{ + "common": { + "planId": "1" + }, + "drop": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "columns": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/drop_multiple_column.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/drop_multiple_column.proto.bin new file mode 100644 index 0000000000000..f4585af804ae6 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/drop_multiple_column.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/drop_multiple_strings.json b/connector/connect/common/src/test/resources/query-tests/queries/drop_multiple_strings.json new file mode 100644 index 0000000000000..dcda09236f4bc --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/drop_multiple_strings.json @@ -0,0 +1,16 @@ +{ + "common": { + "planId": "1" + }, + "drop": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "columnNames": ["id", "a", "b"] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/drop_multiple_strings.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/drop_multiple_strings.proto.bin new file mode 100644 index 0000000000000..e5be859b7081d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/drop_multiple_strings.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/drop_single_column.json b/connector/connect/common/src/test/resources/query-tests/queries/drop_single_column.json new file mode 100644 index 0000000000000..1fe8563e0fdfc --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/drop_single_column.json @@ -0,0 +1,20 @@ +{ + "common": { + "planId": "1" + }, + "drop": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "columns": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/drop_single_column.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/drop_single_column.proto.bin new file mode 100644 index 0000000000000..37d71479cdb84 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/drop_single_column.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/drop_single_string.json b/connector/connect/common/src/test/resources/query-tests/queries/drop_single_string.json new file mode 100644 index 0000000000000..8f849d0346d55 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/drop_single_string.json @@ -0,0 +1,16 @@ +{ + "common": { + "planId": "1" + }, + "drop": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "columnNames": ["a"] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/drop_single_string.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/drop_single_string.proto.bin new file mode 100644 index 0000000000000..12013543c4632 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/drop_single_string.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/except.json b/connector/connect/common/src/test/resources/query-tests/queries/except.json new file mode 100644 index 0000000000000..6544e03f6e10d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/except.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "2" + }, + "setOp": { + "leftInput": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "rightInput": { + "common": { + "planId": "1" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "setOpType": "SET_OP_TYPE_EXCEPT", + "isAll": false + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/except.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/except.proto.bin new file mode 100644 index 0000000000000..0e9efea2f94d0 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/except.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/exceptAll.json b/connector/connect/common/src/test/resources/query-tests/queries/exceptAll.json new file mode 100644 index 0000000000000..e77b583b9c287 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/exceptAll.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "2" + }, + "setOp": { + "leftInput": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "rightInput": { + "common": { + "planId": "1" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "setOpType": "SET_OP_TYPE_EXCEPT", + "isAll": true + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/exceptAll.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/exceptAll.proto.bin new file mode 100644 index 0000000000000..19f9231eb2674 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/exceptAll.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/expression_extension.json b/connector/connect/common/src/test/resources/query-tests/queries/expression_extension.json new file mode 100644 index 0000000000000..acfb3cc2333d1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/expression_extension.json @@ -0,0 +1,26 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "expressions": [{ + "extension": { + "@type": "type.googleapis.com/spark.connect.ExamplePluginExpression", + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, + "customField": "abc" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/expression_extension.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/expression_extension.proto.bin new file mode 100644 index 0000000000000..24669eba64234 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/expression_extension.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/fill.json b/connector/connect/common/src/test/resources/query-tests/queries/fill.json new file mode 100644 index 0000000000000..8308af1f579e6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/fill.json @@ -0,0 +1,19 @@ +{ + "common": { + "planId": "1" + }, + "fillNa": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "cols": ["id"], + "values": [{ + "long": "8" + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/fill.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/fill.proto.bin new file mode 100644 index 0000000000000..b034c5e64a839 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/fill.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/filter.json b/connector/connect/common/src/test/resources/query-tests/queries/filter.json new file mode 100644 index 0000000000000..1046e1262150e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/filter.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "filter": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "condition": { + "unresolvedFunction": { + "functionName": "\u003d", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, { + "literal": { + "long": "10" + } + }] + } + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/filter.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/filter.proto.bin new file mode 100644 index 0000000000000..069171ead3233 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/filter.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/filter_expr.json b/connector/connect/common/src/test/resources/query-tests/queries/filter_expr.json new file mode 100644 index 0000000000000..a2c49ec98c611 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/filter_expr.json @@ -0,0 +1,20 @@ +{ + "common": { + "planId": "1" + }, + "filter": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "condition": { + "expressionString": { + "expression": "exp(a) \u003c 10.0" + } + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/filter_expr.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/filter_expr.proto.bin new file mode 100644 index 0000000000000..56e5be565435b Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/filter_expr.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/freqItems.json b/connector/connect/common/src/test/resources/query-tests/queries/freqItems.json new file mode 100644 index 0000000000000..8734722b35427 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/freqItems.json @@ -0,0 +1,17 @@ +{ + "common": { + "planId": "1" + }, + "freqItems": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "cols": ["id", "a"], + "support": 0.1 + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/freqItems.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/freqItems.proto.bin new file mode 100644 index 0000000000000..717f3d61ae953 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/freqItems.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_abs.json b/connector/connect/common/src/test/resources/query-tests/queries/function_abs.json new file mode 100644 index 0000000000000..13df3437ddabe --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_abs.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "abs", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_abs.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_abs.proto.bin new file mode 100644 index 0000000000000..86cfbc09a8f91 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_abs.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_acos.json b/connector/connect/common/src/test/resources/query-tests/queries/function_acos.json new file mode 100644 index 0000000000000..7506c0f6cb630 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_acos.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "acos", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_acos.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_acos.proto.bin new file mode 100644 index 0000000000000..cc6a279cb188e Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_acos.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_acosh.json b/connector/connect/common/src/test/resources/query-tests/queries/function_acosh.json new file mode 100644 index 0000000000000..6a83b4ab008bc --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_acosh.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "acosh", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_acosh.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_acosh.proto.bin new file mode 100644 index 0000000000000..e16ed2ba92e3f Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_acosh.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_add_months.json b/connector/connect/common/src/test/resources/query-tests/queries/function_add_months.json new file mode 100644 index 0000000000000..b1b2e78a08435 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_add_months.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "add_months", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }, { + "literal": { + "integer": 2 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_add_months.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_add_months.proto.bin new file mode 100644 index 0000000000000..6abacc9cc2b40 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_add_months.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_aggregate.json b/connector/connect/common/src/test/resources/query-tests/queries/function_aggregate.json new file mode 100644 index 0000000000000..3116837aeb876 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_aggregate.json @@ -0,0 +1,62 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "aggregate", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "literal": { + "integer": 0 + } + }, { + "lambdaFunction": { + "function": { + "unresolvedFunction": { + "functionName": "+", + "arguments": [{ + "unresolvedNamedLambdaVariable": { + "nameParts": ["x"] + } + }, { + "unresolvedNamedLambdaVariable": { + "nameParts": ["y"] + } + }] + } + }, + "arguments": [{ + "nameParts": ["x"] + }, { + "nameParts": ["y"] + }] + } + }, { + "lambdaFunction": { + "function": { + "unresolvedNamedLambdaVariable": { + "nameParts": ["x"] + } + }, + "arguments": [{ + "nameParts": ["x"] + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_aggregate.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_aggregate.proto.bin new file mode 100644 index 0000000000000..f97843e086a58 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_aggregate.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct.json b/connector/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct.json new file mode 100644 index 0000000000000..5579faf119647 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "approx_count_distinct", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct.proto.bin new file mode 100644 index 0000000000000..bac82f670b298 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct_rsd.json b/connector/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct_rsd.json new file mode 100644 index 0000000000000..851862082ca04 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct_rsd.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "approx_count_distinct", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "literal": { + "double": 0.1 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct_rsd.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct_rsd.proto.bin new file mode 100644 index 0000000000000..fd61420fd1e45 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_approx_count_distinct_rsd.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array.json b/connector/connect/common/src/test/resources/query-tests/queries/function_array.json new file mode 100644 index 0000000000000..20fe495bb9bf4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_array.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "array", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_array.proto.bin new file mode 100644 index 0000000000000..2b679eb4c6db1 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_array.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_append.json b/connector/connect/common/src/test/resources/query-tests/queries/function_array_append.json new file mode 100644 index 0000000000000..cabd44c063dec --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_array_append.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "array_append", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "literal": { + "integer": 1 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_append.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_array_append.proto.bin new file mode 100644 index 0000000000000..76f2f0255bf25 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_array_append.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_compact.json b/connector/connect/common/src/test/resources/query-tests/queries/function_array_compact.json new file mode 100644 index 0000000000000..c3ebf313190c2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_array_compact.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "array_compact", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_compact.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_array_compact.proto.bin new file mode 100644 index 0000000000000..949d66cb951f0 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_array_compact.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_contains.json b/connector/connect/common/src/test/resources/query-tests/queries/function_array_contains.json new file mode 100644 index 0000000000000..a362d66d9d64d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_array_contains.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "array_contains", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "literal": { + "integer": 1 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_contains.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_array_contains.proto.bin new file mode 100644 index 0000000000000..d8764f60364c2 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_array_contains.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_distinct.json b/connector/connect/common/src/test/resources/query-tests/queries/function_array_distinct.json new file mode 100644 index 0000000000000..d38f4194bcd2b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_array_distinct.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "array_distinct", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_distinct.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_array_distinct.proto.bin new file mode 100644 index 0000000000000..e6359c074bf23 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_array_distinct.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_except.json b/connector/connect/common/src/test/resources/query-tests/queries/function_array_except.json new file mode 100644 index 0000000000000..17d50c87161d6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_array_except.json @@ -0,0 +1,42 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "array_except", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "unresolvedFunction": { + "functionName": "array", + "arguments": [{ + "literal": { + "integer": 1 + } + }, { + "literal": { + "integer": 2 + } + }, { + "literal": { + "integer": 4 + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_except.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_array_except.proto.bin new file mode 100644 index 0000000000000..692511b2f74a6 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_array_except.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_insert.json b/connector/connect/common/src/test/resources/query-tests/queries/function_array_insert.json new file mode 100644 index 0000000000000..f4540edbf4108 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_array_insert.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "array_insert", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "literal": { + "integer": 0 + } + }, { + "literal": { + "integer": 1 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_insert.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_array_insert.proto.bin new file mode 100644 index 0000000000000..6e2178ad124e9 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_array_insert.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_intersect.json b/connector/connect/common/src/test/resources/query-tests/queries/function_array_intersect.json new file mode 100644 index 0000000000000..1b95a6724f86d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_array_intersect.json @@ -0,0 +1,38 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "array_intersect", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "unresolvedFunction": { + "functionName": "array", + "arguments": [{ + "literal": { + "integer": 10 + } + }, { + "literal": { + "integer": 4 + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_intersect.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_array_intersect.proto.bin new file mode 100644 index 0000000000000..67fb497cf270c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_array_intersect.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_join.json b/connector/connect/common/src/test/resources/query-tests/queries/function_array_join.json new file mode 100644 index 0000000000000..94e8c176cefbf --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_array_join.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "array_join", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "literal": { + "string": ";" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_join.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_array_join.proto.bin new file mode 100644 index 0000000000000..fbab1b208605d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_array_join.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_join_with_null_replacement.json b/connector/connect/common/src/test/resources/query-tests/queries/function_array_join_with_null_replacement.json new file mode 100644 index 0000000000000..ad580c33e476c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_array_join_with_null_replacement.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "array_join", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "literal": { + "string": ";" + } + }, { + "literal": { + "string": "null" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_join_with_null_replacement.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_array_join_with_null_replacement.proto.bin new file mode 100644 index 0000000000000..e3fb6b3bf67c3 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_array_join_with_null_replacement.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_max.json b/connector/connect/common/src/test/resources/query-tests/queries/function_array_max.json new file mode 100644 index 0000000000000..ba67984758a5a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_array_max.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "array_max", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_max.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_array_max.proto.bin new file mode 100644 index 0000000000000..f7a98c08cd175 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_array_max.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_min.json b/connector/connect/common/src/test/resources/query-tests/queries/function_array_min.json new file mode 100644 index 0000000000000..a342ae18f9ef7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_array_min.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "array_min", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_min.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_array_min.proto.bin new file mode 100644 index 0000000000000..02cfdfeb215d6 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_array_min.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_position.json b/connector/connect/common/src/test/resources/query-tests/queries/function_array_position.json new file mode 100644 index 0000000000000..4c212cb028273 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_array_position.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "array_position", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "literal": { + "integer": 10 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_position.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_array_position.proto.bin new file mode 100644 index 0000000000000..4ef2b11273f25 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_array_position.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_remove.json b/connector/connect/common/src/test/resources/query-tests/queries/function_array_remove.json new file mode 100644 index 0000000000000..8c562247714a4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_array_remove.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "array_remove", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "literal": { + "integer": 314 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_remove.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_array_remove.proto.bin new file mode 100644 index 0000000000000..95e2872ad77bd Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_array_remove.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_repeat.json b/connector/connect/common/src/test/resources/query-tests/queries/function_array_repeat.json new file mode 100644 index 0000000000000..c9d9f1f9ca79d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_array_repeat.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "array_repeat", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "literal": { + "integer": 10 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_repeat.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_array_repeat.proto.bin new file mode 100644 index 0000000000000..e370db16e977c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_array_repeat.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_sort.json b/connector/connect/common/src/test/resources/query-tests/queries/function_array_sort.json new file mode 100644 index 0000000000000..406dc54c8cd2f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_array_sort.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "array_sort", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_sort.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_array_sort.proto.bin new file mode 100644 index 0000000000000..2074caae16384 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_array_sort.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_sort_with_comparator.json b/connector/connect/common/src/test/resources/query-tests/queries/function_array_sort_with_comparator.json new file mode 100644 index 0000000000000..95be74d0b4c81 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_array_sort_with_comparator.json @@ -0,0 +1,47 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "array_sort", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "lambdaFunction": { + "function": { + "unresolvedFunction": { + "functionName": "-", + "arguments": [{ + "unresolvedNamedLambdaVariable": { + "nameParts": ["x"] + } + }, { + "unresolvedNamedLambdaVariable": { + "nameParts": ["y"] + } + }] + } + }, + "arguments": [{ + "nameParts": ["x"] + }, { + "nameParts": ["y"] + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_sort_with_comparator.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_array_sort_with_comparator.proto.bin new file mode 100644 index 0000000000000..c1e2363f0fdab Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_array_sort_with_comparator.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_union.json b/connector/connect/common/src/test/resources/query-tests/queries/function_array_union.json new file mode 100644 index 0000000000000..7d54079cdb47e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_array_union.json @@ -0,0 +1,42 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "array_union", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "unresolvedFunction": { + "functionName": "array", + "arguments": [{ + "literal": { + "integer": 1 + } + }, { + "literal": { + "integer": 2 + } + }, { + "literal": { + "integer": 3 + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_array_union.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_array_union.proto.bin new file mode 100644 index 0000000000000..fc3d9d7cd0fd1 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_array_union.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_arrays_overlap.json b/connector/connect/common/src/test/resources/query-tests/queries/function_arrays_overlap.json new file mode 100644 index 0000000000000..ce1d288e00d78 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_arrays_overlap.json @@ -0,0 +1,38 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "arrays_overlap", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "unresolvedFunction": { + "functionName": "array", + "arguments": [{ + "literal": { + "integer": 1 + } + }, { + "literal": { + "integer": 2 + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_arrays_overlap.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_arrays_overlap.proto.bin new file mode 100644 index 0000000000000..216f306507d40 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_arrays_overlap.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_arrays_zip.json b/connector/connect/common/src/test/resources/query-tests/queries/function_arrays_zip.json new file mode 100644 index 0000000000000..14769082725f1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_arrays_zip.json @@ -0,0 +1,42 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "arrays_zip", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "unresolvedFunction": { + "functionName": "sequence", + "arguments": [{ + "literal": { + "integer": 1 + } + }, { + "literal": { + "integer": 20 + } + }, { + "literal": { + "long": "1" + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_arrays_zip.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_arrays_zip.proto.bin new file mode 100644 index 0000000000000..609f52db32478 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_arrays_zip.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_asc.json b/connector/connect/common/src/test/resources/query-tests/queries/function_asc.json new file mode 100644 index 0000000000000..30740c81ba412 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_asc.json @@ -0,0 +1,26 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "sortOrder": { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_asc.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_asc.proto.bin new file mode 100644 index 0000000000000..7c5bc4213a6f8 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_asc.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_asc_nulls_first.json b/connector/connect/common/src/test/resources/query-tests/queries/function_asc_nulls_first.json new file mode 100644 index 0000000000000..30740c81ba412 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_asc_nulls_first.json @@ -0,0 +1,26 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "sortOrder": { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_asc_nulls_first.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_asc_nulls_first.proto.bin new file mode 100644 index 0000000000000..7c5bc4213a6f8 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_asc_nulls_first.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_asc_nulls_last.json b/connector/connect/common/src/test/resources/query-tests/queries/function_asc_nulls_last.json new file mode 100644 index 0000000000000..b8bbbb73544f4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_asc_nulls_last.json @@ -0,0 +1,26 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "sortOrder": { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_LAST" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_asc_nulls_last.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_asc_nulls_last.proto.bin new file mode 100644 index 0000000000000..1eb6f88cac874 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_asc_nulls_last.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_ascii.json b/connector/connect/common/src/test/resources/query-tests/queries/function_ascii.json new file mode 100644 index 0000000000000..3c4dcb70fead3 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_ascii.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "ascii", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_ascii.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_ascii.proto.bin new file mode 100644 index 0000000000000..5989bd3b5c606 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_ascii.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_asin.json b/connector/connect/common/src/test/resources/query-tests/queries/function_asin.json new file mode 100644 index 0000000000000..4bf89be753458 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_asin.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "asin", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_asin.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_asin.proto.bin new file mode 100644 index 0000000000000..737ad789da268 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_asin.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_asinh.json b/connector/connect/common/src/test/resources/query-tests/queries/function_asinh.json new file mode 100644 index 0000000000000..238571b0231c6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_asinh.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "asinh", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_asinh.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_asinh.proto.bin new file mode 100644 index 0000000000000..01ea4675b22eb Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_asinh.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_assert_true_with_message.json b/connector/connect/common/src/test/resources/query-tests/queries/function_assert_true_with_message.json new file mode 100644 index 0000000000000..5520b70a0250b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_assert_true_with_message.json @@ -0,0 +1,38 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "assert_true", + "arguments": [{ + "unresolvedFunction": { + "functionName": "\u003e", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, { + "literal": { + "integer": 0 + } + }] + } + }, { + "literal": { + "string": "id negative!" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_assert_true_with_message.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_assert_true_with_message.proto.bin new file mode 100644 index 0000000000000..6992604efe1b3 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_assert_true_with_message.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_atan.json b/connector/connect/common/src/test/resources/query-tests/queries/function_atan.json new file mode 100644 index 0000000000000..3ae4e7ef188ec --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_atan.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "atan", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_atan.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_atan.proto.bin new file mode 100644 index 0000000000000..b932086941f45 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_atan.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_atan2.json b/connector/connect/common/src/test/resources/query-tests/queries/function_atan2.json new file mode 100644 index 0000000000000..7d08116c40ae6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_atan2.json @@ -0,0 +1,37 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "atan2", + "arguments": [{ + "cast": { + "expr": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "type": { + "double": { + } + } + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_atan2.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_atan2.proto.bin new file mode 100644 index 0000000000000..372ae8358494e Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_atan2.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_atanh.json b/connector/connect/common/src/test/resources/query-tests/queries/function_atanh.json new file mode 100644 index 0000000000000..8daec8813917e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_atanh.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "atanh", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_atanh.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_atanh.proto.bin new file mode 100644 index 0000000000000..0aa2f3527ae9c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_atanh.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_avg.json b/connector/connect/common/src/test/resources/query-tests/queries/function_avg.json new file mode 100644 index 0000000000000..b433f1ea89c29 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_avg.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "avg", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_avg.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_avg.proto.bin new file mode 100644 index 0000000000000..9d9bd296dbdda Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_avg.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_base64.json b/connector/connect/common/src/test/resources/query-tests/queries/function_base64.json new file mode 100644 index 0000000000000..97739dca283ef --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_base64.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "base64", + "arguments": [{ + "cast": { + "expr": { + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, + "type": { + "binary": { + } + } + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_base64.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_base64.proto.bin new file mode 100644 index 0000000000000..fc854d974752b Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_base64.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_bin.json b/connector/connect/common/src/test/resources/query-tests/queries/function_bin.json new file mode 100644 index 0000000000000..304e56504bad9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_bin.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "bin", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_bin.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_bin.proto.bin new file mode 100644 index 0000000000000..e8d55fb8d6149 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_bin.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_bit_length.json b/connector/connect/common/src/test/resources/query-tests/queries/function_bit_length.json new file mode 100644 index 0000000000000..df21871cb535d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_bit_length.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "bit_length", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_bit_length.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_bit_length.proto.bin new file mode 100644 index 0000000000000..860c2eaec0e85 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_bit_length.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_bitwise_not.json b/connector/connect/common/src/test/resources/query-tests/queries/function_bitwise_not.json new file mode 100644 index 0000000000000..7ddf73253e0a3 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_bitwise_not.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "~", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_bitwise_not.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_bitwise_not.proto.bin new file mode 100644 index 0000000000000..bfaefb2a20075 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_bitwise_not.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_bround.json b/connector/connect/common/src/test/resources/query-tests/queries/function_bround.json new file mode 100644 index 0000000000000..585a0befb224d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_bround.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "round", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, { + "literal": { + "integer": 2 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_bround.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_bround.proto.bin new file mode 100644 index 0000000000000..8625ccb1a58f1 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_bround.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_bucket.json b/connector/connect/common/src/test/resources/query-tests/queries/function_bucket.json new file mode 100644 index 0000000000000..971660144a5bc --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_bucket.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "bucket", + "arguments": [{ + "literal": { + "integer": 3 + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_bucket.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_bucket.proto.bin new file mode 100644 index 0000000000000..1b389401f15e6 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_bucket.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_ceil.json b/connector/connect/common/src/test/resources/query-tests/queries/function_ceil.json new file mode 100644 index 0000000000000..5a9961ab47f55 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_ceil.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "ceil", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_ceil.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_ceil.proto.bin new file mode 100644 index 0000000000000..3761deb1663a2 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_ceil.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_ceil_scale.json b/connector/connect/common/src/test/resources/query-tests/queries/function_ceil_scale.json new file mode 100644 index 0000000000000..bda5e85924c30 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_ceil_scale.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "ceil", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, { + "literal": { + "integer": 2 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_ceil_scale.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_ceil_scale.proto.bin new file mode 100644 index 0000000000000..8db402ac167e0 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_ceil_scale.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_coalesce.json b/connector/connect/common/src/test/resources/query-tests/queries/function_coalesce.json new file mode 100644 index 0000000000000..497922b5df75c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_coalesce.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "coalesce", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "literal": { + "integer": 3 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_coalesce.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_coalesce.proto.bin new file mode 100644 index 0000000000000..ec871018489c2 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_coalesce.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_col.json b/connector/connect/common/src/test/resources/query-tests/queries/function_col.json new file mode 100644 index 0000000000000..0420a3d12f6fe --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_col.json @@ -0,0 +1,20 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_col.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_col.proto.bin new file mode 100644 index 0000000000000..e113880f31b77 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_col.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_collect_list.json b/connector/connect/common/src/test/resources/query-tests/queries/function_collect_list.json new file mode 100644 index 0000000000000..c5bae4baef352 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_collect_list.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "collect_list", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_collect_list.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_collect_list.proto.bin new file mode 100644 index 0000000000000..e3827b9f650ae Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_collect_list.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_collect_set.json b/connector/connect/common/src/test/resources/query-tests/queries/function_collect_set.json new file mode 100644 index 0000000000000..615386d050e14 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_collect_set.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "collect_set", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_collect_set.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_collect_set.proto.bin new file mode 100644 index 0000000000000..5fb97f27d25b6 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_collect_set.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_concat.json b/connector/connect/common/src/test/resources/query-tests/queries/function_concat.json new file mode 100644 index 0000000000000..4a053d9c3c354 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_concat.json @@ -0,0 +1,55 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "concat", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "unresolvedFunction": { + "functionName": "array", + "arguments": [{ + "literal": { + "integer": 1 + } + }, { + "literal": { + "integer": 2 + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "sequence", + "arguments": [{ + "literal": { + "integer": 33 + } + }, { + "literal": { + "integer": 40 + } + }, { + "literal": { + "long": "1" + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_concat.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_concat.proto.bin new file mode 100644 index 0000000000000..e53eb7a75b8a2 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_concat.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_concat_ws.json b/connector/connect/common/src/test/resources/query-tests/queries/function_concat_ws.json new file mode 100644 index 0000000000000..b9ba89b42185c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_concat_ws.json @@ -0,0 +1,37 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "concat_ws", + "arguments": [{ + "literal": { + "string": "-" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, { + "literal": { + "string": "world" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_concat_ws.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_concat_ws.proto.bin new file mode 100644 index 0000000000000..2fbc4f7090448 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_concat_ws.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_conv.json b/connector/connect/common/src/test/resources/query-tests/queries/function_conv.json new file mode 100644 index 0000000000000..c6734936bfcd1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_conv.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "conv", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, { + "literal": { + "integer": 10 + } + }, { + "literal": { + "integer": 16 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_conv.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_conv.proto.bin new file mode 100644 index 0000000000000..373b997b79240 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_conv.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_corr.json b/connector/connect/common/src/test/resources/query-tests/queries/function_corr.json new file mode 100644 index 0000000000000..6fadb0385622b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_corr.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "corr", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_corr.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_corr.proto.bin new file mode 100644 index 0000000000000..fdeeb4fd12d19 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_corr.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_cos.json b/connector/connect/common/src/test/resources/query-tests/queries/function_cos.json new file mode 100644 index 0000000000000..f7072dff03404 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_cos.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "cos", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_cos.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_cos.proto.bin new file mode 100644 index 0000000000000..09fd198b097c0 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_cos.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_cosh.json b/connector/connect/common/src/test/resources/query-tests/queries/function_cosh.json new file mode 100644 index 0000000000000..3bcab61d37a0d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_cosh.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "cosh", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_cosh.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_cosh.proto.bin new file mode 100644 index 0000000000000..54d5da8fabfa6 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_cosh.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_cot.json b/connector/connect/common/src/test/resources/query-tests/queries/function_cot.json new file mode 100644 index 0000000000000..62ce963fa8737 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_cot.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "cot", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_cot.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_cot.proto.bin new file mode 100644 index 0000000000000..e79c32660a772 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_cot.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_count.json b/connector/connect/common/src/test/resources/query-tests/queries/function_count.json new file mode 100644 index 0000000000000..126a0ca242c52 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_count.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "count", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_count.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_count.proto.bin new file mode 100644 index 0000000000000..6c87a809ad0c4 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_count.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_countDistinct.json b/connector/connect/common/src/test/resources/query-tests/queries/function_countDistinct.json new file mode 100644 index 0000000000000..eb211ceb239aa --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_countDistinct.json @@ -0,0 +1,30 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "count", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }], + "isDistinct": true + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_countDistinct.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_countDistinct.proto.bin new file mode 100644 index 0000000000000..591e2300ec689 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_countDistinct.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_count_typed.json b/connector/connect/common/src/test/resources/query-tests/queries/function_count_typed.json new file mode 100644 index 0000000000000..1c5df90b79cd1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_count_typed.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "count", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_count_typed.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_count_typed.proto.bin new file mode 100644 index 0000000000000..44b613eb40c6f Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_count_typed.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_covar_pop.json b/connector/connect/common/src/test/resources/query-tests/queries/function_covar_pop.json new file mode 100644 index 0000000000000..3c4df70a5fbfc --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_covar_pop.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "covar_pop", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_covar_pop.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_covar_pop.proto.bin new file mode 100644 index 0000000000000..4a7202f15e768 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_covar_pop.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_covar_samp.json b/connector/connect/common/src/test/resources/query-tests/queries/function_covar_samp.json new file mode 100644 index 0000000000000..7c723069e4671 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_covar_samp.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "covar_samp", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_covar_samp.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_covar_samp.proto.bin new file mode 100644 index 0000000000000..ebff687730e35 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_covar_samp.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_crc32.json b/connector/connect/common/src/test/resources/query-tests/queries/function_crc32.json new file mode 100644 index 0000000000000..1892a9af85d97 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_crc32.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "crc32", + "arguments": [{ + "cast": { + "expr": { + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, + "type": { + "binary": { + } + } + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_crc32.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_crc32.proto.bin new file mode 100644 index 0000000000000..54ad14dedae4e Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_crc32.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_csc.json b/connector/connect/common/src/test/resources/query-tests/queries/function_csc.json new file mode 100644 index 0000000000000..88504ed9c5280 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_csc.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "csc", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_csc.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_csc.proto.bin new file mode 100644 index 0000000000000..0ed5022a73adf Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_csc.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_cume_dist.json b/connector/connect/common/src/test/resources/query-tests/queries/function_cume_dist.json new file mode 100644 index 0000000000000..ac48841199075 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_cume_dist.json @@ -0,0 +1,38 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "window": { + "windowFunction": { + "unresolvedFunction": { + "functionName": "cume_dist" + } + }, + "partitionSpec": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }], + "orderSpec": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_cume_dist.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_cume_dist.proto.bin new file mode 100644 index 0000000000000..7578245aabe3a Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_cume_dist.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_current_date.json b/connector/connect/common/src/test/resources/query-tests/queries/function_current_date.json new file mode 100644 index 0000000000000..6dab8c39d626c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_current_date.json @@ -0,0 +1,20 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "current_date" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_current_date.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_current_date.proto.bin new file mode 100644 index 0000000000000..f32c3f541c4c7 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_current_date.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_current_timestamp.json b/connector/connect/common/src/test/resources/query-tests/queries/function_current_timestamp.json new file mode 100644 index 0000000000000..16af5eb9ba084 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_current_timestamp.json @@ -0,0 +1,20 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "current_timestamp" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_current_timestamp.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_current_timestamp.proto.bin new file mode 100644 index 0000000000000..5a1f3de6c3a9a Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_current_timestamp.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_date_add.json b/connector/connect/common/src/test/resources/query-tests/queries/function_date_add.json new file mode 100644 index 0000000000000..f81ad3335242c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_date_add.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "date_add", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }, { + "literal": { + "integer": 2 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_date_add.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_date_add.proto.bin new file mode 100644 index 0000000000000..f4dbc16b05c1d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_date_add.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_date_format.json b/connector/connect/common/src/test/resources/query-tests/queries/function_date_format.json new file mode 100644 index 0000000000000..9b3d469ed4e98 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_date_format.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "date_format", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }, { + "literal": { + "string": "yyyy-MM-dd" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_date_format.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_date_format.proto.bin new file mode 100644 index 0000000000000..7226c20974b2a Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_date_format.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_date_sub.json b/connector/connect/common/src/test/resources/query-tests/queries/function_date_sub.json new file mode 100644 index 0000000000000..f1dde0902a20a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_date_sub.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "date_sub", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }, { + "literal": { + "integer": 2 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_date_sub.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_date_sub.proto.bin new file mode 100644 index 0000000000000..43b630c27ed45 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_date_sub.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_date_trunc.json b/connector/connect/common/src/test/resources/query-tests/queries/function_date_trunc.json new file mode 100644 index 0000000000000..363da9b9b9006 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_date_trunc.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "trunc", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "t" + } + }, { + "literal": { + "string": "minute" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_date_trunc.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_date_trunc.proto.bin new file mode 100644 index 0000000000000..f037fb8d34a56 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_date_trunc.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_datediff.json b/connector/connect/common/src/test/resources/query-tests/queries/function_datediff.json new file mode 100644 index 0000000000000..b5ef560486d0d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_datediff.json @@ -0,0 +1,42 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "datediff", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }, { + "unresolvedFunction": { + "functionName": "make_date", + "arguments": [{ + "literal": { + "integer": 2020 + } + }, { + "literal": { + "integer": 10 + } + }, { + "literal": { + "integer": 10 + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_datediff.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_datediff.proto.bin new file mode 100644 index 0000000000000..02e917b406838 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_datediff.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_dayofmonth.json b/connector/connect/common/src/test/resources/query-tests/queries/function_dayofmonth.json new file mode 100644 index 0000000000000..3e453c1f7a652 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_dayofmonth.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "dayofmonth", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_dayofmonth.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_dayofmonth.proto.bin new file mode 100644 index 0000000000000..3a2973e21e5a0 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_dayofmonth.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_dayofweek.json b/connector/connect/common/src/test/resources/query-tests/queries/function_dayofweek.json new file mode 100644 index 0000000000000..74715de151e77 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_dayofweek.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "dayofweek", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_dayofweek.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_dayofweek.proto.bin new file mode 100644 index 0000000000000..fceea203c790e Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_dayofweek.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_dayofyear.json b/connector/connect/common/src/test/resources/query-tests/queries/function_dayofyear.json new file mode 100644 index 0000000000000..d23c6790a47dd --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_dayofyear.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "dayofyear", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_dayofyear.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_dayofyear.proto.bin new file mode 100644 index 0000000000000..a526b449ae0a4 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_dayofyear.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_days.json b/connector/connect/common/src/test/resources/query-tests/queries/function_days.json new file mode 100644 index 0000000000000..9e20c48729a30 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_days.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "days", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_days.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_days.proto.bin new file mode 100644 index 0000000000000..b0a8472f8c4ff Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_days.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_decode.json b/connector/connect/common/src/test/resources/query-tests/queries/function_decode.json new file mode 100644 index 0000000000000..6be60808e64f3 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_decode.json @@ -0,0 +1,37 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "decode", + "arguments": [{ + "cast": { + "expr": { + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, + "type": { + "binary": { + } + } + } + }, { + "literal": { + "string": "UTF-8" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_decode.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_decode.proto.bin new file mode 100644 index 0000000000000..18b8bbcf6a01d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_decode.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_degrees.json b/connector/connect/common/src/test/resources/query-tests/queries/function_degrees.json new file mode 100644 index 0000000000000..e096b07e4dc6e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_degrees.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "degrees", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_degrees.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_degrees.proto.bin new file mode 100644 index 0000000000000..e2d264bb2e108 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_degrees.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_dense_rank.json b/connector/connect/common/src/test/resources/query-tests/queries/function_dense_rank.json new file mode 100644 index 0000000000000..46c5e1eaddfc0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_dense_rank.json @@ -0,0 +1,38 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "window": { + "windowFunction": { + "unresolvedFunction": { + "functionName": "dense_rank" + } + }, + "partitionSpec": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }], + "orderSpec": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_dense_rank.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_dense_rank.proto.bin new file mode 100644 index 0000000000000..4597e63be8379 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_dense_rank.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_desc.json b/connector/connect/common/src/test/resources/query-tests/queries/function_desc.json new file mode 100644 index 0000000000000..0841b33b8fb69 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_desc.json @@ -0,0 +1,26 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "sortOrder": { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "direction": "SORT_DIRECTION_DESCENDING", + "nullOrdering": "SORT_NULLS_LAST" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_desc.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_desc.proto.bin new file mode 100644 index 0000000000000..bd549431a832c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_desc.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_desc_nulls_first.json b/connector/connect/common/src/test/resources/query-tests/queries/function_desc_nulls_first.json new file mode 100644 index 0000000000000..683de2af2388a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_desc_nulls_first.json @@ -0,0 +1,26 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "sortOrder": { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "direction": "SORT_DIRECTION_DESCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_desc_nulls_first.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_desc_nulls_first.proto.bin new file mode 100644 index 0000000000000..b46e09d6ef3d1 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_desc_nulls_first.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_desc_nulls_last.json b/connector/connect/common/src/test/resources/query-tests/queries/function_desc_nulls_last.json new file mode 100644 index 0000000000000..0841b33b8fb69 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_desc_nulls_last.json @@ -0,0 +1,26 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "sortOrder": { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "direction": "SORT_DIRECTION_DESCENDING", + "nullOrdering": "SORT_NULLS_LAST" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_desc_nulls_last.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_desc_nulls_last.proto.bin new file mode 100644 index 0000000000000..bd549431a832c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_desc_nulls_last.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_element_at.json b/connector/connect/common/src/test/resources/query-tests/queries/function_element_at.json new file mode 100644 index 0000000000000..ef5551440934c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_element_at.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "element_at", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "f" + } + }, { + "literal": { + "string": "bob" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_element_at.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_element_at.proto.bin new file mode 100644 index 0000000000000..993818c6cb4bf Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_element_at.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_encode.json b/connector/connect/common/src/test/resources/query-tests/queries/function_encode.json new file mode 100644 index 0000000000000..92e95f2c946d0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_encode.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "encode", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "UTF-8" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_encode.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_encode.proto.bin new file mode 100644 index 0000000000000..9644825af470b Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_encode.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_exists.json b/connector/connect/common/src/test/resources/query-tests/queries/function_exists.json new file mode 100644 index 0000000000000..76d107092ae1e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_exists.json @@ -0,0 +1,45 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "exists", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "lambdaFunction": { + "function": { + "unresolvedFunction": { + "functionName": "\u003e", + "arguments": [{ + "unresolvedNamedLambdaVariable": { + "nameParts": ["x"] + } + }, { + "literal": { + "integer": 10 + } + }] + } + }, + "arguments": [{ + "nameParts": ["x"] + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_exists.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_exists.proto.bin new file mode 100644 index 0000000000000..27fbc03c69880 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_exists.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_exp.json b/connector/connect/common/src/test/resources/query-tests/queries/function_exp.json new file mode 100644 index 0000000000000..d317efef75eee --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_exp.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "exp", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_exp.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_exp.proto.bin new file mode 100644 index 0000000000000..7def20c94df00 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_exp.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_explode.json b/connector/connect/common/src/test/resources/query-tests/queries/function_explode.json new file mode 100644 index 0000000000000..35ad40ccdd04f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_explode.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "explode", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_explode.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_explode.proto.bin new file mode 100644 index 0000000000000..9c15f942bb11d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_explode.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_explode_outer.json b/connector/connect/common/src/test/resources/query-tests/queries/function_explode_outer.json new file mode 100644 index 0000000000000..efd7f4b524d47 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_explode_outer.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "explode_outer", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_explode_outer.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_explode_outer.proto.bin new file mode 100644 index 0000000000000..9f2cf9554dd15 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_explode_outer.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_expm1.json b/connector/connect/common/src/test/resources/query-tests/queries/function_expm1.json new file mode 100644 index 0000000000000..d425a6de709b7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_expm1.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "expm1", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_expm1.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_expm1.proto.bin new file mode 100644 index 0000000000000..3c310cb04ce3d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_expm1.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_expr.json b/connector/connect/common/src/test/resources/query-tests/queries/function_expr.json new file mode 100644 index 0000000000000..99c69b8e8905b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_expr.json @@ -0,0 +1,20 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "expressionString": { + "expression": "a + 1" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_expr.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_expr.proto.bin new file mode 100644 index 0000000000000..2e59d436bc811 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_expr.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_factorial.json b/connector/connect/common/src/test/resources/query-tests/queries/function_factorial.json new file mode 100644 index 0000000000000..7f13a10480915 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_factorial.json @@ -0,0 +1,34 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "factorial", + "arguments": [{ + "unresolvedFunction": { + "functionName": "%", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "literal": { + "integer": 10 + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_factorial.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_factorial.proto.bin new file mode 100644 index 0000000000000..ac776eb60d2b0 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_factorial.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_filter.json b/connector/connect/common/src/test/resources/query-tests/queries/function_filter.json new file mode 100644 index 0000000000000..f6b565324b8ba --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_filter.json @@ -0,0 +1,45 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "filter", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "lambdaFunction": { + "function": { + "unresolvedFunction": { + "functionName": "\u003e", + "arguments": [{ + "unresolvedNamedLambdaVariable": { + "nameParts": ["x"] + } + }, { + "literal": { + "integer": 10 + } + }] + } + }, + "arguments": [{ + "nameParts": ["x"] + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_filter.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_filter.proto.bin new file mode 100644 index 0000000000000..a53c554598662 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_filter.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_filter_with_pair_input.json b/connector/connect/common/src/test/resources/query-tests/queries/function_filter_with_pair_input.json new file mode 100644 index 0000000000000..1d9667c88901f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_filter_with_pair_input.json @@ -0,0 +1,65 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "filter", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "lambdaFunction": { + "function": { + "unresolvedFunction": { + "functionName": "and", + "arguments": [{ + "unresolvedFunction": { + "functionName": "\u003e", + "arguments": [{ + "unresolvedNamedLambdaVariable": { + "nameParts": ["x"] + } + }, { + "literal": { + "integer": 10 + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "\u003e", + "arguments": [{ + "unresolvedNamedLambdaVariable": { + "nameParts": ["y"] + } + }, { + "literal": { + "integer": 2 + } + }] + } + }] + } + }, + "arguments": [{ + "nameParts": ["x"] + }, { + "nameParts": ["y"] + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_filter_with_pair_input.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_filter_with_pair_input.proto.bin new file mode 100644 index 0000000000000..5b7db291cc37f Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_filter_with_pair_input.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_first.json b/connector/connect/common/src/test/resources/query-tests/queries/function_first.json new file mode 100644 index 0000000000000..dc33bad3c506a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_first.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "first", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "literal": { + "boolean": true + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_first.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_first.proto.bin new file mode 100644 index 0000000000000..cb029dfd26be9 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_first.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_flatten.json b/connector/connect/common/src/test/resources/query-tests/queries/function_flatten.json new file mode 100644 index 0000000000000..32da97271d2dd --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_flatten.json @@ -0,0 +1,47 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "flatten", + "arguments": [{ + "unresolvedFunction": { + "functionName": "array", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "unresolvedFunction": { + "functionName": "sequence", + "arguments": [{ + "literal": { + "integer": 1 + } + }, { + "literal": { + "integer": 10 + } + }, { + "literal": { + "long": "1" + } + }] + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_flatten.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_flatten.proto.bin new file mode 100644 index 0000000000000..e6bb018a37005 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_flatten.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_floor.json b/connector/connect/common/src/test/resources/query-tests/queries/function_floor.json new file mode 100644 index 0000000000000..78924f5f33627 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_floor.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "floor", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_floor.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_floor.proto.bin new file mode 100644 index 0000000000000..b52696ca4d00a Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_floor.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_floor_scale.json b/connector/connect/common/src/test/resources/query-tests/queries/function_floor_scale.json new file mode 100644 index 0000000000000..394621e4dd314 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_floor_scale.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "floor", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, { + "literal": { + "integer": 2 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_floor_scale.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_floor_scale.proto.bin new file mode 100644 index 0000000000000..ee0665bab644c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_floor_scale.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_forall.json b/connector/connect/common/src/test/resources/query-tests/queries/function_forall.json new file mode 100644 index 0000000000000..93134aba0fa9c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_forall.json @@ -0,0 +1,45 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "forall", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "lambdaFunction": { + "function": { + "unresolvedFunction": { + "functionName": "\u003e", + "arguments": [{ + "unresolvedNamedLambdaVariable": { + "nameParts": ["x"] + } + }, { + "literal": { + "integer": 10 + } + }] + } + }, + "arguments": [{ + "nameParts": ["x"] + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_forall.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_forall.proto.bin new file mode 100644 index 0000000000000..3199c758c04ac Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_forall.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_format_number.json b/connector/connect/common/src/test/resources/query-tests/queries/function_format_number.json new file mode 100644 index 0000000000000..daa648c0a599e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_format_number.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "format_number", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, { + "literal": { + "integer": 1 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_format_number.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_format_number.proto.bin new file mode 100644 index 0000000000000..81e2c4d5fd54d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_format_number.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_from_csv.json b/connector/connect/common/src/test/resources/query-tests/queries/function_from_csv.json new file mode 100644 index 0000000000000..798e79e6618f5 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_from_csv.json @@ -0,0 +1,42 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "from_csv", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "id BIGINT,a INT,b DOUBLE" + } + }, { + "unresolvedFunction": { + "functionName": "map", + "arguments": [{ + "literal": { + "string": "mode" + } + }, { + "literal": { + "string": "FAILFAST" + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_from_csv.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_from_csv.proto.bin new file mode 100644 index 0000000000000..8acd3b619b41e Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_from_csv.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_from_json.json b/connector/connect/common/src/test/resources/query-tests/queries/function_from_json.json new file mode 100644 index 0000000000000..ddfa91abca05e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_from_json.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "from_json", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "{\"type\":\"struct\",\"fields\":[{\"name\":\"id\",\"type\":\"long\",\"nullable\":true,\"metadata\":{}},{\"name\":\"a\",\"type\":\"integer\",\"nullable\":true,\"metadata\":{}},{\"name\":\"b\",\"type\":\"double\",\"nullable\":true,\"metadata\":{}}]}" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_from_json.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_from_json.proto.bin new file mode 100644 index 0000000000000..ad95d0f2b343d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_from_json.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_from_unixtime.json b/connector/connect/common/src/test/resources/query-tests/queries/function_from_unixtime.json new file mode 100644 index 0000000000000..81d6608adb18f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_from_unixtime.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "from_unixtime", + "arguments": [{ + "literal": { + "long": "1" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_from_unixtime.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_from_unixtime.proto.bin new file mode 100644 index 0000000000000..b1c34caaf62f0 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_from_unixtime.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_from_utc_timestamp.json b/connector/connect/common/src/test/resources/query-tests/queries/function_from_utc_timestamp.json new file mode 100644 index 0000000000000..5d63fd829f302 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_from_utc_timestamp.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "from_utc_timestamp", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "t" + } + }, { + "literal": { + "string": "-08:00" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_from_utc_timestamp.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_from_utc_timestamp.proto.bin new file mode 100644 index 0000000000000..34bf9c64f3a97 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_from_utc_timestamp.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_get.json b/connector/connect/common/src/test/resources/query-tests/queries/function_get.json new file mode 100644 index 0000000000000..7a2a89447c079 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_get.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "get", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "literal": { + "integer": 2 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_get.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_get.proto.bin new file mode 100644 index 0000000000000..be40df955a407 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_get.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_get_json_object.json b/connector/connect/common/src/test/resources/query-tests/queries/function_get_json_object.json new file mode 100644 index 0000000000000..17adf9230a6eb --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_get_json_object.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "get_json_object", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "$.device_type" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_get_json_object.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_get_json_object.proto.bin new file mode 100644 index 0000000000000..08ad8f4f91bad Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_get_json_object.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_greatest.json b/connector/connect/common/src/test/resources/query-tests/queries/function_greatest.json new file mode 100644 index 0000000000000..bf5d50edec84f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_greatest.json @@ -0,0 +1,38 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "greatest", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedExtractValue": { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }, + "extraction": { + "literal": { + "string": "a" + } + } + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_greatest.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_greatest.proto.bin new file mode 100644 index 0000000000000..44d9d5f8cfb2d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_greatest.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_hash.json b/connector/connect/common/src/test/resources/query-tests/queries/function_hash.json new file mode 100644 index 0000000000000..6ef504a006457 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_hash.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "hash", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_hash.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_hash.proto.bin new file mode 100644 index 0000000000000..284700c4c5ea9 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_hash.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_hex.json b/connector/connect/common/src/test/resources/query-tests/queries/function_hex.json new file mode 100644 index 0000000000000..af9d0dd298277 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_hex.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "hex", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_hex.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_hex.proto.bin new file mode 100644 index 0000000000000..9d8c3b5e23584 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_hex.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_hour.json b/connector/connect/common/src/test/resources/query-tests/queries/function_hour.json new file mode 100644 index 0000000000000..2621b9f81913c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_hour.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "hour", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "t" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_hour.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_hour.proto.bin new file mode 100644 index 0000000000000..6cdb50364c133 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_hour.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_hours.json b/connector/connect/common/src/test/resources/query-tests/queries/function_hours.json new file mode 100644 index 0000000000000..a72a8656362fd --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_hours.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "hours", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_hours.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_hours.proto.bin new file mode 100644 index 0000000000000..6e8203b89e320 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_hours.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_hypot.json b/connector/connect/common/src/test/resources/query-tests/queries/function_hypot.json new file mode 100644 index 0000000000000..2d0d6be0164bc --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_hypot.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "hypot", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_hypot.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_hypot.proto.bin new file mode 100644 index 0000000000000..3ad07a2a1ee45 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_hypot.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_initcap.json b/connector/connect/common/src/test/resources/query-tests/queries/function_initcap.json new file mode 100644 index 0000000000000..896bb3d0209da --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_initcap.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "initcap", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_initcap.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_initcap.proto.bin new file mode 100644 index 0000000000000..72df35bd9b387 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_initcap.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_inline.json b/connector/connect/common/src/test/resources/query-tests/queries/function_inline.json new file mode 100644 index 0000000000000..4abdac736d0fe --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_inline.json @@ -0,0 +1,30 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "inline", + "arguments": [{ + "unresolvedFunction": { + "functionName": "map_values", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "f" + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_inline.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_inline.proto.bin new file mode 100644 index 0000000000000..261e28e3acaa8 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_inline.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_inline_outer.json b/connector/connect/common/src/test/resources/query-tests/queries/function_inline_outer.json new file mode 100644 index 0000000000000..d74ee83eeff3e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_inline_outer.json @@ -0,0 +1,30 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "inline_outer", + "arguments": [{ + "unresolvedFunction": { + "functionName": "map_values", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "f" + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_inline_outer.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_inline_outer.proto.bin new file mode 100644 index 0000000000000..d757e5afe3050 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_inline_outer.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_input_file_name.json b/connector/connect/common/src/test/resources/query-tests/queries/function_input_file_name.json new file mode 100644 index 0000000000000..47f2e461eba46 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_input_file_name.json @@ -0,0 +1,20 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "input_file_name" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_input_file_name.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_input_file_name.proto.bin new file mode 100644 index 0000000000000..c3c6414d5d881 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_input_file_name.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_isnan.json b/connector/connect/common/src/test/resources/query-tests/queries/function_isnan.json new file mode 100644 index 0000000000000..f594918ed930a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_isnan.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "isNaN", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_isnan.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_isnan.proto.bin new file mode 100644 index 0000000000000..1030abda5b8c2 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_isnan.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_isnull.json b/connector/connect/common/src/test/resources/query-tests/queries/function_isnull.json new file mode 100644 index 0000000000000..7443fc97f42cf --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_isnull.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "isNull", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_isnull.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_isnull.proto.bin new file mode 100644 index 0000000000000..3d1fbd4dedfe7 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_isnull.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_json_tuple.json b/connector/connect/common/src/test/resources/query-tests/queries/function_json_tuple.json new file mode 100644 index 0000000000000..32de63452c364 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_json_tuple.json @@ -0,0 +1,37 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "json_tuple", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "a" + } + }, { + "literal": { + "string": "b" + } + }, { + "literal": { + "string": "id" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_json_tuple.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_json_tuple.proto.bin new file mode 100644 index 0000000000000..e51be42b38d34 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_json_tuple.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_kurtosis.json b/connector/connect/common/src/test/resources/query-tests/queries/function_kurtosis.json new file mode 100644 index 0000000000000..7399d7a6da388 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_kurtosis.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "kurtosis", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_kurtosis.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_kurtosis.proto.bin new file mode 100644 index 0000000000000..848a4842e2462 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_kurtosis.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_lag.json b/connector/connect/common/src/test/resources/query-tests/queries/function_lag.json new file mode 100644 index 0000000000000..dd1cba376f3c7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_lag.json @@ -0,0 +1,58 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "window": { + "windowFunction": { + "unresolvedFunction": { + "functionName": "lag", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "integer": 1 + } + }, { + "literal": { + "null": { + "null": { + } + } + } + }, { + "literal": { + "boolean": true + } + }] + } + }, + "partitionSpec": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }], + "orderSpec": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_lag.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_lag.proto.bin new file mode 100644 index 0000000000000..7fd85861fb8c8 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_lag.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_last.json b/connector/connect/common/src/test/resources/query-tests/queries/function_last.json new file mode 100644 index 0000000000000..f26e5887ed527 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_last.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "last", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "literal": { + "boolean": false + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_last.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_last.proto.bin new file mode 100644 index 0000000000000..69221737be671 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_last.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_last_day.json b/connector/connect/common/src/test/resources/query-tests/queries/function_last_day.json new file mode 100644 index 0000000000000..2cb1635caf47e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_last_day.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "last_day", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "t" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_last_day.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_last_day.proto.bin new file mode 100644 index 0000000000000..1afb5c02ae347 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_last_day.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_lead.json b/connector/connect/common/src/test/resources/query-tests/queries/function_lead.json new file mode 100644 index 0000000000000..ef76586d381dd --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_lead.json @@ -0,0 +1,55 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "window": { + "windowFunction": { + "unresolvedFunction": { + "functionName": "lead", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "integer": 2 + } + }, { + "literal": { + "string": "dv" + } + }, { + "literal": { + "boolean": true + } + }] + } + }, + "partitionSpec": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }], + "orderSpec": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_lead.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_lead.proto.bin new file mode 100644 index 0000000000000..9bcdcdb3617a9 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_lead.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_least.json b/connector/connect/common/src/test/resources/query-tests/queries/function_least.json new file mode 100644 index 0000000000000..403531c9f6958 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_least.json @@ -0,0 +1,38 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "least", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedExtractValue": { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }, + "extraction": { + "literal": { + "string": "a" + } + } + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_least.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_least.proto.bin new file mode 100644 index 0000000000000..c9ead802a9616 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_least.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_length.json b/connector/connect/common/src/test/resources/query-tests/queries/function_length.json new file mode 100644 index 0000000000000..f2c3c69255897 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_length.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "length", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_length.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_length.proto.bin new file mode 100644 index 0000000000000..a14f94085b3b6 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_length.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_levenshtein.json b/connector/connect/common/src/test/resources/query-tests/queries/function_levenshtein.json new file mode 100644 index 0000000000000..10caaf184fee5 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_levenshtein.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "levenshtein", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "bob" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_levenshtein.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_levenshtein.proto.bin new file mode 100644 index 0000000000000..75b48541b7663 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_levenshtein.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_lit.json b/connector/connect/common/src/test/resources/query-tests/queries/function_lit.json new file mode 100644 index 0000000000000..03924866a2681 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_lit.json @@ -0,0 +1,139 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "expressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, { + "literal": { + "boolean": true + } + }, { + "literal": { + "byte": 68 + } + }, { + "literal": { + "short": 9872 + } + }, { + "literal": { + "integer": -8726532 + } + }, { + "literal": { + "long": "7834609328726532" + } + }, { + "literal": { + "double": 2.718281828459045 + } + }, { + "literal": { + "float": -0.8 + } + }, { + "literal": { + "decimal": { + "value": "89.97620", + "precision": 7, + "scale": 5 + } + } + }, { + "literal": { + "decimal": { + "value": "89889.7667231", + "precision": 12, + "scale": 7 + } + } + }, { + "literal": { + "string": "connect!" + } + }, { + "literal": { + "string": "T" + } + }, { + "literal": { + "string": "ABCDEFGHIJ" + } + }, { + "literal": { + "binary": "eHl6e3x9fn+AgYKDhIWGh4iJiouMjY4=" + } + }, { + "literal": { + "binary": "CAY=" + } + }, { + "literal": { + "null": { + "null": { + } + } + } + }, { + "literal": { + "date": 18545 + } + }, { + "literal": { + "decimal": { + "value": "8.997620", + "precision": 7, + "scale": 6 + } + } + }, { + "literal": { + "timestamp": "1677155519808000" + } + }, { + "literal": { + "timestamp": "12345000" + } + }, { + "literal": { + "timestampNtz": "1677184560000000" + } + }, { + "literal": { + "date": 19411 + } + }, { + "literal": { + "dayTimeInterval": "200000000" + } + }, { + "literal": { + "yearMonthInterval": 0 + } + }, { + "literal": { + "calendarInterval": { + "months": 2, + "days": 20, + "microseconds": "100" + } + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_lit.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_lit.proto.bin new file mode 100644 index 0000000000000..fc86c71e28c46 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_lit.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.json b/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.json new file mode 100644 index 0000000000000..c9441c9e77ce7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.json @@ -0,0 +1,461 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "expressions": [{ + "literal": { + "array": { + "elementType": { + "double": { + } + } + } + } + }, { + "literal": { + "array": { + "elementType": { + "array": { + "elementType": { + "integer": { + } + }, + "containsNull": true + } + }, + "element": [{ + "array": { + "elementType": { + "integer": { + } + }, + "element": [{ + "integer": 1 + }] + } + }, { + "array": { + "elementType": { + "integer": { + } + }, + "element": [{ + "integer": 2 + }] + } + }, { + "array": { + "elementType": { + "integer": { + } + }, + "element": [{ + "integer": 3 + }] + } + }] + } + } + }, { + "literal": { + "array": { + "elementType": { + "array": { + "elementType": { + "array": { + "elementType": { + "integer": { + } + }, + "containsNull": true + } + }, + "containsNull": true + } + }, + "element": [{ + "array": { + "elementType": { + "array": { + "elementType": { + "integer": { + } + }, + "containsNull": true + } + }, + "element": [{ + "array": { + "elementType": { + "integer": { + } + }, + "element": [{ + "integer": 1 + }] + } + }] + } + }, { + "array": { + "elementType": { + "array": { + "elementType": { + "integer": { + } + }, + "containsNull": true + } + }, + "element": [{ + "array": { + "elementType": { + "integer": { + } + }, + "element": [{ + "integer": 2 + }] + } + }] + } + }, { + "array": { + "elementType": { + "array": { + "elementType": { + "integer": { + } + }, + "containsNull": true + } + }, + "element": [{ + "array": { + "elementType": { + "integer": { + } + }, + "element": [{ + "integer": 3 + }] + } + }] + } + }] + } + } + }, { + "literal": { + "array": { + "elementType": { + "boolean": { + } + }, + "element": [{ + "boolean": true + }, { + "boolean": false + }] + } + } + }, { + "literal": { + "binary": "Q0RF" + } + }, { + "literal": { + "array": { + "elementType": { + "short": { + } + }, + "element": [{ + "short": 9872 + }, { + "short": 9873 + }, { + "short": 9874 + }] + } + } + }, { + "literal": { + "array": { + "elementType": { + "integer": { + } + }, + "element": [{ + "integer": -8726532 + }, { + "integer": 8726532 + }, { + "integer": -8726533 + }] + } + } + }, { + "literal": { + "array": { + "elementType": { + "long": { + } + }, + "element": [{ + "long": "7834609328726531" + }, { + "long": "7834609328726532" + }, { + "long": "7834609328726533" + }] + } + } + }, { + "literal": { + "array": { + "elementType": { + "double": { + } + }, + "element": [{ + "double": 2.718281828459045 + }, { + "double": 1.0 + }, { + "double": 2.0 + }] + } + } + }, { + "literal": { + "array": { + "elementType": { + "float": { + } + }, + "element": [{ + "float": -0.8 + }, { + "float": -0.7 + }, { + "float": -0.9 + }] + } + } + }, { + "literal": { + "array": { + "elementType": { + "decimal": { + "scale": 18, + "precision": 38 + } + }, + "element": [{ + "decimal": { + "value": "89.97620", + "precision": 7, + "scale": 5 + } + }, { + "decimal": { + "value": "89.97621", + "precision": 7, + "scale": 5 + } + }] + } + } + }, { + "literal": { + "array": { + "elementType": { + "decimal": { + "scale": 18, + "precision": 38 + } + }, + "element": [{ + "decimal": { + "value": "89889.7667231", + "precision": 12, + "scale": 7 + } + }, { + "decimal": { + "value": "89889.7667231", + "precision": 12, + "scale": 7 + } + }] + } + } + }, { + "literal": { + "array": { + "elementType": { + "string": { + } + }, + "element": [{ + "string": "connect!" + }, { + "string": "disconnect!" + }] + } + } + }, { + "literal": { + "string": "TF" + } + }, { + "literal": { + "array": { + "elementType": { + "string": { + } + }, + "element": [{ + "string": "ABCDEFGHIJ" + }, { + "string": "BCDEFGHIJK" + }] + } + } + }, { + "literal": { + "array": { + "elementType": { + "date": { + } + }, + "element": [{ + "date": 18545 + }, { + "date": 18546 + }] + } + } + }, { + "literal": { + "array": { + "elementType": { + "timestamp": { + } + }, + "element": [{ + "timestamp": "1677155519808000" + }, { + "timestamp": "1677155519809000" + }] + } + } + }, { + "literal": { + "array": { + "elementType": { + "timestamp": { + } + }, + "element": [{ + "timestamp": "12345000" + }, { + "timestamp": "23456000" + }] + } + } + }, { + "literal": { + "array": { + "elementType": { + "timestampNtz": { + } + }, + "element": [{ + "timestampNtz": "1677184560000000" + }, { + "timestampNtz": "1677188160000000" + }] + } + } + }, { + "literal": { + "array": { + "elementType": { + "date": { + } + }, + "element": [{ + "date": 19411 + }, { + "date": 19417 + }] + } + } + }, { + "literal": { + "array": { + "elementType": { + "dayTimeInterval": { + "startField": 0, + "endField": 3 + } + }, + "element": [{ + "dayTimeInterval": "100000000" + }, { + "dayTimeInterval": "200000000" + }] + } + } + }, { + "literal": { + "array": { + "elementType": { + "yearMonthInterval": { + "startField": 0, + "endField": 1 + } + }, + "element": [{ + "yearMonthInterval": 0 + }, { + "yearMonthInterval": 0 + }] + } + } + }, { + "literal": { + "array": { + "elementType": { + "calendarInterval": { + } + }, + "element": [{ + "calendarInterval": { + "months": 2, + "days": 20, + "microseconds": "100" + } + }, { + "calendarInterval": { + "months": 2, + "days": 21, + "microseconds": "200" + } + }] + } + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.proto.bin new file mode 100644 index 0000000000000..9763bed6b502a Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_lit_array.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_localtimestamp.json b/connector/connect/common/src/test/resources/query-tests/queries/function_localtimestamp.json new file mode 100644 index 0000000000000..68281d2e6d9d1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_localtimestamp.json @@ -0,0 +1,20 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "localtimestamp" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_localtimestamp.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_localtimestamp.proto.bin new file mode 100644 index 0000000000000..b1a9e70c7c802 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_localtimestamp.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_locate.json b/connector/connect/common/src/test/resources/query-tests/queries/function_locate.json new file mode 100644 index 0000000000000..7939fdd2c7559 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_locate.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "locate", + "arguments": [{ + "literal": { + "string": "jar" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_locate.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_locate.proto.bin new file mode 100644 index 0000000000000..cc7ced9957a52 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_locate.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_locate_with_pos.json b/connector/connect/common/src/test/resources/query-tests/queries/function_locate_with_pos.json new file mode 100644 index 0000000000000..269f39701608a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_locate_with_pos.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "locate", + "arguments": [{ + "literal": { + "string": "jar" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "integer": 10 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_locate_with_pos.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_locate_with_pos.proto.bin new file mode 100644 index 0000000000000..162ab0108c132 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_locate_with_pos.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_log.json b/connector/connect/common/src/test/resources/query-tests/queries/function_log.json new file mode 100644 index 0000000000000..1b2d0ed0b1447 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_log.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "log", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_log.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_log.proto.bin new file mode 100644 index 0000000000000..548fb480dd27e Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_log.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_log10.json b/connector/connect/common/src/test/resources/query-tests/queries/function_log10.json new file mode 100644 index 0000000000000..13292d83c4727 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_log10.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "log10", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_log10.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_log10.proto.bin new file mode 100644 index 0000000000000..22d4655a6efbd Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_log10.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_log1p.json b/connector/connect/common/src/test/resources/query-tests/queries/function_log1p.json new file mode 100644 index 0000000000000..4e9e6847c3c36 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_log1p.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "log1p", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_log1p.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_log1p.proto.bin new file mode 100644 index 0000000000000..9a72c377b0cc4 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_log1p.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_log2.json b/connector/connect/common/src/test/resources/query-tests/queries/function_log2.json new file mode 100644 index 0000000000000..ec29e154a0e1d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_log2.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "log2", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_log2.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_log2.proto.bin new file mode 100644 index 0000000000000..34e3780650540 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_log2.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_log_with_base.json b/connector/connect/common/src/test/resources/query-tests/queries/function_log_with_base.json new file mode 100644 index 0000000000000..6bc2a4ec3335a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_log_with_base.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "log", + "arguments": [{ + "literal": { + "double": 2.0 + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_log_with_base.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_log_with_base.proto.bin new file mode 100644 index 0000000000000..2e64e15ed5555 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_log_with_base.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_lower.json b/connector/connect/common/src/test/resources/query-tests/queries/function_lower.json new file mode 100644 index 0000000000000..f7fe5beba2c02 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_lower.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "lower", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_lower.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_lower.proto.bin new file mode 100644 index 0000000000000..7c736d93f7729 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_lower.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_lpad.json b/connector/connect/common/src/test/resources/query-tests/queries/function_lpad.json new file mode 100644 index 0000000000000..b9f3e6700bfa4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_lpad.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "lpad", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "integer": 10 + } + }, { + "literal": { + "string": "-" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_lpad.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_lpad.proto.bin new file mode 100644 index 0000000000000..470ab1cc44add Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_lpad.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_lpad_binary.json b/connector/connect/common/src/test/resources/query-tests/queries/function_lpad_binary.json new file mode 100644 index 0000000000000..aeb39ba09ad20 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_lpad_binary.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,bytes:binary\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "lpad", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "bytes" + } + }, { + "literal": { + "integer": 5 + } + }, { + "literal": { + "binary": "DAoPDg==" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_lpad_binary.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_lpad_binary.proto.bin new file mode 100644 index 0000000000000..b4acebb394c7a Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_lpad_binary.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_ltrim.json b/connector/connect/common/src/test/resources/query-tests/queries/function_ltrim.json new file mode 100644 index 0000000000000..dd3b459520221 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_ltrim.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "ltrim", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_ltrim.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_ltrim.proto.bin new file mode 100644 index 0000000000000..162b6a7337bb9 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_ltrim.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_ltrim_with_pattern.json b/connector/connect/common/src/test/resources/query-tests/queries/function_ltrim_with_pattern.json new file mode 100644 index 0000000000000..3c4825792dc3c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_ltrim_with_pattern.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "ltrim", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "xxx" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_ltrim_with_pattern.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_ltrim_with_pattern.proto.bin new file mode 100644 index 0000000000000..13455d7091e9f Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_ltrim_with_pattern.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_make_date.json b/connector/connect/common/src/test/resources/query-tests/queries/function_make_date.json new file mode 100644 index 0000000000000..a363298dd123a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_make_date.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "make_date", + "arguments": [{ + "literal": { + "integer": 2018 + } + }, { + "literal": { + "integer": 5 + } + }, { + "literal": { + "integer": 14 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_make_date.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_make_date.proto.bin new file mode 100644 index 0000000000000..0526825fccade Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_make_date.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_map.json b/connector/connect/common/src/test/resources/query-tests/queries/function_map.json new file mode 100644 index 0000000000000..ca9d3bf2bcc71 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_map.json @@ -0,0 +1,37 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "map", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "integer": 22 + } + }, { + "literal": { + "string": "dummy" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_map.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_map.proto.bin new file mode 100644 index 0000000000000..229a48b75131d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_map.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_map_concat.json b/connector/connect/common/src/test/resources/query-tests/queries/function_map_concat.json new file mode 100644 index 0000000000000..f56f6cee20ab0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_map_concat.json @@ -0,0 +1,66 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "map_concat", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "f" + } + }, { + "unresolvedFunction": { + "functionName": "map", + "arguments": [{ + "literal": { + "string": "foo" + } + }, { + "unresolvedFunction": { + "functionName": "struct", + "arguments": [{ + "alias": { + "expr": { + "literal": { + "long": "12" + } + }, + "name": ["id"] + } + }, { + "alias": { + "expr": { + "literal": { + "integer": 68 + } + }, + "name": ["a"] + } + }, { + "alias": { + "expr": { + "literal": { + "double": 2.718281828459045 + } + }, + "name": ["b"] + } + }] + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_map_concat.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_map_concat.proto.bin new file mode 100644 index 0000000000000..0a76d3a1193ea Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_map_concat.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_map_contains_key.json b/connector/connect/common/src/test/resources/query-tests/queries/function_map_contains_key.json new file mode 100644 index 0000000000000..56833f9651023 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_map_contains_key.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "map_contains_key", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "f" + } + }, { + "literal": { + "string": "xyz" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_map_contains_key.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_map_contains_key.proto.bin new file mode 100644 index 0000000000000..e517479020e16 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_map_contains_key.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_map_entries.json b/connector/connect/common/src/test/resources/query-tests/queries/function_map_entries.json new file mode 100644 index 0000000000000..0226506545010 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_map_entries.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "map_entries", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "f" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_map_entries.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_map_entries.proto.bin new file mode 100644 index 0000000000000..f1451d4ad7ba4 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_map_entries.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_map_filter.json b/connector/connect/common/src/test/resources/query-tests/queries/function_map_filter.json new file mode 100644 index 0000000000000..5099377a52a06 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_map_filter.json @@ -0,0 +1,47 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "map_filter", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "f" + } + }, { + "lambdaFunction": { + "function": { + "unresolvedFunction": { + "functionName": "contains", + "arguments": [{ + "unresolvedNamedLambdaVariable": { + "nameParts": ["x"] + } + }, { + "literal": { + "string": "baz" + } + }] + } + }, + "arguments": [{ + "nameParts": ["x"] + }, { + "nameParts": ["y"] + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_map_filter.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_map_filter.proto.bin new file mode 100644 index 0000000000000..fac64e79a5bf0 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_map_filter.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_map_from_arrays.json b/connector/connect/common/src/test/resources/query-tests/queries/function_map_from_arrays.json new file mode 100644 index 0000000000000..1eb1f7d2ef066 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_map_from_arrays.json @@ -0,0 +1,47 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "map_from_arrays", + "arguments": [{ + "unresolvedFunction": { + "functionName": "array", + "arguments": [{ + "literal": { + "integer": 1 + } + }, { + "literal": { + "integer": 2 + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "array", + "arguments": [{ + "literal": { + "string": "one" + } + }, { + "literal": { + "string": "two" + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_map_from_arrays.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_map_from_arrays.proto.bin new file mode 100644 index 0000000000000..f5333b1c882bc Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_map_from_arrays.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_map_from_entries.json b/connector/connect/common/src/test/resources/query-tests/queries/function_map_from_entries.json new file mode 100644 index 0000000000000..399ba8d1021bf --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_map_from_entries.json @@ -0,0 +1,52 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "map_from_entries", + "arguments": [{ + "unresolvedFunction": { + "functionName": "transform", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "lambdaFunction": { + "function": { + "unresolvedFunction": { + "functionName": "struct", + "arguments": [{ + "unresolvedNamedLambdaVariable": { + "nameParts": ["y"] + } + }, { + "unresolvedNamedLambdaVariable": { + "nameParts": ["x"] + } + }] + } + }, + "arguments": [{ + "nameParts": ["x"] + }, { + "nameParts": ["y"] + }] + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_map_from_entries.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_map_from_entries.proto.bin new file mode 100644 index 0000000000000..2938c84f77116 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_map_from_entries.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_map_keys.json b/connector/connect/common/src/test/resources/query-tests/queries/function_map_keys.json new file mode 100644 index 0000000000000..5af013295cd9f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_map_keys.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "map_keys", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "f" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_map_keys.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_map_keys.proto.bin new file mode 100644 index 0000000000000..ee19968bacc2c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_map_keys.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_map_values.json b/connector/connect/common/src/test/resources/query-tests/queries/function_map_values.json new file mode 100644 index 0000000000000..3c5eb651801dc --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_map_values.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "map_values", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "f" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_map_values.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_map_values.proto.bin new file mode 100644 index 0000000000000..4cd7c488ada48 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_map_values.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_map_zip_with.json b/connector/connect/common/src/test/resources/query-tests/queries/function_map_zip_with.json new file mode 100644 index 0000000000000..9d035545eb313 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_map_zip_with.json @@ -0,0 +1,71 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "map_zip_with", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "f" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "f" + } + }, { + "lambdaFunction": { + "function": { + "unresolvedFunction": { + "functionName": "+", + "arguments": [{ + "unresolvedExtractValue": { + "child": { + "unresolvedNamedLambdaVariable": { + "nameParts": ["y"] + } + }, + "extraction": { + "literal": { + "string": "id" + } + } + } + }, { + "unresolvedExtractValue": { + "child": { + "unresolvedNamedLambdaVariable": { + "nameParts": ["z"] + } + }, + "extraction": { + "literal": { + "string": "id" + } + } + } + }] + } + }, + "arguments": [{ + "nameParts": ["x"] + }, { + "nameParts": ["y"] + }, { + "nameParts": ["z"] + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_map_zip_with.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_map_zip_with.proto.bin new file mode 100644 index 0000000000000..f14eb1a3c93d3 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_map_zip_with.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_max.json b/connector/connect/common/src/test/resources/query-tests/queries/function_max.json new file mode 100644 index 0000000000000..b23dd9d14c643 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_max.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "max", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_max.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_max.proto.bin new file mode 100644 index 0000000000000..788c9539b5767 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_max.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_max_by.json b/connector/connect/common/src/test/resources/query-tests/queries/function_max_by.json new file mode 100644 index 0000000000000..da311e340cc50 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_max_by.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "max_by", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_max_by.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_max_by.proto.bin new file mode 100644 index 0000000000000..284c2453af8bd Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_max_by.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_md5.json b/connector/connect/common/src/test/resources/query-tests/queries/function_md5.json new file mode 100644 index 0000000000000..e8718594b0be3 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_md5.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "md5", + "arguments": [{ + "cast": { + "expr": { + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, + "type": { + "binary": { + } + } + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_md5.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_md5.proto.bin new file mode 100644 index 0000000000000..d3ec7c26a2ede Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_md5.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_median.json b/connector/connect/common/src/test/resources/query-tests/queries/function_median.json new file mode 100644 index 0000000000000..7331454b9ecb0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_median.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "median", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_median.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_median.proto.bin new file mode 100644 index 0000000000000..59533e5be5992 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_median.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_min.json b/connector/connect/common/src/test/resources/query-tests/queries/function_min.json new file mode 100644 index 0000000000000..1b7266b6774e4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_min.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "min", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_min.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_min.proto.bin new file mode 100644 index 0000000000000..b82f4c5309222 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_min.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_min_by.json b/connector/connect/common/src/test/resources/query-tests/queries/function_min_by.json new file mode 100644 index 0000000000000..d2478f5e81abe --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_min_by.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "min_by", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_min_by.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_min_by.proto.bin new file mode 100644 index 0000000000000..ddc642b95000c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_min_by.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_minute.json b/connector/connect/common/src/test/resources/query-tests/queries/function_minute.json new file mode 100644 index 0000000000000..7c749cdff82f5 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_minute.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "minute", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "t" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_minute.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_minute.proto.bin new file mode 100644 index 0000000000000..e81b7dad85331 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_minute.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_mode.json b/connector/connect/common/src/test/resources/query-tests/queries/function_mode.json new file mode 100644 index 0000000000000..8e8183e9e0883 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_mode.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "mode", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_mode.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_mode.proto.bin new file mode 100644 index 0000000000000..dca0953a387b1 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_mode.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_monotonically_increasing_id.json b/connector/connect/common/src/test/resources/query-tests/queries/function_monotonically_increasing_id.json new file mode 100644 index 0000000000000..0a14f1008976e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_monotonically_increasing_id.json @@ -0,0 +1,20 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "monotonically_increasing_id" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_monotonically_increasing_id.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_monotonically_increasing_id.proto.bin new file mode 100644 index 0000000000000..724ce3ac6904c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_monotonically_increasing_id.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_month.json b/connector/connect/common/src/test/resources/query-tests/queries/function_month.json new file mode 100644 index 0000000000000..7ea1e5d0375e9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_month.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "month", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_month.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_month.proto.bin new file mode 100644 index 0000000000000..b97100a6fe2ec Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_month.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_months.json b/connector/connect/common/src/test/resources/query-tests/queries/function_months.json new file mode 100644 index 0000000000000..278bab76a6544 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_months.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "months", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_months.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_months.proto.bin new file mode 100644 index 0000000000000..fdcd96750dc9c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_months.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_months_between.json b/connector/connect/common/src/test/resources/query-tests/queries/function_months_between.json new file mode 100644 index 0000000000000..0fa772d26cd41 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_months_between.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "months_between", + "arguments": [{ + "unresolvedFunction": { + "functionName": "current_date" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_months_between.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_months_between.proto.bin new file mode 100644 index 0000000000000..22ddc1813e0fb Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_months_between.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_months_between_with_roundoff.json b/connector/connect/common/src/test/resources/query-tests/queries/function_months_between_with_roundoff.json new file mode 100644 index 0000000000000..d11bfbd7f2426 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_months_between_with_roundoff.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "months_between", + "arguments": [{ + "unresolvedFunction": { + "functionName": "current_date" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }, { + "literal": { + "boolean": true + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_months_between_with_roundoff.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_months_between_with_roundoff.proto.bin new file mode 100644 index 0000000000000..bf9c545911ffd Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_months_between_with_roundoff.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_nanvl.json b/connector/connect/common/src/test/resources/query-tests/queries/function_nanvl.json new file mode 100644 index 0000000000000..69daab270c2b9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_nanvl.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "nanvl", + "arguments": [{ + "literal": { + "double": "NaN" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_nanvl.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_nanvl.proto.bin new file mode 100644 index 0000000000000..f314a73dcae65 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_nanvl.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_negate.json b/connector/connect/common/src/test/resources/query-tests/queries/function_negate.json new file mode 100644 index 0000000000000..e269fabe44be1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_negate.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "negative", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_negate.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_negate.proto.bin new file mode 100644 index 0000000000000..9c56c111ceee6 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_negate.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_next_day.json b/connector/connect/common/src/test/resources/query-tests/queries/function_next_day.json new file mode 100644 index 0000000000000..486523dcad3ec --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_next_day.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "next_day", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }, { + "literal": { + "string": "Mon" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_next_day.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_next_day.proto.bin new file mode 100644 index 0000000000000..a97bd75f129db Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_next_day.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_nth_value.json b/connector/connect/common/src/test/resources/query-tests/queries/function_nth_value.json new file mode 100644 index 0000000000000..4c764a5d5603c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_nth_value.json @@ -0,0 +1,51 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "window": { + "windowFunction": { + "unresolvedFunction": { + "functionName": "nth_value", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "integer": 3 + } + }, { + "literal": { + "boolean": true + } + }] + } + }, + "partitionSpec": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }], + "orderSpec": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_nth_value.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_nth_value.proto.bin new file mode 100644 index 0000000000000..f87e1695f22e3 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_nth_value.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_ntile.json b/connector/connect/common/src/test/resources/query-tests/queries/function_ntile.json new file mode 100644 index 0000000000000..2346a788b64bd --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_ntile.json @@ -0,0 +1,43 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "window": { + "windowFunction": { + "unresolvedFunction": { + "functionName": "ntile", + "arguments": [{ + "literal": { + "integer": 4 + } + }] + } + }, + "partitionSpec": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }], + "orderSpec": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_ntile.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_ntile.proto.bin new file mode 100644 index 0000000000000..d9ccd2e8a6007 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_ntile.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_octet_length.json b/connector/connect/common/src/test/resources/query-tests/queries/function_octet_length.json new file mode 100644 index 0000000000000..7be9ac82662a4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_octet_length.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "octet_length", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_octet_length.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_octet_length.proto.bin new file mode 100644 index 0000000000000..484ebbb6487b0 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_octet_length.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_overlay.json b/connector/connect/common/src/test/resources/query-tests/queries/function_overlay.json new file mode 100644 index 0000000000000..b580570f923a6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_overlay.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "overlay", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, { + "literal": { + "string": "foo" + } + }, { + "literal": { + "integer": 4 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_overlay.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_overlay.proto.bin new file mode 100644 index 0000000000000..2110ae9c14610 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_overlay.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_overlay_with_len.json b/connector/connect/common/src/test/resources/query-tests/queries/function_overlay_with_len.json new file mode 100644 index 0000000000000..99d5426c46fba --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_overlay_with_len.json @@ -0,0 +1,37 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "overlay", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, { + "literal": { + "string": "foo" + } + }, { + "literal": { + "integer": 4 + } + }, { + "literal": { + "string": "3" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_overlay_with_len.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_overlay_with_len.proto.bin new file mode 100644 index 0000000000000..9a09d28d84fde Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_overlay_with_len.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_percent_rank.json b/connector/connect/common/src/test/resources/query-tests/queries/function_percent_rank.json new file mode 100644 index 0000000000000..d8778ec8cd81d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_percent_rank.json @@ -0,0 +1,38 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "window": { + "windowFunction": { + "unresolvedFunction": { + "functionName": "percent_rank" + } + }, + "partitionSpec": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }], + "orderSpec": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_percent_rank.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_percent_rank.proto.bin new file mode 100644 index 0000000000000..d668f7e1504cb Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_percent_rank.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_percentile_approx.json b/connector/connect/common/src/test/resources/query-tests/queries/function_percentile_approx.json new file mode 100644 index 0000000000000..6289464de2a37 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_percentile_approx.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "percentile_approx", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "literal": { + "double": 0.3 + } + }, { + "literal": { + "integer": 20 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_percentile_approx.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_percentile_approx.proto.bin new file mode 100644 index 0000000000000..f44ec86888f6c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_percentile_approx.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_pmod.json b/connector/connect/common/src/test/resources/query-tests/queries/function_pmod.json new file mode 100644 index 0000000000000..1dc2cb54cbb67 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_pmod.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "pmod", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "literal": { + "integer": 10 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_pmod.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_pmod.proto.bin new file mode 100644 index 0000000000000..a2bb94dbb5173 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_pmod.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_posexplode.json b/connector/connect/common/src/test/resources/query-tests/queries/function_posexplode.json new file mode 100644 index 0000000000000..f8a9db37e62be --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_posexplode.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "posexplode", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_posexplode.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_posexplode.proto.bin new file mode 100644 index 0000000000000..fc50f5f4c85b7 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_posexplode.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_posexplode_outer.json b/connector/connect/common/src/test/resources/query-tests/queries/function_posexplode_outer.json new file mode 100644 index 0000000000000..0e8cd4c1509e1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_posexplode_outer.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "posexplode_outer", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_posexplode_outer.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_posexplode_outer.proto.bin new file mode 100644 index 0000000000000..19d700665e7f5 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_posexplode_outer.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_pow.json b/connector/connect/common/src/test/resources/query-tests/queries/function_pow.json new file mode 100644 index 0000000000000..187636fb360c6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_pow.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "power", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_pow.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_pow.proto.bin new file mode 100644 index 0000000000000..6e1d3b06fe87a Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_pow.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_product.json b/connector/connect/common/src/test/resources/query-tests/queries/function_product.json new file mode 100644 index 0000000000000..1dfb7f81912d3 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_product.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "product", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_product.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_product.proto.bin new file mode 100644 index 0000000000000..8c3fbd31eb6b3 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_product.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_quarter.json b/connector/connect/common/src/test/resources/query-tests/queries/function_quarter.json new file mode 100644 index 0000000000000..b95867e0be963 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_quarter.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "quarter", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_quarter.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_quarter.proto.bin new file mode 100644 index 0000000000000..fdc2d96fb08ca Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_quarter.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_radians.json b/connector/connect/common/src/test/resources/query-tests/queries/function_radians.json new file mode 100644 index 0000000000000..837960dedc653 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_radians.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "radians", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_radians.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_radians.proto.bin new file mode 100644 index 0000000000000..33a2521b22ac9 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_radians.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_raise_error.json b/connector/connect/common/src/test/resources/query-tests/queries/function_raise_error.json new file mode 100644 index 0000000000000..5318466706bd8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_raise_error.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "raise_error", + "arguments": [{ + "literal": { + "string": "kaboom" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_raise_error.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_raise_error.proto.bin new file mode 100644 index 0000000000000..7fbd33b9869ca Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_raise_error.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_rand_with_seed.json b/connector/connect/common/src/test/resources/query-tests/queries/function_rand_with_seed.json new file mode 100644 index 0000000000000..453ea54bd0ef3 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_rand_with_seed.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "rand", + "arguments": [{ + "literal": { + "long": "133" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_rand_with_seed.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_rand_with_seed.proto.bin new file mode 100644 index 0000000000000..566a49d641293 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_rand_with_seed.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_randn_with_seed.json b/connector/connect/common/src/test/resources/query-tests/queries/function_randn_with_seed.json new file mode 100644 index 0000000000000..ef84f05c3e193 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_randn_with_seed.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "randn", + "arguments": [{ + "literal": { + "long": "133" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_randn_with_seed.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_randn_with_seed.proto.bin new file mode 100644 index 0000000000000..b0064842bf308 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_randn_with_seed.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_rank.json b/connector/connect/common/src/test/resources/query-tests/queries/function_rank.json new file mode 100644 index 0000000000000..93c8dc38d668a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_rank.json @@ -0,0 +1,38 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "window": { + "windowFunction": { + "unresolvedFunction": { + "functionName": "rank" + } + }, + "partitionSpec": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }], + "orderSpec": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_rank.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_rank.proto.bin new file mode 100644 index 0000000000000..3aef331fb1739 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_rank.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_regexp_extract.json b/connector/connect/common/src/test/resources/query-tests/queries/function_regexp_extract.json new file mode 100644 index 0000000000000..5d9c7a5b4a5ab --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_regexp_extract.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "regexp_extract", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "(\\d+)-(\\d+)" + } + }, { + "literal": { + "integer": 1 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_regexp_extract.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_regexp_extract.proto.bin new file mode 100644 index 0000000000000..32ba8b6dcb5e9 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_regexp_extract.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_regexp_replace.json b/connector/connect/common/src/test/resources/query-tests/queries/function_regexp_replace.json new file mode 100644 index 0000000000000..83dd7a8569fd4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_regexp_replace.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "regexp_replace", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "(\\d+)" + } + }, { + "literal": { + "string": "XXX" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_regexp_replace.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_regexp_replace.proto.bin new file mode 100644 index 0000000000000..b7d3fde25cf85 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_regexp_replace.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_reverse.json b/connector/connect/common/src/test/resources/query-tests/queries/function_reverse.json new file mode 100644 index 0000000000000..93869adfbedca --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_reverse.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "reverse", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_reverse.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_reverse.proto.bin new file mode 100644 index 0000000000000..dd7f2d5de513d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_reverse.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_rint.json b/connector/connect/common/src/test/resources/query-tests/queries/function_rint.json new file mode 100644 index 0000000000000..ea5bcebf81d72 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_rint.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "rint", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_rint.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_rint.proto.bin new file mode 100644 index 0000000000000..bd47adc8476fa Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_rint.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_round.json b/connector/connect/common/src/test/resources/query-tests/queries/function_round.json new file mode 100644 index 0000000000000..585a0befb224d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_round.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "round", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, { + "literal": { + "integer": 2 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_round.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_round.proto.bin new file mode 100644 index 0000000000000..8625ccb1a58f1 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_round.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_row_number.json b/connector/connect/common/src/test/resources/query-tests/queries/function_row_number.json new file mode 100644 index 0000000000000..3d5ac8afe3db3 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_row_number.json @@ -0,0 +1,38 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "window": { + "windowFunction": { + "unresolvedFunction": { + "functionName": "row_number" + } + }, + "partitionSpec": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }], + "orderSpec": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_row_number.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_row_number.proto.bin new file mode 100644 index 0000000000000..90b4fcb27d3f1 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_row_number.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_rpad.json b/connector/connect/common/src/test/resources/query-tests/queries/function_rpad.json new file mode 100644 index 0000000000000..d9b78a0cfd7a9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_rpad.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "rpad", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "integer": 10 + } + }, { + "literal": { + "string": "-" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_rpad.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_rpad.proto.bin new file mode 100644 index 0000000000000..d4c355afee0b7 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_rpad.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_rpad_binary.json b/connector/connect/common/src/test/resources/query-tests/queries/function_rpad_binary.json new file mode 100644 index 0000000000000..0daaf1636f13d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_rpad_binary.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,bytes:binary\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "rpad", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "bytes" + } + }, { + "literal": { + "integer": 5 + } + }, { + "literal": { + "binary": "CwoLDg==" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_rpad_binary.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_rpad_binary.proto.bin new file mode 100644 index 0000000000000..c6f9f22146c61 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_rpad_binary.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_rtrim.json b/connector/connect/common/src/test/resources/query-tests/queries/function_rtrim.json new file mode 100644 index 0000000000000..5fe66e8e33596 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_rtrim.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "rtrim", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_rtrim.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_rtrim.proto.bin new file mode 100644 index 0000000000000..4320bf6ac397c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_rtrim.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_rtrim_with_pattern.json b/connector/connect/common/src/test/resources/query-tests/queries/function_rtrim_with_pattern.json new file mode 100644 index 0000000000000..d4c3c0ca68eb2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_rtrim_with_pattern.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "rtrim", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "yyy" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_rtrim_with_pattern.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_rtrim_with_pattern.proto.bin new file mode 100644 index 0000000000000..37f4782f46161 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_rtrim_with_pattern.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_csv.json b/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_csv.json new file mode 100644 index 0000000000000..6df6438a1a9ca --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_csv.json @@ -0,0 +1,38 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "schema_of_csv", + "arguments": [{ + "literal": { + "string": "1|abc" + } + }, { + "unresolvedFunction": { + "functionName": "map", + "arguments": [{ + "literal": { + "string": "sep" + } + }, { + "literal": { + "string": "|" + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_csv.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_csv.proto.bin new file mode 100644 index 0000000000000..99475ddf30d11 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_csv.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_json.json b/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_json.json new file mode 100644 index 0000000000000..06110d326e1ef --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_json.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "schema_of_json", + "arguments": [{ + "literal": { + "string": "[{\"col\":01}]" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_json.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_json.proto.bin new file mode 100644 index 0000000000000..c4ca00e629262 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_json.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_json_with_options.json b/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_json_with_options.json new file mode 100644 index 0000000000000..ab05ffa940c50 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_json_with_options.json @@ -0,0 +1,38 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "schema_of_json", + "arguments": [{ + "literal": { + "string": "[{\"col\":01}]" + } + }, { + "unresolvedFunction": { + "functionName": "map", + "arguments": [{ + "literal": { + "string": "allowNumericLeadingZeros" + } + }, { + "literal": { + "string": "true" + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_json_with_options.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_json_with_options.proto.bin new file mode 100644 index 0000000000000..482485501dd37 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_schema_of_json_with_options.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sec.json b/connector/connect/common/src/test/resources/query-tests/queries/function_sec.json new file mode 100644 index 0000000000000..1cab2239755ca --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_sec.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "sec", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sec.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_sec.proto.bin new file mode 100644 index 0000000000000..8760f57a6d4f0 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_sec.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_second.json b/connector/connect/common/src/test/resources/query-tests/queries/function_second.json new file mode 100644 index 0000000000000..c77a572b88aa0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_second.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "second", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "t" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_second.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_second.proto.bin new file mode 100644 index 0000000000000..193c46e917ba2 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_second.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sentences.json b/connector/connect/common/src/test/resources/query-tests/queries/function_sentences.json new file mode 100644 index 0000000000000..412ac0272dd57 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_sentences.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "sentences", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sentences.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_sentences.proto.bin new file mode 100644 index 0000000000000..4b62f22574d32 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_sentences.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.json b/connector/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.json new file mode 100644 index 0000000000000..991b42faddb76 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "sentences", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "en" + } + }, { + "literal": { + "string": "US" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.proto.bin new file mode 100644 index 0000000000000..01c0136c6df16 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_sentences_with_locale.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sequence.json b/connector/connect/common/src/test/resources/query-tests/queries/function_sequence.json new file mode 100644 index 0000000000000..84bced640ff37 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_sequence.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "sequence", + "arguments": [{ + "literal": { + "integer": 1 + } + }, { + "literal": { + "integer": 10 + } + }, { + "literal": { + "long": "1" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sequence.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_sequence.proto.bin new file mode 100644 index 0000000000000..09e1ab3be7dab Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_sequence.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_session_window.json b/connector/connect/common/src/test/resources/query-tests/queries/function_session_window.json new file mode 100644 index 0000000000000..5c7d953402b24 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_session_window.json @@ -0,0 +1,34 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "alias": { + "expr": { + "unresolvedFunction": { + "functionName": "session_window", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "t" + } + }, { + "literal": { + "string": "10 minutes" + } + }] + } + }, + "name": ["session_window"] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_session_window.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_session_window.proto.bin new file mode 100644 index 0000000000000..7f4ee24d53692 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_session_window.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sha1.json b/connector/connect/common/src/test/resources/query-tests/queries/function_sha1.json new file mode 100644 index 0000000000000..ce5014ac2f7e6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_sha1.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "sha1", + "arguments": [{ + "cast": { + "expr": { + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, + "type": { + "binary": { + } + } + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sha1.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_sha1.proto.bin new file mode 100644 index 0000000000000..3fdfdb2a072de Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_sha1.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sha2.json b/connector/connect/common/src/test/resources/query-tests/queries/function_sha2.json new file mode 100644 index 0000000000000..5278d604e97b9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_sha2.json @@ -0,0 +1,37 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "sha2", + "arguments": [{ + "cast": { + "expr": { + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, + "type": { + "binary": { + } + } + } + }, { + "literal": { + "integer": 512 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sha2.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_sha2.proto.bin new file mode 100644 index 0000000000000..20a0ee1082ae2 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_sha2.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_shiftleft.json b/connector/connect/common/src/test/resources/query-tests/queries/function_shiftleft.json new file mode 100644 index 0000000000000..12decd300ab03 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_shiftleft.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "shiftleft", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, { + "literal": { + "integer": 2 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_shiftleft.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_shiftleft.proto.bin new file mode 100644 index 0000000000000..94bfbc99fce2d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_shiftleft.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_shiftright.json b/connector/connect/common/src/test/resources/query-tests/queries/function_shiftright.json new file mode 100644 index 0000000000000..c2295c4abaaa2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_shiftright.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "shiftright", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, { + "literal": { + "integer": 2 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_shiftright.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_shiftright.proto.bin new file mode 100644 index 0000000000000..910d12f50d6a9 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_shiftright.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_shiftrightunsigned.json b/connector/connect/common/src/test/resources/query-tests/queries/function_shiftrightunsigned.json new file mode 100644 index 0000000000000..875e26a5a5652 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_shiftrightunsigned.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "shiftrightunsigned", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, { + "literal": { + "integer": 2 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_shiftrightunsigned.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_shiftrightunsigned.proto.bin new file mode 100644 index 0000000000000..aba9c425dca96 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_shiftrightunsigned.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_signum.json b/connector/connect/common/src/test/resources/query-tests/queries/function_signum.json new file mode 100644 index 0000000000000..bcf6ad7eb174d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_signum.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "signum", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_signum.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_signum.proto.bin new file mode 100644 index 0000000000000..af52abfb7f25b Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_signum.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sin.json b/connector/connect/common/src/test/resources/query-tests/queries/function_sin.json new file mode 100644 index 0000000000000..cb5b0da073456 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_sin.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "sin", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sin.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_sin.proto.bin new file mode 100644 index 0000000000000..a63f574fa59cb Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_sin.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sinh.json b/connector/connect/common/src/test/resources/query-tests/queries/function_sinh.json new file mode 100644 index 0000000000000..e0f46b428611e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_sinh.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "sinh", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sinh.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_sinh.proto.bin new file mode 100644 index 0000000000000..2f17ab02a6d94 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_sinh.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_size.json b/connector/connect/common/src/test/resources/query-tests/queries/function_size.json new file mode 100644 index 0000000000000..37c9cd1ac1ba7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_size.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "size", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "f" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_size.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_size.proto.bin new file mode 100644 index 0000000000000..a8ae600a3dd7a Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_size.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_skewness.json b/connector/connect/common/src/test/resources/query-tests/queries/function_skewness.json new file mode 100644 index 0000000000000..4b14c8d5ca79c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_skewness.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "skewness", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_skewness.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_skewness.proto.bin new file mode 100644 index 0000000000000..889f96b2d2a39 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_skewness.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_slice.json b/connector/connect/common/src/test/resources/query-tests/queries/function_slice.json new file mode 100644 index 0000000000000..b0a63248784ea --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_slice.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "slice", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "literal": { + "integer": 0 + } + }, { + "literal": { + "integer": 5 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_slice.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_slice.proto.bin new file mode 100644 index 0000000000000..620a006f775d6 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_slice.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sort_array.json b/connector/connect/common/src/test/resources/query-tests/queries/function_sort_array.json new file mode 100644 index 0000000000000..b42bede5cd172 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_sort_array.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "sort_array", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "literal": { + "boolean": true + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sort_array.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_sort_array.proto.bin new file mode 100644 index 0000000000000..994048af2afc4 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_sort_array.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_spark_partition_id.json b/connector/connect/common/src/test/resources/query-tests/queries/function_spark_partition_id.json new file mode 100644 index 0000000000000..851745b32ebe0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_spark_partition_id.json @@ -0,0 +1,20 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "spark_partition_id" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_spark_partition_id.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_spark_partition_id.proto.bin new file mode 100644 index 0000000000000..df99cd64e7203 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_spark_partition_id.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_split.json b/connector/connect/common/src/test/resources/query-tests/queries/function_split.json new file mode 100644 index 0000000000000..001d44dcaaf6e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_split.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "split", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": ";" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_split.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_split.proto.bin new file mode 100644 index 0000000000000..cab0bde7b6da2 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_split.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_split_with_limit.json b/connector/connect/common/src/test/resources/query-tests/queries/function_split_with_limit.json new file mode 100644 index 0000000000000..45a7588838ff8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_split_with_limit.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "split", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": ";" + } + }, { + "literal": { + "integer": 10 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_split_with_limit.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_split_with_limit.proto.bin new file mode 100644 index 0000000000000..497297fad8715 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_split_with_limit.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sqrt.json b/connector/connect/common/src/test/resources/query-tests/queries/function_sqrt.json new file mode 100644 index 0000000000000..f9a2b76520c13 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_sqrt.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "sqrt", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sqrt.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_sqrt.proto.bin new file mode 100644 index 0000000000000..e98e3bdfdb665 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_sqrt.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_stddev.json b/connector/connect/common/src/test/resources/query-tests/queries/function_stddev.json new file mode 100644 index 0000000000000..1403817886ca0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_stddev.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "stddev", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_stddev.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_stddev.proto.bin new file mode 100644 index 0000000000000..8d214eea8e74e Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_stddev.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_stddev_pop.json b/connector/connect/common/src/test/resources/query-tests/queries/function_stddev_pop.json new file mode 100644 index 0000000000000..35e3a08b219f8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_stddev_pop.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "stddev_pop", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_stddev_pop.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_stddev_pop.proto.bin new file mode 100644 index 0000000000000..b679f55014f97 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_stddev_pop.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_stddev_samp.json b/connector/connect/common/src/test/resources/query-tests/queries/function_stddev_samp.json new file mode 100644 index 0000000000000..17cd0fd5e5976 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_stddev_samp.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "stddev_samp", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_stddev_samp.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_stddev_samp.proto.bin new file mode 100644 index 0000000000000..9f22eba5e39aa Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_stddev_samp.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_struct.json b/connector/connect/common/src/test/resources/query-tests/queries/function_struct.json new file mode 100644 index 0000000000000..ba950215a2591 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_struct.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "struct", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_struct.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_struct.proto.bin new file mode 100644 index 0000000000000..079c2be3c52e5 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_struct.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substring.json new file mode 100644 index 0000000000000..84a70cf1c0236 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_substring.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "substring", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "integer": 4 + } + }, { + "literal": { + "integer": 5 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substring.proto.bin new file mode 100644 index 0000000000000..d302cd95c7434 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_substring.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_index.json b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_index.json new file mode 100644 index 0000000000000..dc81d925957cd --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_index.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "substring_index", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": ";" + } + }, { + "literal": { + "integer": 5 + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_substring_index.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_index.proto.bin new file mode 100644 index 0000000000000..192bb2e300dc3 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_substring_index.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sum.json b/connector/connect/common/src/test/resources/query-tests/queries/function_sum.json new file mode 100644 index 0000000000000..e9526a20b67fb --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_sum.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "sum", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sum.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_sum.proto.bin new file mode 100644 index 0000000000000..0e347bbc0a167 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_sum.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sum_distinct.json b/connector/connect/common/src/test/resources/query-tests/queries/function_sum_distinct.json new file mode 100644 index 0000000000000..4614cf99ad3a6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_sum_distinct.json @@ -0,0 +1,26 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "sum", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }], + "isDistinct": true + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_sum_distinct.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_sum_distinct.proto.bin new file mode 100644 index 0000000000000..b4cf704391a4d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_sum_distinct.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_tan.json b/connector/connect/common/src/test/resources/query-tests/queries/function_tan.json new file mode 100644 index 0000000000000..ead160a7e3ac2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_tan.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "tan", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_tan.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_tan.proto.bin new file mode 100644 index 0000000000000..d674dc033b2cd Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_tan.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_tanh.json b/connector/connect/common/src/test/resources/query-tests/queries/function_tanh.json new file mode 100644 index 0000000000000..bcd12c664427e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_tanh.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "tanh", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_tanh.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_tanh.proto.bin new file mode 100644 index 0000000000000..21c28c3ef88e6 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_tanh.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_timestamp_seconds.json b/connector/connect/common/src/test/resources/query-tests/queries/function_timestamp_seconds.json new file mode 100644 index 0000000000000..e6892d17708b3 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_timestamp_seconds.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "timestamp_seconds", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "x" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_timestamp_seconds.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_timestamp_seconds.proto.bin new file mode 100644 index 0000000000000..102afbdda9021 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_timestamp_seconds.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_csv.json b/connector/connect/common/src/test/resources/query-tests/queries/function_to_csv.json new file mode 100644 index 0000000000000..6b3856f5ac0af --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_to_csv.json @@ -0,0 +1,38 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "to_csv", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }, { + "unresolvedFunction": { + "functionName": "map", + "arguments": [{ + "literal": { + "string": "sep" + } + }, { + "literal": { + "string": "|" + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_csv.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_to_csv.proto.bin new file mode 100644 index 0000000000000..a3017643a330a Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_to_csv.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_date.json b/connector/connect/common/src/test/resources/query-tests/queries/function_to_date.json new file mode 100644 index 0000000000000..8b9d50aa578b8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_to_date.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "to_date", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "s" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_date.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_to_date.proto.bin new file mode 100644 index 0000000000000..59178487eef58 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_to_date.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_date_with_format.json b/connector/connect/common/src/test/resources/query-tests/queries/function_to_date_with_format.json new file mode 100644 index 0000000000000..48ae80d1e70ed --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_to_date_with_format.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "to_date", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "s" + } + }, { + "literal": { + "string": "yyyy-MM-dd" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_date_with_format.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_to_date_with_format.proto.bin new file mode 100644 index 0000000000000..2641d660ff69f Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_to_date_with_format.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_json.json b/connector/connect/common/src/test/resources/query-tests/queries/function_to_json.json new file mode 100644 index 0000000000000..7ceeb9d113cd3 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_to_json.json @@ -0,0 +1,38 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "to_json", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }, { + "unresolvedFunction": { + "functionName": "map", + "arguments": [{ + "literal": { + "string": "timestampFormat" + } + }, { + "literal": { + "string": "dd/MM/yyyy" + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_json.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_to_json.proto.bin new file mode 100644 index 0000000000000..c9461c1aa961c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_to_json.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_timestamp.json b/connector/connect/common/src/test/resources/query-tests/queries/function_to_timestamp.json new file mode 100644 index 0000000000000..323c57e2ef58a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_to_timestamp.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "to_timestamp", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "s" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_timestamp.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_to_timestamp.proto.bin new file mode 100644 index 0000000000000..ec6bd64f98187 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_to_timestamp.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_with_format.json b/connector/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_with_format.json new file mode 100644 index 0000000000000..30f34528319c7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_with_format.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "to_timestamp", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "s" + } + }, { + "literal": { + "string": "yyyy-MM-dd HH:mm:ss.SSSS" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_with_format.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_with_format.proto.bin new file mode 100644 index 0000000000000..9c2d6d354ca73 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_to_timestamp_with_format.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_utc_timestamp.json b/connector/connect/common/src/test/resources/query-tests/queries/function_to_utc_timestamp.json new file mode 100644 index 0000000000000..015fbb5cf534a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_to_utc_timestamp.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "to_utc_timestamp", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "t" + } + }, { + "literal": { + "string": "-04:00" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_to_utc_timestamp.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_to_utc_timestamp.proto.bin new file mode 100644 index 0000000000000..b2b65089604a2 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_to_utc_timestamp.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_transform.json b/connector/connect/common/src/test/resources/query-tests/queries/function_transform.json new file mode 100644 index 0000000000000..2b357a3577318 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_transform.json @@ -0,0 +1,45 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "transform", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "lambdaFunction": { + "function": { + "unresolvedFunction": { + "functionName": "+", + "arguments": [{ + "unresolvedNamedLambdaVariable": { + "nameParts": ["x"] + } + }, { + "literal": { + "integer": 1 + } + }] + } + }, + "arguments": [{ + "nameParts": ["x"] + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_transform.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_transform.proto.bin new file mode 100644 index 0000000000000..44b83a9b98c53 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_transform.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_transform_keys.json b/connector/connect/common/src/test/resources/query-tests/queries/function_transform_keys.json new file mode 100644 index 0000000000000..0b6a6c24504b6 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_transform_keys.json @@ -0,0 +1,56 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "transform_keys", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "f" + } + }, { + "lambdaFunction": { + "function": { + "unresolvedFunction": { + "functionName": "concat", + "arguments": [{ + "unresolvedNamedLambdaVariable": { + "nameParts": ["x"] + } + }, { + "unresolvedExtractValue": { + "child": { + "unresolvedNamedLambdaVariable": { + "nameParts": ["y"] + } + }, + "extraction": { + "literal": { + "string": "id" + } + } + } + }] + } + }, + "arguments": [{ + "nameParts": ["x"] + }, { + "nameParts": ["y"] + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_transform_keys.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_transform_keys.proto.bin new file mode 100644 index 0000000000000..338aa87e01832 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_transform_keys.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_transform_values.json b/connector/connect/common/src/test/resources/query-tests/queries/function_transform_values.json new file mode 100644 index 0000000000000..71911ab5ed99b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_transform_values.json @@ -0,0 +1,48 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "transform_values", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "f" + } + }, { + "lambdaFunction": { + "function": { + "updateFields": { + "structExpression": { + "unresolvedNamedLambdaVariable": { + "nameParts": ["y"] + } + }, + "fieldName": "key", + "valueExpression": { + "unresolvedNamedLambdaVariable": { + "nameParts": ["x"] + } + } + } + }, + "arguments": [{ + "nameParts": ["x"] + }, { + "nameParts": ["y"] + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_transform_values.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_transform_values.proto.bin new file mode 100644 index 0000000000000..10cf8c503f420 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_transform_values.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_transform_with_index.json b/connector/connect/common/src/test/resources/query-tests/queries/function_transform_with_index.json new file mode 100644 index 0000000000000..1b296e891bca9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_transform_with_index.json @@ -0,0 +1,47 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "transform", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "lambdaFunction": { + "function": { + "unresolvedFunction": { + "functionName": "+", + "arguments": [{ + "unresolvedNamedLambdaVariable": { + "nameParts": ["x"] + } + }, { + "unresolvedNamedLambdaVariable": { + "nameParts": ["y"] + } + }] + } + }, + "arguments": [{ + "nameParts": ["x"] + }, { + "nameParts": ["y"] + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_transform_with_index.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_transform_with_index.proto.bin new file mode 100644 index 0000000000000..86f29399b9560 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_transform_with_index.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_translate.json b/connector/connect/common/src/test/resources/query-tests/queries/function_translate.json new file mode 100644 index 0000000000000..93d155c2857fb --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_translate.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "translate", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "foo" + } + }, { + "literal": { + "string": "bar" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_translate.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_translate.proto.bin new file mode 100644 index 0000000000000..1ce32c8d2843e Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_translate.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_trim.json b/connector/connect/common/src/test/resources/query-tests/queries/function_trim.json new file mode 100644 index 0000000000000..d2700174bca3d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_trim.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "trim", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_trim.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_trim.proto.bin new file mode 100644 index 0000000000000..d5f4f21510fc6 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_trim.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_trim_with_pattern.json b/connector/connect/common/src/test/resources/query-tests/queries/function_trim_with_pattern.json new file mode 100644 index 0000000000000..82b1616ef38ed --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_trim_with_pattern.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "trim", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }, { + "literal": { + "string": "---" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_trim_with_pattern.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_trim_with_pattern.proto.bin new file mode 100644 index 0000000000000..6a86e87c9850b Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_trim_with_pattern.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_trunc.json b/connector/connect/common/src/test/resources/query-tests/queries/function_trunc.json new file mode 100644 index 0000000000000..4c596cd863261 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_trunc.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "trunc", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }, { + "literal": { + "string": "mm" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_trunc.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_trunc.proto.bin new file mode 100644 index 0000000000000..cdcee95af6344 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_trunc.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_unbase64.json b/connector/connect/common/src/test/resources/query-tests/queries/function_unbase64.json new file mode 100644 index 0000000000000..6af2a00ed160e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_unbase64.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "unbase64", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_unbase64.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_unbase64.proto.bin new file mode 100644 index 0000000000000..f37ceb91bf42b Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_unbase64.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_unhex.json b/connector/connect/common/src/test/resources/query-tests/queries/function_unhex.json new file mode 100644 index 0000000000000..7c409d023f76a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_unhex.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "unhex", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_unhex.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_unhex.proto.bin new file mode 100644 index 0000000000000..fbac2821fdb07 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_unhex.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp.json b/connector/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp.json new file mode 100644 index 0000000000000..e590f7778f2ea --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "unix_timestamp", + "arguments": [{ + "unresolvedFunction": { + "functionName": "current_timestamp" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp.proto.bin new file mode 100644 index 0000000000000..cb3d967ae0123 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp_with_format.json b/connector/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp_with_format.json new file mode 100644 index 0000000000000..d2e087a5d8a24 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp_with_format.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "unix_timestamp", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "s" + } + }, { + "literal": { + "string": "yyyy-MM-dd HH:mm:ss.SSSS" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp_with_format.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp_with_format.proto.bin new file mode 100644 index 0000000000000..ddfcdff63d11a Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_unix_timestamp_with_format.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_upper.json b/connector/connect/common/src/test/resources/query-tests/queries/function_upper.json new file mode 100644 index 0000000000000..208ee9231a13c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_upper.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "upper", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_upper.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_upper.proto.bin new file mode 100644 index 0000000000000..5ddbfce96e71b Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_upper.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_var_pop.json b/connector/connect/common/src/test/resources/query-tests/queries/function_var_pop.json new file mode 100644 index 0000000000000..9c74ce4a984f8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_var_pop.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "var_pop", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_var_pop.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_var_pop.proto.bin new file mode 100644 index 0000000000000..7ca6e8d3b811b Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_var_pop.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_var_samp.json b/connector/connect/common/src/test/resources/query-tests/queries/function_var_samp.json new file mode 100644 index 0000000000000..979313dd0510d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_var_samp.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "var_samp", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_var_samp.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_var_samp.proto.bin new file mode 100644 index 0000000000000..9bd042ad339e7 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_var_samp.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_variance.json b/connector/connect/common/src/test/resources/query-tests/queries/function_variance.json new file mode 100644 index 0000000000000..90a97c3becf4d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_variance.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "variance", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_variance.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_variance.proto.bin new file mode 100644 index 0000000000000..fd494fc496391 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_variance.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_weekofyear.json b/connector/connect/common/src/test/resources/query-tests/queries/function_weekofyear.json new file mode 100644 index 0000000000000..3f46a98569e24 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_weekofyear.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "weekofyear", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_weekofyear.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_weekofyear.proto.bin new file mode 100644 index 0000000000000..ec9b22522360e Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_weekofyear.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_window.json b/connector/connect/common/src/test/resources/query-tests/queries/function_window.json new file mode 100644 index 0000000000000..bdcb6a398800f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_window.json @@ -0,0 +1,37 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "window", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "t" + } + }, { + "literal": { + "string": "1 second" + } + }, { + "literal": { + "string": "1 second" + } + }, { + "literal": { + "string": "0 second" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_window.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_window.proto.bin new file mode 100644 index 0000000000000..8cffcc1e9f673 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_window.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_window_time.json b/connector/connect/common/src/test/resources/query-tests/queries/function_window_time.json new file mode 100644 index 0000000000000..4809ea21261c4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_window_time.json @@ -0,0 +1,42 @@ +{ + "common": { + "planId": "2" + }, + "project": { + "input": { + "common": { + "planId": "1" + }, + "withColumns": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "aliases": [{ + "expr": { + "unresolvedAttribute": { + "unparsedIdentifier": "wt", + "planId": "0" + } + }, + "name": ["wt"], + "metadata": "{\"spark.timeWindow\":true}" + }] + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "window_time", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "wt" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_window_time.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_window_time.proto.bin new file mode 100644 index 0000000000000..c143520df08ce Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_window_time.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_xxhash64.json b/connector/connect/common/src/test/resources/query-tests/queries/function_xxhash64.json new file mode 100644 index 0000000000000..c20739d09ff10 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_xxhash64.json @@ -0,0 +1,37 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "xxhash64", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "g" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_xxhash64.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_xxhash64.proto.bin new file mode 100644 index 0000000000000..414c76fc5ce7f Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_xxhash64.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_year.json b/connector/connect/common/src/test/resources/query-tests/queries/function_year.json new file mode 100644 index 0000000000000..b8a4ee5a16525 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_year.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cd:date,t:timestamp,s:string,x:bigint,wt:struct\u003cstart:timestamp,end:timestamp\u003e\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "year", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "d" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_year.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_year.proto.bin new file mode 100644 index 0000000000000..623bc9ac6d81f Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_year.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_years.json b/connector/connect/common/src/test/resources/query-tests/queries/function_years.json new file mode 100644 index 0000000000000..2e87307320271 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_years.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "years", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_years.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_years.proto.bin new file mode 100644 index 0000000000000..30c25423fd563 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_years.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_zip_with.json b/connector/connect/common/src/test/resources/query-tests/queries/function_zip_with.json new file mode 100644 index 0000000000000..d1d0e7293c8ff --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/function_zip_with.json @@ -0,0 +1,51 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "zip_with", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "e" + } + }, { + "lambdaFunction": { + "function": { + "unresolvedFunction": { + "functionName": "+", + "arguments": [{ + "unresolvedNamedLambdaVariable": { + "nameParts": ["x"] + } + }, { + "unresolvedNamedLambdaVariable": { + "nameParts": ["y"] + } + }] + } + }, + "arguments": [{ + "nameParts": ["x"] + }, { + "nameParts": ["y"] + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/function_zip_with.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/function_zip_with.proto.bin new file mode 100644 index 0000000000000..c9a6dff84736b Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/function_zip_with.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupby_agg.json b/connector/connect/common/src/test/resources/query-tests/queries/groupby_agg.json new file mode 100644 index 0000000000000..4a1cfddb0288f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/groupby_agg.json @@ -0,0 +1,100 @@ +{ + "common": { + "planId": "1" + }, + "aggregate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "groupType": "GROUP_TYPE_GROUPBY", + "groupingExpressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }], + "aggregateExpressions": [{ + "unresolvedFunction": { + "functionName": "max", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a", + "planId": "0" + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "stddev", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b", + "planId": "0" + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "stddev", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b", + "planId": "0" + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "avg", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b", + "planId": "0" + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "avg", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b", + "planId": "0" + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "avg", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b", + "planId": "0" + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "count", + "arguments": [{ + "unresolvedStar": { + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "count", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a", + "planId": "0" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupby_agg.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/groupby_agg.proto.bin new file mode 100644 index 0000000000000..cfd6c2daa84b4 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/groupby_agg.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupby_agg_columns.json b/connector/connect/common/src/test/resources/query-tests/queries/groupby_agg_columns.json new file mode 100644 index 0000000000000..e61616786158e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/groupby_agg_columns.json @@ -0,0 +1,40 @@ +{ + "common": { + "planId": "1" + }, + "aggregate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "groupType": "GROUP_TYPE_GROUPBY", + "groupingExpressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }], + "aggregateExpressions": [{ + "unresolvedFunction": { + "functionName": "max", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "sum", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupby_agg_columns.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/groupby_agg_columns.proto.bin new file mode 100644 index 0000000000000..d6daa1cc31f7d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/groupby_agg_columns.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupby_agg_string.json b/connector/connect/common/src/test/resources/query-tests/queries/groupby_agg_string.json new file mode 100644 index 0000000000000..26320d404835f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/groupby_agg_string.json @@ -0,0 +1,46 @@ +{ + "common": { + "planId": "1" + }, + "aggregate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "groupType": "GROUP_TYPE_GROUPBY", + "groupingExpressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }], + "aggregateExpressions": [{ + "unresolvedFunction": { + "functionName": "max", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a", + "planId": "0" + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "count", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a", + "planId": "0" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupby_agg_string.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/groupby_agg_string.proto.bin new file mode 100644 index 0000000000000..818146f7f6935 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/groupby_agg_string.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupby_avg.json b/connector/connect/common/src/test/resources/query-tests/queries/groupby_avg.json new file mode 100644 index 0000000000000..5785eee2cadb5 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/groupby_avg.json @@ -0,0 +1,40 @@ +{ + "common": { + "planId": "1" + }, + "aggregate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "groupType": "GROUP_TYPE_GROUPBY", + "groupingExpressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }], + "aggregateExpressions": [{ + "unresolvedFunction": { + "functionName": "avg", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "avg", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupby_avg.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/groupby_avg.proto.bin new file mode 100644 index 0000000000000..4a18ea2d82d93 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/groupby_avg.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupby_count.json b/connector/connect/common/src/test/resources/query-tests/queries/groupby_count.json new file mode 100644 index 0000000000000..f92e22493e07b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/groupby_count.json @@ -0,0 +1,36 @@ +{ + "common": { + "planId": "1" + }, + "aggregate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "groupType": "GROUP_TYPE_GROUPBY", + "groupingExpressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }], + "aggregateExpressions": [{ + "alias": { + "expr": { + "unresolvedFunction": { + "functionName": "count", + "arguments": [{ + "literal": { + "integer": 1 + } + }] + } + }, + "name": ["count"] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupby_count.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/groupby_count.proto.bin new file mode 100644 index 0000000000000..5bb539195df9a Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/groupby_count.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupby_max.json b/connector/connect/common/src/test/resources/query-tests/queries/groupby_max.json new file mode 100644 index 0000000000000..3225a475a9b35 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/groupby_max.json @@ -0,0 +1,40 @@ +{ + "common": { + "planId": "1" + }, + "aggregate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "groupType": "GROUP_TYPE_GROUPBY", + "groupingExpressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }], + "aggregateExpressions": [{ + "unresolvedFunction": { + "functionName": "max", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "max", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupby_max.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/groupby_max.proto.bin new file mode 100644 index 0000000000000..651274b1afcac Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/groupby_max.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupby_mean.json b/connector/connect/common/src/test/resources/query-tests/queries/groupby_mean.json new file mode 100644 index 0000000000000..5785eee2cadb5 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/groupby_mean.json @@ -0,0 +1,40 @@ +{ + "common": { + "planId": "1" + }, + "aggregate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "groupType": "GROUP_TYPE_GROUPBY", + "groupingExpressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }], + "aggregateExpressions": [{ + "unresolvedFunction": { + "functionName": "avg", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "avg", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupby_mean.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/groupby_mean.proto.bin new file mode 100644 index 0000000000000..4a18ea2d82d93 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/groupby_mean.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupby_min.json b/connector/connect/common/src/test/resources/query-tests/queries/groupby_min.json new file mode 100644 index 0000000000000..afcc07d2c869c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/groupby_min.json @@ -0,0 +1,40 @@ +{ + "common": { + "planId": "1" + }, + "aggregate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "groupType": "GROUP_TYPE_GROUPBY", + "groupingExpressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }], + "aggregateExpressions": [{ + "unresolvedFunction": { + "functionName": "min", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "min", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupby_min.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/groupby_min.proto.bin new file mode 100644 index 0000000000000..6e038bf0b315c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/groupby_min.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupby_sum.json b/connector/connect/common/src/test/resources/query-tests/queries/groupby_sum.json new file mode 100644 index 0000000000000..74dd5b045aa57 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/groupby_sum.json @@ -0,0 +1,40 @@ +{ + "common": { + "planId": "1" + }, + "aggregate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "groupType": "GROUP_TYPE_GROUPBY", + "groupingExpressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }], + "aggregateExpressions": [{ + "unresolvedFunction": { + "functionName": "sum", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "sum", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/groupby_sum.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/groupby_sum.proto.bin new file mode 100644 index 0000000000000..fe2451ca18fbd Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/groupby_sum.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/grouping_and_grouping_id.json b/connector/connect/common/src/test/resources/query-tests/queries/grouping_and_grouping_id.json new file mode 100644 index 0000000000000..07bbd315a5fe9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/grouping_and_grouping_id.json @@ -0,0 +1,57 @@ +{ + "common": { + "planId": "1" + }, + "aggregate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "groupType": "GROUP_TYPE_CUBE", + "groupingExpressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }], + "aggregateExpressions": [{ + "unresolvedFunction": { + "functionName": "grouping", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "grouping", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }, { + "unresolvedFunction": { + "functionName": "grouping_id", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/grouping_and_grouping_id.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/grouping_and_grouping_id.proto.bin new file mode 100644 index 0000000000000..88b3f05931328 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/grouping_and_grouping_id.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/hint.json b/connector/connect/common/src/test/resources/query-tests/queries/hint.json new file mode 100644 index 0000000000000..bb5b848b744d0 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/hint.json @@ -0,0 +1,21 @@ +{ + "common": { + "planId": "1" + }, + "hint": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "name": "coalesce", + "parameters": [{ + "literal": { + "integer": 100 + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/hint.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/hint.proto.bin new file mode 100644 index 0000000000000..8eb4f41203511 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/hint.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/intersect.json b/connector/connect/common/src/test/resources/query-tests/queries/intersect.json new file mode 100644 index 0000000000000..f290397c55ca1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/intersect.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "2" + }, + "setOp": { + "leftInput": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "rightInput": { + "common": { + "planId": "1" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "setOpType": "SET_OP_TYPE_INTERSECT", + "isAll": false + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/intersect.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/intersect.proto.bin new file mode 100644 index 0000000000000..0ea7edc5cee3d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/intersect.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/intersectAll.json b/connector/connect/common/src/test/resources/query-tests/queries/intersectAll.json new file mode 100644 index 0000000000000..d8fe5fe0b7e79 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/intersectAll.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "2" + }, + "setOp": { + "leftInput": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "rightInput": { + "common": { + "planId": "1" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "setOpType": "SET_OP_TYPE_INTERSECT", + "isAll": true + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/intersectAll.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/intersectAll.proto.bin new file mode 100644 index 0000000000000..6df2125682bcb Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/intersectAll.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/join_condition.json b/connector/connect/common/src/test/resources/query-tests/queries/join_condition.json new file mode 100644 index 0000000000000..993cd98a7dd16 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/join_condition.json @@ -0,0 +1,54 @@ +{ + "common": { + "planId": "4" + }, + "join": { + "left": { + "common": { + "planId": "1" + }, + "subqueryAlias": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "alias": "l" + } + }, + "right": { + "common": { + "planId": "3" + }, + "subqueryAlias": { + "input": { + "common": { + "planId": "2" + }, + "localRelation": { + "schema": "struct\u003ca:int,id:bigint,payload:binary\u003e" + } + }, + "alias": "r" + } + }, + "joinCondition": { + "unresolvedFunction": { + "functionName": "\u003d", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "l.id" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "r.id" + } + }] + } + }, + "joinType": "JOIN_TYPE_LEFT_ANTI" + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/join_condition.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/join_condition.proto.bin new file mode 100644 index 0000000000000..1d11fe5e75bcc Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/join_condition.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/join_inner_condition.json b/connector/connect/common/src/test/resources/query-tests/queries/join_inner_condition.json new file mode 100644 index 0000000000000..527338c56ae60 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/join_inner_condition.json @@ -0,0 +1,54 @@ +{ + "common": { + "planId": "4" + }, + "join": { + "left": { + "common": { + "planId": "1" + }, + "subqueryAlias": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "alias": "l" + } + }, + "right": { + "common": { + "planId": "3" + }, + "subqueryAlias": { + "input": { + "common": { + "planId": "2" + }, + "localRelation": { + "schema": "struct\u003ca:int,id:bigint,payload:binary\u003e" + } + }, + "alias": "r" + } + }, + "joinCondition": { + "unresolvedFunction": { + "functionName": "\u003d", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "l.a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "r.a" + } + }] + } + }, + "joinType": "JOIN_TYPE_INNER" + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/join_inner_condition.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/join_inner_condition.proto.bin new file mode 100644 index 0000000000000..5d3de55da9cf8 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/join_inner_condition.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/join_inner_no_condition.json b/connector/connect/common/src/test/resources/query-tests/queries/join_inner_no_condition.json new file mode 100644 index 0000000000000..8c53a193162d7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/join_inner_no_condition.json @@ -0,0 +1,24 @@ +{ + "common": { + "planId": "2" + }, + "join": { + "left": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "right": { + "common": { + "planId": "1" + }, + "localRelation": { + "schema": "struct\u003ca:int,id:bigint,payload:binary\u003e" + } + }, + "joinType": "JOIN_TYPE_INNER" + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/join_inner_no_condition.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/join_inner_no_condition.proto.bin new file mode 100644 index 0000000000000..44bf1a6793cdc Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/join_inner_no_condition.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/join_inner_using_multiple_col_array.json b/connector/connect/common/src/test/resources/query-tests/queries/join_inner_using_multiple_col_array.json new file mode 100644 index 0000000000000..42b4eec5d9f1f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/join_inner_using_multiple_col_array.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "2" + }, + "join": { + "left": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "right": { + "common": { + "planId": "1" + }, + "localRelation": { + "schema": "struct\u003ca:int,id:bigint,payload:binary\u003e" + } + }, + "joinType": "JOIN_TYPE_INNER", + "usingColumns": ["id", "a"] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/join_inner_using_multiple_col_array.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/join_inner_using_multiple_col_array.proto.bin new file mode 100644 index 0000000000000..98e2a4fe9b58f Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/join_inner_using_multiple_col_array.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/join_inner_using_multiple_col_seq.json b/connector/connect/common/src/test/resources/query-tests/queries/join_inner_using_multiple_col_seq.json new file mode 100644 index 0000000000000..42b4eec5d9f1f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/join_inner_using_multiple_col_seq.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "2" + }, + "join": { + "left": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "right": { + "common": { + "planId": "1" + }, + "localRelation": { + "schema": "struct\u003ca:int,id:bigint,payload:binary\u003e" + } + }, + "joinType": "JOIN_TYPE_INNER", + "usingColumns": ["id", "a"] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/join_inner_using_multiple_col_seq.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/join_inner_using_multiple_col_seq.proto.bin new file mode 100644 index 0000000000000..98e2a4fe9b58f Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/join_inner_using_multiple_col_seq.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/join_inner_using_single_col.json b/connector/connect/common/src/test/resources/query-tests/queries/join_inner_using_single_col.json new file mode 100644 index 0000000000000..2c2bde49b190e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/join_inner_using_single_col.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "2" + }, + "join": { + "left": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "right": { + "common": { + "planId": "1" + }, + "localRelation": { + "schema": "struct\u003ca:int,id:bigint,payload:binary\u003e" + } + }, + "joinType": "JOIN_TYPE_INNER", + "usingColumns": ["id"] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/join_inner_using_single_col.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/join_inner_using_single_col.proto.bin new file mode 100644 index 0000000000000..7d4a1aeb11efc Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/join_inner_using_single_col.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/join_using_multiple_col_array.json b/connector/connect/common/src/test/resources/query-tests/queries/join_using_multiple_col_array.json new file mode 100644 index 0000000000000..9b592426cf96b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/join_using_multiple_col_array.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "2" + }, + "join": { + "left": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "right": { + "common": { + "planId": "1" + }, + "localRelation": { + "schema": "struct\u003ca:int,id:bigint,payload:binary\u003e" + } + }, + "joinType": "JOIN_TYPE_FULL_OUTER", + "usingColumns": ["id", "a"] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/join_using_multiple_col_array.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/join_using_multiple_col_array.proto.bin new file mode 100644 index 0000000000000..4c4b6ecb20767 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/join_using_multiple_col_array.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/join_using_multiple_col_seq.json b/connector/connect/common/src/test/resources/query-tests/queries/join_using_multiple_col_seq.json new file mode 100644 index 0000000000000..3f1c46f08e813 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/join_using_multiple_col_seq.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "2" + }, + "join": { + "left": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "right": { + "common": { + "planId": "1" + }, + "localRelation": { + "schema": "struct\u003ca:int,id:bigint,payload:binary\u003e" + } + }, + "joinType": "JOIN_TYPE_RIGHT_OUTER", + "usingColumns": ["id", "a"] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/join_using_multiple_col_seq.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/join_using_multiple_col_seq.proto.bin new file mode 100644 index 0000000000000..2a5410fc06316 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/join_using_multiple_col_seq.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/join_using_single_col.json b/connector/connect/common/src/test/resources/query-tests/queries/join_using_single_col.json new file mode 100644 index 0000000000000..46f144de61a99 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/join_using_single_col.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "2" + }, + "join": { + "left": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "right": { + "common": { + "planId": "1" + }, + "localRelation": { + "schema": "struct\u003ca:int,id:bigint,payload:binary\u003e" + } + }, + "joinType": "JOIN_TYPE_LEFT_SEMI", + "usingColumns": ["id"] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/join_using_single_col.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/join_using_single_col.proto.bin new file mode 100644 index 0000000000000..c2fa60619d705 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/join_using_single_col.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json b/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json new file mode 100644 index 0000000000000..d6f992d09a5c2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.json @@ -0,0 +1,38 @@ +{ + "common": { + "planId": "1" + }, + "parse": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "{\"type\":\"struct\",\"fields\":[{\"name\":\"value\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]}" + } + }, + "format": "PARSE_FORMAT_JSON", + "schema": { + "struct": { + "fields": [{ + "name": "c1", + "dataType": { + "string": { + } + }, + "nullable": true + }, { + "name": "c2", + "dataType": { + "integer": { + } + }, + "nullable": true + }] + } + }, + "options": { + "allowsinglequotes": "true" + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin new file mode 100644 index 0000000000000..0fce9d9ff8c7e Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/json_from_dataset.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/limit.json b/connector/connect/common/src/test/resources/query-tests/queries/limit.json new file mode 100644 index 0000000000000..acf01c196891d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/limit.json @@ -0,0 +1,16 @@ +{ + "common": { + "planId": "1" + }, + "limit": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "limit": 10 + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/limit.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/limit.proto.bin new file mode 100644 index 0000000000000..f3f4771fe4deb Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/limit.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/melt_no_values.json b/connector/connect/common/src/test/resources/query-tests/queries/melt_no_values.json new file mode 100644 index 0000000000000..12db0a5abe368 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/melt_no_values.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "unpivot": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "ids": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }], + "valueColumnName": "value" + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/melt_no_values.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/melt_no_values.proto.bin new file mode 100644 index 0000000000000..23a6aa1289a99 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/melt_no_values.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/melt_values.json b/connector/connect/common/src/test/resources/query-tests/queries/melt_values.json new file mode 100644 index 0000000000000..e2a004f46e781 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/melt_values.json @@ -0,0 +1,28 @@ +{ + "common": { + "planId": "1" + }, + "unpivot": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "ids": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }], + "values": { + "values": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }] + }, + "valueColumnName": "value" + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/melt_values.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/melt_values.proto.bin new file mode 100644 index 0000000000000..e021e1110def5 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/melt_values.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/offset.json b/connector/connect/common/src/test/resources/query-tests/queries/offset.json new file mode 100644 index 0000000000000..80796160b96d7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/offset.json @@ -0,0 +1,16 @@ +{ + "common": { + "planId": "1" + }, + "offset": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "offset": 1000 + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/offset.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/offset.proto.bin new file mode 100644 index 0000000000000..6671eebb93183 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/offset.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/orderBy_columns.json b/connector/connect/common/src/test/resources/query-tests/queries/orderBy_columns.json new file mode 100644 index 0000000000000..72ea72d795497 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/orderBy_columns.json @@ -0,0 +1,41 @@ +{ + "common": { + "planId": "1" + }, + "sort": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "order": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }, { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }, { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }], + "isGlobal": true + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/orderBy_columns.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/orderBy_columns.proto.bin new file mode 100644 index 0000000000000..00fa9f8b5c02d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/orderBy_columns.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/orderBy_strings.json b/connector/connect/common/src/test/resources/query-tests/queries/orderBy_strings.json new file mode 100644 index 0000000000000..e7f63a15c2882 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/orderBy_strings.json @@ -0,0 +1,41 @@ +{ + "common": { + "planId": "1" + }, + "sort": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "order": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }, { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }, { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }], + "isGlobal": true + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/orderBy_strings.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/orderBy_strings.proto.bin new file mode 100644 index 0000000000000..a907e66a130d4 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/orderBy_strings.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/pivot.json b/connector/connect/common/src/test/resources/query-tests/queries/pivot.json new file mode 100644 index 0000000000000..30bff04c531db --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/pivot.json @@ -0,0 +1,45 @@ +{ + "common": { + "planId": "1" + }, + "aggregate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "groupType": "GROUP_TYPE_PIVOT", + "groupingExpressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }], + "aggregateExpressions": [{ + "unresolvedFunction": { + "functionName": "count", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }], + "pivot": { + "col": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "values": [{ + "integer": 1 + }, { + "integer": 2 + }, { + "integer": 3 + }] + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/pivot.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/pivot.proto.bin new file mode 100644 index 0000000000000..67063209a184c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/pivot.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/pivot_without_column_values.json b/connector/connect/common/src/test/resources/query-tests/queries/pivot_without_column_values.json new file mode 100644 index 0000000000000..5218a88988ea3 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/pivot_without_column_values.json @@ -0,0 +1,38 @@ +{ + "common": { + "planId": "1" + }, + "aggregate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "groupType": "GROUP_TYPE_PIVOT", + "groupingExpressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }], + "aggregateExpressions": [{ + "unresolvedFunction": { + "functionName": "count", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }], + "pivot": { + "col": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + } + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/pivot_without_column_values.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/pivot_without_column_values.proto.bin new file mode 100644 index 0000000000000..aee3c980eaee4 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/pivot_without_column_values.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/range.json b/connector/connect/common/src/test/resources/query-tests/queries/range.json new file mode 100644 index 0000000000000..8afa44fac6cf2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/range.json @@ -0,0 +1,11 @@ +{ + "common": { + "planId": "0" + }, + "range": { + "start": "1", + "end": "10", + "step": "1", + "numPartitions": 2 + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/range.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/range.proto.bin new file mode 100644 index 0000000000000..277a02cea558c Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/range.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read.json b/connector/connect/common/src/test/resources/query-tests/queries/read.json new file mode 100644 index 0000000000000..d5580c1321ec4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/read.json @@ -0,0 +1,16 @@ +{ + "common": { + "planId": "0" + }, + "read": { + "dataSource": { + "format": "csv", + "schema": "name STRING,age INT,job STRING", + "options": { + "header": "true", + "delimiter": ";" + }, + "paths": ["../common/src/test/resources/query-tests/test-data/people.csv"] + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/read.proto.bin new file mode 100644 index 0000000000000..c50391bb1a8f7 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/read.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read_csv.json b/connector/connect/common/src/test/resources/query-tests/queries/read_csv.json new file mode 100644 index 0000000000000..ec1eed1c6cf35 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/read_csv.json @@ -0,0 +1,11 @@ +{ + "common": { + "planId": "0" + }, + "read": { + "dataSource": { + "format": "csv", + "paths": ["../common/src/test/resources/query-tests/test-data/people.csv"] + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read_csv.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/read_csv.proto.bin new file mode 100644 index 0000000000000..d8b5ca93f2f77 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/read_csv.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read_jdbc.json b/connector/connect/common/src/test/resources/query-tests/queries/read_jdbc.json new file mode 100644 index 0000000000000..3e9b7b8cc864d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/read_jdbc.json @@ -0,0 +1,14 @@ +{ + "common": { + "planId": "0" + }, + "read": { + "dataSource": { + "format": "jdbc", + "options": { + "url": "jdbc:h2:mem:testdb0;user\u003dtestUser;password\u003dtestPass", + "dbtable": "TEST.TIMETYPES" + } + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read_jdbc.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/read_jdbc.proto.bin new file mode 100644 index 0000000000000..4e74a07d22fe9 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/read_jdbc.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read_jdbc_with_partition.json b/connector/connect/common/src/test/resources/query-tests/queries/read_jdbc_with_partition.json new file mode 100644 index 0000000000000..31576cee4f1a3 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/read_jdbc_with_partition.json @@ -0,0 +1,18 @@ +{ + "common": { + "planId": "0" + }, + "read": { + "dataSource": { + "format": "jdbc", + "options": { + "url": "jdbc:h2:mem:testdb0;user\u003dtestUser;password\u003dtestPass", + "upperbound": "4", + "lowerbound": "0", + "numpartitions": "3", + "dbtable": "TEST.EMP", + "partitioncolumn": "THEID" + } + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read_jdbc_with_partition.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/read_jdbc_with_partition.proto.bin new file mode 100644 index 0000000000000..c74178148dea9 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/read_jdbc_with_partition.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read_jdbc_with_predicates.json b/connector/connect/common/src/test/resources/query-tests/queries/read_jdbc_with_predicates.json new file mode 100644 index 0000000000000..d8d4cfbdcab16 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/read_jdbc_with_predicates.json @@ -0,0 +1,15 @@ +{ + "common": { + "planId": "0" + }, + "read": { + "dataSource": { + "format": "jdbc", + "options": { + "url": "jdbc:h2:mem:testdb0;user\u003dtestUser;password\u003dtestPass", + "dbtable": "TEST.PEOPLE" + }, + "predicates": ["THEID \u003c 2", "THEID \u003e\u003d 2"] + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read_jdbc_with_predicates.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/read_jdbc_with_predicates.proto.bin new file mode 100644 index 0000000000000..9b1d5812e473b Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/read_jdbc_with_predicates.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read_json.json b/connector/connect/common/src/test/resources/query-tests/queries/read_json.json new file mode 100644 index 0000000000000..63dadc129dc8f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/read_json.json @@ -0,0 +1,11 @@ +{ + "common": { + "planId": "0" + }, + "read": { + "dataSource": { + "format": "json", + "paths": ["../common/src/test/resources/query-tests/test-data/people.json"] + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read_json.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/read_json.proto.bin new file mode 100644 index 0000000000000..1d829df6bbcfe Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/read_json.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read_orc.json b/connector/connect/common/src/test/resources/query-tests/queries/read_orc.json new file mode 100644 index 0000000000000..b78d7d6ecd61c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/read_orc.json @@ -0,0 +1,11 @@ +{ + "common": { + "planId": "0" + }, + "read": { + "dataSource": { + "format": "orc", + "paths": ["../common/src/test/resources/query-tests/test-data/users.orc"] + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read_orc.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/read_orc.proto.bin new file mode 100644 index 0000000000000..6a67db561dc88 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/read_orc.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read_parquet.json b/connector/connect/common/src/test/resources/query-tests/queries/read_parquet.json new file mode 100644 index 0000000000000..0a201a43c744b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/read_parquet.json @@ -0,0 +1,11 @@ +{ + "common": { + "planId": "0" + }, + "read": { + "dataSource": { + "format": "parquet", + "paths": ["../common/src/test/resources/query-tests/test-data/users.parquet"] + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read_parquet.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/read_parquet.proto.bin new file mode 100644 index 0000000000000..f16b28dcce01e Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/read_parquet.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read_path.json b/connector/connect/common/src/test/resources/query-tests/queries/read_path.json new file mode 100644 index 0000000000000..c3fc8132a3bc5 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/read_path.json @@ -0,0 +1,11 @@ +{ + "read": { + "dataSource": { + "format": "csv", + "schema": "name STRING,age INT", + "options": { + "path": "../common/src/test/resources/query-tests/test-data/people.csv" + } + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read_path.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/read_path.proto.bin new file mode 100644 index 0000000000000..01787253c4283 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/read_path.proto.bin @@ -0,0 +1,3 @@ +ca +csvname STRING,age INTE +path=../common/src/test/resources/query-tests/test-data/people.csv \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read_table.json b/connector/connect/common/src/test/resources/query-tests/queries/read_table.json new file mode 100644 index 0000000000000..b2cd4ae0a5bae --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/read_table.json @@ -0,0 +1,10 @@ +{ + "common": { + "planId": "0" + }, + "read": { + "namedTable": { + "unparsedIdentifier": "myTable" + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read_table.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/read_table.proto.bin new file mode 100644 index 0000000000000..956da78861d0b Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/read_table.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read_text.json b/connector/connect/common/src/test/resources/query-tests/queries/read_text.json new file mode 100644 index 0000000000000..de7a306a52fbc --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/read_text.json @@ -0,0 +1,11 @@ +{ + "common": { + "planId": "0" + }, + "read": { + "dataSource": { + "format": "text", + "paths": ["../common/src/test/resources/query-tests/test-data/people.txt"] + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/read_text.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/read_text.proto.bin new file mode 100644 index 0000000000000..3f3bbf8769c4d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/read_text.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/relation_extension.json b/connector/connect/common/src/test/resources/query-tests/queries/relation_extension.json new file mode 100644 index 0000000000000..47ceba13ca7e2 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/relation_extension.json @@ -0,0 +1,16 @@ +{ + "common": { + "planId": "1" + }, + "extension": { + "@type": "type.googleapis.com/spark.connect.ExamplePluginRelation", + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/relation_extension.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/relation_extension.proto.bin new file mode 100644 index 0000000000000..680bb550eca53 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/relation_extension.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/repartition.json b/connector/connect/common/src/test/resources/query-tests/queries/repartition.json new file mode 100644 index 0000000000000..163742886c3a5 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/repartition.json @@ -0,0 +1,17 @@ +{ + "common": { + "planId": "1" + }, + "repartition": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "numPartitions": 24, + "shuffle": true + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/repartition.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/repartition.proto.bin new file mode 100644 index 0000000000000..5265e0e6175c4 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/repartition.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/repartitionByRange_expressions.json b/connector/connect/common/src/test/resources/query-tests/queries/repartitionByRange_expressions.json new file mode 100644 index 0000000000000..98bd4c988abc3 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/repartitionByRange_expressions.json @@ -0,0 +1,36 @@ +{ + "common": { + "planId": "1" + }, + "repartitionByExpression": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "partitionExprs": [{ + "sortOrder": { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + } + }, { + "sortOrder": { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, + "direction": "SORT_DIRECTION_DESCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/repartitionByRange_expressions.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/repartitionByRange_expressions.proto.bin new file mode 100644 index 0000000000000..8ee220833d9e8 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/repartitionByRange_expressions.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/repartitionByRange_num_partitions_expressions.json b/connector/connect/common/src/test/resources/query-tests/queries/repartitionByRange_num_partitions_expressions.json new file mode 100644 index 0000000000000..604d0330fedd7 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/repartitionByRange_num_partitions_expressions.json @@ -0,0 +1,37 @@ +{ + "common": { + "planId": "1" + }, + "repartitionByExpression": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "partitionExprs": [{ + "sortOrder": { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + } + }, { + "sortOrder": { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, + "direction": "SORT_DIRECTION_DESCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + } + }], + "numPartitions": 33 + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/repartitionByRange_num_partitions_expressions.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/repartitionByRange_num_partitions_expressions.proto.bin new file mode 100644 index 0000000000000..a3f1546cca1f8 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/repartitionByRange_num_partitions_expressions.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/repartition_expressions.json b/connector/connect/common/src/test/resources/query-tests/queries/repartition_expressions.json new file mode 100644 index 0000000000000..81113afea3535 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/repartition_expressions.json @@ -0,0 +1,24 @@ +{ + "common": { + "planId": "1" + }, + "repartitionByExpression": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "partitionExprs": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/repartition_expressions.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/repartition_expressions.proto.bin new file mode 100644 index 0000000000000..50ff8c590cda3 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/repartition_expressions.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/repartition_num_partitions_expressions.json b/connector/connect/common/src/test/resources/query-tests/queries/repartition_num_partitions_expressions.json new file mode 100644 index 0000000000000..996beda2253aa --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/repartition_num_partitions_expressions.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "repartitionByExpression": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "partitionExprs": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }], + "numPartitions": 22 + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/repartition_num_partitions_expressions.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/repartition_num_partitions_expressions.proto.bin new file mode 100644 index 0000000000000..73e22f120ed95 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/repartition_num_partitions_expressions.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/replace.json b/connector/connect/common/src/test/resources/query-tests/queries/replace.json new file mode 100644 index 0000000000000..d0e39d340c0b8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/replace.json @@ -0,0 +1,24 @@ +{ + "common": { + "planId": "1" + }, + "replace": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "cols": ["id"], + "replacements": [{ + "oldValue": { + "double": 1.0 + }, + "newValue": { + "double": 8.0 + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/replace.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/replace.proto.bin new file mode 100644 index 0000000000000..d1868cee7bfb6 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/replace.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/rollup_column.json b/connector/connect/common/src/test/resources/query-tests/queries/rollup_column.json new file mode 100644 index 0000000000000..1102db18830bd --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/rollup_column.json @@ -0,0 +1,40 @@ +{ + "common": { + "planId": "1" + }, + "aggregate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "groupType": "GROUP_TYPE_ROLLUP", + "groupingExpressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }], + "aggregateExpressions": [{ + "alias": { + "expr": { + "unresolvedFunction": { + "functionName": "count", + "arguments": [{ + "literal": { + "integer": 1 + } + }] + } + }, + "name": ["count"] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/rollup_column.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/rollup_column.proto.bin new file mode 100644 index 0000000000000..64dbb597c3650 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/rollup_column.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/rollup_string.json b/connector/connect/common/src/test/resources/query-tests/queries/rollup_string.json new file mode 100644 index 0000000000000..1102db18830bd --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/rollup_string.json @@ -0,0 +1,40 @@ +{ + "common": { + "planId": "1" + }, + "aggregate": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "groupType": "GROUP_TYPE_ROLLUP", + "groupingExpressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }], + "aggregateExpressions": [{ + "alias": { + "expr": { + "unresolvedFunction": { + "functionName": "count", + "arguments": [{ + "literal": { + "integer": 1 + } + }] + } + }, + "name": ["count"] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/rollup_string.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/rollup_string.proto.bin new file mode 100644 index 0000000000000..64dbb597c3650 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/rollup_string.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/sampleBy.json b/connector/connect/common/src/test/resources/query-tests/queries/sampleBy.json new file mode 100644 index 0000000000000..03fdd10075387 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/sampleBy.json @@ -0,0 +1,32 @@ +{ + "common": { + "planId": "1" + }, + "sampleBy": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "col": { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, + "fractions": [{ + "stratum": { + "integer": 0 + }, + "fraction": 0.1 + }, { + "stratum": { + "integer": 1 + }, + "fraction": 0.2 + }], + "seed": "0" + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/sampleBy.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/sampleBy.proto.bin new file mode 100644 index 0000000000000..29773f18e0e47 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/sampleBy.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/sample_fraction_seed.json b/connector/connect/common/src/test/resources/query-tests/queries/sample_fraction_seed.json new file mode 100644 index 0000000000000..88e80a3f60c6c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/sample_fraction_seed.json @@ -0,0 +1,18 @@ +{ + "common": { + "planId": "1" + }, + "sample": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "upperBound": 0.43, + "withReplacement": false, + "seed": "9890823" + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/sample_fraction_seed.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/sample_fraction_seed.proto.bin new file mode 100644 index 0000000000000..546c9c9c69cac Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/sample_fraction_seed.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/sample_withReplacement_fraction_seed.json b/connector/connect/common/src/test/resources/query-tests/queries/sample_withReplacement_fraction_seed.json new file mode 100644 index 0000000000000..75d3b2421601d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/sample_withReplacement_fraction_seed.json @@ -0,0 +1,18 @@ +{ + "common": { + "planId": "1" + }, + "sample": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "upperBound": 0.23, + "withReplacement": true, + "seed": "898" + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/sample_withReplacement_fraction_seed.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/sample_withReplacement_fraction_seed.proto.bin new file mode 100644 index 0000000000000..48650897e6762 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/sample_withReplacement_fraction_seed.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/select.json b/connector/connect/common/src/test/resources/query-tests/queries/select.json new file mode 100644 index 0000000000000..8ef46a6cc2aab --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/select.json @@ -0,0 +1,20 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "expressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/select.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/select.proto.bin new file mode 100644 index 0000000000000..2bc4bd85a5806 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/select.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/selectExpr.json b/connector/connect/common/src/test/resources/query-tests/queries/selectExpr.json new file mode 100644 index 0000000000000..9c2815cffb752 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/selectExpr.json @@ -0,0 +1,24 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "expressions": [{ + "expressionString": { + "expression": "a + 10 as x" + } + }, { + "expressionString": { + "expression": "id % 10 as grp" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/selectExpr.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/selectExpr.proto.bin new file mode 100644 index 0000000000000..88824d7f896f5 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/selectExpr.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/select_strings.json b/connector/connect/common/src/test/resources/query-tests/queries/select_strings.json new file mode 100644 index 0000000000000..421b9aa120016 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/select_strings.json @@ -0,0 +1,24 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "expressions": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/select_strings.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/select_strings.proto.bin new file mode 100644 index 0000000000000..f868b46f3e58f Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/select_strings.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/select_typed_1-arg.json b/connector/connect/common/src/test/resources/query-tests/queries/select_typed_1-arg.json new file mode 100644 index 0000000000000..90ef62c5f415b --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/select_typed_1-arg.json @@ -0,0 +1,39 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "inline", + "arguments": [{ + "unresolvedFunction": { + "functionName": "array", + "arguments": [{ + "unresolvedFunction": { + "functionName": "struct", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }] + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/select_typed_1-arg.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/select_typed_1-arg.proto.bin new file mode 100644 index 0000000000000..2273a16d4e6a8 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/select_typed_1-arg.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/sortWithinPartitions_columns.json b/connector/connect/common/src/test/resources/query-tests/queries/sortWithinPartitions_columns.json new file mode 100644 index 0000000000000..c45a326a01b47 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/sortWithinPartitions_columns.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "sort": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "order": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }, { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }], + "isGlobal": false + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/sortWithinPartitions_columns.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/sortWithinPartitions_columns.proto.bin new file mode 100644 index 0000000000000..49e24e6f6f222 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/sortWithinPartitions_columns.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/sortWithinPartitions_strings.json b/connector/connect/common/src/test/resources/query-tests/queries/sortWithinPartitions_strings.json new file mode 100644 index 0000000000000..dcded7cb32d8c --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/sortWithinPartitions_strings.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "sort": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "order": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }, { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }], + "isGlobal": false + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/sortWithinPartitions_strings.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/sortWithinPartitions_strings.proto.bin new file mode 100644 index 0000000000000..f5ff329823889 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/sortWithinPartitions_strings.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/sort_columns.json b/connector/connect/common/src/test/resources/query-tests/queries/sort_columns.json new file mode 100644 index 0000000000000..76b4d92d71c1f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/sort_columns.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "sort": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "order": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }, { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }], + "isGlobal": true + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/sort_columns.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/sort_columns.proto.bin new file mode 100644 index 0000000000000..9c059d244aecf Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/sort_columns.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/sort_strings.json b/connector/connect/common/src/test/resources/query-tests/queries/sort_strings.json new file mode 100644 index 0000000000000..7955221d7d786 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/sort_strings.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "1" + }, + "sort": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "order": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }, { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }], + "isGlobal": true + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/sort_strings.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/sort_strings.proto.bin new file mode 100644 index 0000000000000..e780d351c8af9 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/sort_strings.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/summary.json b/connector/connect/common/src/test/resources/query-tests/queries/summary.json new file mode 100644 index 0000000000000..cbfe9bcf7b085 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/summary.json @@ -0,0 +1,16 @@ +{ + "common": { + "planId": "1" + }, + "summary": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "statistics": ["mean", "min"] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/summary.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/summary.proto.bin new file mode 100644 index 0000000000000..a88d61cdc76b2 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/summary.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/table.json b/connector/connect/common/src/test/resources/query-tests/queries/table.json new file mode 100644 index 0000000000000..b2cd4ae0a5bae --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/table.json @@ -0,0 +1,10 @@ +{ + "common": { + "planId": "0" + }, + "read": { + "namedTable": { + "unparsedIdentifier": "myTable" + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/table.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/table.proto.bin new file mode 100644 index 0000000000000..956da78861d0b Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/table.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/table_API_with_options.json b/connector/connect/common/src/test/resources/query-tests/queries/table_API_with_options.json new file mode 100644 index 0000000000000..acf0492358835 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/table_API_with_options.json @@ -0,0 +1,14 @@ +{ + "common": { + "planId": "0" + }, + "read": { + "namedTable": { + "unparsedIdentifier": "tempdb.myTable", + "options": { + "p1": "v1", + "p2": "v2" + } + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/table_API_with_options.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/table_API_with_options.proto.bin new file mode 100644 index 0000000000000..95e044984b4f8 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/table_API_with_options.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/test_broadcast.json b/connector/connect/common/src/test/resources/query-tests/queries/test_broadcast.json new file mode 100644 index 0000000000000..5409428642592 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/test_broadcast.json @@ -0,0 +1,33 @@ +{ + "common": { + "planId": "3" + }, + "join": { + "left": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "right": { + "common": { + "planId": "2" + }, + "hint": { + "input": { + "common": { + "planId": "1" + }, + "localRelation": { + "schema": "struct\u003ca:int,id:bigint,payload:binary\u003e" + } + }, + "name": "broadcast" + } + }, + "joinType": "JOIN_TYPE_INNER", + "usingColumns": ["id"] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/test_broadcast.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/test_broadcast.proto.bin new file mode 100644 index 0000000000000..96c87594c69b0 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/test_broadcast.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/to.json b/connector/connect/common/src/test/resources/query-tests/queries/to.json new file mode 100644 index 0000000000000..a3e07202c106f --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/to.json @@ -0,0 +1,34 @@ +{ + "common": { + "planId": "1" + }, + "toSchema": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "schema": { + "struct": { + "fields": [{ + "name": "b", + "dataType": { + "double": { + } + }, + "nullable": true + }, { + "name": "id", + "dataType": { + "integer": { + } + }, + "nullable": true + }] + } + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/to.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/to.proto.bin new file mode 100644 index 0000000000000..8e15aa6c2791d Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/to.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/toDF.json b/connector/connect/common/src/test/resources/query-tests/queries/toDF.json new file mode 100644 index 0000000000000..8111bc76a8a81 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/toDF.json @@ -0,0 +1,16 @@ +{ + "common": { + "planId": "1" + }, + "toDf": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "columnNames": ["x1", "x2", "x3"] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/toDF.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/toDF.proto.bin new file mode 100644 index 0000000000000..3238291e87948 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/toDF.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/toJSON.json b/connector/connect/common/src/test/resources/query-tests/queries/toJSON.json new file mode 100644 index 0000000000000..278767e620a16 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/toJSON.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double,d:struct\u003cid:bigint,a:int,b:double\u003e,e:array\u003cint\u003e,f:map\u003cstring,struct\u003cid:bigint,a:int,b:double\u003e\u003e,g:string\u003e" + } + }, + "expressions": [{ + "unresolvedFunction": { + "functionName": "to_json", + "arguments": [{ + "unresolvedFunction": { + "functionName": "struct", + "arguments": [{ + "unresolvedStar": { + } + }] + } + }] + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/toJSON.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/toJSON.proto.bin new file mode 100644 index 0000000000000..e08d0fd2180f0 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/toJSON.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/union.json b/connector/connect/common/src/test/resources/query-tests/queries/union.json new file mode 100644 index 0000000000000..9048133ca6385 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/union.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "2" + }, + "setOp": { + "leftInput": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "rightInput": { + "common": { + "planId": "1" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "setOpType": "SET_OP_TYPE_UNION", + "isAll": true + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/union.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/union.proto.bin new file mode 100644 index 0000000000000..caafd1ef998d6 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/union.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/unionAll.json b/connector/connect/common/src/test/resources/query-tests/queries/unionAll.json new file mode 100644 index 0000000000000..9048133ca6385 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/unionAll.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "2" + }, + "setOp": { + "leftInput": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "rightInput": { + "common": { + "planId": "1" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "setOpType": "SET_OP_TYPE_UNION", + "isAll": true + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/unionAll.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/unionAll.proto.bin new file mode 100644 index 0000000000000..caafd1ef998d6 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/unionAll.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/unionByName.json b/connector/connect/common/src/test/resources/query-tests/queries/unionByName.json new file mode 100644 index 0000000000000..181d681b7f1fd --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/unionByName.json @@ -0,0 +1,43 @@ +{ + "common": { + "planId": "4" + }, + "setOp": { + "leftInput": { + "common": { + "planId": "1" + }, + "drop": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "columnNames": ["b"] + } + }, + "rightInput": { + "common": { + "planId": "3" + }, + "drop": { + "input": { + "common": { + "planId": "2" + }, + "localRelation": { + "schema": "struct\u003ca:int,id:bigint,payload:binary\u003e" + } + }, + "columnNames": ["payload"] + } + }, + "setOpType": "SET_OP_TYPE_UNION", + "isAll": true, + "byName": true, + "allowMissingColumns": false + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/unionByName.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/unionByName.proto.bin new file mode 100644 index 0000000000000..519fbc8edaa42 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/unionByName.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/unionByName_allowMissingColumns.json b/connector/connect/common/src/test/resources/query-tests/queries/unionByName_allowMissingColumns.json new file mode 100644 index 0000000000000..98870ffe7175d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/unionByName_allowMissingColumns.json @@ -0,0 +1,27 @@ +{ + "common": { + "planId": "2" + }, + "setOp": { + "leftInput": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "rightInput": { + "common": { + "planId": "1" + }, + "localRelation": { + "schema": "struct\u003ca:int,id:bigint,payload:binary\u003e" + } + }, + "setOpType": "SET_OP_TYPE_UNION", + "isAll": true, + "byName": true, + "allowMissingColumns": true + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/unionByName_allowMissingColumns.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/unionByName_allowMissingColumns.proto.bin new file mode 100644 index 0000000000000..4facbbc553ea5 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/unionByName_allowMissingColumns.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/unpivot_no_values.json b/connector/connect/common/src/test/resources/query-tests/queries/unpivot_no_values.json new file mode 100644 index 0000000000000..9f550c0319147 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/unpivot_no_values.json @@ -0,0 +1,21 @@ +{ + "common": { + "planId": "1" + }, + "unpivot": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "ids": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }], + "valueColumnName": "value" + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/unpivot_no_values.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/unpivot_no_values.proto.bin new file mode 100644 index 0000000000000..ac3bad8bd04ed Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/unpivot_no_values.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/unpivot_values.json b/connector/connect/common/src/test/resources/query-tests/queries/unpivot_values.json new file mode 100644 index 0000000000000..92bc19d195c6e --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/unpivot_values.json @@ -0,0 +1,32 @@ +{ + "common": { + "planId": "1" + }, + "unpivot": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "ids": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }], + "values": { + "values": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + }, + "valueColumnName": "value" + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/unpivot_values.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/unpivot_values.proto.bin new file mode 100644 index 0000000000000..7f717cb23517b Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/unpivot_values.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/where_column.json b/connector/connect/common/src/test/resources/query-tests/queries/where_column.json new file mode 100644 index 0000000000000..bef80a7e6ed5a --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/where_column.json @@ -0,0 +1,29 @@ +{ + "common": { + "planId": "1" + }, + "filter": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "condition": { + "unresolvedFunction": { + "functionName": "\u003d", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, { + "literal": { + "long": "1" + } + }] + } + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/where_column.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/where_column.proto.bin new file mode 100644 index 0000000000000..e472ed0715b62 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/where_column.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/where_expr.json b/connector/connect/common/src/test/resources/query-tests/queries/where_expr.json new file mode 100644 index 0000000000000..dc7523bcaade4 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/where_expr.json @@ -0,0 +1,20 @@ +{ + "common": { + "planId": "1" + }, + "filter": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "condition": { + "expressionString": { + "expression": "a + id \u003c 1000" + } + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/where_expr.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/where_expr.proto.bin new file mode 100644 index 0000000000000..380a1763b81ec Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/where_expr.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/window.json b/connector/connect/common/src/test/resources/query-tests/queries/window.json new file mode 100644 index 0000000000000..23fd5c1556ec5 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/window.json @@ -0,0 +1,211 @@ +{ + "common": { + "planId": "1" + }, + "project": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "expressions": [{ + "window": { + "windowFunction": { + "unresolvedFunction": { + "functionName": "min", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }] + } + }, + "partitionSpec": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }, { + "window": { + "windowFunction": { + "unresolvedFunction": { + "functionName": "min", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }] + } + }, + "partitionSpec": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }] + } + }, { + "window": { + "windowFunction": { + "unresolvedFunction": { + "functionName": "min", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }] + } + }, + "orderSpec": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }, { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }] + } + }, { + "window": { + "windowFunction": { + "unresolvedFunction": { + "functionName": "min", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }] + } + }, + "orderSpec": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }, { + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "b" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }] + } + }, { + "window": { + "windowFunction": { + "unresolvedFunction": { + "functionName": "min", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }] + } + }, + "orderSpec": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }], + "frameSpec": { + "frameType": "FRAME_TYPE_ROW", + "lower": { + "value": { + "literal": { + "integer": 2 + } + } + }, + "upper": { + "value": { + "literal": { + "integer": 3 + } + } + } + } + } + }, { + "window": { + "windowFunction": { + "unresolvedFunction": { + "functionName": "min", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }] + } + }, + "orderSpec": [{ + "child": { + "unresolvedAttribute": { + "unparsedIdentifier": "a" + } + }, + "direction": "SORT_DIRECTION_ASCENDING", + "nullOrdering": "SORT_NULLS_FIRST" + }], + "frameSpec": { + "frameType": "FRAME_TYPE_RANGE", + "lower": { + "value": { + "literal": { + "long": "2" + } + } + }, + "upper": { + "value": { + "literal": { + "long": "3" + } + } + } + } + } + }, { + "window": { + "windowFunction": { + "unresolvedFunction": { + "functionName": "count", + "arguments": [{ + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }] + } + } + } + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/window.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/window.proto.bin new file mode 100644 index 0000000000000..a89c0d6a6a3f4 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/window.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/withColumnRenamed_java_map.json b/connector/connect/common/src/test/resources/query-tests/queries/withColumnRenamed_java_map.json new file mode 100644 index 0000000000000..731cf844afe6d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/withColumnRenamed_java_map.json @@ -0,0 +1,19 @@ +{ + "common": { + "planId": "1" + }, + "withColumnsRenamed": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "renameColumnsMap": { + "b": "bravo", + "id": "nid" + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/withColumnRenamed_java_map.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/withColumnRenamed_java_map.proto.bin new file mode 100644 index 0000000000000..64fcf7855ecbf Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/withColumnRenamed_java_map.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/withColumnRenamed_scala_map.json b/connector/connect/common/src/test/resources/query-tests/queries/withColumnRenamed_scala_map.json new file mode 100644 index 0000000000000..570bfa32233d9 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/withColumnRenamed_scala_map.json @@ -0,0 +1,19 @@ +{ + "common": { + "planId": "1" + }, + "withColumnsRenamed": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "renameColumnsMap": { + "a": "alpha", + "b": "beta" + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/withColumnRenamed_scala_map.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/withColumnRenamed_scala_map.proto.bin new file mode 100644 index 0000000000000..42df8ea1d1111 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/withColumnRenamed_scala_map.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/withColumnRenamed_single.json b/connector/connect/common/src/test/resources/query-tests/queries/withColumnRenamed_single.json new file mode 100644 index 0000000000000..23b2e1d41d3cb --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/withColumnRenamed_single.json @@ -0,0 +1,18 @@ +{ + "common": { + "planId": "1" + }, + "withColumnsRenamed": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "renameColumnsMap": { + "id": "nid" + } + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/withColumnRenamed_single.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/withColumnRenamed_single.proto.bin new file mode 100644 index 0000000000000..f46d01646c6f1 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/withColumnRenamed_single.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/withColumn_single.json b/connector/connect/common/src/test/resources/query-tests/queries/withColumn_single.json new file mode 100644 index 0000000000000..8863d15f2764d --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/withColumn_single.json @@ -0,0 +1,23 @@ +{ + "common": { + "planId": "1" + }, + "withColumns": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "aliases": [{ + "expr": { + "expressionString": { + "expression": "a + 100" + } + }, + "name": ["z"] + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/withColumn_single.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/withColumn_single.proto.bin new file mode 100644 index 0000000000000..6d53a883a5f40 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/withColumn_single.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/withColumns_java_map.json b/connector/connect/common/src/test/resources/query-tests/queries/withColumns_java_map.json new file mode 100644 index 0000000000000..a59f4abd47ce1 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/withColumns_java_map.json @@ -0,0 +1,30 @@ +{ + "common": { + "planId": "1" + }, + "withColumns": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "aliases": [{ + "expr": { + "literal": { + "string": "123" + } + }, + "name": ["a"] + }, { + "expr": { + "unresolvedAttribute": { + "unparsedIdentifier": "id" + } + }, + "name": ["g"] + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/withColumns_java_map.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/withColumns_java_map.proto.bin new file mode 100644 index 0000000000000..be381f62594c8 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/withColumns_java_map.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/withColumns_scala_map.json b/connector/connect/common/src/test/resources/query-tests/queries/withColumns_scala_map.json new file mode 100644 index 0000000000000..99405a73041fa --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/withColumns_scala_map.json @@ -0,0 +1,30 @@ +{ + "common": { + "planId": "1" + }, + "withColumns": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "aliases": [{ + "expr": { + "literal": { + "string": "redacted" + } + }, + "name": ["b"] + }, { + "expr": { + "expressionString": { + "expression": "a + 100" + } + }, + "name": ["z"] + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/withColumns_scala_map.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/withColumns_scala_map.proto.bin new file mode 100644 index 0000000000000..77ee1900e73fd Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/withColumns_scala_map.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/queries/withMetadata.json b/connector/connect/common/src/test/resources/query-tests/queries/withMetadata.json new file mode 100644 index 0000000000000..6ba7e5cd55bdd --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/queries/withMetadata.json @@ -0,0 +1,25 @@ +{ + "common": { + "planId": "1" + }, + "withColumns": { + "input": { + "common": { + "planId": "0" + }, + "localRelation": { + "schema": "struct\u003cid:bigint,a:int,b:double\u003e" + } + }, + "aliases": [{ + "expr": { + "unresolvedAttribute": { + "unparsedIdentifier": "id", + "planId": "0" + } + }, + "name": ["id"], + "metadata": "{\"description\":\"unique identifier\"}" + }] + } +} \ No newline at end of file diff --git a/connector/connect/common/src/test/resources/query-tests/queries/withMetadata.proto.bin b/connector/connect/common/src/test/resources/query-tests/queries/withMetadata.proto.bin new file mode 100644 index 0000000000000..f814b37d0ac2e Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/queries/withMetadata.proto.bin differ diff --git a/connector/connect/common/src/test/resources/query-tests/test-data/people.csv b/connector/connect/common/src/test/resources/query-tests/test-data/people.csv new file mode 100644 index 0000000000000..7fe5adba93d77 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/test-data/people.csv @@ -0,0 +1,3 @@ +name;age;job +Jorge;30;Developer +Bob;32;Developer diff --git a/connector/connect/common/src/test/resources/query-tests/test-data/people.json b/connector/connect/common/src/test/resources/query-tests/test-data/people.json new file mode 100644 index 0000000000000..50a859cbd7ee8 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/test-data/people.json @@ -0,0 +1,3 @@ +{"name":"Michael"} +{"name":"Andy", "age":30} +{"name":"Justin", "age":19} diff --git a/connector/connect/common/src/test/resources/query-tests/test-data/people.txt b/connector/connect/common/src/test/resources/query-tests/test-data/people.txt new file mode 100644 index 0000000000000..3bcace4a44c23 --- /dev/null +++ b/connector/connect/common/src/test/resources/query-tests/test-data/people.txt @@ -0,0 +1,3 @@ +Michael, 29 +Andy, 30 +Justin, 19 diff --git a/connector/connect/common/src/test/resources/query-tests/test-data/users.orc b/connector/connect/common/src/test/resources/query-tests/test-data/users.orc new file mode 100644 index 0000000000000..12478a5d03c26 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/test-data/users.orc differ diff --git a/connector/connect/common/src/test/resources/query-tests/test-data/users.parquet b/connector/connect/common/src/test/resources/query-tests/test-data/users.parquet new file mode 100644 index 0000000000000..aa527338c43a8 Binary files /dev/null and b/connector/connect/common/src/test/resources/query-tests/test-data/users.parquet differ diff --git a/connector/connect/common/src/test/scala/org/apache/spark/sql/TestUDFs.scala b/connector/connect/common/src/test/scala/org/apache/spark/sql/TestUDFs.scala new file mode 100644 index 0000000000000..20c9201e7c9f4 --- /dev/null +++ b/connector/connect/common/src/test/scala/org/apache/spark/sql/TestUDFs.scala @@ -0,0 +1,43 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +/** + * A bunch of functions use for testing udf serialization. + */ +object TestUDFs { + type L = Long + val udf0: () => Double = new Function0[Double] with Serializable { + override def apply(): Double = Math.random() + } + + val udf1: L => Tuple1[L] = new (L => Tuple1[L]) with Serializable { + override def apply(i0: L): Tuple1[L] = Tuple1(i0) + } + + val udf2: (L, L) => (L, L) = new ((L, L) => (L, L)) with Serializable { + override def apply(i0: L, i1: L): (L, L) = (i0, i1) + } + + val udf3: (L, L, L) => (L, L, L) = new ((L, L, L) => (L, L, L)) with Serializable { + override def apply(i0: L, i1: L, i2: L): (L, L, L) = (i0, i1, i2) + } + + val udf4: (L, L, L, L) => (L, L, L, L) = new ((L, L, L, L) => (L, L, L, L)) with Serializable { + override def apply(i0: L, i1: L, i2: L, i3: L): (L, L, L, L) = (i0, i1, i2, i3) + } +} diff --git a/connector/connect/docs/adding-proto-messages.md b/connector/connect/docs/adding-proto-messages.md new file mode 100644 index 0000000000000..85e7bb79e0a32 --- /dev/null +++ b/connector/connect/docs/adding-proto-messages.md @@ -0,0 +1,40 @@ +# Required, Optional and default values + +Spark Connect adopts proto3, which does not support the use of the `required` constraint anymore. +For non-message proto fields, there is also no `has_field_name` functions to easy tell +if a filed is set or not-set. (Read [proto3 field rules](https://developers.google.com/protocol-buffers/docs/proto3#specifying_field_rules)) + + +### Required field + +When adding fields that have required semantics, developers are required to follow +the outlined process. Fields that are semantically required for the server to +correctly process the incoming message must be documented with `(Required)`. For scalar +fields the server will not perform any additional input validation. For compound fields, +the server will perform minimal checks to avoid null pointer exceptions but will not +perform any semantic validation. + +Example: +```protobuf +message DataSource { + // (Required) Supported formats include: parquet, orc, text, json, parquet, csv, avro. + string format = 1; +} +``` + + +### Optional fields + +Semantically optional fields must be marked by `optional`. The server side will +then use this information to branch into different behaviors based on the presence or absence of this field. + +Due to the lack of configurable default values for scalar types, the pure presence of +an optional value does not define its default value. The server side implementation will interpret the observed value based on its own rules. + +Example: +```protobuf +message DataSource { + // (Optional) If not set, Spark will infer the schema. + optional string schema = 2; +} +``` diff --git a/connector/connect/docs/client-connection-string.md b/connector/connect/docs/client-connection-string.md new file mode 100644 index 0000000000000..6e5b0c80db7aa --- /dev/null +++ b/connector/connect/docs/client-connection-string.md @@ -0,0 +1,126 @@ +# Connecting to Spark Connect using Clients + +From the client perspective, Spark Connect mostly behaves as any other GRPC +client and can be configured as such. However, to make it easy to use from +different programming languages and to have a homogenous connection surface +this document proposes what the user surface is for connecting to a +Spark Connect endpoint. + +## Background +Similar to JDBC or other database connections, Spark Connect leverages a +connection string that contains the relevant parameters that are interpreted +to connect to the Spark Connect endpoint + + +## Connection String + +Generally, the connection string follows the standard URI definitions. The URI +scheme is fixed and set to `sc://`. The full URI has to be a +[valid URI](http://www.faqs.org/rfcs/rfc2396.html) and must +be parsed properly by most systems. For example, hostnames have to be valid and +cannot contain arbitrary characters. Configuration parameter are passed in the +style of the HTTP URL Path Parameter Syntax. This is similar to the JDBC connection +strings. The path component must be empty. All parameters are interpreted **case sensitive**. + +```shell +sc://hostname:port/;param1=value;param2=value +``` + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
ParameterTypeDescriptionExamples
hostnameString + The hostname of the endpoint for Spark Connect. Since the endpoint + has to be a fully GRPC compatible endpoint a particular path cannot + be specified. The hostname must be fully qualified or can be an IP + address as well. + +
myexample.com
+
127.0.0.1
+
portNumericThe portname to be used when connecting to the GRPC endpoint. The + default values is: 15002. Any valid port number can be used.
15002
443
tokenStringWhen this param is set in the URL, it will enable standard + bearer token authentication using GRPC. By default this value is not set. + Setting this value enables SSL.
token=ABCDEFGH
use_sslBooleanWhen this flag is set, will by default connect to the endpoint + using TLS. The assumption is that the necessary certificates to verify + the server certificates are available in the system. The default + value is false
use_ssl=true
use_ssl=false
user_idStringUser ID to automatically set in the Spark Connect UserContext message. + This is necssary for the appropriate Spark Session management. This is an + *optional* parameter and depending on the deployment this parameter might + be automatically injected using other means. +
user_id=Martin
+
user_agentStringThe user agent acting on behalf of the user, typically applications + that use Spark Connect to implement its functionality and execute Spark + requests on behalf of the user.
+ Default:
_SPARK_CONNECT_PYTHON
in the Python client
user_agent=my_data_query_app
+ +## Examples + +### Valid Examples +Below we capture valid configuration examples, explaining how the connection string +will be used when configuring the Spark Connect client. + +The below example connects to port **`15002`** on **myhost.com**. +```python +server_url = "sc://myhost.com/" +``` + +The next example configures the connection to use a different port with SSL. + +```python +server_url = "sc://myhost.com:443/;use_ssl=true" +``` + +```python +server_url = "sc://myhost.com:443/;use_ssl=true;token=ABCDEFG" +``` + +### Invalid Examples + +As mentioned above, Spark Connect uses a regular GRPC client and the server path +cannot be configured to remain compatible with the GRPC standard and HTTP. For +example the following examles are invalid. + +```python +server_url = "sc://myhost.com:443/mypathprefix/;token=AAAAAAA" +``` + diff --git a/connector/connect/server/pom.xml b/connector/connect/server/pom.xml new file mode 100644 index 0000000000000..4df855efcc640 --- /dev/null +++ b/connector/connect/server/pom.xml @@ -0,0 +1,383 @@ + + + + + 4.0.0 + + org.apache.spark + spark-parent_2.12 + 3.4.1 + ../../../pom.xml + + + spark-connect_2.12 + jar + Spark Project Connect Server + https://spark.apache.org/ + + connect + 31.0.1-jre + 1.0.1 + 1.47.0 + 6.0.53 + + + + + org.apache.spark + spark-core_${scala.binary.version} + ${project.version} + provided + + + com.google.guava + guava + + + + + org.apache.spark + spark-connect-common_${scala.binary.version} + ${project.version} + + + com.google.guava + guava + + + + + org.apache.spark + spark-core_${scala.binary.version} + ${project.version} + test-jar + test + + + org.apache.spark + spark-catalyst_${scala.binary.version} + ${project.version} + provided + + + com.google.guava + guava + + + + + org.apache.spark + spark-sql_${scala.binary.version} + ${project.version} + provided + + + com.google.guava + guava + + + + + org.apache.spark + spark-catalyst_${scala.binary.version} + ${project.version} + test-jar + test + + + org.apache.spark + spark-sql_${scala.binary.version} + ${project.version} + test-jar + test + + + org.apache.spark + spark-connect-common_${scala.binary.version} + ${project.version} + test-jar + test + + + com.google.guava + guava + + + + + org.apache.spark + spark-tags_${scala.binary.version} + ${project.version} + provided + + + com.google.guava + guava + + + + + + com.google.guava + guava + ${guava.version} + compile + + + com.google.guava + failureaccess + ${guava.failureaccess.version} + + + com.google.protobuf + protobuf-java + ${protobuf.version} + compile + + + com.google.protobuf + protobuf-java-util + ${protobuf.version} + compile + + + io.grpc + grpc-netty + ${io.grpc.version} + + + io.grpc + grpc-protobuf + ${io.grpc.version} + + + io.grpc + grpc-services + ${io.grpc.version} + + + io.grpc + grpc-stub + ${io.grpc.version} + + + io.netty + netty-codec-http2 + ${netty.version} + provided + + + io.netty + netty-handler-proxy + ${netty.version} + provided + + + io.netty + netty-transport-native-unix-common + ${netty.version} + provided + + + org.apache.tomcat + annotations-api + ${tomcat.annotations.api.version} + provided + + + org.scalacheck + scalacheck_${scala.binary.version} + test + + + org.mockito + mockito-core + test + + + com.h2database + h2 + 2.1.214 + test + + + + target/scala-${scala.binary.version}/classes + target/scala-${scala.binary.version}/test-classes + + + org.codehaus.mojo + build-helper-maven-plugin + + + add-sources + generate-sources + + add-source + + + + src/main/scala-${scala.binary.version} + + + + + add-scala-test-sources + generate-test-sources + + add-test-source + + + + src/test/gen-java + + + + + + + + org.apache.maven.plugins + maven-shade-plugin + + false + + + com.google.guava:* + io.grpc:*: + com.google.protobuf:* + + + com.google.android:annotations + com.google.api.grpc:proto-google-common-protos + io.perfmark:perfmark-api + org.codehaus.mojo:animal-sniffer-annotations + com.google.errorprone:error_prone_annotations + com.google.j2objc:j2objc-annotations + org.checkerframework:checker-qual + com.google.code.gson:gson + org.apache.spark:spark-connect-common_${scala.binary.version} + + + + + com.google.common + ${spark.shade.packageName}.connect.guava + + com.google.common.** + + + + com.google.thirdparty + ${spark.shade.packageName}.connect.guava + + com.google.thirdparty.** + + + + com.google.protobuf + ${spark.shade.packageName}.connect.protobuf + + com.google.protobuf.** + + + + io.grpc + ${spark.shade.packageName}.connect.grpc + + + + android.annotation + ${spark.shade.packageName}.connect.android_annotation + + + io.perfmark + ${spark.shade.packageName}.connect.io_perfmark + + + org.codehaus.mojo.animal_sniffer + ${spark.shade.packageName}.connect.animal_sniffer + + + com.google.j2objc.annotations + ${spark.shade.packageName}.connect.j2objc_annotations + + + com.google.errorprone.annotations + ${spark.shade.packageName}.connect.errorprone_annotations + + + org.checkerframework + ${spark.shade.packageName}.connect.checkerframework + + + com.google.gson + ${spark.shade.packageName}.connect.gson + + + + + com.google.api + ${spark.shade.packageName}.connect.google_protos.api + + + com.google.cloud + ${spark.shade.packageName}.connect.google_protos.cloud + + + com.google.geo + ${spark.shade.packageName}.connect.google_protos.geo + + + com.google.logging + ${spark.shade.packageName}.connect.google_protos.logging + + + com.google.longrunning + ${spark.shade.packageName}.connect.google_protos.longrunning + + + com.google.rpc + ${spark.shade.packageName}.connect.google_protos.rpc + + + com.google.type + ${spark.shade.packageName}.connect.google_protos.type + + + + + + + + + + diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/SimpleSparkConnectService.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/SimpleSparkConnectService.scala new file mode 100644 index 0000000000000..b1376e5131a72 --- /dev/null +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/SimpleSparkConnectService.scala @@ -0,0 +1,61 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect + +import java.util.concurrent.TimeUnit + +import scala.io.StdIn +import scala.sys.exit + +import org.apache.spark.SparkConf +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connect.service.SparkConnectService + +/** + * A simple main class method to start the spark connect server as a service for client tests + * using spark-submit: + * {{{ + * bin/spark-submit --class org.apache.spark.sql.connect.SimpleSparkConnectService + * }}} + * The service can be stopped by receiving a stop command or until the service get killed. + */ +private[sql] object SimpleSparkConnectService { + private val stopCommand = "q" + + def main(args: Array[String]): Unit = { + val conf = new SparkConf() + val sparkSession = SparkSession.builder().config(conf).getOrCreate() + val sparkContext = sparkSession.sparkContext // init spark context + SparkConnectService.start() + // scalastyle:off println + println("Ready for client connections.") + // scalastyle:on println + while (true) { + val code = StdIn.readLine() + if (code == stopCommand) { + // scalastyle:off println + println("No more client connections.") + // scalastyle:on println + // Wait for 1 min for the server to stop + SparkConnectService.stop(Some(1), Some(TimeUnit.MINUTES)) + sparkSession.close() + exit(0) + } + } + } +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/SparkConnectPlugin.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/SparkConnectPlugin.scala new file mode 100644 index 0000000000000..bb694a7679890 --- /dev/null +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/SparkConnectPlugin.scala @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect + +import java.util + +import scala.collection.JavaConverters._ + +import org.apache.spark.SparkContext +import org.apache.spark.api.plugin.{DriverPlugin, ExecutorPlugin, PluginContext, SparkPlugin} +import org.apache.spark.sql.connect.service.SparkConnectService + +/** + * This is the main entry point for Spark Connect. + * + * To decouple the build of Spark Connect and its dependencies from the core of Spark, we + * implement it as a Driver Plugin. To enable Spark Connect, simply make sure that the appropriate + * JAR is available in the CLASSPATH and the driver plugin is configured to load this class. + */ +class SparkConnectPlugin extends SparkPlugin { + + /** + * Return the plugin's driver-side component. + * + * @return + * The driver-side component. + */ + override def driverPlugin(): DriverPlugin = new DriverPlugin { + + override def init( + sc: SparkContext, + pluginContext: PluginContext): util.Map[String, String] = { + SparkConnectService.start() + Map.empty[String, String].asJava + } + + override def shutdown(): Unit = { + SparkConnectService.stop() + } + } + + override def executorPlugin(): ExecutorPlugin = null +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala new file mode 100644 index 0000000000000..19fdad97b5ffb --- /dev/null +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/config/Connect.scala @@ -0,0 +1,93 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.config + +import org.apache.spark.internal.config.ConfigBuilder +import org.apache.spark.network.util.ByteUnit +import org.apache.spark.sql.connect.common.config.ConnectCommon + +object Connect { + + val CONNECT_GRPC_BINDING_PORT = + ConfigBuilder("spark.connect.grpc.binding.port") + .version("3.4.0") + .intConf + .createWithDefault(ConnectCommon.CONNECT_GRPC_BINDING_PORT) + + val CONNECT_GRPC_INTERCEPTOR_CLASSES = + ConfigBuilder("spark.connect.grpc.interceptor.classes") + .doc( + "Comma separated list of class names that must " + + "implement the io.grpc.ServerInterceptor interface.") + .version("3.4.0") + .stringConf + .createOptional + + val CONNECT_GRPC_ARROW_MAX_BATCH_SIZE = + ConfigBuilder("spark.connect.grpc.arrow.maxBatchSize") + .doc( + "When using Apache Arrow, limit the maximum size of one arrow batch that " + + "can be sent from server side to client side. Currently, we conservatively use 70% " + + "of it because the size is not accurate but estimated.") + .version("3.4.0") + .bytesConf(ByteUnit.MiB) + .createWithDefaultString("4m") + + val CONNECT_GRPC_MAX_INBOUND_MESSAGE_SIZE = + ConfigBuilder("spark.connect.grpc.maxInboundMessageSize") + .doc("Sets the maximum inbound message in bytes size for the gRPC requests." + + "Requests with a larger payload will fail.") + .version("3.4.0") + .bytesConf(ByteUnit.BYTE) + .createWithDefault(ConnectCommon.CONNECT_GRPC_MAX_MESSAGE_SIZE) + + val CONNECT_EXTENSIONS_RELATION_CLASSES = + ConfigBuilder("spark.connect.extensions.relation.classes") + .doc(""" + |Comma separated list of classes that implement the trait + |org.apache.spark.sql.connect.plugin.RelationPlugin to support custom + |Relation types in proto. + |""".stripMargin) + .version("3.4.0") + .stringConf + .toSequence + .createWithDefault(Nil) + + val CONNECT_EXTENSIONS_EXPRESSION_CLASSES = + ConfigBuilder("spark.connect.extensions.expression.classes") + .doc(""" + |Comma separated list of classes that implement the trait + |org.apache.spark.sql.connect.plugin.ExpressionPlugin to support custom + |Expression types in proto. + |""".stripMargin) + .version("3.4.0") + .stringConf + .toSequence + .createWithDefault(Nil) + + val CONNECT_EXTENSIONS_COMMAND_CLASSES = + ConfigBuilder("spark.connect.extensions.command.classes") + .doc(""" + |Comma separated list of classes that implement the trait + |org.apache.spark.sql.connect.plugin.CommandPlugin to support custom + |Command types in proto. + |""".stripMargin) + .version("3.4.0") + .stringConf + .toSequence + .createWithDefault(Nil) +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala new file mode 100644 index 0000000000000..21b9180ccfbc0 --- /dev/null +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/dsl/package.scala @@ -0,0 +1,1121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect + +import scala.collection.JavaConverters._ +import scala.language.implicitConversions + +import org.apache.spark.connect.proto +import org.apache.spark.connect.proto._ +import org.apache.spark.connect.proto.Expression.ExpressionString +import org.apache.spark.connect.proto.Join.JoinType +import org.apache.spark.connect.proto.SetOperation.SetOpType +import org.apache.spark.sql.{Observation, SaveMode} +import org.apache.spark.sql.connect.common.DataTypeProtoConverter +import org.apache.spark.sql.connect.common.LiteralValueProtoConverter.toLiteralProto +import org.apache.spark.sql.connect.planner.{SaveModeConverter, TableSaveMethodConverter} +import org.apache.spark.sql.types.StructType +import org.apache.spark.util.Utils + +/** + * A collection of implicit conversions that create a DSL for constructing connect protos. + * + * All classes in connect/dsl are considered an internal API to Spark Connect and are subject to + * change between minor releases. + */ + +package object dsl { + + class MockRemoteSession {} + + object expressions { // scalastyle:ignore + implicit class DslString(val s: String) { + def protoAttr: Expression = + Expression + .newBuilder() + .setUnresolvedAttribute( + Expression.UnresolvedAttribute + .newBuilder() + .setUnparsedIdentifier(s)) + .build() + + def colRegex: Expression = + Expression + .newBuilder() + .setUnresolvedRegex( + Expression.UnresolvedRegex + .newBuilder() + .setColName(s)) + .build() + + def asc: Expression = + Expression + .newBuilder() + .setSortOrder( + Expression.SortOrder + .newBuilder() + .setChild(protoAttr) + .setDirectionValue( + proto.Expression.SortOrder.SortDirection.SORT_DIRECTION_ASCENDING_VALUE) + .setNullOrdering(proto.Expression.SortOrder.NullOrdering.SORT_NULLS_FIRST)) + .build() + } + + implicit class DslExpression(val expr: Expression) { + def as(alias: String): Expression = Expression + .newBuilder() + .setAlias(Expression.Alias.newBuilder().addName(alias).setExpr(expr)) + .build() + + def as(alias: String, metadata: String): Expression = Expression + .newBuilder() + .setAlias( + Expression.Alias + .newBuilder() + .setExpr(expr) + .addName(alias) + .setMetadata(metadata) + .build()) + .build() + + def as(alias: Seq[String]): Expression = Expression + .newBuilder() + .setAlias( + Expression.Alias + .newBuilder() + .setExpr(expr) + .addAllName(alias.asJava) + .build()) + .build() + + def <(other: Expression): Expression = + Expression + .newBuilder() + .setUnresolvedFunction( + Expression.UnresolvedFunction + .newBuilder() + .setFunctionName("<") + .addArguments(expr) + .addArguments(other)) + .build() + + def cast(dataType: DataType): Expression = + Expression + .newBuilder() + .setCast( + Expression.Cast + .newBuilder() + .setExpr(expr) + .setType(dataType)) + .build() + + def cast(dataType: String): Expression = + Expression + .newBuilder() + .setCast( + Expression.Cast + .newBuilder() + .setExpr(expr) + .setTypeStr(dataType)) + .build() + } + + def proto_min(e: Expression): Expression = + Expression + .newBuilder() + .setUnresolvedFunction( + Expression.UnresolvedFunction.newBuilder().setFunctionName("min").addArguments(e)) + .build() + + def proto_max(e: Expression): Expression = + Expression + .newBuilder() + .setUnresolvedFunction( + Expression.UnresolvedFunction.newBuilder().setFunctionName("max").addArguments(e)) + .build() + + def proto_sum(e: Expression): Expression = + Expression + .newBuilder() + .setUnresolvedFunction( + Expression.UnresolvedFunction.newBuilder().setFunctionName("sum").addArguments(e)) + .build() + + def proto_explode(e: Expression): Expression = + Expression + .newBuilder() + .setUnresolvedFunction( + Expression.UnresolvedFunction.newBuilder().setFunctionName("explode").addArguments(e)) + .build() + + /** + * Create an unresolved function from name parts. + * + * @param nameParts + * @param args + * @return + * Expression wrapping the unresolved function. + */ + def callFunction(nameParts: Seq[String], args: Seq[Expression]): Expression = { + Expression + .newBuilder() + .setUnresolvedFunction( + Expression.UnresolvedFunction + .newBuilder() + .setFunctionName(nameParts.mkString(".")) + .setIsUserDefinedFunction(true) + .addAllArguments(args.asJava)) + .build() + } + + /** + * Creates an UnresolvedFunction from a single identifier. + * + * @param name + * @param args + * @return + * Expression wrapping the unresolved function. + */ + def callFunction(name: String, args: Seq[Expression]): Expression = { + Expression + .newBuilder() + .setUnresolvedFunction( + Expression.UnresolvedFunction + .newBuilder() + .setFunctionName(name) + .addAllArguments(args.asJava)) + .build() + } + + implicit def intToLiteral(i: Int): Expression = + Expression + .newBuilder() + .setLiteral(Expression.Literal.newBuilder().setInteger(i)) + .build() + } + + object commands { // scalastyle:ignore + implicit class DslCommands(val logicalPlan: Relation) { + def write( + format: Option[String] = None, + path: Option[String] = None, + tableName: Option[String] = None, + tableSaveMethod: Option[String] = None, + mode: Option[String] = None, + sortByColumns: Seq[String] = Seq.empty, + partitionByCols: Seq[String] = Seq.empty, + bucketByCols: Seq[String] = Seq.empty, + numBuckets: Option[Int] = None): Command = { + val writeOp = WriteOperation.newBuilder() + format.foreach(writeOp.setSource(_)) + + mode + .map(SaveMode.valueOf(_)) + .map(SaveModeConverter.toSaveModeProto) + .foreach(writeOp.setMode(_)) + + if (tableName.nonEmpty) { + tableName.foreach { tn => + val saveTable = WriteOperation.SaveTable.newBuilder().setTableName(tn) + tableSaveMethod + .map(TableSaveMethodConverter.toTableSaveMethodProto(_)) + .foreach(saveTable.setSaveMethod(_)) + writeOp.setTable(saveTable.build()) + } + } else { + path.foreach(writeOp.setPath(_)) + } + sortByColumns.foreach(writeOp.addSortColumnNames(_)) + partitionByCols.foreach(writeOp.addPartitioningColumns(_)) + + if (numBuckets.nonEmpty && bucketByCols.nonEmpty) { + val op = WriteOperation.BucketBy.newBuilder() + numBuckets.foreach(op.setNumBuckets(_)) + bucketByCols.foreach(op.addBucketColumnNames(_)) + writeOp.setBucketBy(op.build()) + } + writeOp.setInput(logicalPlan) + Command.newBuilder().setWriteOperation(writeOp.build()).build() + } + + def createView(name: String, global: Boolean, replace: Boolean): Command = { + Command + .newBuilder() + .setCreateDataframeView( + CreateDataFrameViewCommand + .newBuilder() + .setName(name) + .setIsGlobal(global) + .setReplace(replace) + .setInput(logicalPlan)) + .build() + } + + def writeV2( + tableName: Option[String] = None, + provider: Option[String] = None, + options: Map[String, String] = Map.empty, + tableProperties: Map[String, String] = Map.empty, + partitionByCols: Seq[Expression] = Seq.empty, + mode: Option[String] = None, + overwriteCondition: Option[Expression] = None): Command = { + val writeOp = WriteOperationV2.newBuilder() + writeOp.setInput(logicalPlan) + tableName.foreach(writeOp.setTableName) + provider.foreach(writeOp.setProvider) + partitionByCols.foreach(writeOp.addPartitioningColumns) + options.foreach { case (k, v) => + writeOp.putOptions(k, v) + } + tableProperties.foreach { case (k, v) => + writeOp.putTableProperties(k, v) + } + mode.foreach { m => + if (m == "MODE_CREATE") { + writeOp.setMode(WriteOperationV2.Mode.MODE_CREATE) + } else if (m == "MODE_OVERWRITE") { + writeOp.setMode(WriteOperationV2.Mode.MODE_OVERWRITE) + overwriteCondition.foreach(writeOp.setOverwriteCondition) + } else if (m == "MODE_OVERWRITE_PARTITIONS") { + writeOp.setMode(WriteOperationV2.Mode.MODE_OVERWRITE_PARTITIONS) + } else if (m == "MODE_APPEND") { + writeOp.setMode(WriteOperationV2.Mode.MODE_APPEND) + } else if (m == "MODE_REPLACE") { + writeOp.setMode(WriteOperationV2.Mode.MODE_REPLACE) + } else if (m == "MODE_CREATE_OR_REPLACE") { + writeOp.setMode(WriteOperationV2.Mode.MODE_CREATE_OR_REPLACE) + } + } + Command.newBuilder().setWriteOperationV2(writeOp.build()).build() + } + } + } + + object plans { // scalastyle:ignore + implicit class DslMockRemoteSession(val session: MockRemoteSession) { + def range( + start: Option[Long], + end: Long, + step: Option[Long], + numPartitions: Option[Int]): Relation = { + val range = proto.Range.newBuilder() + if (start.isDefined) { + range.setStart(start.get) + } + range.setEnd(end) + if (step.isDefined) { + range.setStep(step.get) + } else { + range.setStep(1L) + } + if (numPartitions.isDefined) { + range.setNumPartitions(numPartitions.get) + } + Relation.newBuilder().setRange(range).build() + } + + def sql(sqlText: String): Relation = { + Relation.newBuilder().setSql(SQL.newBuilder().setQuery(sqlText)).build() + } + } + + implicit class DslNAFunctions(val logicalPlan: Relation) { + + def fillValue(value: Any): Relation = { + Relation + .newBuilder() + .setFillNa( + proto.NAFill + .newBuilder() + .setInput(logicalPlan) + .addAllValues(Seq(toLiteralProto(value)).asJava) + .build()) + .build() + } + + def fillColumns(value: Any, cols: Seq[String]): Relation = { + Relation + .newBuilder() + .setFillNa( + proto.NAFill + .newBuilder() + .setInput(logicalPlan) + .addAllCols(cols.asJava) + .addAllValues(Seq(toLiteralProto(value)).asJava) + .build()) + .build() + } + + def fillValueMap(valueMap: Map[String, Any]): Relation = { + val (cols, values) = valueMap.mapValues(toLiteralProto).toSeq.unzip + Relation + .newBuilder() + .setFillNa( + proto.NAFill + .newBuilder() + .setInput(logicalPlan) + .addAllCols(cols.asJava) + .addAllValues(values.asJava) + .build()) + .build() + } + + def drop( + how: Option[String] = None, + minNonNulls: Option[Int] = None, + cols: Seq[String] = Seq.empty): Relation = { + require(!(how.nonEmpty && minNonNulls.nonEmpty)) + require(how.isEmpty || Seq("any", "all").contains(how.get)) + + val dropna = proto.NADrop + .newBuilder() + .setInput(logicalPlan) + + if (cols.nonEmpty) { + dropna.addAllCols(cols.asJava) + } + + var _minNonNulls = -1 + how match { + case Some("all") => _minNonNulls = 1 + case _ => + } + if (minNonNulls.nonEmpty) { + _minNonNulls = minNonNulls.get + } + if (_minNonNulls > 0) { + dropna.setMinNonNulls(_minNonNulls) + } + + Relation + .newBuilder() + .setDropNa(dropna.build()) + .build() + } + + def replace(cols: Seq[String], replacement: Map[Any, Any]): Relation = { + require(cols.nonEmpty) + + val replace = proto.NAReplace + .newBuilder() + .setInput(logicalPlan) + + if (!(cols.length == 1 && cols.head == "*")) { + replace.addAllCols(cols.asJava) + } + + replacement.foreach { case (oldValue, newValue) => + replace.addReplacements( + proto.NAReplace.Replacement + .newBuilder() + .setOldValue(toLiteralProto(oldValue)) + .setNewValue(toLiteralProto(newValue))) + } + + Relation + .newBuilder() + .setReplace(replace.build()) + .build() + } + } + + implicit class DslStatFunctions(val logicalPlan: Relation) { + def cov(col1: String, col2: String): Relation = { + Relation + .newBuilder() + .setCov( + proto.StatCov + .newBuilder() + .setInput(logicalPlan) + .setCol1(col1) + .setCol2(col2) + .build()) + .build() + } + + def corr(col1: String, col2: String, method: String): Relation = { + Relation + .newBuilder() + .setCorr( + proto.StatCorr + .newBuilder() + .setInput(logicalPlan) + .setCol1(col1) + .setCol2(col2) + .setMethod(method) + .build()) + .build() + } + + def corr(col1: String, col2: String): Relation = corr(col1, col2, "pearson") + + def approxQuantile( + cols: Array[String], + probabilities: Array[Double], + relativeError: Double): Relation = { + Relation + .newBuilder() + .setApproxQuantile( + proto.StatApproxQuantile + .newBuilder() + .setInput(logicalPlan) + .addAllCols(cols.toSeq.asJava) + .addAllProbabilities(probabilities.toSeq.map(Double.box).asJava) + .setRelativeError(relativeError) + .build()) + .build() + } + + def crosstab(col1: String, col2: String): Relation = { + Relation + .newBuilder() + .setCrosstab( + proto.StatCrosstab + .newBuilder() + .setInput(logicalPlan) + .setCol1(col1) + .setCol2(col2) + .build()) + .build() + } + + def freqItems(cols: Array[String], support: Double): Relation = { + Relation + .newBuilder() + .setFreqItems( + proto.StatFreqItems + .newBuilder() + .setInput(logicalPlan) + .addAllCols(cols.toSeq.asJava) + .setSupport(support) + .build()) + .build() + } + + def freqItems(cols: Array[String]): Relation = freqItems(cols, 0.01) + + def freqItems(cols: Seq[String], support: Double): Relation = + freqItems(cols.toArray, support) + + def freqItems(cols: Seq[String]): Relation = freqItems(cols, 0.01) + } + + def select(exprs: Expression*): Relation = { + Relation + .newBuilder() + .setProject( + Project + .newBuilder() + .addAllExpressions(exprs.toIterable.asJava) + .build()) + .build() + } + + implicit class DslLogicalPlan(val logicalPlan: Relation) { + def select(exprs: Expression*): Relation = { + Relation + .newBuilder() + .setProject( + Project + .newBuilder() + .setInput(logicalPlan) + .addAllExpressions(exprs.toIterable.asJava) + .build()) + .build() + } + + def selectExpr(exprs: String*): Relation = + select(exprs.map { expr => + Expression + .newBuilder() + .setExpressionString(ExpressionString.newBuilder().setExpression(expr)) + .build() + }: _*) + + def tail(limit: Int): Relation = { + Relation + .newBuilder() + .setTail( + Tail + .newBuilder() + .setInput(logicalPlan) + .setLimit(limit)) + .build() + } + + def limit(limit: Int): Relation = { + Relation + .newBuilder() + .setLimit( + Limit + .newBuilder() + .setInput(logicalPlan) + .setLimit(limit)) + .build() + } + + def offset(offset: Int): Relation = { + Relation + .newBuilder() + .setOffset( + Offset + .newBuilder() + .setInput(logicalPlan) + .setOffset(offset)) + .build() + } + + def where(condition: Expression): Relation = { + Relation + .newBuilder() + .setFilter(Filter.newBuilder().setInput(logicalPlan).setCondition(condition)) + .build() + } + + def deduplicate(colNames: Seq[String]): Relation = + Relation + .newBuilder() + .setDeduplicate( + Deduplicate + .newBuilder() + .setInput(logicalPlan) + .addAllColumnNames(colNames.asJava)) + .build() + + def distinct(): Relation = + Relation + .newBuilder() + .setDeduplicate( + Deduplicate + .newBuilder() + .setInput(logicalPlan) + .setAllColumnsAsKeys(true)) + .build() + + def join( + otherPlan: Relation, + joinType: JoinType, + condition: Option[Expression]): Relation = { + join(otherPlan, joinType, Seq(), condition) + } + + def join(otherPlan: Relation, condition: Option[Expression]): Relation = { + join(otherPlan, JoinType.JOIN_TYPE_INNER, Seq(), condition) + } + + def join(otherPlan: Relation): Relation = { + join(otherPlan, JoinType.JOIN_TYPE_INNER, Seq(), None) + } + + def join(otherPlan: Relation, joinType: JoinType): Relation = { + join(otherPlan, joinType, Seq(), None) + } + + def join(otherPlan: Relation, joinType: JoinType, usingColumns: Seq[String]): Relation = { + join(otherPlan, joinType, usingColumns, None) + } + + private def join( + otherPlan: Relation, + joinType: JoinType = JoinType.JOIN_TYPE_INNER, + usingColumns: Seq[String], + condition: Option[Expression]): Relation = { + val relation = Relation.newBuilder() + val join = Join.newBuilder() + join + .setLeft(logicalPlan) + .setRight(otherPlan) + .setJoinType(joinType) + if (usingColumns.nonEmpty) { + join.addAllUsingColumns(usingColumns.asJava) + } + if (condition.isDefined) { + join.setJoinCondition(condition.get) + } + relation.setJoin(join).build() + } + + def as(alias: String): Relation = { + Relation + .newBuilder(logicalPlan) + .setSubqueryAlias(SubqueryAlias.newBuilder().setAlias(alias).setInput(logicalPlan)) + .build() + } + + def sample( + lowerBound: Double, + upperBound: Double, + withReplacement: Boolean, + seed: Long): Relation = { + Relation + .newBuilder() + .setSample( + Sample + .newBuilder() + .setInput(logicalPlan) + .setUpperBound(upperBound) + .setLowerBound(lowerBound) + .setWithReplacement(withReplacement) + .setSeed(seed) + .build()) + .build() + } + + private def createDefaultSortField(col: String): Expression.SortOrder = { + Expression.SortOrder + .newBuilder() + .setNullOrdering(Expression.SortOrder.NullOrdering.SORT_NULLS_FIRST) + .setDirection(Expression.SortOrder.SortDirection.SORT_DIRECTION_ASCENDING) + .setChild( + Expression.newBuilder + .setUnresolvedAttribute( + Expression.UnresolvedAttribute.newBuilder.setUnparsedIdentifier(col).build()) + .build()) + .build() + } + + def sort(columns: String*): Relation = { + Relation + .newBuilder() + .setSort( + Sort + .newBuilder() + .setInput(logicalPlan) + .addAllOrder(columns.map(createDefaultSortField).asJava) + .setIsGlobal(true) + .build()) + .build() + } + + def sortWithinPartitions(columns: String*): Relation = { + Relation + .newBuilder() + .setSort( + Sort + .newBuilder() + .setInput(logicalPlan) + .addAllOrder(columns.map(createDefaultSortField).asJava) + .setIsGlobal(false) + .build()) + .build() + } + + def drop(columns: String*): Relation = { + assert(columns.nonEmpty) + + Relation + .newBuilder() + .setDrop( + Drop + .newBuilder() + .setInput(logicalPlan) + .addAllColumnNames(columns.toSeq.asJava) + .build()) + .build() + } + + def groupBy(groupingExprs: Expression*)(aggregateExprs: Expression*): Relation = { + val agg = Aggregate.newBuilder() + agg.setInput(logicalPlan) + agg.setGroupType(proto.Aggregate.GroupType.GROUP_TYPE_GROUPBY) + + for (groupingExpr <- groupingExprs) { + agg.addGroupingExpressions(groupingExpr) + } + for (aggregateExpr <- aggregateExprs) { + agg.addAggregateExpressions(aggregateExpr) + } + Relation.newBuilder().setAggregate(agg.build()).build() + } + + def rollup(groupingExprs: Expression*)(aggregateExprs: Expression*): Relation = { + val agg = Aggregate.newBuilder() + agg.setInput(logicalPlan) + agg.setGroupType(proto.Aggregate.GroupType.GROUP_TYPE_ROLLUP) + + for (groupingExpr <- groupingExprs) { + agg.addGroupingExpressions(groupingExpr) + } + for (aggregateExpr <- aggregateExprs) { + agg.addAggregateExpressions(aggregateExpr) + } + Relation.newBuilder().setAggregate(agg.build()).build() + } + + def cube(groupingExprs: Expression*)(aggregateExprs: Expression*): Relation = { + val agg = Aggregate.newBuilder() + agg.setInput(logicalPlan) + agg.setGroupType(proto.Aggregate.GroupType.GROUP_TYPE_CUBE) + + for (groupingExpr <- groupingExprs) { + agg.addGroupingExpressions(groupingExpr) + } + for (aggregateExpr <- aggregateExprs) { + agg.addAggregateExpressions(aggregateExpr) + } + Relation.newBuilder().setAggregate(agg.build()).build() + } + + def pivot(groupingExprs: Expression*)( + pivotCol: Expression, + pivotValues: Seq[proto.Expression.Literal])(aggregateExprs: Expression*): Relation = { + val agg = Aggregate.newBuilder() + agg.setInput(logicalPlan) + agg.setGroupType(proto.Aggregate.GroupType.GROUP_TYPE_PIVOT) + + for (groupingExpr <- groupingExprs) { + agg.addGroupingExpressions(groupingExpr) + } + for (aggregateExpr <- aggregateExprs) { + agg.addAggregateExpressions(aggregateExpr) + } + agg.setPivot( + Aggregate.Pivot.newBuilder().setCol(pivotCol).addAllValues(pivotValues.asJava).build()) + + Relation.newBuilder().setAggregate(agg.build()).build() + } + + def except(otherPlan: Relation, isAll: Boolean): Relation = { + Relation + .newBuilder() + .setSetOp( + createSetOperation(logicalPlan, otherPlan, SetOpType.SET_OP_TYPE_EXCEPT, isAll)) + .build() + } + + def intersect(otherPlan: Relation, isAll: Boolean): Relation = + Relation + .newBuilder() + .setSetOp( + createSetOperation(logicalPlan, otherPlan, SetOpType.SET_OP_TYPE_INTERSECT, isAll)) + .build() + + def union( + otherPlan: Relation, + isAll: Boolean = true, + byName: Boolean = false, + allowMissingColumns: Boolean = false): Relation = + Relation + .newBuilder() + .setSetOp( + createSetOperation( + logicalPlan, + otherPlan, + SetOpType.SET_OP_TYPE_UNION, + isAll, + byName, + allowMissingColumns)) + .build() + + def coalesce(num: Integer): Relation = + Relation + .newBuilder() + .setRepartition( + Repartition + .newBuilder() + .setInput(logicalPlan) + .setNumPartitions(num) + .setShuffle(false)) + .build() + + def repartition(num: Int): Relation = + Relation + .newBuilder() + .setRepartition( + Repartition.newBuilder().setInput(logicalPlan).setNumPartitions(num).setShuffle(true)) + .build() + + @scala.annotation.varargs + def repartition(partitionExprs: Expression*): Relation = { + repartition(None, partitionExprs) + } + + @scala.annotation.varargs + def repartition(num: Int, partitionExprs: Expression*): Relation = { + repartition(Some(num), partitionExprs) + } + + private def repartition(numOpt: Option[Int], partitionExprs: Seq[Expression]): Relation = { + val expressions = RepartitionByExpression + .newBuilder() + .setInput(logicalPlan) + numOpt.foreach(expressions.setNumPartitions) + for (expr <- partitionExprs) { + expressions.addPartitionExprs(expr) + } + Relation + .newBuilder() + .setRepartitionByExpression(expressions) + .build() + } + + @scala.annotation.varargs + def repartitionByRange(partitionExprs: Expression*): Relation = { + repartitionByRange(None, partitionExprs) + } + + @scala.annotation.varargs + def repartitionByRange(num: Int, partitionExprs: Expression*): Relation = { + repartitionByRange(Some(num), partitionExprs) + } + + private def repartitionByRange( + numOpt: Option[Int], + partitionExprs: Seq[Expression]): Relation = { + val expressions = RepartitionByExpression + .newBuilder() + .setInput(logicalPlan) + numOpt.foreach(expressions.setNumPartitions) + partitionExprs + .map(expr => + expr.getExprTypeCase match { + case Expression.ExprTypeCase.SORT_ORDER => expr + case _ => + Expression + .newBuilder() + .setSortOrder( + Expression.SortOrder + .newBuilder() + .setChild(expr) + .setDirectionValue( + proto.Expression.SortOrder.SortDirection.SORT_DIRECTION_ASCENDING_VALUE) + .setNullOrdering(proto.Expression.SortOrder.NullOrdering.SORT_NULLS_FIRST)) + .build() + }) + .foreach(order => expressions.addPartitionExprs(order)) + Relation + .newBuilder() + .setRepartitionByExpression(expressions) + .build() + } + + def na: DslNAFunctions = new DslNAFunctions(logicalPlan) + + def stat: DslStatFunctions = new DslStatFunctions(logicalPlan) + + def summary(statistics: String*): Relation = { + Relation + .newBuilder() + .setSummary( + proto.StatSummary + .newBuilder() + .setInput(logicalPlan) + .addAllStatistics(statistics.toSeq.asJava) + .build()) + .build() + } + + def describe(cols: String*): Relation = { + Relation + .newBuilder() + .setDescribe( + proto.StatDescribe + .newBuilder() + .setInput(logicalPlan) + .addAllCols(cols.toSeq.asJava) + .build()) + .build() + } + + def to(schema: StructType): Relation = + Relation + .newBuilder() + .setToSchema( + ToSchema + .newBuilder() + .setInput(logicalPlan) + .setSchema(DataTypeProtoConverter.toConnectProtoType(schema)) + .build()) + .build() + + def toDF(columnNames: String*): Relation = + Relation + .newBuilder() + .setToDf( + ToDF + .newBuilder() + .setInput(logicalPlan) + .addAllColumnNames(columnNames.asJava)) + .build() + + def withColumnsRenamed(renameColumnsMap: Map[String, String]): Relation = { + Relation + .newBuilder() + .setWithColumnsRenamed( + WithColumnsRenamed + .newBuilder() + .setInput(logicalPlan) + .putAllRenameColumnsMap(renameColumnsMap.asJava)) + .build() + } + + def withColumns(colsMap: Map[String, Expression]): Relation = { + Relation + .newBuilder() + .setWithColumns( + WithColumns + .newBuilder() + .setInput(logicalPlan) + .addAllAliases(colsMap.map { case (k, v) => + Expression.Alias.newBuilder().addName(k).setExpr(v).build() + }.asJava)) + .build() + } + + def hint(name: String, parameters: Any*): Relation = { + val expressions = parameters.map { parameter => + proto.Expression.newBuilder().setLiteral(toLiteralProto(parameter)).build() + } + + Relation + .newBuilder() + .setHint( + Hint + .newBuilder() + .setInput(logicalPlan) + .setName(name) + .addAllParameters(expressions.asJava)) + .build() + } + + def unpivot( + ids: Seq[Expression], + values: Seq[Expression], + variableColumnName: String, + valueColumnName: String): Relation = { + Relation + .newBuilder() + .setUnpivot( + Unpivot + .newBuilder() + .setInput(logicalPlan) + .addAllIds(ids.asJava) + .setValues(Unpivot.Values + .newBuilder() + .addAllValues(values.asJava) + .build()) + .setVariableColumnName(variableColumnName) + .setValueColumnName(valueColumnName)) + .build() + } + + def unpivot( + ids: Seq[Expression], + variableColumnName: String, + valueColumnName: String): Relation = { + Relation + .newBuilder() + .setUnpivot( + Unpivot + .newBuilder() + .setInput(logicalPlan) + .addAllIds(ids.asJava) + .setVariableColumnName(variableColumnName) + .setValueColumnName(valueColumnName)) + .build() + } + + def melt( + ids: Seq[Expression], + values: Seq[Expression], + variableColumnName: String, + valueColumnName: String): Relation = + unpivot(ids, values, variableColumnName, valueColumnName) + + def melt( + ids: Seq[Expression], + variableColumnName: String, + valueColumnName: String): Relation = + unpivot(ids, variableColumnName, valueColumnName) + + def randomSplit(weights: Array[Double], seed: Long): Array[Relation] = { + require( + weights.forall(_ >= 0), + s"Weights must be nonnegative, but got ${weights.mkString("[", ",", "]")}") + require( + weights.sum > 0, + s"Sum of weights must be positive, but got ${weights.mkString("[", ",", "]")}") + + val sum = weights.toSeq.sum + val normalizedCumWeights = weights.map(_ / sum).scanLeft(0.0d)(_ + _) + normalizedCumWeights + .sliding(2) + .map { x => + Relation + .newBuilder() + .setSample( + Sample + .newBuilder() + .setInput(logicalPlan) + .setLowerBound(x(0)) + .setUpperBound(x(1)) + .setWithReplacement(false) + .setSeed(seed) + .setDeterministicOrder(true) + .build()) + .build() + } + .toArray + } + + def randomSplit(weights: Array[Double]): Array[Relation] = + randomSplit(weights, Utils.random.nextLong) + + def observe(name: String, expr: Expression, exprs: Expression*): Relation = { + Relation + .newBuilder() + .setCollectMetrics( + CollectMetrics + .newBuilder() + .setInput(logicalPlan) + .setName(name) + .addAllMetrics((expr +: exprs).asJava)) + .build() + } + + def observe(observation: Observation, expr: Expression, exprs: Expression*): Relation = { + Relation + .newBuilder() + .setCollectMetrics( + CollectMetrics + .newBuilder() + .setInput(logicalPlan) + .setName(observation.name) + .addAllMetrics((expr +: exprs).asJava)) + .build() + } + + private def createSetOperation( + left: Relation, + right: Relation, + t: SetOpType, + isAll: Boolean = true, + byName: Boolean = false, + allowMissingColumns: Boolean = false): SetOperation.Builder = { + val setOp = SetOperation + .newBuilder() + .setLeftInput(left) + .setRightInput(right) + .setSetOpType(t) + .setIsAll(isAll) + .setByName(byName) + .setAllowMissingColumns(allowMissingColumns) + setOp + } + } + } +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/LiteralExpressionProtoConverter.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/LiteralExpressionProtoConverter.scala new file mode 100644 index 0000000000000..9f2baea573764 --- /dev/null +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/LiteralExpressionProtoConverter.scala @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.planner + +import scala.collection.mutable +import scala.reflect.ClassTag + +import org.apache.spark.connect.proto +import org.apache.spark.sql.catalyst.expressions +import org.apache.spark.sql.catalyst.util.{DateTimeUtils, IntervalUtils} +import org.apache.spark.sql.connect.common.{DataTypeProtoConverter, InvalidPlanInput} +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.{CalendarInterval, UTF8String} + +object LiteralExpressionProtoConverter { + + /** + * Transforms the protocol buffers literals into the appropriate Catalyst literal expression. + * + * @return + * Expression + */ + def toCatalystExpression(lit: proto.Expression.Literal): expressions.Literal = { + lit.getLiteralTypeCase match { + case proto.Expression.Literal.LiteralTypeCase.NULL => + expressions.Literal(null, DataTypeProtoConverter.toCatalystType(lit.getNull)) + + case proto.Expression.Literal.LiteralTypeCase.BINARY => + expressions.Literal(lit.getBinary.toByteArray, BinaryType) + + case proto.Expression.Literal.LiteralTypeCase.BOOLEAN => + expressions.Literal(lit.getBoolean, BooleanType) + + case proto.Expression.Literal.LiteralTypeCase.BYTE => + expressions.Literal(lit.getByte.toByte, ByteType) + + case proto.Expression.Literal.LiteralTypeCase.SHORT => + expressions.Literal(lit.getShort.toShort, ShortType) + + case proto.Expression.Literal.LiteralTypeCase.INTEGER => + expressions.Literal(lit.getInteger, IntegerType) + + case proto.Expression.Literal.LiteralTypeCase.LONG => + expressions.Literal(lit.getLong, LongType) + + case proto.Expression.Literal.LiteralTypeCase.FLOAT => + expressions.Literal(lit.getFloat, FloatType) + + case proto.Expression.Literal.LiteralTypeCase.DOUBLE => + expressions.Literal(lit.getDouble, DoubleType) + + case proto.Expression.Literal.LiteralTypeCase.DECIMAL => + val decimal = Decimal.apply(lit.getDecimal.getValue) + var precision = decimal.precision + if (lit.getDecimal.hasPrecision) { + precision = math.max(precision, lit.getDecimal.getPrecision) + } + var scale = decimal.scale + if (lit.getDecimal.hasScale) { + scale = math.max(scale, lit.getDecimal.getScale) + } + expressions.Literal(decimal, DecimalType(math.max(precision, scale), scale)) + + case proto.Expression.Literal.LiteralTypeCase.STRING => + expressions.Literal(UTF8String.fromString(lit.getString), StringType) + + case proto.Expression.Literal.LiteralTypeCase.DATE => + expressions.Literal(lit.getDate, DateType) + + case proto.Expression.Literal.LiteralTypeCase.TIMESTAMP => + expressions.Literal(lit.getTimestamp, TimestampType) + + case proto.Expression.Literal.LiteralTypeCase.TIMESTAMP_NTZ => + expressions.Literal(lit.getTimestampNtz, TimestampNTZType) + + case proto.Expression.Literal.LiteralTypeCase.CALENDAR_INTERVAL => + val interval = new CalendarInterval( + lit.getCalendarInterval.getMonths, + lit.getCalendarInterval.getDays, + lit.getCalendarInterval.getMicroseconds) + expressions.Literal(interval, CalendarIntervalType) + + case proto.Expression.Literal.LiteralTypeCase.YEAR_MONTH_INTERVAL => + expressions.Literal(lit.getYearMonthInterval, YearMonthIntervalType()) + + case proto.Expression.Literal.LiteralTypeCase.DAY_TIME_INTERVAL => + expressions.Literal(lit.getDayTimeInterval, DayTimeIntervalType()) + + case proto.Expression.Literal.LiteralTypeCase.ARRAY => + expressions.Literal.create( + toArrayData(lit.getArray), + ArrayType(DataTypeProtoConverter.toCatalystType(lit.getArray.getElementType))) + + case _ => + throw InvalidPlanInput( + s"Unsupported Literal Type: ${lit.getLiteralTypeCase.getNumber}" + + s"(${lit.getLiteralTypeCase.name})") + } + } + + def toCatalystValue(lit: proto.Expression.Literal): Any = { + lit.getLiteralTypeCase match { + case proto.Expression.Literal.LiteralTypeCase.STRING => lit.getString + + case _ => toCatalystExpression(lit).value + } + } + + private def toArrayData(array: proto.Expression.Literal.Array): Any = { + def makeArrayData[T](converter: proto.Expression.Literal => T)(implicit + tag: ClassTag[T]): Array[T] = { + val builder = mutable.ArrayBuilder.make[T] + val elementList = array.getElementsList + builder.sizeHint(elementList.size()) + val iter = elementList.iterator() + while (iter.hasNext) { + builder += converter(iter.next()) + } + builder.result() + } + + val elementType = array.getElementType + if (elementType.hasShort) { + makeArrayData(v => v.getShort.toShort) + } else if (elementType.hasInteger) { + makeArrayData(v => v.getInteger) + } else if (elementType.hasLong) { + makeArrayData(v => v.getLong) + } else if (elementType.hasDouble) { + makeArrayData(v => v.getDouble) + } else if (elementType.hasByte) { + makeArrayData(v => v.getByte.toByte) + } else if (elementType.hasFloat) { + makeArrayData(v => v.getFloat) + } else if (elementType.hasBoolean) { + makeArrayData(v => v.getBoolean) + } else if (elementType.hasString) { + makeArrayData(v => v.getString) + } else if (elementType.hasBinary) { + makeArrayData(v => v.getBinary.toByteArray) + } else if (elementType.hasDate) { + makeArrayData(v => DateTimeUtils.toJavaDate(v.getDate)) + } else if (elementType.hasTimestamp) { + makeArrayData(v => DateTimeUtils.toJavaTimestamp(v.getTimestamp)) + } else if (elementType.hasTimestampNtz) { + makeArrayData(v => DateTimeUtils.microsToLocalDateTime(v.getTimestampNtz)) + } else if (elementType.hasDayTimeInterval) { + makeArrayData(v => IntervalUtils.microsToDuration(v.getDayTimeInterval)) + } else if (elementType.hasYearMonthInterval) { + makeArrayData(v => IntervalUtils.monthsToPeriod(v.getYearMonthInterval)) + } else if (elementType.hasDecimal) { + makeArrayData(v => Decimal(v.getDecimal.getValue)) + } else if (elementType.hasCalendarInterval) { + makeArrayData(v => { + val interval = v.getCalendarInterval + new CalendarInterval(interval.getMonths, interval.getDays, interval.getMicroseconds) + }) + } else if (elementType.hasArray) { + makeArrayData(v => toArrayData(v.getArray)) + } else { + throw InvalidPlanInput(s"Unsupported Literal Type: $elementType)") + } + } +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SaveModeConverter.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SaveModeConverter.scala new file mode 100644 index 0000000000000..b2052f580b054 --- /dev/null +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SaveModeConverter.scala @@ -0,0 +1,49 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.planner + +import org.apache.spark.connect.proto +import org.apache.spark.sql.SaveMode + +/** + * Helper class for conversions between [[SaveMode]] and [[proto.WriteOperation.SaveMode]]. + */ +object SaveModeConverter { + def toSaveMode(mode: proto.WriteOperation.SaveMode): SaveMode = { + mode match { + case proto.WriteOperation.SaveMode.SAVE_MODE_APPEND => SaveMode.Append + case proto.WriteOperation.SaveMode.SAVE_MODE_IGNORE => SaveMode.Ignore + case proto.WriteOperation.SaveMode.SAVE_MODE_OVERWRITE => SaveMode.Overwrite + case proto.WriteOperation.SaveMode.SAVE_MODE_ERROR_IF_EXISTS => SaveMode.ErrorIfExists + case _ => + throw new IllegalArgumentException( + s"Cannot convert from WriteOperation.SaveMode to Spark SaveMode: ${mode.getNumber}") + } + } + + def toSaveModeProto(mode: SaveMode): proto.WriteOperation.SaveMode = { + mode match { + case SaveMode.Append => proto.WriteOperation.SaveMode.SAVE_MODE_APPEND + case SaveMode.Ignore => proto.WriteOperation.SaveMode.SAVE_MODE_IGNORE + case SaveMode.Overwrite => proto.WriteOperation.SaveMode.SAVE_MODE_OVERWRITE + case SaveMode.ErrorIfExists => proto.WriteOperation.SaveMode.SAVE_MODE_ERROR_IF_EXISTS + case _ => + throw new IllegalArgumentException( + s"Cannot convert from SaveMode to WriteOperation.SaveMode: ${mode.name()}") + } + } +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala new file mode 100644 index 0000000000000..0f3189e601342 --- /dev/null +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/SparkConnectPlanner.scala @@ -0,0 +1,2145 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.planner + +import scala.collection.JavaConverters._ +import scala.collection.mutable + +import com.google.common.collect.{Lists, Maps} +import com.google.protobuf.{Any => ProtoAny, ByteString} +import io.grpc.stub.StreamObserver + +import org.apache.spark.{Partition, SparkEnv, TaskContext} +import org.apache.spark.api.python.{PythonEvalType, SimplePythonFunction} +import org.apache.spark.connect.proto +import org.apache.spark.connect.proto.{ExecutePlanResponse, SqlCommand} +import org.apache.spark.connect.proto.ExecutePlanResponse.SqlCommandResult +import org.apache.spark.connect.proto.Parse.ParseFormat +import org.apache.spark.sql.{Column, Dataset, Encoders, SparkSession} +import org.apache.spark.sql.catalyst.{expressions, AliasIdentifier, FunctionIdentifier} +import org.apache.spark.sql.catalyst.analysis.{GlobalTempView, LocalTempView, MultiAlias, ParameterizedQuery, UnresolvedAlias, UnresolvedAttribute, UnresolvedExtractValue, UnresolvedFunction, UnresolvedRegex, UnresolvedRelation, UnresolvedStar} +import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder +import org.apache.spark.sql.catalyst.expressions._ +import org.apache.spark.sql.catalyst.parser.{CatalystSqlParser, ParseException, ParserUtils} +import org.apache.spark.sql.catalyst.plans.{Cross, FullOuter, Inner, JoinType, LeftAnti, LeftOuter, LeftSemi, RightOuter, UsingJoin} +import org.apache.spark.sql.catalyst.plans.logical +import org.apache.spark.sql.catalyst.plans.logical.{CollectMetrics, CommandResult, Deduplicate, Except, Intersect, LocalRelation, LogicalPlan, Project, Sample, Sort, SubqueryAlias, Union, Unpivot, UnresolvedHint} +import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, CharVarcharUtils} +import org.apache.spark.sql.connect.common.{DataTypeProtoConverter, InvalidPlanInput, UdfPacket} +import org.apache.spark.sql.connect.config.Connect.CONNECT_GRPC_ARROW_MAX_BATCH_SIZE +import org.apache.spark.sql.connect.planner.LiteralExpressionProtoConverter.{toCatalystExpression, toCatalystValue} +import org.apache.spark.sql.connect.plugin.SparkConnectPluginRegistry +import org.apache.spark.sql.connect.service.SparkConnectStreamHandler +import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.execution.QueryExecution +import org.apache.spark.sql.execution.arrow.ArrowConverters +import org.apache.spark.sql.execution.command.CreateViewCommand +import org.apache.spark.sql.execution.datasources.LogicalRelation +import org.apache.spark.sql.execution.datasources.jdbc.{JDBCOptions, JDBCPartition, JDBCRelation} +import org.apache.spark.sql.execution.python.UserDefinedPythonFunction +import org.apache.spark.sql.internal.CatalogImpl +import org.apache.spark.sql.types._ +import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.util.Utils + +final case class InvalidCommandInput( + private val message: String = "", + private val cause: Throwable = null) + extends Exception(message, cause) + +class SparkConnectPlanner(val session: SparkSession) { + private lazy val pythonExec = + sys.env.getOrElse("PYSPARK_PYTHON", sys.env.getOrElse("PYSPARK_DRIVER_PYTHON", "python3")) + + // The root of the query plan is a relation and we apply the transformations to it. + def transformRelation(rel: proto.Relation): LogicalPlan = { + val plan = rel.getRelTypeCase match { + // DataFrame API + case proto.Relation.RelTypeCase.SHOW_STRING => transformShowString(rel.getShowString) + case proto.Relation.RelTypeCase.READ => transformReadRel(rel.getRead) + case proto.Relation.RelTypeCase.PROJECT => transformProject(rel.getProject) + case proto.Relation.RelTypeCase.FILTER => transformFilter(rel.getFilter) + case proto.Relation.RelTypeCase.LIMIT => transformLimit(rel.getLimit) + case proto.Relation.RelTypeCase.OFFSET => transformOffset(rel.getOffset) + case proto.Relation.RelTypeCase.TAIL => transformTail(rel.getTail) + case proto.Relation.RelTypeCase.JOIN => transformJoin(rel.getJoin) + case proto.Relation.RelTypeCase.DEDUPLICATE => transformDeduplicate(rel.getDeduplicate) + case proto.Relation.RelTypeCase.SET_OP => transformSetOperation(rel.getSetOp) + case proto.Relation.RelTypeCase.SORT => transformSort(rel.getSort) + case proto.Relation.RelTypeCase.DROP => transformDrop(rel.getDrop) + case proto.Relation.RelTypeCase.AGGREGATE => transformAggregate(rel.getAggregate) + case proto.Relation.RelTypeCase.SQL => transformSql(rel.getSql) + case proto.Relation.RelTypeCase.LOCAL_RELATION => + transformLocalRelation(rel.getLocalRelation) + case proto.Relation.RelTypeCase.SAMPLE => transformSample(rel.getSample) + case proto.Relation.RelTypeCase.RANGE => transformRange(rel.getRange) + case proto.Relation.RelTypeCase.SUBQUERY_ALIAS => + transformSubqueryAlias(rel.getSubqueryAlias) + case proto.Relation.RelTypeCase.REPARTITION => transformRepartition(rel.getRepartition) + case proto.Relation.RelTypeCase.FILL_NA => transformNAFill(rel.getFillNa) + case proto.Relation.RelTypeCase.DROP_NA => transformNADrop(rel.getDropNa) + case proto.Relation.RelTypeCase.REPLACE => transformReplace(rel.getReplace) + case proto.Relation.RelTypeCase.SUMMARY => transformStatSummary(rel.getSummary) + case proto.Relation.RelTypeCase.DESCRIBE => transformStatDescribe(rel.getDescribe) + case proto.Relation.RelTypeCase.COV => transformStatCov(rel.getCov) + case proto.Relation.RelTypeCase.CORR => transformStatCorr(rel.getCorr) + case proto.Relation.RelTypeCase.APPROX_QUANTILE => + transformStatApproxQuantile(rel.getApproxQuantile) + case proto.Relation.RelTypeCase.CROSSTAB => + transformStatCrosstab(rel.getCrosstab) + case proto.Relation.RelTypeCase.FREQ_ITEMS => transformStatFreqItems(rel.getFreqItems) + case proto.Relation.RelTypeCase.SAMPLE_BY => + transformStatSampleBy(rel.getSampleBy) + case proto.Relation.RelTypeCase.TO_SCHEMA => transformToSchema(rel.getToSchema) + case proto.Relation.RelTypeCase.TO_DF => + transformToDF(rel.getToDf) + case proto.Relation.RelTypeCase.WITH_COLUMNS_RENAMED => + transformWithColumnsRenamed(rel.getWithColumnsRenamed) + case proto.Relation.RelTypeCase.WITH_COLUMNS => transformWithColumns(rel.getWithColumns) + case proto.Relation.RelTypeCase.HINT => transformHint(rel.getHint) + case proto.Relation.RelTypeCase.UNPIVOT => transformUnpivot(rel.getUnpivot) + case proto.Relation.RelTypeCase.REPARTITION_BY_EXPRESSION => + transformRepartitionByExpression(rel.getRepartitionByExpression) + case proto.Relation.RelTypeCase.MAP_PARTITIONS => + transformMapPartitions(rel.getMapPartitions) + case proto.Relation.RelTypeCase.GROUP_MAP => + transformGroupMap(rel.getGroupMap) + case proto.Relation.RelTypeCase.CO_GROUP_MAP => + transformCoGroupMap(rel.getCoGroupMap) + case proto.Relation.RelTypeCase.COLLECT_METRICS => + transformCollectMetrics(rel.getCollectMetrics) + case proto.Relation.RelTypeCase.PARSE => transformParse(rel.getParse) + case proto.Relation.RelTypeCase.RELTYPE_NOT_SET => + throw new IndexOutOfBoundsException("Expected Relation to be set, but is empty.") + + // Catalog API (internal-only) + case proto.Relation.RelTypeCase.CATALOG => transformCatalog(rel.getCatalog) + + // Handle plugins for Spark Connect Relation types. + case proto.Relation.RelTypeCase.EXTENSION => + transformRelationPlugin(rel.getExtension) + case _ => throw InvalidPlanInput(s"${rel.getUnknown} not supported.") + } + + if (rel.hasCommon && rel.getCommon.hasPlanId) { + plan.setTagValue(LogicalPlan.PLAN_ID_TAG, rel.getCommon.getPlanId) + } + plan + } + + private def transformRelationPlugin(extension: ProtoAny): LogicalPlan = { + SparkConnectPluginRegistry.relationRegistry + // Lazily traverse the collection. + .view + // Apply the transformation. + .map(p => p.transform(extension, this)) + // Find the first non-empty transformation or throw. + .find(_.nonEmpty) + .flatten + .getOrElse(throw InvalidPlanInput("No handler found for extension")) + } + + private def transformCatalog(catalog: proto.Catalog): LogicalPlan = { + catalog.getCatTypeCase match { + case proto.Catalog.CatTypeCase.CURRENT_DATABASE => + transformCurrentDatabase(catalog.getCurrentDatabase) + case proto.Catalog.CatTypeCase.SET_CURRENT_DATABASE => + transformSetCurrentDatabase(catalog.getSetCurrentDatabase) + case proto.Catalog.CatTypeCase.LIST_DATABASES => + transformListDatabases(catalog.getListDatabases) + case proto.Catalog.CatTypeCase.LIST_TABLES => transformListTables(catalog.getListTables) + case proto.Catalog.CatTypeCase.LIST_FUNCTIONS => + transformListFunctions(catalog.getListFunctions) + case proto.Catalog.CatTypeCase.LIST_COLUMNS => transformListColumns(catalog.getListColumns) + case proto.Catalog.CatTypeCase.GET_DATABASE => transformGetDatabase(catalog.getGetDatabase) + case proto.Catalog.CatTypeCase.GET_TABLE => transformGetTable(catalog.getGetTable) + case proto.Catalog.CatTypeCase.GET_FUNCTION => transformGetFunction(catalog.getGetFunction) + case proto.Catalog.CatTypeCase.DATABASE_EXISTS => + transformDatabaseExists(catalog.getDatabaseExists) + case proto.Catalog.CatTypeCase.TABLE_EXISTS => transformTableExists(catalog.getTableExists) + case proto.Catalog.CatTypeCase.FUNCTION_EXISTS => + transformFunctionExists(catalog.getFunctionExists) + case proto.Catalog.CatTypeCase.CREATE_EXTERNAL_TABLE => + transformCreateExternalTable(catalog.getCreateExternalTable) + case proto.Catalog.CatTypeCase.CREATE_TABLE => transformCreateTable(catalog.getCreateTable) + case proto.Catalog.CatTypeCase.DROP_TEMP_VIEW => + transformDropTempView(catalog.getDropTempView) + case proto.Catalog.CatTypeCase.DROP_GLOBAL_TEMP_VIEW => + transformDropGlobalTempView(catalog.getDropGlobalTempView) + case proto.Catalog.CatTypeCase.RECOVER_PARTITIONS => + transformRecoverPartitions(catalog.getRecoverPartitions) + case proto.Catalog.CatTypeCase.IS_CACHED => transformIsCached(catalog.getIsCached) + case proto.Catalog.CatTypeCase.CACHE_TABLE => transformCacheTable(catalog.getCacheTable) + case proto.Catalog.CatTypeCase.UNCACHE_TABLE => + transformUncacheTable(catalog.getUncacheTable) + case proto.Catalog.CatTypeCase.CLEAR_CACHE => transformClearCache(catalog.getClearCache) + case proto.Catalog.CatTypeCase.REFRESH_TABLE => + transformRefreshTable(catalog.getRefreshTable) + case proto.Catalog.CatTypeCase.REFRESH_BY_PATH => + transformRefreshByPath(catalog.getRefreshByPath) + case proto.Catalog.CatTypeCase.CURRENT_CATALOG => + transformCurrentCatalog(catalog.getCurrentCatalog) + case proto.Catalog.CatTypeCase.SET_CURRENT_CATALOG => + transformSetCurrentCatalog(catalog.getSetCurrentCatalog) + case proto.Catalog.CatTypeCase.LIST_CATALOGS => + transformListCatalogs(catalog.getListCatalogs) + case other => throw InvalidPlanInput(s"$other not supported.") + } + } + + private def transformShowString(rel: proto.ShowString): LogicalPlan = { + val showString = Dataset + .ofRows(session, transformRelation(rel.getInput)) + .showString(rel.getNumRows, rel.getTruncate, rel.getVertical) + LocalRelation.fromProduct( + output = AttributeReference("show_string", StringType, false)() :: Nil, + data = Tuple1.apply(showString) :: Nil) + } + + private def transformSql(sql: proto.SQL): LogicalPlan = { + val args = sql.getArgsMap + val parser = session.sessionState.sqlParser + val parsedPlan = parser.parsePlan(sql.getQuery) + if (!args.isEmpty) { + ParameterizedQuery(parsedPlan, args.asScala.mapValues(transformLiteral).toMap) + } else { + parsedPlan + } + } + + private def transformSubqueryAlias(alias: proto.SubqueryAlias): LogicalPlan = { + val aliasIdentifier = + if (alias.getQualifierCount > 0) { + AliasIdentifier.apply(alias.getAlias, alias.getQualifierList.asScala.toSeq) + } else { + AliasIdentifier.apply(alias.getAlias) + } + SubqueryAlias(aliasIdentifier, transformRelation(alias.getInput)) + } + + /** + * All fields of [[proto.Sample]] are optional. However, given those are proto primitive types, + * we cannot differentiate if the field is not or set when the field's value equals to the type + * default value. In the future if this ever become a problem, one solution could be that to + * wrap such fields into proto messages. + */ + private def transformSample(rel: proto.Sample): LogicalPlan = { + val plan = if (rel.getDeterministicOrder) { + val input = Dataset.ofRows(session, transformRelation(rel.getInput)) + + // It is possible that the underlying dataframe doesn't guarantee the ordering of rows in its + // constituent partitions each time a split is materialized which could result in + // overlapping splits. To prevent this, we explicitly sort each input partition to make the + // ordering deterministic. Note that MapTypes cannot be sorted and are explicitly pruned out + // from the sort order. + val sortOrder = input.logicalPlan.output + .filter(attr => RowOrdering.isOrderable(attr.dataType)) + .map(SortOrder(_, Ascending)) + if (sortOrder.nonEmpty) { + Sort(sortOrder, global = false, input.logicalPlan) + } else { + input.cache() + input.logicalPlan + } + } else { + transformRelation(rel.getInput) + } + + Sample( + rel.getLowerBound, + rel.getUpperBound, + rel.getWithReplacement, + if (rel.hasSeed) rel.getSeed else Utils.random.nextLong, + plan) + } + + private def transformRepartition(rel: proto.Repartition): LogicalPlan = { + logical.Repartition(rel.getNumPartitions, rel.getShuffle, transformRelation(rel.getInput)) + } + + private def transformRange(rel: proto.Range): LogicalPlan = { + val start = rel.getStart + val end = rel.getEnd + val step = rel.getStep + val numPartitions = if (rel.hasNumPartitions) { + rel.getNumPartitions + } else { + session.leafNodeDefaultParallelism + } + logical.Range(start, end, step, numPartitions) + } + + private def transformNAFill(rel: proto.NAFill): LogicalPlan = { + if (rel.getValuesCount == 0) { + throw InvalidPlanInput(s"values must contains at least 1 item!") + } + if (rel.getValuesCount > 1 && rel.getValuesCount != rel.getColsCount) { + throw InvalidPlanInput( + s"When values contains more than 1 items, " + + s"values and cols should have the same length!") + } + + val dataset = Dataset.ofRows(session, transformRelation(rel.getInput)) + + val cols = rel.getColsList.asScala.toArray + val values = rel.getValuesList.asScala.toArray + if (values.length == 1) { + val value = values.head + value.getLiteralTypeCase match { + case proto.Expression.Literal.LiteralTypeCase.BOOLEAN => + if (cols.nonEmpty) { + dataset.na.fill(value = value.getBoolean, cols = cols).logicalPlan + } else { + dataset.na.fill(value = value.getBoolean).logicalPlan + } + case proto.Expression.Literal.LiteralTypeCase.LONG => + if (cols.nonEmpty) { + dataset.na.fill(value = value.getLong, cols = cols).logicalPlan + } else { + dataset.na.fill(value = value.getLong).logicalPlan + } + case proto.Expression.Literal.LiteralTypeCase.DOUBLE => + if (cols.nonEmpty) { + dataset.na.fill(value = value.getDouble, cols = cols).logicalPlan + } else { + dataset.na.fill(value = value.getDouble).logicalPlan + } + case proto.Expression.Literal.LiteralTypeCase.STRING => + if (cols.nonEmpty) { + dataset.na.fill(value = value.getString, cols = cols).logicalPlan + } else { + dataset.na.fill(value = value.getString).logicalPlan + } + case other => throw InvalidPlanInput(s"Unsupported value type: $other") + } + } else { + val valueMap = mutable.Map.empty[String, Any] + cols.zip(values).foreach { case (col, value) => + valueMap.update(col, toCatalystValue(value)) + } + dataset.na.fill(valueMap = valueMap.toMap).logicalPlan + } + } + + private def transformNADrop(rel: proto.NADrop): LogicalPlan = { + val dataset = Dataset.ofRows(session, transformRelation(rel.getInput)) + + val cols = rel.getColsList.asScala.toArray + + (cols.nonEmpty, rel.hasMinNonNulls) match { + case (true, true) => + dataset.na.drop(minNonNulls = rel.getMinNonNulls, cols = cols).logicalPlan + case (true, false) => + dataset.na.drop(cols = cols).logicalPlan + case (false, true) => + dataset.na.drop(minNonNulls = rel.getMinNonNulls).logicalPlan + case (false, false) => + dataset.na.drop().logicalPlan + } + } + + private def transformReplace(rel: proto.NAReplace): LogicalPlan = { + val replacement = mutable.Map.empty[Any, Any] + rel.getReplacementsList.asScala.foreach { replace => + replacement.update( + toCatalystValue(replace.getOldValue), + toCatalystValue(replace.getNewValue)) + } + + if (rel.getColsCount == 0) { + Dataset + .ofRows(session, transformRelation(rel.getInput)) + .na + .replace("*", replacement.toMap) + .logicalPlan + } else { + Dataset + .ofRows(session, transformRelation(rel.getInput)) + .na + .replace(rel.getColsList.asScala.toSeq, replacement.toMap) + .logicalPlan + } + } + + private def transformStatSummary(rel: proto.StatSummary): LogicalPlan = { + Dataset + .ofRows(session, transformRelation(rel.getInput)) + .summary(rel.getStatisticsList.asScala.toSeq: _*) + .logicalPlan + } + + private def transformStatDescribe(rel: proto.StatDescribe): LogicalPlan = { + Dataset + .ofRows(session, transformRelation(rel.getInput)) + .describe(rel.getColsList.asScala.toSeq: _*) + .logicalPlan + } + + private def transformStatCov(rel: proto.StatCov): LogicalPlan = { + val cov = Dataset + .ofRows(session, transformRelation(rel.getInput)) + .stat + .cov(rel.getCol1, rel.getCol2) + LocalRelation.fromProduct( + output = AttributeReference("cov", DoubleType, false)() :: Nil, + data = Tuple1.apply(cov) :: Nil) + } + + private def transformStatCorr(rel: proto.StatCorr): LogicalPlan = { + val df = Dataset.ofRows(session, transformRelation(rel.getInput)) + val corr = if (rel.hasMethod) { + df.stat.corr(rel.getCol1, rel.getCol2, rel.getMethod) + } else { + df.stat.corr(rel.getCol1, rel.getCol2) + } + + LocalRelation.fromProduct( + output = AttributeReference("corr", DoubleType, false)() :: Nil, + data = Tuple1.apply(corr) :: Nil) + } + + private def transformStatApproxQuantile(rel: proto.StatApproxQuantile): LogicalPlan = { + val cols = rel.getColsList.asScala.toArray + val probabilities = rel.getProbabilitiesList.asScala.map(_.doubleValue()).toArray + val approxQuantile = Dataset + .ofRows(session, transformRelation(rel.getInput)) + .stat + .approxQuantile(cols, probabilities, rel.getRelativeError) + LocalRelation.fromProduct( + output = + AttributeReference("approx_quantile", ArrayType(ArrayType(DoubleType)), false)() :: Nil, + data = Tuple1.apply(approxQuantile) :: Nil) + } + + private def transformStatCrosstab(rel: proto.StatCrosstab): LogicalPlan = { + Dataset + .ofRows(session, transformRelation(rel.getInput)) + .stat + .crosstab(rel.getCol1, rel.getCol2) + .logicalPlan + } + + private def transformStatFreqItems(rel: proto.StatFreqItems): LogicalPlan = { + val cols = rel.getColsList.asScala.toSeq + val df = Dataset.ofRows(session, transformRelation(rel.getInput)) + if (rel.hasSupport) { + df.stat.freqItems(cols, rel.getSupport).logicalPlan + } else { + df.stat.freqItems(cols).logicalPlan + } + } + + private def transformStatSampleBy(rel: proto.StatSampleBy): LogicalPlan = { + val fractions = rel.getFractionsList.asScala.toSeq.map { protoFraction => + val stratum = transformLiteral(protoFraction.getStratum) match { + case Literal(s, StringType) if s != null => s.toString + case literal => literal.value + } + (stratum, protoFraction.getFraction) + } + + Dataset + .ofRows(session, transformRelation(rel.getInput)) + .stat + .sampleBy( + col = Column(transformExpression(rel.getCol)), + fractions = fractions.toMap, + seed = if (rel.hasSeed) rel.getSeed else Utils.random.nextLong) + .logicalPlan + } + + private def transformToSchema(rel: proto.ToSchema): LogicalPlan = { + val schema = transformDataType(rel.getSchema) + assert(schema.isInstanceOf[StructType]) + + Dataset + .ofRows(session, transformRelation(rel.getInput)) + .to(schema.asInstanceOf[StructType]) + .logicalPlan + } + + private def transformToDF(rel: proto.ToDF): LogicalPlan = { + Dataset + .ofRows(session, transformRelation(rel.getInput)) + .toDF(rel.getColumnNamesList.asScala.toSeq: _*) + .logicalPlan + } + + private def transformMapPartitions(rel: proto.MapPartitions): LogicalPlan = { + val commonUdf = rel.getFunc + val pythonUdf = transformPythonUDF(commonUdf) + pythonUdf.evalType match { + case PythonEvalType.SQL_MAP_PANDAS_ITER_UDF => + logical.MapInPandas( + pythonUdf, + pythonUdf.dataType.asInstanceOf[StructType].toAttributes, + transformRelation(rel.getInput)) + case PythonEvalType.SQL_MAP_ARROW_ITER_UDF => + logical.PythonMapInArrow( + pythonUdf, + pythonUdf.dataType.asInstanceOf[StructType].toAttributes, + transformRelation(rel.getInput)) + case _ => + throw InvalidPlanInput(s"Function with EvalType: ${pythonUdf.evalType} is not supported") + } + } + + private def transformGroupMap(rel: proto.GroupMap): LogicalPlan = { + val pythonUdf = transformPythonUDF(rel.getFunc) + val cols = + rel.getGroupingExpressionsList.asScala.toSeq.map(expr => Column(transformExpression(expr))) + + Dataset + .ofRows(session, transformRelation(rel.getInput)) + .groupBy(cols: _*) + .flatMapGroupsInPandas(pythonUdf) + .logicalPlan + } + + private def transformCoGroupMap(rel: proto.CoGroupMap): LogicalPlan = { + val pythonUdf = transformPythonUDF(rel.getFunc) + + val inputCols = + rel.getInputGroupingExpressionsList.asScala.toSeq.map(expr => + Column(transformExpression(expr))) + val otherCols = + rel.getOtherGroupingExpressionsList.asScala.toSeq.map(expr => + Column(transformExpression(expr))) + + val input = Dataset + .ofRows(session, transformRelation(rel.getInput)) + .groupBy(inputCols: _*) + val other = Dataset + .ofRows(session, transformRelation(rel.getOther)) + .groupBy(otherCols: _*) + + input.flatMapCoGroupsInPandas(other, pythonUdf).logicalPlan + } + + private def transformWithColumnsRenamed(rel: proto.WithColumnsRenamed): LogicalPlan = { + Dataset + .ofRows(session, transformRelation(rel.getInput)) + .withColumnsRenamed(rel.getRenameColumnsMapMap) + .logicalPlan + } + + private def transformWithColumns(rel: proto.WithColumns): LogicalPlan = { + val (colNames, cols, metadata) = + rel.getAliasesList.asScala.toSeq.map { alias => + if (alias.getNameCount != 1) { + throw InvalidPlanInput(s"""WithColumns require column name only contains one name part, + |but got ${alias.getNameList.toString}""".stripMargin) + } + + val metadata = if (alias.hasMetadata && alias.getMetadata.nonEmpty) { + Metadata.fromJson(alias.getMetadata) + } else { + Metadata.empty + } + + (alias.getName(0), Column(transformExpression(alias.getExpr)), metadata) + }.unzip3 + + Dataset + .ofRows(session, transformRelation(rel.getInput)) + .withColumns(colNames, cols, metadata) + .logicalPlan + } + + private def transformHint(rel: proto.Hint): LogicalPlan = { + + def extractValue(expr: Expression): Any = { + expr match { + case Literal(s, StringType) if s != null => + UnresolvedAttribute.quotedString(s.toString) + case literal: Literal => literal.value + case UnresolvedFunction(Seq("array"), arguments, _, _, _) => + arguments.map(extractValue).toArray + case other => + throw InvalidPlanInput( + s"Expression should be a Literal or CreateMap or CreateArray, " + + s"but got ${other.getClass} $other") + } + } + + val params = rel.getParametersList.asScala.toSeq.map(transformExpression).map(extractValue) + UnresolvedHint(rel.getName, params, transformRelation(rel.getInput)) + } + + private def transformUnpivot(rel: proto.Unpivot): LogicalPlan = { + val ids = rel.getIdsList.asScala.toArray.map { expr => + Column(transformExpression(expr)) + } + + if (!rel.hasValues) { + Unpivot( + Some(ids.map(_.named)), + None, + None, + rel.getVariableColumnName, + Seq(rel.getValueColumnName), + transformRelation(rel.getInput)) + } else { + val values = rel.getValues.getValuesList.asScala.toArray.map { expr => + Column(transformExpression(expr)) + } + + Unpivot( + Some(ids.map(_.named)), + Some(values.map(v => Seq(v.named))), + None, + rel.getVariableColumnName, + Seq(rel.getValueColumnName), + transformRelation(rel.getInput)) + } + } + + private def transformRepartitionByExpression( + rel: proto.RepartitionByExpression): LogicalPlan = { + val numPartitionsOpt = if (rel.hasNumPartitions) { + Some(rel.getNumPartitions) + } else { + None + } + val partitionExpressions = rel.getPartitionExprsList.asScala.map(transformExpression).toSeq + logical.RepartitionByExpression( + partitionExpressions, + transformRelation(rel.getInput), + numPartitionsOpt) + } + + private def transformCollectMetrics(rel: proto.CollectMetrics): LogicalPlan = { + val metrics = rel.getMetricsList.asScala.toSeq.map { expr => + Column(transformExpression(expr)) + } + + CollectMetrics(rel.getName, metrics.map(_.named), transformRelation(rel.getInput)) + } + + private def transformDeduplicate(rel: proto.Deduplicate): LogicalPlan = { + if (!rel.hasInput) { + throw InvalidPlanInput("Deduplicate needs a plan input") + } + if (rel.getAllColumnsAsKeys && rel.getColumnNamesCount > 0) { + throw InvalidPlanInput("Cannot deduplicate on both all columns and a subset of columns") + } + if (!rel.getAllColumnsAsKeys && rel.getColumnNamesCount == 0) { + throw InvalidPlanInput( + "Deduplicate requires to either deduplicate on all columns or a subset of columns") + } + val queryExecution = new QueryExecution(session, transformRelation(rel.getInput)) + val resolver = session.sessionState.analyzer.resolver + val allColumns = queryExecution.analyzed.output + if (rel.getAllColumnsAsKeys) { + Deduplicate(allColumns, queryExecution.analyzed) + } else { + val toGroupColumnNames = rel.getColumnNamesList.asScala.toSeq + val groupCols = toGroupColumnNames.flatMap { (colName: String) => + // It is possibly there are more than one columns with the same name, + // so we call filter instead of find. + val cols = allColumns.filter(col => resolver(col.name, colName)) + if (cols.isEmpty) { + throw InvalidPlanInput(s"Invalid deduplicate column ${colName}") + } + cols + } + Deduplicate(groupCols, queryExecution.analyzed) + } + } + + private def transformDataType(t: proto.DataType): DataType = { + t.getKindCase match { + case proto.DataType.KindCase.UNPARSED => + parseDatatypeString(t.getUnparsed.getDataTypeString) + case _ => DataTypeProtoConverter.toCatalystType(t) + } + } + + private[connect] def parseDatatypeString(sqlText: String): DataType = { + val parser = session.sessionState.sqlParser + try { + parser.parseTableSchema(sqlText) + } catch { + case e: ParseException => + try { + parser.parseDataType(sqlText) + } catch { + case _: ParseException => + try { + parser.parseDataType(s"struct<${sqlText.trim}>") + } catch { + case _: ParseException => + throw e + } + } + } + } + + private def transformLocalRelation(rel: proto.LocalRelation): LogicalPlan = { + var schema: StructType = null + if (rel.hasSchema) { + val schemaType = DataType.parseTypeWithFallback( + rel.getSchema, + parseDatatypeString, + fallbackParser = DataType.fromJson) + schema = schemaType match { + case s: StructType => s + case d => StructType(Seq(StructField("value", d))) + } + } + + if (rel.hasData) { + val (rows, structType) = ArrowConverters.fromBatchWithSchemaIterator( + Iterator(rel.getData.toByteArray), + TaskContext.get()) + if (structType == null) { + throw InvalidPlanInput(s"Input data for LocalRelation does not produce a schema.") + } + val attributes = structType.toAttributes + val proj = UnsafeProjection.create(attributes, attributes) + val data = rows.map(proj) + + if (schema == null) { + logical.LocalRelation(attributes, data.map(_.copy()).toSeq) + } else { + def normalize(dt: DataType): DataType = dt match { + case udt: UserDefinedType[_] => normalize(udt.sqlType) + case StructType(fields) => + val newFields = fields.zipWithIndex.map { + case (StructField(_, dataType, nullable, metadata), i) => + StructField(s"col_$i", normalize(dataType), nullable, metadata) + } + StructType(newFields) + case ArrayType(elementType, containsNull) => + ArrayType(normalize(elementType), containsNull) + case MapType(keyType, valueType, valueContainsNull) => + MapType(normalize(keyType), normalize(valueType), valueContainsNull) + case _ => dt + } + + val normalized = normalize(schema).asInstanceOf[StructType] + + val project = Dataset + .ofRows( + session, + logicalPlan = + logical.LocalRelation(normalize(structType).asInstanceOf[StructType].toAttributes)) + .toDF(normalized.names: _*) + .to(normalized) + .logicalPlan + .asInstanceOf[Project] + + val proj = UnsafeProjection.create(project.projectList, project.child.output) + logical.LocalRelation(schema.toAttributes, data.map(proj).map(_.copy()).toSeq) + } + } else { + if (schema == null) { + throw InvalidPlanInput( + s"Schema for LocalRelation is required when the input data is not provided.") + } + LocalRelation(schema.toAttributes, data = Seq.empty) + } + } + + private def transformReadRel(rel: proto.Read): LogicalPlan = { + rel.getReadTypeCase match { + case proto.Read.ReadTypeCase.NAMED_TABLE => + val multipartIdentifier = + CatalystSqlParser.parseMultipartIdentifier(rel.getNamedTable.getUnparsedIdentifier) + UnresolvedRelation( + multipartIdentifier, + new CaseInsensitiveStringMap(rel.getNamedTable.getOptionsMap)) + + case proto.Read.ReadTypeCase.DATA_SOURCE => + val localMap = CaseInsensitiveMap[String](rel.getDataSource.getOptionsMap.asScala.toMap) + val reader = session.read + if (rel.getDataSource.hasFormat) { + reader.format(rel.getDataSource.getFormat) + } + localMap.foreach { case (key, value) => reader.option(key, value) } + + if (rel.getDataSource.getFormat == "jdbc" && rel.getDataSource.getPredicatesCount > 0) { + if (!localMap.contains(JDBCOptions.JDBC_URL) || + !localMap.contains(JDBCOptions.JDBC_TABLE_NAME)) { + throw InvalidPlanInput(s"Invalid jdbc params, please specify jdbc url and table.") + } + + val url = rel.getDataSource.getOptionsMap.get(JDBCOptions.JDBC_URL) + val table = rel.getDataSource.getOptionsMap.get(JDBCOptions.JDBC_TABLE_NAME) + val options = new JDBCOptions(url, table, localMap) + val predicates = rel.getDataSource.getPredicatesList.asScala.toArray + val parts: Array[Partition] = predicates.zipWithIndex.map { case (part, i) => + JDBCPartition(part, i): Partition + } + val relation = JDBCRelation(parts, options)(session) + LogicalRelation(relation) + } else if (rel.getDataSource.getPredicatesCount == 0) { + if (rel.getDataSource.hasSchema && rel.getDataSource.getSchema.nonEmpty) { + + DataType.parseTypeWithFallback( + rel.getDataSource.getSchema, + StructType.fromDDL, + fallbackParser = DataType.fromJson) match { + case s: StructType => reader.schema(s) + case other => throw InvalidPlanInput(s"Invalid schema $other") + } + } + if (rel.getDataSource.getPathsCount == 0) { + reader.load().queryExecution.analyzed + } else if (rel.getDataSource.getPathsCount == 1) { + reader.load(rel.getDataSource.getPaths(0)).queryExecution.analyzed + } else { + reader.load(rel.getDataSource.getPathsList.asScala.toSeq: _*).queryExecution.analyzed + } + } else { + throw InvalidPlanInput( + s"Predicates are not supported for ${rel.getDataSource.getFormat} data sources.") + } + + case _ => throw InvalidPlanInput(s"Does not support ${rel.getReadTypeCase.name()}") + } + } + + private def transformParse(rel: proto.Parse): LogicalPlan = { + def dataFrameReader = { + val localMap = CaseInsensitiveMap[String](rel.getOptionsMap.asScala.toMap) + val reader = session.read + if (rel.hasSchema) { + DataTypeProtoConverter.toCatalystType(rel.getSchema) match { + case s: StructType => reader.schema(s) + case other => throw InvalidPlanInput(s"Invalid schema dataType $other") + } + } + localMap.foreach { case (key, value) => reader.option(key, value) } + reader + } + def ds: Dataset[String] = Dataset(session, transformRelation(rel.getInput))(Encoders.STRING) + + rel.getFormat match { + case ParseFormat.PARSE_FORMAT_CSV => + dataFrameReader.csv(ds).queryExecution.analyzed + case ParseFormat.PARSE_FORMAT_JSON => + dataFrameReader.json(ds).queryExecution.analyzed + case _ => throw InvalidPlanInput("Does not support " + rel.getFormat.name()) + } + } + + private def transformFilter(rel: proto.Filter): LogicalPlan = { + assert(rel.hasInput) + val baseRel = transformRelation(rel.getInput) + logical.Filter(condition = transformExpression(rel.getCondition), child = baseRel) + } + + private def transformProject(rel: proto.Project): LogicalPlan = { + val baseRel = if (rel.hasInput) { + transformRelation(rel.getInput) + } else { + logical.OneRowRelation() + } + + val projection = rel.getExpressionsList.asScala.toSeq + .map(transformExpression) + .map(toNamedExpression) + + logical.Project(projectList = projection, child = baseRel) + } + + /** + * Transforms an input protobuf expression into the Catalyst expression. This is usually not + * called directly. Typically the planner will traverse the expressions automatically, only + * plugins are expected to manually perform expression transformations. + * + * @param exp + * the input expression + * @return + * Catalyst expression + */ + def transformExpression(exp: proto.Expression): Expression = { + exp.getExprTypeCase match { + case proto.Expression.ExprTypeCase.LITERAL => transformLiteral(exp.getLiteral) + case proto.Expression.ExprTypeCase.UNRESOLVED_ATTRIBUTE => + transformUnresolvedAttribute(exp.getUnresolvedAttribute) + case proto.Expression.ExprTypeCase.UNRESOLVED_FUNCTION => + transformUnregisteredFunction(exp.getUnresolvedFunction) + .getOrElse(transformUnresolvedFunction(exp.getUnresolvedFunction)) + case proto.Expression.ExprTypeCase.ALIAS => transformAlias(exp.getAlias) + case proto.Expression.ExprTypeCase.EXPRESSION_STRING => + transformExpressionString(exp.getExpressionString) + case proto.Expression.ExprTypeCase.UNRESOLVED_STAR => + transformUnresolvedStar(exp.getUnresolvedStar) + case proto.Expression.ExprTypeCase.CAST => transformCast(exp.getCast) + case proto.Expression.ExprTypeCase.UNRESOLVED_REGEX => + transformUnresolvedRegex(exp.getUnresolvedRegex) + case proto.Expression.ExprTypeCase.UNRESOLVED_EXTRACT_VALUE => + transformUnresolvedExtractValue(exp.getUnresolvedExtractValue) + case proto.Expression.ExprTypeCase.UPDATE_FIELDS => + transformUpdateFields(exp.getUpdateFields) + case proto.Expression.ExprTypeCase.SORT_ORDER => transformSortOrder(exp.getSortOrder) + case proto.Expression.ExprTypeCase.LAMBDA_FUNCTION => + transformLambdaFunction(exp.getLambdaFunction) + case proto.Expression.ExprTypeCase.UNRESOLVED_NAMED_LAMBDA_VARIABLE => + transformUnresolvedNamedLambdaVariable(exp.getUnresolvedNamedLambdaVariable) + case proto.Expression.ExprTypeCase.WINDOW => + transformWindowExpression(exp.getWindow) + case proto.Expression.ExprTypeCase.EXTENSION => + transformExpressionPlugin(exp.getExtension) + case proto.Expression.ExprTypeCase.COMMON_INLINE_USER_DEFINED_FUNCTION => + transformCommonInlineUserDefinedFunction(exp.getCommonInlineUserDefinedFunction) + case _ => + throw InvalidPlanInput( + s"Expression with ID: ${exp.getExprTypeCase.getNumber} is not supported") + } + } + + private def toNamedExpression(expr: Expression): NamedExpression = expr match { + case named: NamedExpression => named + case expr => UnresolvedAlias(expr) + } + + private def transformUnresolvedAttribute( + attr: proto.Expression.UnresolvedAttribute): UnresolvedAttribute = { + val expr = UnresolvedAttribute.quotedString(attr.getUnparsedIdentifier) + if (attr.hasPlanId) { + expr.setTagValue(LogicalPlan.PLAN_ID_TAG, attr.getPlanId) + } + expr + } + + private def transformExpressionPlugin(extension: ProtoAny): Expression = { + SparkConnectPluginRegistry.expressionRegistry + // Lazily traverse the collection. + .view + // Apply the transformation. + .map(p => p.transform(extension, this)) + // Find the first non-empty transformation or throw. + .find(_.nonEmpty) + .flatten + .getOrElse(throw InvalidPlanInput("No handler found for extension")) + } + + /** + * Transforms the protocol buffers literals into the appropriate Catalyst literal expression. + * @return + * Expression + */ + private def transformLiteral(lit: proto.Expression.Literal): Literal = { + toCatalystExpression(lit) + } + + private def transformLimit(limit: proto.Limit): LogicalPlan = { + logical.Limit( + limitExpr = expressions.Literal(limit.getLimit, IntegerType), + transformRelation(limit.getInput)) + } + + private def transformTail(tail: proto.Tail): LogicalPlan = { + logical.Tail( + limitExpr = expressions.Literal(tail.getLimit, IntegerType), + transformRelation(tail.getInput)) + } + + private def transformOffset(offset: proto.Offset): LogicalPlan = { + logical.Offset( + offsetExpr = expressions.Literal(offset.getOffset, IntegerType), + transformRelation(offset.getInput)) + } + + /** + * Translates a scalar function from proto to the Catalyst expression. + * + * TODO(SPARK-40546) We need to homogenize the function names for binary operators. + * + * @param fun + * Proto representation of the function call. + * @return + */ + private def transformUnresolvedFunction( + fun: proto.Expression.UnresolvedFunction): Expression = { + if (fun.getIsUserDefinedFunction) { + UnresolvedFunction( + session.sessionState.sqlParser.parseFunctionIdentifier(fun.getFunctionName), + fun.getArgumentsList.asScala.map(transformExpression).toSeq, + isDistinct = fun.getIsDistinct) + } else { + UnresolvedFunction( + FunctionIdentifier(fun.getFunctionName), + fun.getArgumentsList.asScala.map(transformExpression).toSeq, + isDistinct = fun.getIsDistinct) + } + } + + /** + * Translates a user-defined function from proto to the Catalyst expression. + * + * @param fun + * Proto representation of the function call. + * @return + * Expression. + */ + private def transformCommonInlineUserDefinedFunction( + fun: proto.CommonInlineUserDefinedFunction): Expression = { + fun.getFunctionCase match { + case proto.CommonInlineUserDefinedFunction.FunctionCase.PYTHON_UDF => + transformPythonUDF(fun) + case proto.CommonInlineUserDefinedFunction.FunctionCase.SCALAR_SCALA_UDF => + transformScalarScalaUDF(fun) + case _ => + throw InvalidPlanInput( + s"Function with ID: ${fun.getFunctionCase.getNumber} is not supported") + } + } + + /** + * Translates a Scalar Scala user-defined function from proto to the Catalyst expression. + * + * @param fun + * Proto representation of the Scalar Scalar user-defined function. + * @return + * ScalaUDF. + */ + private def transformScalarScalaUDF(fun: proto.CommonInlineUserDefinedFunction): ScalaUDF = { + val udf = fun.getScalarScalaUdf + val udfPacket = + Utils.deserialize[UdfPacket](udf.getPayload.toByteArray, Utils.getContextOrSparkClassLoader) + ScalaUDF( + function = udfPacket.function, + dataType = udfPacket.outputEncoder.dataType, + children = fun.getArgumentsList.asScala.map(transformExpression).toSeq, + inputEncoders = udfPacket.inputEncoders.map(e => Option(ExpressionEncoder(e))), + outputEncoder = Option(ExpressionEncoder(udfPacket.outputEncoder)), + udfName = Option(fun.getFunctionName), + nullable = udf.getNullable, + udfDeterministic = fun.getDeterministic) + } + + /** + * Translates a Python user-defined function from proto to the Catalyst expression. + * + * @param fun + * Proto representation of the Python user-defined function. + * @return + * PythonUDF. + */ + private def transformPythonUDF(fun: proto.CommonInlineUserDefinedFunction): PythonUDF = { + val udf = fun.getPythonUdf + PythonUDF( + name = fun.getFunctionName, + func = transformPythonFunction(udf), + dataType = transformDataType(udf.getOutputType), + children = fun.getArgumentsList.asScala.map(transformExpression).toSeq, + evalType = udf.getEvalType, + udfDeterministic = fun.getDeterministic) + } + + private def transformPythonFunction(fun: proto.PythonUDF): SimplePythonFunction = { + SimplePythonFunction( + command = fun.getCommand.toByteArray, + // Empty environment variables + envVars = Maps.newHashMap(), + // No imported Python libraries + pythonIncludes = Lists.newArrayList(), + pythonExec = pythonExec, + pythonVer = fun.getPythonVer, + // Empty broadcast variables + broadcastVars = Lists.newArrayList(), + // Null accumulator + accumulator = null) + } + + /** + * Translates a LambdaFunction from proto to the Catalyst expression. + */ + private def transformLambdaFunction(lambda: proto.Expression.LambdaFunction): LambdaFunction = { + if (lambda.getArgumentsCount == 0 || lambda.getArgumentsCount > 3) { + throw InvalidPlanInput( + "LambdaFunction requires 1 ~ 3 arguments, " + + s"but got ${lambda.getArgumentsCount} ones!") + } + + LambdaFunction( + function = transformExpression(lambda.getFunction), + arguments = lambda.getArgumentsList.asScala.toSeq + .map(transformUnresolvedNamedLambdaVariable)) + } + + private def transformUnresolvedNamedLambdaVariable( + variable: proto.Expression.UnresolvedNamedLambdaVariable): UnresolvedNamedLambdaVariable = { + if (variable.getNamePartsCount == 0) { + throw InvalidPlanInput("UnresolvedNamedLambdaVariable requires at least one name part!") + } + + UnresolvedNamedLambdaVariable(variable.getNamePartsList.asScala.toSeq) + } + + /** + * For some reason, not all functions are registered in 'FunctionRegistry'. For a unregistered + * function, we can still wrap it under the proto 'UnresolvedFunction', and then resolve it in + * this method. + */ + private def transformUnregisteredFunction( + fun: proto.Expression.UnresolvedFunction): Option[Expression] = { + fun.getFunctionName match { + case "product" => + if (fun.getArgumentsCount != 1) { + throw InvalidPlanInput("Product requires single child expression") + } + Some( + aggregate + .Product(transformExpression(fun.getArgumentsList.asScala.head)) + .toAggregateExpression()) + + case "when" => + if (fun.getArgumentsCount == 0) { + throw InvalidPlanInput("CaseWhen requires at least one child expression") + } + val children = fun.getArgumentsList.asScala.toSeq.map(transformExpression) + Some(CaseWhen.createFromParser(children)) + + case "in" => + if (fun.getArgumentsCount == 0) { + throw InvalidPlanInput("In requires at least one child expression") + } + val children = fun.getArgumentsList.asScala.toSeq.map(transformExpression) + Some(In(children.head, children.tail)) + + case "nth_value" if fun.getArgumentsCount == 3 => + // NthValue does not have a constructor which accepts Expression typed 'ignoreNulls' + val children = fun.getArgumentsList.asScala.toSeq.map(transformExpression) + val ignoreNulls = children.last match { + case Literal(bool: Boolean, BooleanType) => bool + case other => + throw InvalidPlanInput(s"ignoreNulls should be a literal boolean, but got $other") + } + Some(NthValue(children(0), children(1), ignoreNulls)) + + case "lag" if fun.getArgumentsCount == 4 => + // Lag does not have a constructor which accepts Expression typed 'ignoreNulls' + val children = fun.getArgumentsList.asScala.toSeq.map(transformExpression) + val ignoreNulls = children.last match { + case Literal(bool: Boolean, BooleanType) => bool + case other => + throw InvalidPlanInput(s"ignoreNulls should be a literal boolean, but got $other") + } + Some(Lag(children.head, children(1), children(2), ignoreNulls)) + + case "lead" if fun.getArgumentsCount == 4 => + // Lead does not have a constructor which accepts Expression typed 'ignoreNulls' + val children = fun.getArgumentsList.asScala.toSeq.map(transformExpression) + val ignoreNulls = children.last match { + case Literal(bool: Boolean, BooleanType) => bool + case other => + throw InvalidPlanInput(s"ignoreNulls should be a literal boolean, but got $other") + } + Some(Lead(children.head, children(1), children(2), ignoreNulls)) + + case "window" if 2 <= fun.getArgumentsCount && fun.getArgumentsCount <= 4 => + val children = fun.getArgumentsList.asScala.toSeq.map(transformExpression) + val timeCol = children.head + val args = children.tail.map { + case Literal(s, StringType) if s != null => s.toString + case other => + throw InvalidPlanInput( + s"windowDuration,slideDuration,startTime should be literal strings, but got $other") + } + var windowDuration: String = null + var slideDuration: String = null + var startTime: String = null + if (args.length == 3) { + windowDuration = args(0) + slideDuration = args(1) + startTime = args(2) + } else if (args.length == 2) { + windowDuration = args(0) + slideDuration = args(1) + startTime = "0 second" + } else { + windowDuration = args(0) + slideDuration = args(0) + startTime = "0 second" + } + Some( + Alias(TimeWindow(timeCol, windowDuration, slideDuration, startTime), "window")( + nonInheritableMetadataKeys = Seq(Dataset.DATASET_ID_KEY, Dataset.COL_POS_KEY))) + + case "session_window" if fun.getArgumentsCount == 2 => + val children = fun.getArgumentsList.asScala.toSeq.map(transformExpression) + val timeCol = children.head + val sessionWindow = children.last match { + case Literal(s, StringType) if s != null => SessionWindow(timeCol, s.toString) + case other => SessionWindow(timeCol, other) + } + Some( + Alias(sessionWindow, "session_window")(nonInheritableMetadataKeys = + Seq(Dataset.DATASET_ID_KEY, Dataset.COL_POS_KEY))) + + case "bucket" if fun.getArgumentsCount == 2 => + val children = fun.getArgumentsList.asScala.toSeq.map(transformExpression) + (children.head, children.last) match { + case (numBuckets: Literal, child) if numBuckets.dataType == IntegerType => + Some(Bucket(numBuckets, child)) + case (other, _) => + throw InvalidPlanInput(s"numBuckets should be a literal integer, but got $other") + } + + case "years" if fun.getArgumentsCount == 1 => + Some(Years(transformExpression(fun.getArguments(0)))) + + case "months" if fun.getArgumentsCount == 1 => + Some(Months(transformExpression(fun.getArguments(0)))) + + case "days" if fun.getArgumentsCount == 1 => + Some(Days(transformExpression(fun.getArguments(0)))) + + case "hours" if fun.getArgumentsCount == 1 => + Some(Hours(transformExpression(fun.getArguments(0)))) + + case "unwrap_udt" if fun.getArgumentsCount == 1 => + Some(UnwrapUDT(transformExpression(fun.getArguments(0)))) + + case "from_json" if Seq(2, 3).contains(fun.getArgumentsCount) => + // JsonToStructs constructor doesn't accept JSON-formatted schema. + val children = fun.getArgumentsList.asScala.toSeq.map(transformExpression) + + var schema: DataType = null + children(1) match { + case Literal(s, StringType) if s != null => + try { + schema = DataType.fromJson(s.toString) + } catch { + case _: Exception => + } + case _ => + } + + if (schema != null) { + val options = if (children.length == 3) { + // ExprUtils.convertToMapData requires the options to be resolved CreateMap, + // but the options here is not resolved yet: UnresolvedFunction("map", ...) + children(2) match { + case UnresolvedFunction(Seq("map"), arguments, _, _, _) => + ExprUtils.convertToMapData(CreateMap(arguments)) + case other => + throw InvalidPlanInput( + s"Options in from_json should be created by map, but got $other") + } + } else { + Map.empty[String, String] + } + + Some( + JsonToStructs( + schema = CharVarcharUtils.failIfHasCharVarchar(schema), + options = options, + child = children.head)) + } else { + None + } + + case _ => None + } + } + + private def transformAlias(alias: proto.Expression.Alias): NamedExpression = { + if (alias.getNameCount == 1) { + val metadata = if (alias.hasMetadata() && alias.getMetadata.nonEmpty) { + Some(Metadata.fromJson(alias.getMetadata)) + } else { + None + } + Alias(transformExpression(alias.getExpr), alias.getName(0))(explicitMetadata = metadata) + } else { + if (alias.hasMetadata) { + throw InvalidPlanInput( + "Alias expressions with more than 1 identifier must not use optional metadata.") + } + MultiAlias(transformExpression(alias.getExpr), alias.getNameList.asScala.toSeq) + } + } + + private def transformExpressionString(expr: proto.Expression.ExpressionString): Expression = { + session.sessionState.sqlParser.parseExpression(expr.getExpression) + } + + private def transformUnresolvedStar(star: proto.Expression.UnresolvedStar): UnresolvedStar = { + if (star.hasUnparsedTarget) { + val target = star.getUnparsedTarget + if (!target.endsWith(".*")) { + throw InvalidPlanInput( + s"UnresolvedStar requires a unparsed target ending with '.*', " + + s"but got $target.") + } + + UnresolvedStar( + Some(UnresolvedAttribute.parseAttributeName(target.substring(0, target.length - 2)))) + } else { + UnresolvedStar(None) + } + } + + private def transformCast(cast: proto.Expression.Cast): Expression = { + cast.getCastToTypeCase match { + case proto.Expression.Cast.CastToTypeCase.TYPE => + Cast(transformExpression(cast.getExpr), transformDataType(cast.getType)) + case _ => + Cast( + transformExpression(cast.getExpr), + session.sessionState.sqlParser.parseDataType(cast.getTypeStr)) + } + } + + private def transformUnresolvedRegex(regex: proto.Expression.UnresolvedRegex): Expression = { + val caseSensitive = session.sessionState.conf.caseSensitiveAnalysis + regex.getColName match { + case ParserUtils.escapedIdentifier(columnNameRegex) => + UnresolvedRegex(columnNameRegex, None, caseSensitive) + case ParserUtils.qualifiedEscapedIdentifier(nameParts, columnNameRegex) => + UnresolvedRegex(columnNameRegex, Some(nameParts), caseSensitive) + case _ => + val expr = UnresolvedAttribute.quotedString(regex.getColName) + if (regex.hasPlanId) { + expr.setTagValue(LogicalPlan.PLAN_ID_TAG, regex.getPlanId) + } + expr + } + } + + private def transformUnresolvedExtractValue( + extract: proto.Expression.UnresolvedExtractValue): UnresolvedExtractValue = { + UnresolvedExtractValue( + transformExpression(extract.getChild), + transformExpression(extract.getExtraction)) + } + + private def transformUpdateFields(update: proto.Expression.UpdateFields): UpdateFields = { + if (update.hasValueExpression) { + // add or replace a field + UpdateFields.apply( + col = transformExpression(update.getStructExpression), + fieldName = update.getFieldName, + expr = transformExpression(update.getValueExpression)) + } else { + // drop a field + UpdateFields.apply( + col = transformExpression(update.getStructExpression), + fieldName = update.getFieldName) + } + } + + private def transformWindowExpression(window: proto.Expression.Window) = { + if (!window.hasWindowFunction) { + throw InvalidPlanInput(s"WindowFunction is required in WindowExpression") + } + + val frameSpec = if (window.hasFrameSpec) { + val protoFrameSpec = window.getFrameSpec + + val frameType = protoFrameSpec.getFrameType match { + case proto.Expression.Window.WindowFrame.FrameType.FRAME_TYPE_ROW => RowFrame + + case proto.Expression.Window.WindowFrame.FrameType.FRAME_TYPE_RANGE => RangeFrame + + case other => throw InvalidPlanInput(s"Unknown FrameType $other") + } + + if (!protoFrameSpec.hasLower) { + throw InvalidPlanInput(s"LowerBound is required in WindowFrame") + } + val lower = protoFrameSpec.getLower.getBoundaryCase match { + case proto.Expression.Window.WindowFrame.FrameBoundary.BoundaryCase.CURRENT_ROW => + CurrentRow + + case proto.Expression.Window.WindowFrame.FrameBoundary.BoundaryCase.UNBOUNDED => + UnboundedPreceding + + case proto.Expression.Window.WindowFrame.FrameBoundary.BoundaryCase.VALUE => + transformExpression(protoFrameSpec.getLower.getValue) + + case other => throw InvalidPlanInput(s"Unknown FrameBoundary $other") + } + + if (!protoFrameSpec.hasUpper) { + throw InvalidPlanInput(s"UpperBound is required in WindowFrame") + } + val upper = protoFrameSpec.getUpper.getBoundaryCase match { + case proto.Expression.Window.WindowFrame.FrameBoundary.BoundaryCase.CURRENT_ROW => + CurrentRow + + case proto.Expression.Window.WindowFrame.FrameBoundary.BoundaryCase.UNBOUNDED => + UnboundedFollowing + + case proto.Expression.Window.WindowFrame.FrameBoundary.BoundaryCase.VALUE => + transformExpression(protoFrameSpec.getUpper.getValue) + + case other => throw InvalidPlanInput(s"Unknown FrameBoundary $other") + } + + SpecifiedWindowFrame(frameType = frameType, lower = lower, upper = upper) + + } else { + UnspecifiedFrame + } + + val windowSpec = WindowSpecDefinition( + partitionSpec = window.getPartitionSpecList.asScala.toSeq.map(transformExpression), + orderSpec = window.getOrderSpecList.asScala.toSeq.map(transformSortOrder), + frameSpecification = frameSpec) + + WindowExpression( + windowFunction = transformExpression(window.getWindowFunction), + windowSpec = windowSpec) + } + + private def transformSetOperation(u: proto.SetOperation): LogicalPlan = { + if (!u.hasLeftInput || !u.hasRightInput) { + throw InvalidPlanInput("Set operation must have 2 inputs") + } + val leftPlan = transformRelation(u.getLeftInput) + val rightPlan = transformRelation(u.getRightInput) + val isAll = if (u.hasIsAll) u.getIsAll else false + + u.getSetOpType match { + case proto.SetOperation.SetOpType.SET_OP_TYPE_EXCEPT => + if (u.getByName) { + throw InvalidPlanInput("Except does not support union_by_name") + } + Except(leftPlan, rightPlan, isAll) + case proto.SetOperation.SetOpType.SET_OP_TYPE_INTERSECT => + if (u.getByName) { + throw InvalidPlanInput("Intersect does not support union_by_name") + } + Intersect(leftPlan, rightPlan, isAll) + case proto.SetOperation.SetOpType.SET_OP_TYPE_UNION => + if (!u.getByName && u.getAllowMissingColumns) { + throw InvalidPlanInput( + "UnionByName `allowMissingCol` can be true only if `byName` is true.") + } + val union = Union(Seq(leftPlan, rightPlan), u.getByName, u.getAllowMissingColumns) + if (isAll) { + union + } else { + logical.Distinct(union) + } + + case _ => + throw InvalidPlanInput(s"Unsupported set operation ${u.getSetOpTypeValue}") + } + } + + private def transformJoin(rel: proto.Join): LogicalPlan = { + assert(rel.hasLeft && rel.hasRight, "Both join sides must be present") + if (rel.hasJoinCondition && rel.getUsingColumnsCount > 0) { + throw InvalidPlanInput( + s"Using columns or join conditions cannot be set at the same time in Join") + } + val joinCondition = + if (rel.hasJoinCondition) Some(transformExpression(rel.getJoinCondition)) else None + val catalystJointype = transformJoinType( + if (rel.getJoinType != null) rel.getJoinType else proto.Join.JoinType.JOIN_TYPE_INNER) + val joinType = if (rel.getUsingColumnsCount > 0) { + UsingJoin(catalystJointype, rel.getUsingColumnsList.asScala.toSeq) + } else { + catalystJointype + } + logical.Join( + left = transformRelation(rel.getLeft), + right = transformRelation(rel.getRight), + joinType = joinType, + condition = joinCondition, + hint = logical.JoinHint.NONE) + } + + private def transformJoinType(t: proto.Join.JoinType): JoinType = { + t match { + case proto.Join.JoinType.JOIN_TYPE_INNER => Inner + case proto.Join.JoinType.JOIN_TYPE_LEFT_ANTI => LeftAnti + case proto.Join.JoinType.JOIN_TYPE_FULL_OUTER => FullOuter + case proto.Join.JoinType.JOIN_TYPE_LEFT_OUTER => LeftOuter + case proto.Join.JoinType.JOIN_TYPE_RIGHT_OUTER => RightOuter + case proto.Join.JoinType.JOIN_TYPE_LEFT_SEMI => LeftSemi + case proto.Join.JoinType.JOIN_TYPE_CROSS => Cross + case _ => throw InvalidPlanInput(s"Join type ${t} is not supported") + } + } + + private def transformSort(sort: proto.Sort): LogicalPlan = { + assert(sort.getOrderCount > 0, "'order' must be present and contain elements.") + logical.Sort( + child = transformRelation(sort.getInput), + global = sort.getIsGlobal, + order = sort.getOrderList.asScala.toSeq.map(transformSortOrder)) + } + + private def transformSortOrder(order: proto.Expression.SortOrder) = { + expressions.SortOrder( + child = transformExpression(order.getChild), + direction = order.getDirection match { + case proto.Expression.SortOrder.SortDirection.SORT_DIRECTION_ASCENDING => + expressions.Ascending + case _ => expressions.Descending + }, + nullOrdering = order.getNullOrdering match { + case proto.Expression.SortOrder.NullOrdering.SORT_NULLS_FIRST => + expressions.NullsFirst + case _ => expressions.NullsLast + }, + sameOrderExpressions = Seq.empty) + } + + private def transformDrop(rel: proto.Drop): LogicalPlan = { + var output = Dataset.ofRows(session, transformRelation(rel.getInput)) + if (rel.getColumnsCount > 0) { + val cols = rel.getColumnsList.asScala.toSeq.map(expr => Column(transformExpression(expr))) + output = output.drop(cols.head, cols.tail: _*) + } + if (rel.getColumnNamesCount > 0) { + val colNames = rel.getColumnNamesList.asScala.toSeq + output = output.drop(colNames: _*) + } + output.logicalPlan + } + + private def transformAggregate(rel: proto.Aggregate): LogicalPlan = { + if (!rel.hasInput) { + throw InvalidPlanInput("Aggregate needs a plan input") + } + val input = transformRelation(rel.getInput) + + val groupingExprs = rel.getGroupingExpressionsList.asScala.toSeq.map(transformExpression) + val aggExprs = rel.getAggregateExpressionsList.asScala.toSeq.map(transformExpression) + val aliasedAgg = (groupingExprs ++ aggExprs).map(toNamedExpression) + + rel.getGroupType match { + case proto.Aggregate.GroupType.GROUP_TYPE_GROUPBY => + logical.Aggregate( + groupingExpressions = groupingExprs, + aggregateExpressions = aliasedAgg, + child = input) + + case proto.Aggregate.GroupType.GROUP_TYPE_ROLLUP => + logical.Aggregate( + groupingExpressions = Seq(Rollup(groupingExprs.map(Seq(_)))), + aggregateExpressions = aliasedAgg, + child = input) + + case proto.Aggregate.GroupType.GROUP_TYPE_CUBE => + logical.Aggregate( + groupingExpressions = Seq(Cube(groupingExprs.map(Seq(_)))), + aggregateExpressions = aliasedAgg, + child = input) + + case proto.Aggregate.GroupType.GROUP_TYPE_PIVOT => + if (!rel.hasPivot) { + throw InvalidPlanInput("Aggregate with GROUP_TYPE_PIVOT requires a Pivot") + } + + val pivotExpr = transformExpression(rel.getPivot.getCol) + + var valueExprs = rel.getPivot.getValuesList.asScala.toSeq.map(transformLiteral) + if (valueExprs.isEmpty) { + // This is to prevent unintended OOM errors when the number of distinct values is large + val maxValues = session.sessionState.conf.dataFramePivotMaxValues + // Get the distinct values of the column and sort them so its consistent + val pivotCol = Column(pivotExpr) + valueExprs = Dataset + .ofRows(session, input) + .select(pivotCol) + .distinct() + .limit(maxValues + 1) + .sort(pivotCol) // ensure that the output columns are in a consistent logical order + .collect() + .map(_.get(0)) + .toSeq + .map(expressions.Literal.apply) + } + + logical.Pivot( + groupByExprsOpt = Some(groupingExprs.map(toNamedExpression)), + pivotColumn = pivotExpr, + pivotValues = valueExprs, + aggregates = aggExprs, + child = input) + + case other => throw InvalidPlanInput(s"Unknown Group Type $other") + } + } + + def process( + command: proto.Command, + sessionId: String, + responseObserver: StreamObserver[ExecutePlanResponse]): Unit = { + command.getCommandTypeCase match { + case proto.Command.CommandTypeCase.REGISTER_FUNCTION => + handleRegisterUserDefinedFunction(command.getRegisterFunction) + case proto.Command.CommandTypeCase.WRITE_OPERATION => + handleWriteOperation(command.getWriteOperation) + case proto.Command.CommandTypeCase.CREATE_DATAFRAME_VIEW => + handleCreateViewCommand(command.getCreateDataframeView) + case proto.Command.CommandTypeCase.WRITE_OPERATION_V2 => + handleWriteOperationV2(command.getWriteOperationV2) + case proto.Command.CommandTypeCase.EXTENSION => + handleCommandPlugin(command.getExtension) + case proto.Command.CommandTypeCase.SQL_COMMAND => + handleSqlCommand(command.getSqlCommand, sessionId, responseObserver) + case _ => throw new UnsupportedOperationException(s"$command not supported.") + } + } + + def handleSqlCommand( + getSqlCommand: SqlCommand, + sessionId: String, + responseObserver: StreamObserver[ExecutePlanResponse]): Unit = { + // Eagerly execute commands of the provided SQL string. + val df = session.sql( + getSqlCommand.getSql, + getSqlCommand.getArgsMap.asScala.mapValues(transformLiteral).toMap) + // Check if commands have been executed. + val isCommand = df.queryExecution.commandExecuted.isInstanceOf[CommandResult] + val rows = df.logicalPlan match { + case lr: LocalRelation => lr.data + case cr: CommandResult => cr.rows + case _ => Seq.empty + } + + // Convert the results to Arrow. + val schema = df.schema + val maxRecordsPerBatch = session.sessionState.conf.arrowMaxRecordsPerBatch + val maxBatchSize = (SparkEnv.get.conf.get(CONNECT_GRPC_ARROW_MAX_BATCH_SIZE) * 0.7).toLong + val timeZoneId = session.sessionState.conf.sessionLocalTimeZone + + // Convert the data. + val bytes = if (rows.isEmpty) { + ArrowConverters.createEmptyArrowBatch(schema, timeZoneId) + } else { + val batches = ArrowConverters.toBatchWithSchemaIterator( + rows.iterator, + schema, + maxRecordsPerBatch, + maxBatchSize, + timeZoneId) + assert(batches.hasNext) + val bytes = batches.next() + assert(!batches.hasNext, s"remaining batches: ${batches.size}") + bytes + } + + // To avoid explicit handling of the result on the client, we build the expected input + // of the relation on the server. The client has to simply forward the result. + val result = SqlCommandResult.newBuilder() + if (isCommand) { + result.setRelation( + proto.Relation + .newBuilder() + .setLocalRelation( + proto.LocalRelation + .newBuilder() + .setData(ByteString.copyFrom(bytes)))) + } else { + result.setRelation( + proto.Relation + .newBuilder() + .setSql( + proto.SQL + .newBuilder() + .setQuery(getSqlCommand.getSql) + .putAllArgs(getSqlCommand.getArgsMap))) + } + // Exactly one SQL Command Result Batch + responseObserver.onNext( + ExecutePlanResponse + .newBuilder() + .setSessionId(sessionId) + .setSqlCommandResult(result) + .build()) + + // Send Metrics + responseObserver.onNext(SparkConnectStreamHandler.createMetricsResponse(sessionId, df)) + } + + private def handleRegisterUserDefinedFunction( + fun: proto.CommonInlineUserDefinedFunction): Unit = { + fun.getFunctionCase match { + case proto.CommonInlineUserDefinedFunction.FunctionCase.PYTHON_UDF => + handleRegisterPythonUDF(fun) + case proto.CommonInlineUserDefinedFunction.FunctionCase.JAVA_UDF => + handleRegisterJavaUDF(fun) + case _ => + throw InvalidPlanInput( + s"Function with ID: ${fun.getFunctionCase.getNumber} is not supported") + } + } + + private def handleRegisterPythonUDF(fun: proto.CommonInlineUserDefinedFunction): Unit = { + val udf = fun.getPythonUdf + val function = transformPythonFunction(udf) + val udpf = UserDefinedPythonFunction( + name = fun.getFunctionName, + func = function, + dataType = transformDataType(udf.getOutputType), + pythonEvalType = udf.getEvalType, + udfDeterministic = fun.getDeterministic) + + session.udf.registerPython(fun.getFunctionName, udpf) + } + + private def handleRegisterJavaUDF(fun: proto.CommonInlineUserDefinedFunction): Unit = { + val udf = fun.getJavaUdf + val dataType = if (udf.hasOutputType) { + transformDataType(udf.getOutputType) + } else { + null + } + if (udf.getAggregate) { + session.udf.registerJavaUDAF(fun.getFunctionName, udf.getClassName) + } else { + session.udf.registerJava(fun.getFunctionName, udf.getClassName, dataType) + } + } + + private def handleCommandPlugin(extension: ProtoAny): Unit = { + SparkConnectPluginRegistry.commandRegistry + // Lazily traverse the collection. + .view + // Apply the transformation. + .map(p => p.process(extension, this)) + // Find the first non-empty transformation or throw. + .find(_.nonEmpty) + .flatten + .getOrElse(throw InvalidPlanInput("No handler found for extension")) + } + + private def handleCreateViewCommand(createView: proto.CreateDataFrameViewCommand): Unit = { + val viewType = if (createView.getIsGlobal) GlobalTempView else LocalTempView + + val tableIdentifier = + try { + session.sessionState.sqlParser.parseTableIdentifier(createView.getName) + } catch { + case _: ParseException => + throw QueryCompilationErrors.invalidViewNameError(createView.getName) + } + + val plan = CreateViewCommand( + name = tableIdentifier, + userSpecifiedColumns = Nil, + comment = None, + properties = Map.empty, + originalText = None, + plan = transformRelation(createView.getInput), + allowExisting = false, + replace = createView.getReplace, + viewType = viewType) + + Dataset.ofRows(session, plan).queryExecution.commandExecuted + } + + /** + * Transforms the write operation and executes it. + * + * The input write operation contains a reference to the input plan and transforms it to the + * corresponding logical plan. Afterwards, creates the DataFrameWriter and translates the + * parameters of the WriteOperation into the corresponding methods calls. + * + * @param writeOperation + */ + private def handleWriteOperation(writeOperation: proto.WriteOperation): Unit = { + // Transform the input plan into the logical plan. + val planner = new SparkConnectPlanner(session) + val plan = planner.transformRelation(writeOperation.getInput) + // And create a Dataset from the plan. + val dataset = Dataset.ofRows(session, logicalPlan = plan) + + val w = dataset.write + if (writeOperation.getMode != proto.WriteOperation.SaveMode.SAVE_MODE_UNSPECIFIED) { + w.mode(SaveModeConverter.toSaveMode(writeOperation.getMode)) + } + + if (writeOperation.getOptionsCount > 0) { + writeOperation.getOptionsMap.asScala.foreach { case (key, value) => w.option(key, value) } + } + + if (writeOperation.getSortColumnNamesCount > 0) { + val names = writeOperation.getSortColumnNamesList.asScala + w.sortBy(names.head, names.tail.toSeq: _*) + } + + if (writeOperation.hasBucketBy) { + val op = writeOperation.getBucketBy + val cols = op.getBucketColumnNamesList.asScala + if (op.getNumBuckets <= 0) { + throw InvalidCommandInput( + s"BucketBy must specify a bucket count > 0, received ${op.getNumBuckets} instead.") + } + w.bucketBy(op.getNumBuckets, cols.head, cols.tail.toSeq: _*) + } + + if (writeOperation.getPartitioningColumnsCount > 0) { + val names = writeOperation.getPartitioningColumnsList.asScala + w.partitionBy(names.toSeq: _*) + } + + if (writeOperation.hasSource) { + w.format(writeOperation.getSource) + } + + writeOperation.getSaveTypeCase match { + case proto.WriteOperation.SaveTypeCase.SAVETYPE_NOT_SET => w.save() + case proto.WriteOperation.SaveTypeCase.PATH => w.save(writeOperation.getPath) + case proto.WriteOperation.SaveTypeCase.TABLE => + val tableName = writeOperation.getTable.getTableName + writeOperation.getTable.getSaveMethod match { + case proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_SAVE_AS_TABLE => + w.saveAsTable(tableName) + case proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_INSERT_INTO => + w.insertInto(tableName) + case _ => + throw new UnsupportedOperationException( + "WriteOperation:SaveTable:TableSaveMethod not supported " + + s"${writeOperation.getTable.getSaveMethodValue}") + } + case _ => + throw new UnsupportedOperationException( + "WriteOperation:SaveTypeCase not supported " + + s"${writeOperation.getSaveTypeCase.getNumber}") + } + } + + /** + * Transforms the write operation and executes it. + * + * The input write operation contains a reference to the input plan and transforms it to the + * corresponding logical plan. Afterwards, creates the DataFrameWriter and translates the + * parameters of the WriteOperation into the corresponding methods calls. + * + * @param writeOperation + */ + def handleWriteOperationV2(writeOperation: proto.WriteOperationV2): Unit = { + // Transform the input plan into the logical plan. + val planner = new SparkConnectPlanner(session) + val plan = planner.transformRelation(writeOperation.getInput) + // And create a Dataset from the plan. + val dataset = Dataset.ofRows(session, logicalPlan = plan) + + val w = dataset.writeTo(table = writeOperation.getTableName) + + if (writeOperation.getOptionsCount > 0) { + writeOperation.getOptionsMap.asScala.foreach { case (key, value) => w.option(key, value) } + } + + if (writeOperation.getTablePropertiesCount > 0) { + writeOperation.getTablePropertiesMap.asScala.foreach { case (key, value) => + w.tableProperty(key, value) + } + } + + if (writeOperation.getPartitioningColumnsCount > 0) { + val names = writeOperation.getPartitioningColumnsList.asScala + .map(transformExpression) + .map(Column(_)) + .toSeq + w.partitionedBy(names.head, names.tail.toSeq: _*) + } + + writeOperation.getMode match { + case proto.WriteOperationV2.Mode.MODE_CREATE => + if (writeOperation.hasProvider) { + w.using(writeOperation.getProvider).create() + } else { + w.create() + } + case proto.WriteOperationV2.Mode.MODE_OVERWRITE => + w.overwrite(Column(transformExpression(writeOperation.getOverwriteCondition))) + case proto.WriteOperationV2.Mode.MODE_OVERWRITE_PARTITIONS => + w.overwritePartitions() + case proto.WriteOperationV2.Mode.MODE_APPEND => + w.append() + case proto.WriteOperationV2.Mode.MODE_REPLACE => + if (writeOperation.hasProvider) { + w.using(writeOperation.getProvider).replace() + } else { + w.replace() + } + case proto.WriteOperationV2.Mode.MODE_CREATE_OR_REPLACE => + if (writeOperation.hasProvider) { + w.using(writeOperation.getProvider).createOrReplace() + } else { + w.createOrReplace() + } + case _ => + throw new UnsupportedOperationException( + s"WriteOperationV2:ModeValue not supported ${writeOperation.getModeValue}") + } + } + + private val emptyLocalRelation = LocalRelation( + output = AttributeReference("value", StringType, false)() :: Nil, + data = Seq.empty) + + private def transformCurrentDatabase(getCurrentDatabase: proto.CurrentDatabase): LogicalPlan = { + session.createDataset(session.catalog.currentDatabase :: Nil)(Encoders.STRING).logicalPlan + } + + private def transformSetCurrentDatabase( + getSetCurrentDatabase: proto.SetCurrentDatabase): LogicalPlan = { + session.catalog.setCurrentDatabase(getSetCurrentDatabase.getDbName) + emptyLocalRelation + } + + private def transformListDatabases(getListDatabases: proto.ListDatabases): LogicalPlan = { + session.catalog.listDatabases().logicalPlan + } + + private def transformListTables(getListTables: proto.ListTables): LogicalPlan = { + if (getListTables.hasDbName) { + session.catalog.listTables(getListTables.getDbName).logicalPlan + } else { + session.catalog.listTables().logicalPlan + } + } + + private def transformListFunctions(getListFunctions: proto.ListFunctions): LogicalPlan = { + if (getListFunctions.hasDbName) { + session.catalog.listFunctions(getListFunctions.getDbName).logicalPlan + } else { + session.catalog.listFunctions().logicalPlan + } + } + + private def transformListColumns(getListColumns: proto.ListColumns): LogicalPlan = { + if (getListColumns.hasDbName) { + session.catalog + .listColumns(dbName = getListColumns.getDbName, tableName = getListColumns.getTableName) + .logicalPlan + } else { + session.catalog.listColumns(getListColumns.getTableName).logicalPlan + } + } + + private def transformGetDatabase(getGetDatabase: proto.GetDatabase): LogicalPlan = { + CatalogImpl + .makeDataset(session.catalog.getDatabase(getGetDatabase.getDbName) :: Nil, session) + .logicalPlan + } + + private def transformGetTable(getGetTable: proto.GetTable): LogicalPlan = { + if (getGetTable.hasDbName) { + CatalogImpl + .makeDataset( + session.catalog.getTable( + dbName = getGetTable.getDbName, + tableName = getGetTable.getTableName) :: Nil, + session) + .logicalPlan + } else { + CatalogImpl + .makeDataset(session.catalog.getTable(getGetTable.getTableName) :: Nil, session) + .logicalPlan + } + } + + private def transformGetFunction(getGetFunction: proto.GetFunction): LogicalPlan = { + if (getGetFunction.hasDbName) { + CatalogImpl + .makeDataset( + session.catalog.getFunction( + dbName = getGetFunction.getDbName, + functionName = getGetFunction.getFunctionName) :: Nil, + session) + .logicalPlan + } else { + CatalogImpl + .makeDataset(session.catalog.getFunction(getGetFunction.getFunctionName) :: Nil, session) + .logicalPlan + } + } + + private def transformDatabaseExists(getDatabaseExists: proto.DatabaseExists): LogicalPlan = { + session + .createDataset(session.catalog.databaseExists(getDatabaseExists.getDbName) :: Nil)( + Encoders.scalaBoolean) + .logicalPlan + } + + private def transformTableExists(getTableExists: proto.TableExists): LogicalPlan = { + if (getTableExists.hasDbName) { + session + .createDataset( + session.catalog.tableExists( + dbName = getTableExists.getDbName, + tableName = getTableExists.getTableName) :: Nil)(Encoders.scalaBoolean) + .logicalPlan + } else { + session + .createDataset(session.catalog.tableExists(getTableExists.getTableName) :: Nil)( + Encoders.scalaBoolean) + .logicalPlan + } + } + + private def transformFunctionExists(getFunctionExists: proto.FunctionExists): LogicalPlan = { + if (getFunctionExists.hasDbName) { + session + .createDataset( + session.catalog.functionExists( + dbName = getFunctionExists.getDbName, + functionName = getFunctionExists.getFunctionName) :: Nil)(Encoders.scalaBoolean) + .logicalPlan + } else { + session + .createDataset(session.catalog.functionExists(getFunctionExists.getFunctionName) :: Nil)( + Encoders.scalaBoolean) + .logicalPlan + } + } + + private def transformCreateExternalTable( + getCreateExternalTable: proto.CreateExternalTable): LogicalPlan = { + val schema = if (getCreateExternalTable.hasSchema) { + val struct = transformDataType(getCreateExternalTable.getSchema) + assert(struct.isInstanceOf[StructType]) + struct.asInstanceOf[StructType] + } else { + new StructType + } + + val source = if (getCreateExternalTable.hasSource) { + getCreateExternalTable.getSource + } else { + session.sessionState.conf.defaultDataSourceName + } + + val options = if (getCreateExternalTable.hasPath) { + (getCreateExternalTable.getOptionsMap.asScala ++ + Map("path" -> getCreateExternalTable.getPath)).asJava + } else { + getCreateExternalTable.getOptionsMap + } + session.catalog + .createTable( + tableName = getCreateExternalTable.getTableName, + source = source, + schema = schema, + options = options) + .logicalPlan + } + + private def transformCreateTable(getCreateTable: proto.CreateTable): LogicalPlan = { + val schema = if (getCreateTable.hasSchema) { + val struct = transformDataType(getCreateTable.getSchema) + assert(struct.isInstanceOf[StructType]) + struct.asInstanceOf[StructType] + } else { + new StructType + } + + val source = if (getCreateTable.hasSource) { + getCreateTable.getSource + } else { + session.sessionState.conf.defaultDataSourceName + } + + val description = if (getCreateTable.hasDescription) { + getCreateTable.getDescription + } else { + "" + } + + val options = if (getCreateTable.hasPath) { + (getCreateTable.getOptionsMap.asScala ++ + Map("path" -> getCreateTable.getPath)).asJava + } else { + getCreateTable.getOptionsMap + } + + session.catalog + .createTable( + tableName = getCreateTable.getTableName, + source = source, + schema = schema, + description = description, + options = options) + .logicalPlan + } + + private def transformDropTempView(getDropTempView: proto.DropTempView): LogicalPlan = { + session + .createDataset(session.catalog.dropTempView(getDropTempView.getViewName) :: Nil)( + Encoders.scalaBoolean) + .logicalPlan + } + + private def transformDropGlobalTempView( + getDropGlobalTempView: proto.DropGlobalTempView): LogicalPlan = { + session + .createDataset( + session.catalog.dropGlobalTempView(getDropGlobalTempView.getViewName) :: Nil)( + Encoders.scalaBoolean) + .logicalPlan + } + + private def transformRecoverPartitions( + getRecoverPartitions: proto.RecoverPartitions): LogicalPlan = { + session.catalog.recoverPartitions(getRecoverPartitions.getTableName) + emptyLocalRelation + } + + private def transformIsCached(getIsCached: proto.IsCached): LogicalPlan = { + session + .createDataset(session.catalog.isCached(getIsCached.getTableName) :: Nil)( + Encoders.scalaBoolean) + .logicalPlan + } + + private def transformCacheTable(getCacheTable: proto.CacheTable): LogicalPlan = { + session.catalog.cacheTable(getCacheTable.getTableName) + emptyLocalRelation + } + + private def transformUncacheTable(getUncacheTable: proto.UncacheTable): LogicalPlan = { + session.catalog.uncacheTable(getUncacheTable.getTableName) + emptyLocalRelation + } + + private def transformClearCache(getClearCache: proto.ClearCache): LogicalPlan = { + session.catalog.clearCache() + emptyLocalRelation + } + + private def transformRefreshTable(getRefreshTable: proto.RefreshTable): LogicalPlan = { + session.catalog.refreshTable(getRefreshTable.getTableName) + emptyLocalRelation + } + + private def transformRefreshByPath(getRefreshByPath: proto.RefreshByPath): LogicalPlan = { + session.catalog.refreshByPath(getRefreshByPath.getPath) + emptyLocalRelation + } + + private def transformCurrentCatalog(getCurrentCatalog: proto.CurrentCatalog): LogicalPlan = { + session.createDataset(session.catalog.currentCatalog() :: Nil)(Encoders.STRING).logicalPlan + } + + private def transformSetCurrentCatalog( + getSetCurrentCatalog: proto.SetCurrentCatalog): LogicalPlan = { + session.catalog.setCurrentCatalog(getSetCurrentCatalog.getCatalogName) + emptyLocalRelation + } + + private def transformListCatalogs(getListCatalogs: proto.ListCatalogs): LogicalPlan = { + session.catalog.listCatalogs().logicalPlan + } +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/TableSaveMethodConverter.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/TableSaveMethodConverter.scala new file mode 100644 index 0000000000000..d3dfee405eae9 --- /dev/null +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/planner/TableSaveMethodConverter.scala @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.planner + +import java.util.Locale + +import org.apache.spark.connect.proto + +/** + * Helper class for conversions between save table method string and + * [[proto.WriteOperation.SaveTable.TableSaveMethod]]. + */ +object TableSaveMethodConverter { + def toTableSaveMethodProto(method: String): proto.WriteOperation.SaveTable.TableSaveMethod = { + method.toLowerCase(Locale.ROOT) match { + case "save_as_table" => + proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_SAVE_AS_TABLE + case "insert_into" => + proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_INSERT_INTO + case _ => + throw new IllegalArgumentException( + "Cannot convert from TableSaveMethod to WriteOperation.SaveTable.TableSaveMethod: " + + s"${method}") + } + } +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/plugin/CommandPlugin.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/plugin/CommandPlugin.scala new file mode 100644 index 0000000000000..839a774062f07 --- /dev/null +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/plugin/CommandPlugin.scala @@ -0,0 +1,34 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.plugin + +import com.google.protobuf + +import org.apache.spark.sql.connect.planner.SparkConnectPlanner + +/** + * Behavior trait for supporting extension mechanisms for the Spark Connect planner. + * + * Classes implementing the trait must be trivially constructable and should not rely on internal + * state. Every registered extension will be passed the Any instance. If the plugin supports + * handling this type it is responsible of constructing the logical expression from this object + * and if necessary traverse it's children. + */ +trait CommandPlugin { + def process(command: protobuf.Any, planner: SparkConnectPlanner): Option[Unit] +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/plugin/ExpressionPlugin.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/plugin/ExpressionPlugin.scala new file mode 100644 index 0000000000000..7847312265673 --- /dev/null +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/plugin/ExpressionPlugin.scala @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.plugin + +import com.google.protobuf + +import org.apache.spark.sql.catalyst.expressions.Expression +import org.apache.spark.sql.connect.planner.SparkConnectPlanner + +/** + * Behavior trait for supporting extension mechanisms for the Spark Connect planner. + * + * Classes implementing the trait must be trivially constructable and should not rely on internal + * state. Every registered extension will be passed the Any instance. If the plugin supports + * handling this type it is responsible of constructing the logical expression from this object + * and if necessary traverse it's children. + */ +trait ExpressionPlugin { + def transform(relation: protobuf.Any, planner: SparkConnectPlanner): Option[Expression] +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/plugin/RelationPlugin.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/plugin/RelationPlugin.scala new file mode 100644 index 0000000000000..b583c6456d2fa --- /dev/null +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/plugin/RelationPlugin.scala @@ -0,0 +1,35 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.plugin + +import com.google.protobuf + +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connect.planner.SparkConnectPlanner + +/** + * Behavior trait for supporting extension mechanisms for the Spark Connect planner. + * + * Classes implementing the trait must be trivially constructable and should not rely on internal + * state. Every registered extension will be passed the Any instance. If the plugin supports + * handling this type it is responsible of constructing the logical catalyst plan from this object + * and if necessary traverse it's children. + */ +trait RelationPlugin { + def transform(relation: protobuf.Any, planner: SparkConnectPlanner): Option[LogicalPlan] +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/plugin/SparkConnectPluginRegistry.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/plugin/SparkConnectPluginRegistry.scala new file mode 100644 index 0000000000000..d6f5f01a5e07e --- /dev/null +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/plugin/SparkConnectPluginRegistry.scala @@ -0,0 +1,179 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.plugin + +import java.lang.reflect.InvocationTargetException + +import org.apache.spark.{SparkEnv, SparkException} +import org.apache.spark.sql.connect.config.Connect +import org.apache.spark.util.Utils + +/** + * This object provides a global list of configured relation, expression and command plugins for + * Spark Connect. The plugins are used to handle custom message types. + */ +object SparkConnectPluginRegistry { + + // Contains the list of configured interceptors. + private lazy val relationPluginChain: Seq[relationPluginBuilder] = Seq( + // Adding a new plugin at compile time works like the example below: + // relation[DummyRelationPlugin](classOf[DummyRelationPlugin]) + ) + + private lazy val expressionPluginChain: Seq[expressionPluginBuilder] = Seq( + // Adding a new plugin at compile time works like the example below: + // expression[DummyExpressionPlugin](classOf[DummyExpressionPlugin]) + ) + + private lazy val commandPluginChain: Seq[commandPluginBuilder] = Seq( + // Adding a new plugin at compile time works like the example below: + // expression[DummyExpressionPlugin](classOf[DummyExpressionPlugin]) + ) + + private var initialized = false + private var relationRegistryCache: Seq[RelationPlugin] = Seq.empty + private var expressionRegistryCache: Seq[ExpressionPlugin] = Seq.empty + private var commandRegistryCache: Seq[CommandPlugin] = Seq.empty + + // Type used to identify the closure responsible to instantiate a ServerInterceptor. + type relationPluginBuilder = () => RelationPlugin + type expressionPluginBuilder = () => ExpressionPlugin + type commandPluginBuilder = () => CommandPlugin + + def relationRegistry: Seq[RelationPlugin] = withInitialize { + relationRegistryCache + } + def expressionRegistry: Seq[ExpressionPlugin] = withInitialize { + expressionRegistryCache + } + def commandRegistry: Seq[CommandPlugin] = withInitialize { + commandRegistryCache + } + + private def withInitialize[T](f: => Seq[T]): Seq[T] = { + synchronized { + if (!initialized) { + relationRegistryCache = loadRelationPlugins() + expressionRegistryCache = loadExpressionPlugins() + commandRegistryCache = loadCommandPlugins() + initialized = true + } + } + f + } + + /** + * Only visible for testing. Should not be called from any other code path. + */ + def reset(): Unit = { + synchronized { + initialized = false + } + } + + /** + * Only visible for testing + */ + private[connect] def loadRelationPlugins(): Seq[RelationPlugin] = { + relationPluginChain.map(x => x()) ++ createConfiguredPlugins[RelationPlugin]( + SparkEnv.get.conf.get(Connect.CONNECT_EXTENSIONS_RELATION_CLASSES)) + } + + /** + * Only visible for testing + */ + private[connect] def loadExpressionPlugins(): Seq[ExpressionPlugin] = { + expressionPluginChain.map(x => x()) ++ createConfiguredPlugins( + SparkEnv.get.conf.get(Connect.CONNECT_EXTENSIONS_EXPRESSION_CLASSES)) + } + + private[connect] def loadCommandPlugins(): Seq[CommandPlugin] = { + commandPluginChain.map(x => x()) ++ createConfiguredPlugins( + SparkEnv.get.conf.get(Connect.CONNECT_EXTENSIONS_COMMAND_CLASSES)) + } + + /** + * Exposed for testing only. + */ + def createConfiguredPlugins[T](values: Seq[String]): Seq[T] = { + // Check all values from the Spark conf. + if (values.nonEmpty) { + values + .map(_.trim) + .filter(_.nonEmpty) + .map(Utils.classForName[T](_)) + .map(createInstance(_)) + } else { + Seq.empty + } + } + + /** + * Creates a new instance of T using the default constructor. + * @param cls + * @tparam T + * @return + */ + private def createInstance[B, T <: B](cls: Class[T]): B = { + val ctorOpt = cls.getConstructors.find(_.getParameterCount == 0) + if (ctorOpt.isEmpty) { + throw new SparkException( + errorClass = "CONNECT.PLUGIN_CTOR_MISSING", + messageParameters = Map("cls" -> cls.getName), + cause = null) + } + try { + ctorOpt.get.newInstance().asInstanceOf[T] + } catch { + case e: InvocationTargetException => + throw new SparkException( + errorClass = "CONNECT.PLUGIN_RUNTIME_ERROR", + messageParameters = Map("msg" -> e.getTargetException.getMessage), + cause = e) + case e: Exception => + throw new SparkException( + errorClass = "CONNECT.PLUGIN_RUNTIME_ERROR", + messageParameters = Map("msg" -> e.getMessage), + cause = e) + } + } + + /** + * Creates a callable expression that instantiates the configured Relation plugin. + * + * Visible for testing only. + */ + def relation[T <: RelationPlugin](cls: Class[T]): relationPluginBuilder = + () => createInstance[RelationPlugin, T](cls) + + /** + * Creates a callable expression that instantiates the configured Expression plugin. + * + * Visible for testing only. + */ + def expression[T <: ExpressionPlugin](cls: Class[T]): expressionPluginBuilder = + () => createInstance[ExpressionPlugin, T](cls) + + /** + * Creates a callable expression that instantiates the configured Command plugin. + * + * Visible for testing only. + */ + def command[T <: CommandPlugin](cls: Class[T]): commandPluginBuilder = + () => createInstance[CommandPlugin, T](cls) +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/LoggingInterceptor.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/LoggingInterceptor.scala new file mode 100644 index 0000000000000..2d848d3c8400a --- /dev/null +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/LoggingInterceptor.scala @@ -0,0 +1,75 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.service + +import scala.util.Random + +import com.google.protobuf.Message +import com.google.protobuf.util.JsonFormat +import io.grpc.ForwardingServerCall.SimpleForwardingServerCall +import io.grpc.ForwardingServerCallListener.SimpleForwardingServerCallListener +import io.grpc.Metadata +import io.grpc.ServerCall +import io.grpc.ServerCallHandler +import io.grpc.ServerInterceptor + +import org.apache.spark.internal.Logging + +/** + * A gRPC interceptor to log RPC requests and responses. It logs the protobufs as JSON. Useful for + * local development. An ID is logged for each RPC so that requests and corresponding responses + * can be exactly matched. + */ +class LoggingInterceptor extends ServerInterceptor with Logging { + + private val jsonPrinter = JsonFormat.printer().preservingProtoFieldNames() + + private def logProto[T](description: String, message: T): Unit = { + message match { + case m: Message => + logInfo(s"$description:\n${jsonPrinter.print(m)}") + case other => + logInfo(s"$description: (Unknown message type) $other") + } + } + + override def interceptCall[ReqT, RespT]( + call: ServerCall[ReqT, RespT], + headers: Metadata, + next: ServerCallHandler[ReqT, RespT]): ServerCall.Listener[ReqT] = { + + val id = Random.nextInt(Int.MaxValue) // Assign a random id for this RPC. + val desc = s"${call.getMethodDescriptor.getFullMethodName} (id $id)" + + val respLoggingCall = new SimpleForwardingServerCall[ReqT, RespT](call) { + override def sendMessage(message: RespT): Unit = { + logProto(s"Responding to RPC $desc", message) + super.sendMessage(message) + } + } + + val listener = next.startCall(respLoggingCall, headers) + + new SimpleForwardingServerCallListener[ReqT](listener) { + override def onMessage(message: ReqT): Unit = { + logProto(s"Received RPC request $desc", message) + super.onMessage(message) + } + } + } +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectAnalyzeHandler.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectAnalyzeHandler.scala new file mode 100644 index 0000000000000..a03b827b60e35 --- /dev/null +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectAnalyzeHandler.scala @@ -0,0 +1,201 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.service + +import scala.collection.JavaConverters._ + +import io.grpc.stub.StreamObserver + +import org.apache.spark.connect.proto +import org.apache.spark.internal.Logging +import org.apache.spark.sql.{Dataset, SparkSession} +import org.apache.spark.sql.connect.common.{DataTypeProtoConverter, InvalidPlanInput, StorageLevelProtoConverter} +import org.apache.spark.sql.connect.planner.SparkConnectPlanner +import org.apache.spark.sql.execution.{CodegenMode, CostMode, ExtendedMode, FormattedMode, SimpleMode} + +private[connect] class SparkConnectAnalyzeHandler( + responseObserver: StreamObserver[proto.AnalyzePlanResponse]) + extends Logging { + + def handle(request: proto.AnalyzePlanRequest): Unit = { + val session = + SparkConnectService + .getOrCreateIsolatedSession(request.getUserContext.getUserId, request.getSessionId) + .session + session.withActive { + val response = process(request, session) + responseObserver.onNext(response) + responseObserver.onCompleted() + } + } + + def process( + request: proto.AnalyzePlanRequest, + session: SparkSession): proto.AnalyzePlanResponse = { + lazy val planner = new SparkConnectPlanner(session) + val builder = proto.AnalyzePlanResponse.newBuilder() + + request.getAnalyzeCase match { + case proto.AnalyzePlanRequest.AnalyzeCase.SCHEMA => + val schema = Dataset + .ofRows(session, planner.transformRelation(request.getSchema.getPlan.getRoot)) + .schema + builder.setSchema( + proto.AnalyzePlanResponse.Schema + .newBuilder() + .setSchema(DataTypeProtoConverter.toConnectProtoType(schema)) + .build()) + + case proto.AnalyzePlanRequest.AnalyzeCase.EXPLAIN => + val queryExecution = Dataset + .ofRows(session, planner.transformRelation(request.getExplain.getPlan.getRoot)) + .queryExecution + val explainString = request.getExplain.getExplainMode match { + case proto.AnalyzePlanRequest.Explain.ExplainMode.EXPLAIN_MODE_SIMPLE => + queryExecution.explainString(SimpleMode) + case proto.AnalyzePlanRequest.Explain.ExplainMode.EXPLAIN_MODE_EXTENDED => + queryExecution.explainString(ExtendedMode) + case proto.AnalyzePlanRequest.Explain.ExplainMode.EXPLAIN_MODE_CODEGEN => + queryExecution.explainString(CodegenMode) + case proto.AnalyzePlanRequest.Explain.ExplainMode.EXPLAIN_MODE_COST => + queryExecution.explainString(CostMode) + case proto.AnalyzePlanRequest.Explain.ExplainMode.EXPLAIN_MODE_FORMATTED => + queryExecution.explainString(FormattedMode) + case other => throw new UnsupportedOperationException(s"Unknown Explain Mode $other!") + } + builder.setExplain( + proto.AnalyzePlanResponse.Explain + .newBuilder() + .setExplainString(explainString) + .build()) + + case proto.AnalyzePlanRequest.AnalyzeCase.TREE_STRING => + val treeString = Dataset + .ofRows(session, planner.transformRelation(request.getTreeString.getPlan.getRoot)) + .schema + .treeString + builder.setTreeString( + proto.AnalyzePlanResponse.TreeString + .newBuilder() + .setTreeString(treeString) + .build()) + + case proto.AnalyzePlanRequest.AnalyzeCase.IS_LOCAL => + val isLocal = Dataset + .ofRows(session, planner.transformRelation(request.getIsLocal.getPlan.getRoot)) + .isLocal + builder.setIsLocal( + proto.AnalyzePlanResponse.IsLocal + .newBuilder() + .setIsLocal(isLocal) + .build()) + + case proto.AnalyzePlanRequest.AnalyzeCase.IS_STREAMING => + val isStreaming = Dataset + .ofRows(session, planner.transformRelation(request.getIsStreaming.getPlan.getRoot)) + .isStreaming + builder.setIsStreaming( + proto.AnalyzePlanResponse.IsStreaming + .newBuilder() + .setIsStreaming(isStreaming) + .build()) + + case proto.AnalyzePlanRequest.AnalyzeCase.INPUT_FILES => + val inputFiles = Dataset + .ofRows(session, planner.transformRelation(request.getInputFiles.getPlan.getRoot)) + .inputFiles + builder.setInputFiles( + proto.AnalyzePlanResponse.InputFiles + .newBuilder() + .addAllFiles(inputFiles.toSeq.asJava) + .build()) + + case proto.AnalyzePlanRequest.AnalyzeCase.SPARK_VERSION => + builder.setSparkVersion( + proto.AnalyzePlanResponse.SparkVersion + .newBuilder() + .setVersion(session.version) + .build()) + + case proto.AnalyzePlanRequest.AnalyzeCase.DDL_PARSE => + val schema = planner.parseDatatypeString(request.getDdlParse.getDdlString) + builder.setDdlParse( + proto.AnalyzePlanResponse.DDLParse + .newBuilder() + .setParsed(DataTypeProtoConverter.toConnectProtoType(schema)) + .build()) + + case proto.AnalyzePlanRequest.AnalyzeCase.SAME_SEMANTICS => + val target = Dataset.ofRows( + session, + planner.transformRelation(request.getSameSemantics.getTargetPlan.getRoot)) + val other = Dataset.ofRows( + session, + planner.transformRelation(request.getSameSemantics.getOtherPlan.getRoot)) + builder.setSameSemantics( + proto.AnalyzePlanResponse.SameSemantics + .newBuilder() + .setResult(target.sameSemantics(other))) + + case proto.AnalyzePlanRequest.AnalyzeCase.SEMANTIC_HASH => + val semanticHash = Dataset + .ofRows(session, planner.transformRelation(request.getSemanticHash.getPlan.getRoot)) + .semanticHash() + builder.setSemanticHash( + proto.AnalyzePlanResponse.SemanticHash + .newBuilder() + .setResult(semanticHash)) + + case proto.AnalyzePlanRequest.AnalyzeCase.PERSIST => + val target = Dataset + .ofRows(session, planner.transformRelation(request.getPersist.getRelation)) + if (request.getPersist.hasStorageLevel) { + target.persist( + StorageLevelProtoConverter.toStorageLevel(request.getPersist.getStorageLevel)) + } else { + target.persist() + } + builder.setPersist(proto.AnalyzePlanResponse.Persist.newBuilder().build()) + + case proto.AnalyzePlanRequest.AnalyzeCase.UNPERSIST => + val target = Dataset + .ofRows(session, planner.transformRelation(request.getUnpersist.getRelation)) + if (request.getUnpersist.hasBlocking) { + target.unpersist(request.getUnpersist.getBlocking) + } else { + target.unpersist() + } + builder.setUnpersist(proto.AnalyzePlanResponse.Unpersist.newBuilder().build()) + + case proto.AnalyzePlanRequest.AnalyzeCase.GET_STORAGE_LEVEL => + val target = Dataset + .ofRows(session, planner.transformRelation(request.getGetStorageLevel.getRelation)) + val storageLevel = target.storageLevel + builder.setGetStorageLevel( + proto.AnalyzePlanResponse.GetStorageLevel + .newBuilder() + .setStorageLevel(StorageLevelProtoConverter.toConnectProtoType(storageLevel)) + .build()) + + case other => throw InvalidPlanInput(s"Unknown Analyze Method $other!") + } + + builder.setSessionId(request.getSessionId) + builder.build() + } +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectConfigHandler.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectConfigHandler.scala new file mode 100644 index 0000000000000..38fd88297f354 --- /dev/null +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectConfigHandler.scala @@ -0,0 +1,181 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.service + +import scala.collection.JavaConverters._ + +import io.grpc.stub.StreamObserver + +import org.apache.spark.connect.proto +import org.apache.spark.internal.Logging +import org.apache.spark.sql.RuntimeConfig +import org.apache.spark.sql.internal.SQLConf + +class SparkConnectConfigHandler(responseObserver: StreamObserver[proto.ConfigResponse]) + extends Logging { + + def handle(request: proto.ConfigRequest): Unit = { + val session = + SparkConnectService + .getOrCreateIsolatedSession(request.getUserContext.getUserId, request.getSessionId) + .session + + val builder = request.getOperation.getOpTypeCase match { + case proto.ConfigRequest.Operation.OpTypeCase.SET => + handleSet(request.getOperation.getSet, session.conf) + case proto.ConfigRequest.Operation.OpTypeCase.GET => + handleGet(request.getOperation.getGet, session.conf) + case proto.ConfigRequest.Operation.OpTypeCase.GET_WITH_DEFAULT => + handleGetWithDefault(request.getOperation.getGetWithDefault, session.conf) + case proto.ConfigRequest.Operation.OpTypeCase.GET_OPTION => + handleGetOption(request.getOperation.getGetOption, session.conf) + case proto.ConfigRequest.Operation.OpTypeCase.GET_ALL => + handleGetAll(request.getOperation.getGetAll, session.conf) + case proto.ConfigRequest.Operation.OpTypeCase.UNSET => + handleUnset(request.getOperation.getUnset, session.conf) + case proto.ConfigRequest.Operation.OpTypeCase.IS_MODIFIABLE => + handleIsModifiable(request.getOperation.getIsModifiable, session.conf) + case _ => throw new UnsupportedOperationException(s"${request.getOperation} not supported.") + } + + builder.setSessionId(request.getSessionId) + responseObserver.onNext(builder.build()) + responseObserver.onCompleted() + } + + private def handleSet( + operation: proto.ConfigRequest.Set, + conf: RuntimeConfig): proto.ConfigResponse.Builder = { + val builder = proto.ConfigResponse.newBuilder() + operation.getPairsList.asScala.iterator.foreach { pair => + val (key, value) = SparkConnectConfigHandler.toKeyValue(pair) + conf.set(key, value.orNull) + getWarning(key).foreach(builder.addWarnings) + } + builder + } + + private def handleGet( + operation: proto.ConfigRequest.Get, + conf: RuntimeConfig): proto.ConfigResponse.Builder = { + val builder = proto.ConfigResponse.newBuilder() + operation.getKeysList.asScala.iterator.foreach { key => + val value = conf.get(key) + builder.addPairs(SparkConnectConfigHandler.toProtoKeyValue(key, Option(value))) + getWarning(key).foreach(builder.addWarnings) + } + builder + } + + private def handleGetWithDefault( + operation: proto.ConfigRequest.GetWithDefault, + conf: RuntimeConfig): proto.ConfigResponse.Builder = { + val builder = proto.ConfigResponse.newBuilder() + operation.getPairsList.asScala.iterator.foreach { pair => + val (key, default) = SparkConnectConfigHandler.toKeyValue(pair) + val value = conf.get(key, default.orNull) + builder.addPairs(SparkConnectConfigHandler.toProtoKeyValue(key, Option(value))) + getWarning(key).foreach(builder.addWarnings) + } + builder + } + + private def handleGetOption( + operation: proto.ConfigRequest.GetOption, + conf: RuntimeConfig): proto.ConfigResponse.Builder = { + val builder = proto.ConfigResponse.newBuilder() + operation.getKeysList.asScala.iterator.foreach { key => + val value = conf.getOption(key) + builder.addPairs(SparkConnectConfigHandler.toProtoKeyValue(key, value)) + getWarning(key).foreach(builder.addWarnings) + } + builder + } + + private def handleGetAll( + operation: proto.ConfigRequest.GetAll, + conf: RuntimeConfig): proto.ConfigResponse.Builder = { + val builder = proto.ConfigResponse.newBuilder() + val results = if (operation.hasPrefix) { + val prefix = operation.getPrefix + conf.getAll.iterator + .filter { case (key, _) => key.startsWith(prefix) } + .map { case (key, value) => (key.substring(prefix.length), value) } + } else { + conf.getAll.iterator + } + results.foreach { case (key, value) => + builder.addPairs(SparkConnectConfigHandler.toProtoKeyValue(key, Option(value))) + getWarning(key).foreach(builder.addWarnings) + } + builder + } + + private def handleUnset( + operation: proto.ConfigRequest.Unset, + conf: RuntimeConfig): proto.ConfigResponse.Builder = { + val builder = proto.ConfigResponse.newBuilder() + operation.getKeysList.asScala.iterator.foreach { key => + conf.unset(key) + getWarning(key).foreach(builder.addWarnings) + } + builder + } + + private def handleIsModifiable( + operation: proto.ConfigRequest.IsModifiable, + conf: RuntimeConfig): proto.ConfigResponse.Builder = { + val builder = proto.ConfigResponse.newBuilder() + operation.getKeysList.asScala.iterator.foreach { key => + val value = conf.isModifiable(key) + builder.addPairs(SparkConnectConfigHandler.toProtoKeyValue(key, Option(value.toString))) + getWarning(key).foreach(builder.addWarnings) + } + builder + } + + private def getWarning(key: String): Option[String] = { + if (SparkConnectConfigHandler.unsupportedConfigurations.contains(key)) { + Some(s"The SQL config '$key' is NOT supported in Spark Connect") + } else { + SQLConf.deprecatedSQLConfigs.get(key).map(_.toDeprecationString) + } + } +} + +object SparkConnectConfigHandler { + + private[connect] val unsupportedConfigurations = Set("spark.sql.execution.arrow.enabled") + + def toKeyValue(pair: proto.KeyValue): (String, Option[String]) = { + val key = pair.getKey + val value = if (pair.hasValue) { + Some(pair.getValue) + } else { + None + } + (key, value) + } + + def toProtoKeyValue(key: String, value: Option[String]): proto.KeyValue = { + val builder = proto.KeyValue.newBuilder() + builder.setKey(key) + value.foreach(builder.setValue) + builder.build() + } +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectInterceptorRegistry.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectInterceptorRegistry.scala new file mode 100644 index 0000000000000..cddd4b976638d --- /dev/null +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectInterceptorRegistry.scala @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.service + +import java.lang.reflect.InvocationTargetException + +import io.grpc.ServerInterceptor +import io.grpc.netty.NettyServerBuilder + +import org.apache.spark.{SparkEnv, SparkException} +import org.apache.spark.sql.connect.config.Connect +import org.apache.spark.util.Utils + +/** + * This object provides a global list of configured interceptors for GRPC. The interceptors are + * added to the GRPC server in order of their position in the list. Once the statically compiled + * interceptors are added, dynamically configured interceptors are added. + */ +object SparkConnectInterceptorRegistry { + + // Contains the list of configured interceptors. + private lazy val interceptorChain: Seq[InterceptorBuilder] = Seq( + // Adding a new interceptor at compile time works like the eaxmple below with the dummy + // interceptor: + // interceptor[DummyInterceptor](classOf[DummyInterceptor]) + ) + + /** + * Given a NettyServerBuilder instance, will chain all interceptors to it in reverse order. + * @param sb + */ + def chainInterceptors(sb: NettyServerBuilder): Unit = { + interceptorChain.foreach(i => sb.intercept(i())) + createConfiguredInterceptors().foreach(sb.intercept(_)) + } + + // Type used to identify the closure responsible to instantiate a ServerInterceptor. + type InterceptorBuilder = () => ServerInterceptor + + /** + * Exposed for testing only. + */ + def createConfiguredInterceptors(): Seq[ServerInterceptor] = { + // Check all values from the Spark conf. + val classes = SparkEnv.get.conf.get(Connect.CONNECT_GRPC_INTERCEPTOR_CLASSES) + if (classes.nonEmpty) { + classes.get + .split(",") + .map(_.trim) + .filter(_.nonEmpty) + .map(Utils.classForName[ServerInterceptor](_)) + .map(createInstance(_)) + } else { + Seq.empty + } + } + + /** + * Creates a new instance of T using the default constructor. + * @param cls + * @tparam T + * @return + */ + private def createInstance[T <: ServerInterceptor](cls: Class[T]): ServerInterceptor = { + val ctorOpt = cls.getConstructors.find(_.getParameterCount == 0) + if (ctorOpt.isEmpty) { + throw new SparkException( + errorClass = "CONNECT.INTERCEPTOR_CTOR_MISSING", + messageParameters = Map("cls" -> cls.getName), + cause = null) + } + try { + ctorOpt.get.newInstance().asInstanceOf[T] + } catch { + case e: InvocationTargetException => + throw new SparkException( + errorClass = "CONNECT.INTERCEPTOR_RUNTIME_ERROR", + messageParameters = Map("msg" -> e.getTargetException.getMessage), + cause = e) + case e: Exception => + throw new SparkException( + errorClass = "CONNECT.INTERCEPTOR_RUNTIME_ERROR", + messageParameters = Map("msg" -> e.getMessage), + cause = e) + } + } + + /** + * Creates a callable expression that instantiates the configured GPRC interceptor + * implementation. + */ + private def interceptor[T <: ServerInterceptor](cls: Class[T]): InterceptorBuilder = + () => createInstance(cls) +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectServer.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectServer.scala new file mode 100644 index 0000000000000..df28df59fa2ac --- /dev/null +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectServer.scala @@ -0,0 +1,45 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.service + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.SparkSession + +/** + * The Spark Connect server + */ +object SparkConnectServer extends Logging { + def main(args: Array[String]): Unit = { + // Set the active Spark Session, and starts SparkEnv instance (via Spark Context) + logInfo("Starting Spark session.") + val session = SparkSession.builder.getOrCreate() + try { + try { + SparkConnectService.start() + logInfo("Spark Connect server started.") + } catch { + case e: Exception => + logError("Error starting Spark Connect server", e) + System.exit(-1) + } + SparkConnectService.server.awaitTermination() + } finally { + session.stop() + } + } +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala new file mode 100755 index 0000000000000..726809ecb19a4 --- /dev/null +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectService.scala @@ -0,0 +1,310 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.service + +import java.util.concurrent.TimeUnit + +import scala.annotation.tailrec +import scala.collection.mutable.ArrayBuffer +import scala.util.control.NonFatal + +import com.google.common.base.Ticker +import com.google.common.cache.CacheBuilder +import com.google.protobuf.{Any => ProtoAny} +import com.google.rpc.{Code => RPCCode, ErrorInfo, Status => RPCStatus} +import io.grpc.{Server, Status} +import io.grpc.netty.NettyServerBuilder +import io.grpc.protobuf.StatusProto +import io.grpc.protobuf.services.ProtoReflectionService +import io.grpc.stub.StreamObserver +import org.apache.commons.lang3.StringUtils +import org.json4s.JsonDSL._ +import org.json4s.jackson.JsonMethods.{compact, render} + +import org.apache.spark.{SparkEnv, SparkException, SparkThrowable} +import org.apache.spark.api.python.PythonException +import org.apache.spark.connect.proto +import org.apache.spark.connect.proto.{AddArtifactsRequest, AddArtifactsResponse} +import org.apache.spark.internal.Logging +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.connect.config.Connect.{CONNECT_GRPC_BINDING_PORT, CONNECT_GRPC_MAX_INBOUND_MESSAGE_SIZE} + +/** + * The SparkConnectService implementation. + * + * This class implements the service stub from the generated code of GRPC. + * + * @param debug + * delegates debug behavior to the handlers. + */ +class SparkConnectService(debug: Boolean) + extends proto.SparkConnectServiceGrpc.SparkConnectServiceImplBase + with Logging { + + private def allClasses(cl: Class[_]): Seq[Class[_]] = { + val classes = ArrayBuffer.empty[Class[_]] + if (cl != null && !cl.equals(classOf[java.lang.Object])) { + classes.append(cl) // Includes itself. + } + + @tailrec + def appendSuperClasses(clazz: Class[_]): Unit = { + if (clazz == null || clazz.equals(classOf[java.lang.Object])) return + classes.append(clazz.getSuperclass) + appendSuperClasses(clazz.getSuperclass) + } + + appendSuperClasses(cl) + classes.toSeq + } + + private def buildStatusFromThrowable(st: Throwable): RPCStatus = { + val message = StringUtils.abbreviate(st.getMessage, 2048) + RPCStatus + .newBuilder() + .setCode(RPCCode.INTERNAL_VALUE) + .addDetails( + ProtoAny.pack( + ErrorInfo + .newBuilder() + .setReason(st.getClass.getName) + .setDomain("org.apache.spark") + .putMetadata("classes", compact(render(allClasses(st.getClass).map(_.getName)))) + .build())) + .setMessage(if (message != null) message else "") + .build() + } + + private def isPythonExecutionException(se: SparkException): Boolean = { + // See also pyspark.errors.exceptions.captured.convert_exception in PySpark. + se.getCause != null && se.getCause + .isInstanceOf[PythonException] && se.getCause.getStackTrace + .exists(_.toString.contains("org.apache.spark.sql.execution.python")) + } + + /** + * Common exception handling function for the Analysis and Execution methods. Closes the stream + * after the error has been sent. + * + * @param opType + * String value indicating the operation type (analysis, execution) + * @param observer + * The GRPC response observer. + * @tparam V + * @return + */ + private def handleError[V]( + opType: String, + observer: StreamObserver[V]): PartialFunction[Throwable, Unit] = { + case se: SparkException if isPythonExecutionException(se) => + logError(s"Error during: $opType", se) + observer.onError( + StatusProto.toStatusRuntimeException(buildStatusFromThrowable(se.getCause))) + + case e: Throwable if e.isInstanceOf[SparkThrowable] || NonFatal.apply(e) => + logError(s"Error during: $opType", e) + observer.onError(StatusProto.toStatusRuntimeException(buildStatusFromThrowable(e))) + + case e: Throwable => + logError(s"Error during: $opType", e) + observer.onError( + Status.UNKNOWN + .withCause(e) + .withDescription(StringUtils.abbreviate(e.getMessage, 2048)) + .asRuntimeException()) + } + + /** + * This is the main entry method for Spark Connect and all calls to execute a plan. + * + * The plan execution is delegated to the [[SparkConnectStreamHandler]]. All error handling + * should be directly implemented in the deferred implementation. But this method catches + * generic errors. + * + * @param request + * @param responseObserver + */ + override def executePlan( + request: proto.ExecutePlanRequest, + responseObserver: StreamObserver[proto.ExecutePlanResponse]): Unit = { + try { + new SparkConnectStreamHandler(responseObserver).handle(request) + } catch handleError("execute", observer = responseObserver) + } + + /** + * Analyze a plan to provide metadata and debugging information. + * + * This method is called to generate the explain plan for a SparkConnect plan. In its simplest + * implementation, the plan that is generated by the [[SparkConnectPlanner]] is used to build a + * [[Dataset]] and derive the explain string from the query execution details. + * + * Errors during planning are returned via the [[StreamObserver]] interface. + * + * @param request + * @param responseObserver + */ + override def analyzePlan( + request: proto.AnalyzePlanRequest, + responseObserver: StreamObserver[proto.AnalyzePlanResponse]): Unit = { + try { + new SparkConnectAnalyzeHandler(responseObserver).handle(request) + } catch handleError("analyze", observer = responseObserver) + } + + /** + * This is the main entry method for Spark Connect and all calls to update or fetch + * configuration.. + * + * @param request + * @param responseObserver + */ + override def config( + request: proto.ConfigRequest, + responseObserver: StreamObserver[proto.ConfigResponse]): Unit = { + try { + new SparkConnectConfigHandler(responseObserver).handle(request) + } catch handleError("config", observer = responseObserver) + } + + /** + * This is the main entry method for all calls to add/transfer artifacts. + * + * @param responseObserver + * @return + */ + override def addArtifacts(responseObserver: StreamObserver[AddArtifactsResponse]) + : StreamObserver[AddArtifactsRequest] = { + // TODO: Handle artifact files + // No-Op StreamObserver + new StreamObserver[AddArtifactsRequest] { + override def onNext(v: AddArtifactsRequest): Unit = {} + + override def onError(throwable: Throwable): Unit = responseObserver.onError(throwable) + + override def onCompleted(): Unit = { + responseObserver.onNext(proto.AddArtifactsResponse.newBuilder().build()) + responseObserver.onCompleted() + } + } + } +} + +/** + * Object used for referring to SparkSessions in the SessionCache. + * + * @param userId + * @param session + */ +case class SessionHolder(userId: String, sessionId: String, session: SparkSession) + +/** + * Static instance of the SparkConnectService. + * + * Used to start the overall SparkConnect service and provides global state to manage the + * different SparkSession from different users connecting to the cluster. + */ +object SparkConnectService { + + private val CACHE_SIZE = 100 + + private val CACHE_TIMEOUT_SECONDS = 3600 + + // Type alias for the SessionCacheKey. Right now this is a String but allows us to switch to a + // different or complex type easily. + private type SessionCacheKey = (String, String) + + private[connect] var server: Server = _ + + // For testing purpose, it's package level private. + private[connect] def localPort: Int = { + assert(server != null) + // Return the actual local port being used. This can be different from the csonfigured port + // when the server binds to the port 0 as an example. + server.getPort + } + + private val userSessionMapping = + cacheBuilder(CACHE_SIZE, CACHE_TIMEOUT_SECONDS).build[SessionCacheKey, SessionHolder]() + + // Simple builder for creating the cache of Sessions. + private def cacheBuilder(cacheSize: Int, timeoutSeconds: Int): CacheBuilder[Object, Object] = { + var cacheBuilder = CacheBuilder.newBuilder().ticker(Ticker.systemTicker()) + if (cacheSize >= 0) { + cacheBuilder = cacheBuilder.maximumSize(cacheSize) + } + if (timeoutSeconds >= 0) { + cacheBuilder.expireAfterAccess(timeoutSeconds, TimeUnit.SECONDS) + } + cacheBuilder + } + + /** + * Based on the `key` find or create a new SparkSession. + */ + def getOrCreateIsolatedSession(userId: String, sessionId: String): SessionHolder = { + userSessionMapping.get( + (userId, sessionId), + () => { + SessionHolder(userId, sessionId, newIsolatedSession()) + }) + } + + private def newIsolatedSession(): SparkSession = { + SparkSession.active.newSession() + } + + /** + * Starts the GRPC Serivce. + */ + private def startGRPCService(): Unit = { + val debugMode = SparkEnv.get.conf.getBoolean("spark.connect.grpc.debug.enabled", true) + val port = SparkEnv.get.conf.get(CONNECT_GRPC_BINDING_PORT) + val sb = NettyServerBuilder + .forPort(port) + .maxInboundMessageSize(SparkEnv.get.conf.get(CONNECT_GRPC_MAX_INBOUND_MESSAGE_SIZE).toInt) + .addService(new SparkConnectService(debugMode)) + + // Add all registered interceptors to the server builder. + SparkConnectInterceptorRegistry.chainInterceptors(sb) + + // If debug mode is configured, load the ProtoReflection service so that tools like + // grpcurl can introspect the API for debugging. + if (debugMode) { + sb.addService(ProtoReflectionService.newInstance()) + } + server = sb.build + server.start() + } + + // Starts the service + def start(): Unit = { + startGRPCService() + } + + def stop(timeout: Option[Long] = None, unit: Option[TimeUnit] = None): Unit = { + if (server != null) { + if (timeout.isDefined && unit.isDefined) { + server.shutdown() + server.awaitTermination(timeout.get, unit.get) + } else { + server.shutdownNow() + } + } + } +} diff --git a/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectStreamHandler.scala b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectStreamHandler.scala new file mode 100644 index 0000000000000..760ff8a64b4f7 --- /dev/null +++ b/connector/connect/server/src/main/scala/org/apache/spark/sql/connect/service/SparkConnectStreamHandler.scala @@ -0,0 +1,281 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.service + +import scala.collection.JavaConverters._ + +import com.google.protobuf.ByteString +import io.grpc.stub.StreamObserver + +import org.apache.spark.SparkEnv +import org.apache.spark.connect.proto +import org.apache.spark.connect.proto.{ExecutePlanRequest, ExecutePlanResponse} +import org.apache.spark.internal.Logging +import org.apache.spark.sql.{DataFrame, Dataset, SparkSession} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.connect.common.DataTypeProtoConverter +import org.apache.spark.sql.connect.common.LiteralValueProtoConverter.toLiteralProto +import org.apache.spark.sql.connect.config.Connect.CONNECT_GRPC_ARROW_MAX_BATCH_SIZE +import org.apache.spark.sql.connect.planner.SparkConnectPlanner +import org.apache.spark.sql.connect.service.SparkConnectStreamHandler.processAsArrowBatches +import org.apache.spark.sql.execution.{SparkPlan, SQLExecution} +import org.apache.spark.sql.execution.adaptive.{AdaptiveSparkPlanExec, AdaptiveSparkPlanHelper, QueryStageExec} +import org.apache.spark.sql.execution.arrow.ArrowConverters +import org.apache.spark.sql.types.StructType +import org.apache.spark.util.ThreadUtils + +class SparkConnectStreamHandler(responseObserver: StreamObserver[ExecutePlanResponse]) + extends Logging { + + def handle(v: ExecutePlanRequest): Unit = { + val session = + SparkConnectService + .getOrCreateIsolatedSession(v.getUserContext.getUserId, v.getSessionId) + .session + session.withActive { + v.getPlan.getOpTypeCase match { + case proto.Plan.OpTypeCase.COMMAND => handleCommand(session, v) + case proto.Plan.OpTypeCase.ROOT => handlePlan(session, v) + case _ => + throw new UnsupportedOperationException(s"${v.getPlan.getOpTypeCase} not supported.") + } + } + } + + private def handlePlan(session: SparkSession, request: ExecutePlanRequest): Unit = { + // Extract the plan from the request and convert it to a logical plan + val planner = new SparkConnectPlanner(session) + val dataframe = Dataset.ofRows(session, planner.transformRelation(request.getPlan.getRoot)) + responseObserver.onNext( + SparkConnectStreamHandler.sendSchemaToResponse(request.getSessionId, dataframe.schema)) + processAsArrowBatches(request.getSessionId, dataframe, responseObserver) + responseObserver.onNext( + SparkConnectStreamHandler.createMetricsResponse(request.getSessionId, dataframe)) + if (dataframe.queryExecution.observedMetrics.nonEmpty) { + responseObserver.onNext( + SparkConnectStreamHandler.sendObservedMetricsToResponse(request.getSessionId, dataframe)) + } + responseObserver.onCompleted() + } + + private def handleCommand(session: SparkSession, request: ExecutePlanRequest): Unit = { + val command = request.getPlan.getCommand + val planner = new SparkConnectPlanner(session) + planner.process(command, request.getSessionId, responseObserver) + responseObserver.onCompleted() + } +} + +object SparkConnectStreamHandler { + type Batch = (Array[Byte], Long) + + def rowToArrowConverter( + schema: StructType, + maxRecordsPerBatch: Int, + maxBatchSize: Long, + timeZoneId: String): Iterator[InternalRow] => Iterator[Batch] = { rows => + val batches = ArrowConverters.toBatchWithSchemaIterator( + rows, + schema, + maxRecordsPerBatch, + maxBatchSize, + timeZoneId) + batches.map(b => b -> batches.rowCountInLastBatch) + } + + def processAsArrowBatches( + sessionId: String, + dataframe: DataFrame, + responseObserver: StreamObserver[ExecutePlanResponse]): Unit = { + val spark = dataframe.sparkSession + val schema = dataframe.schema + val maxRecordsPerBatch = spark.sessionState.conf.arrowMaxRecordsPerBatch + val timeZoneId = spark.sessionState.conf.sessionLocalTimeZone + // Conservatively sets it 70% because the size is not accurate but estimated. + val maxBatchSize = (SparkEnv.get.conf.get(CONNECT_GRPC_ARROW_MAX_BATCH_SIZE) * 0.7).toLong + + SQLExecution.withNewExecutionId(dataframe.queryExecution, Some("collectArrow")) { + val rows = dataframe.queryExecution.executedPlan.execute() + val numPartitions = rows.getNumPartitions + var numSent = 0 + + if (numPartitions > 0) { + type Batch = (Array[Byte], Long) + + val batches = rows.mapPartitionsInternal( + SparkConnectStreamHandler + .rowToArrowConverter(schema, maxRecordsPerBatch, maxBatchSize, timeZoneId)) + + val signal = new Object + val partitions = new Array[Array[Batch]](numPartitions) + var error: Option[Throwable] = None + + // This callback is executed by the DAGScheduler thread. + // After fetching a partition, it inserts the partition into the Map, and then + // wakes up the main thread. + val resultHandler = (partitionId: Int, partition: Array[Batch]) => { + signal.synchronized { + partitions(partitionId) = partition + signal.notify() + } + () + } + + val future = spark.sparkContext.submitJob( + rdd = batches, + processPartition = (iter: Iterator[Batch]) => iter.toArray, + partitions = Seq.range(0, numPartitions), + resultHandler = resultHandler, + resultFunc = () => ()) + + // Collect errors and propagate them to the main thread. + future.onComplete { result => + result.failed.foreach { throwable => + signal.synchronized { + error = Some(throwable) + signal.notify() + } + } + }(ThreadUtils.sameThread) + + // The main thread will wait until 0-th partition is available, + // then send it to client and wait for the next partition. + // Different from the implementation of [[Dataset#collectAsArrowToPython]], it sends + // the arrow batches in main thread to avoid DAGScheduler thread been blocked for + // tasks not related to scheduling. This is particularly important if there are + // multiple users or clients running code at the same time. + var currentPartitionId = 0 + while (currentPartitionId < numPartitions) { + val partition = signal.synchronized { + var part = partitions(currentPartitionId) + while (part == null && error.isEmpty) { + signal.wait() + part = partitions(currentPartitionId) + } + partitions(currentPartitionId) = null + + error.foreach { case other => + throw other + } + part + } + + partition.foreach { case (bytes, count) => + val response = proto.ExecutePlanResponse.newBuilder().setSessionId(sessionId) + val batch = proto.ExecutePlanResponse.ArrowBatch + .newBuilder() + .setRowCount(count) + .setData(ByteString.copyFrom(bytes)) + .build() + response.setArrowBatch(batch) + responseObserver.onNext(response.build()) + numSent += 1 + } + + currentPartitionId += 1 + } + } + + // Make sure at least 1 batch will be sent. + if (numSent == 0) { + val bytes = ArrowConverters.createEmptyArrowBatch(schema, timeZoneId) + val response = proto.ExecutePlanResponse.newBuilder().setSessionId(sessionId) + val batch = proto.ExecutePlanResponse.ArrowBatch + .newBuilder() + .setRowCount(0L) + .setData(ByteString.copyFrom(bytes)) + .build() + response.setArrowBatch(batch) + responseObserver.onNext(response.build()) + } + } + } + + def sendSchemaToResponse(sessionId: String, schema: StructType): ExecutePlanResponse = { + // Send the Spark data type + ExecutePlanResponse + .newBuilder() + .setSessionId(sessionId) + .setSchema(DataTypeProtoConverter.toConnectProtoType(schema)) + .build() + } + + def createMetricsResponse(sessionId: String, rows: DataFrame): ExecutePlanResponse = { + // Send a last batch with the metrics + ExecutePlanResponse + .newBuilder() + .setSessionId(sessionId) + .setMetrics(MetricGenerator.buildMetrics(rows.queryExecution.executedPlan)) + .build() + } + + def sendObservedMetricsToResponse( + sessionId: String, + dataframe: DataFrame): ExecutePlanResponse = { + val observedMetrics = dataframe.queryExecution.observedMetrics.map { case (name, row) => + val cols = (0 until row.length).map(i => toLiteralProto(row(i))) + ExecutePlanResponse.ObservedMetrics + .newBuilder() + .setName(name) + .addAllValues(cols.asJava) + .build() + } + // Prepare a response with the observed metrics. + ExecutePlanResponse + .newBuilder() + .setSessionId(sessionId) + .addAllObservedMetrics(observedMetrics.asJava) + .build() + } +} + +object MetricGenerator extends AdaptiveSparkPlanHelper { + def buildMetrics(p: SparkPlan): ExecutePlanResponse.Metrics = { + val b = ExecutePlanResponse.Metrics.newBuilder + b.addAllMetrics(transformPlan(p, p.id).asJava) + b.build() + } + + private def transformChildren(p: SparkPlan): Seq[ExecutePlanResponse.Metrics.MetricObject] = { + allChildren(p).flatMap(c => transformPlan(c, p.id)) + } + + private def allChildren(p: SparkPlan): Seq[SparkPlan] = p match { + case a: AdaptiveSparkPlanExec => Seq(a.executedPlan) + case s: QueryStageExec => Seq(s.plan) + case _ => p.children + } + + private def transformPlan( + p: SparkPlan, + parentId: Int): Seq[ExecutePlanResponse.Metrics.MetricObject] = { + val mv = p.metrics.map(m => + m._1 -> ExecutePlanResponse.Metrics.MetricValue.newBuilder + .setName(m._2.name.getOrElse("")) + .setValue(m._2.value) + .setMetricType(m._2.metricType) + .build()) + val mo = ExecutePlanResponse.Metrics.MetricObject + .newBuilder() + .setName(p.nodeName) + .setPlanId(p.id) + .putAllExecutionMetrics(mv.asJava) + .build() + Seq(mo) ++ transformChildren(p) + } +} diff --git a/connector/connect/server/src/test/resources/log4j2.properties b/connector/connect/server/src/test/resources/log4j2.properties new file mode 100644 index 0000000000000..ab02104c69697 --- /dev/null +++ b/connector/connect/server/src/test/resources/log4j2.properties @@ -0,0 +1,39 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the file target/unit-tests.log +rootLogger.level = info +rootLogger.appenderRef.file.ref = ${sys:test.appender:-File} + +appender.file.type = File +appender.file.name = File +appender.file.fileName = target/unit-tests.log +appender.file.layout.type = PatternLayout +appender.file.layout.pattern = %d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n%ex + +# Tests that launch java subprocesses can set the "test.appender" system property to +# "console" to avoid having the child process's logs overwrite the unit test's +# log file. +appender.console.type = Console +appender.console.name = console +appender.console.target = SYSTEM_ERR +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %t: %m%n%ex + +# Ignore messages below warning level from Jetty, because it's a bit verbose +logger.jetty.name = org.sparkproject.jetty +logger.jetty.level = warn diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/ProtoToParsedPlanTestSuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/ProtoToParsedPlanTestSuite.scala new file mode 100644 index 0000000000000..e20a6159cc8a5 --- /dev/null +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/ProtoToParsedPlanTestSuite.scala @@ -0,0 +1,217 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect + +import java.nio.charset.StandardCharsets +import java.nio.file.{Files, FileVisitResult, Path, SimpleFileVisitor} +import java.nio.file.attribute.BasicFileAttributes +import java.sql.DriverManager +import java.util + +import scala.util.{Failure, Success, Try} + +import org.apache.spark.{SparkConf, SparkFunSuite} +import org.apache.spark.connect.proto +import org.apache.spark.sql.catalyst.{catalog, QueryPlanningTracker} +import org.apache.spark.sql.catalyst.analysis.{caseSensitiveResolution, Analyzer, FunctionRegistry, Resolver, TableFunctionRegistry} +import org.apache.spark.sql.catalyst.catalog.SessionCatalog +import org.apache.spark.sql.catalyst.optimizer.ReplaceExpressions +import org.apache.spark.sql.connect.config.Connect +import org.apache.spark.sql.connect.planner.SparkConnectPlanner +import org.apache.spark.sql.connector.catalog.{CatalogManager, Identifier, InMemoryCatalog} +import org.apache.spark.sql.connector.expressions.Transform +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.StructType +import org.apache.spark.sql.util.CaseInsensitiveStringMap +import org.apache.spark.util.Utils + +// scalastyle:off +/** + * This test uses a corpus of queries ([[proto.Relation]] relations) and transforms each query + * into its catalyst representation. The resulting catalyst plan is compared with a golden file. + * + * The objective of this test is to make sure the JVM client and potentially others produce valid + * plans, and that these plans are transformed into their expected shape. Additionally this test + * should capture breaking proto changes to a degree. + * + * The corpus of queries is generated by the `PlanGenerationTestSuite` in the connect/client/jvm + * module. + * + * If you need to re-generate the golden files, you need to set the SPARK_GENERATE_GOLDEN_FILES=1 + * environment variable before running this test, e.g.: + * {{{ + * SPARK_GENERATE_GOLDEN_FILES=1 build/sbt "connect/testOnly org.apache.spark.sql.connect.ProtoToParsedPlanTestSuite" + * }}} + */ +// scalastyle:on +class ProtoToParsedPlanTestSuite extends SparkFunSuite with SharedSparkSession { + val url = "jdbc:h2:mem:testdb0" + var conn: java.sql.Connection = null + + override def beforeAll(): Unit = { + super.beforeAll() + + Utils.classForName("org.h2.Driver") + // Extra properties that will be specified for our database. We need these to test + // usage of parameters from OPTIONS clause in queries. + val properties = new util.Properties() + properties.setProperty("user", "testUser") + properties.setProperty("password", "testPass") + + conn = DriverManager.getConnection(url, properties) + conn.prepareStatement("create schema test").executeUpdate() + conn + .prepareStatement( + "create table test.people (name TEXT(32) NOT NULL, theid INTEGER NOT NULL)") + .executeUpdate() + conn + .prepareStatement("create table test.timetypes (a TIME, b DATE, c TIMESTAMP(7))") + .executeUpdate() + conn + .prepareStatement( + "create table test.emp(name TEXT(32) NOT NULL," + + " theid INTEGER, \"Dept\" INTEGER)") + .executeUpdate() + conn.commit() + } + + override def afterAll(): Unit = { + conn.close() + super.afterAll() + } + + override def sparkConf: SparkConf = { + super.sparkConf + .set( + Connect.CONNECT_EXTENSIONS_RELATION_CLASSES.key, + "org.apache.spark.sql.connect.plugin.ExampleRelationPlugin") + .set( + Connect.CONNECT_EXTENSIONS_EXPRESSION_CLASSES.key, + "org.apache.spark.sql.connect.plugin.ExampleExpressionPlugin") + .set(org.apache.spark.sql.internal.SQLConf.ANSI_ENABLED.key, false.toString) + } + + protected val baseResourcePath: Path = { + getWorkspaceFilePath( + "connector", + "connect", + "common", + "src", + "test", + "resources", + "query-tests").toAbsolutePath + } + + protected val inputFilePath: Path = baseResourcePath.resolve("queries") + protected val goldenFilePath: Path = baseResourcePath.resolve("explain-results") + private val emptyProps: util.Map[String, String] = util.Collections.emptyMap() + + private val analyzer = { + val inMemoryCatalog = new InMemoryCatalog + inMemoryCatalog.initialize("primary", CaseInsensitiveStringMap.empty()) + inMemoryCatalog.createNamespace(Array("tempdb"), emptyProps) + inMemoryCatalog.createTable( + Identifier.of(Array("tempdb"), "myTable"), + new StructType().add("id", "long"), + Array.empty[Transform], + emptyProps) + + val catalogManager = new CatalogManager( + inMemoryCatalog, + new SessionCatalog( + new catalog.InMemoryCatalog(), + FunctionRegistry.builtin, + TableFunctionRegistry.builtin)) + catalogManager.setCurrentCatalog("primary") + catalogManager.setCurrentNamespace(Array("tempdb")) + + new Analyzer(catalogManager) { + override def resolver: Resolver = caseSensitiveResolution + } + } + + // Create the tests. + Files.walkFileTree( + inputFilePath, + new SimpleFileVisitor[Path] { + override def visitFile(file: Path, attrs: BasicFileAttributes): FileVisitResult = { + createTest(file) + FileVisitResult.CONTINUE + } + }) + + private def createTest(file: Path): Unit = { + val relativePath = inputFilePath.relativize(file) + val fileName = relativePath.getFileName.toString + if (!fileName.endsWith(".proto.bin")) { + logError(s"Skipping $fileName") + return + } + val name = fileName.stripSuffix(".proto.bin") + test(name) { + val relation = readRelation(file) + val planner = new SparkConnectPlanner(spark) + val catalystPlan = + analyzer.executeAndCheck(planner.transformRelation(relation), new QueryPlanningTracker) + val actual = normalizeExprIds(ReplaceExpressions(catalystPlan)).treeString + val goldenFile = goldenFilePath.resolve(relativePath).getParent.resolve(name + ".explain") + Try(readGoldenFile(goldenFile)) match { + case Success(expected) if expected == actual => // Test passes. + case Success(_) if regenerateGoldenFiles => + logInfo("Overwriting golden file.") + writeGoldenFile(goldenFile, actual) + case Success(expected) => + fail(s""" + |Expected and actual plans do not match: + | + |=== Expected Plan === + |$expected + | + |=== Actual Plan === + |$actual + |""".stripMargin) + case Failure(_) if regenerateGoldenFiles => + logInfo("Writing golden file.") + writeGoldenFile(goldenFile, actual) + case Failure(_) => + fail( + "No golden file found. Please re-run this test with the " + + "SPARK_GENERATE_GOLDEN_FILES=1 environment variable set") + } + } + } + + private def readRelation(path: Path): proto.Relation = { + val input = Files.newInputStream(path) + try proto.Relation.parseFrom(input) + finally { + input.close() + } + } + + private def readGoldenFile(path: Path): String = { + new String(Files.readAllBytes(path), StandardCharsets.UTF_8) + } + + private def writeGoldenFile(path: Path, value: String): Unit = { + val writer = Files.newBufferedWriter(path) + try writer.write(value) + finally { + writer.close() + } + } +} diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/messages/ConnectProtoMessagesSuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/messages/ConnectProtoMessagesSuite.scala new file mode 100644 index 0000000000000..65c03a3c2e291 --- /dev/null +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/messages/ConnectProtoMessagesSuite.scala @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.messages + +import com.google.protobuf.ByteString + +import org.apache.spark.SparkFunSuite +import org.apache.spark.connect.proto +import org.apache.spark.sql.connect.common.DataTypeProtoConverter +import org.apache.spark.sql.types.IntegerType + +class ConnectProtoMessagesSuite extends SparkFunSuite { + test("UserContext can deal with extensions") { + // Create the builder. + val builder = proto.UserContext.newBuilder().setUserId("1").setUserName("Martin") + + // Create the extension value. + val lit = proto.Expression + .newBuilder() + .setLiteral(proto.Expression.Literal.newBuilder().setInteger(32).build()) + // Pack the extension into Any. + val aval = com.google.protobuf.Any.pack(lit.build()) + // Add Any to the repeated field list. + builder.addExtensions(aval) + // Create serialized value. + val serialized = builder.build().toByteArray + + // Now, read the serialized value. + val result = proto.UserContext.parseFrom(serialized) + assert(result.getUserId.equals("1")) + assert(result.getUserName.equals("Martin")) + assert(result.getExtensionsCount == 1) + + val ext = result.getExtensions(0) + assert(ext.is(classOf[proto.Expression])) + val extLit = ext.unpack(classOf[proto.Expression]) + assert(extLit.hasLiteral) + assert(extLit.getLiteral.hasInteger) + assert(extLit.getLiteral.getInteger == 32) + } + + test("CommonInlineUserDefinedFunction") { + val arguments = proto.Expression + .newBuilder() + .setUnresolvedAttribute( + proto.Expression.UnresolvedAttribute.newBuilder().setUnparsedIdentifier("id")) + .build() + + val pythonUdf = proto.PythonUDF + .newBuilder() + .setEvalType(100) + .setOutputType(DataTypeProtoConverter.toConnectProtoType(IntegerType)) + .setCommand(ByteString.copyFrom("command".getBytes())) + .setPythonVer("3.10") + .build() + + val commonInlineUserDefinedFunctionExpr = proto.Expression + .newBuilder() + .setCommonInlineUserDefinedFunction( + proto.CommonInlineUserDefinedFunction + .newBuilder() + .setFunctionName("f") + .setDeterministic(true) + .addArguments(arguments) + .setPythonUdf(pythonUdf)) + .build() + + val fun = commonInlineUserDefinedFunctionExpr.getCommonInlineUserDefinedFunction() + assert(fun.getFunctionName == "f") + assert(fun.getDeterministic == true) + assert(fun.getArgumentsCount == 1) + assert(fun.hasPythonUdf == true) + assert(pythonUdf.getPythonVer == "3.10") + } +} diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/LiteralExpressionProtoConverterSuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/LiteralExpressionProtoConverterSuite.scala new file mode 100644 index 0000000000000..c3479456617ca --- /dev/null +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/LiteralExpressionProtoConverterSuite.scala @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.planner + +import org.scalatest.funsuite.AnyFunSuite // scalastyle:ignore funsuite + +import org.apache.spark.sql.connect.common.LiteralValueProtoConverter.toLiteralProto +import org.apache.spark.sql.connect.planner.LiteralExpressionProtoConverter.toCatalystValue + +class LiteralExpressionProtoConverterSuite extends AnyFunSuite { // scalastyle:ignore funsuite + + test("basic proto value and catalyst value conversion") { + val values = Array(null, true, 1.toByte, 1.toShort, 1, 1L, 1.1d, 1.1f, "spark") + for (v <- values) { + assertResult(v)(toCatalystValue(toLiteralProto(v))) + } + } +} diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerSuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerSuite.scala new file mode 100644 index 0000000000000..ec2362d5a56b3 --- /dev/null +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectPlannerSuite.scala @@ -0,0 +1,847 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.planner + +import scala.collection.JavaConverters._ + +import com.google.protobuf.ByteString +import io.grpc.stub.StreamObserver + +import org.apache.spark.SparkFunSuite +import org.apache.spark.connect.proto +import org.apache.spark.connect.proto.ExecutePlanResponse +import org.apache.spark.connect.proto.Expression.{Alias, ExpressionString, UnresolvedStar} +import org.apache.spark.sql.{AnalysisException, Dataset, Row} +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.analysis.UnresolvedRelation +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, UnsafeProjection} +import org.apache.spark.sql.catalyst.plans.logical +import org.apache.spark.sql.connect.common.InvalidPlanInput +import org.apache.spark.sql.connect.common.LiteralValueProtoConverter.toLiteralProto +import org.apache.spark.sql.execution.arrow.ArrowConverters +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{IntegerType, StringType, StructField, StructType} +import org.apache.spark.unsafe.types.UTF8String + +/** + * Testing trait for SparkConnect tests with some helper methods to make it easier to create new + * test cases. + */ +trait SparkConnectPlanTest extends SharedSparkSession { + + class MockObserver extends StreamObserver[proto.ExecutePlanResponse] { + override def onNext(value: ExecutePlanResponse): Unit = {} + override def onError(t: Throwable): Unit = {} + override def onCompleted(): Unit = {} + } + + def transform(rel: proto.Relation): logical.LogicalPlan = { + new SparkConnectPlanner(spark).transformRelation(rel) + } + + def transform(cmd: proto.Command): Unit = { + new SparkConnectPlanner(spark).process(cmd, "clientId", new MockObserver()) + } + + def readRel: proto.Relation = + proto.Relation + .newBuilder() + .setRead( + proto.Read + .newBuilder() + .setNamedTable(proto.Read.NamedTable.newBuilder().setUnparsedIdentifier("table")) + .build()) + .build() + + /** + * Creates a local relation for testing purposes. The local relation is mapped to it's + * equivalent in Catalyst and can be easily used for planner testing. + * + * @param attrs + * the attributes of LocalRelation + * @param data + * the data of LocalRelation + * @return + */ + def createLocalRelationProto( + attrs: Seq[AttributeReference], + data: Seq[InternalRow]): proto.Relation = { + val localRelationBuilder = proto.LocalRelation.newBuilder() + + val bytes = ArrowConverters + .toBatchWithSchemaIterator( + data.iterator, + StructType.fromAttributes(attrs.map(_.toAttribute)), + Long.MaxValue, + Long.MaxValue, + null) + .next() + + localRelationBuilder.setData(ByteString.copyFrom(bytes)) + proto.Relation.newBuilder().setLocalRelation(localRelationBuilder.build()).build() + } +} + +/** + * This is a rudimentary test class for SparkConnect. The main goal of these basic tests is to + * ensure that the transformation from Proto to LogicalPlan works and that the right nodes are + * generated. + */ +class SparkConnectPlannerSuite extends SparkFunSuite with SparkConnectPlanTest { + + test("Simple Limit") { + assertThrows[IndexOutOfBoundsException] { + new SparkConnectPlanner(None.orNull) + .transformRelation( + proto.Relation.newBuilder + .setLimit(proto.Limit.newBuilder.setLimit(10)) + .build()) + } + } + + test("InvalidInputs") { + // No Relation Set + intercept[IndexOutOfBoundsException]( + new SparkConnectPlanner(None.orNull).transformRelation(proto.Relation.newBuilder().build())) + + intercept[InvalidPlanInput]( + new SparkConnectPlanner(None.orNull) + .transformRelation( + proto.Relation.newBuilder.setUnknown(proto.Unknown.newBuilder().build()).build())) + } + + test("Simple Read") { + val read = proto.Read.newBuilder().build() + // Invalid read without Table name. + intercept[InvalidPlanInput](transform(proto.Relation.newBuilder.setRead(read).build())) + val readWithTable = read.toBuilder + .setNamedTable(proto.Read.NamedTable.newBuilder.setUnparsedIdentifier("name").build()) + .build() + val res = transform(proto.Relation.newBuilder.setRead(readWithTable).build()) + assert(res !== null) + assert(res.nodeName == "UnresolvedRelation") + } + + test("Simple Table with options") { + val read = proto.Read.newBuilder().build() + // Invalid read without Table name. + intercept[InvalidPlanInput](transform(proto.Relation.newBuilder.setRead(read).build())) + val readWithTable = read.toBuilder + .setNamedTable( + proto.Read.NamedTable.newBuilder + .setUnparsedIdentifier("name") + .putOptions("p1", "v1") + .build()) + .build() + val res = transform(proto.Relation.newBuilder.setRead(readWithTable).build()) + res match { + case e: UnresolvedRelation => assert(e.options.get("p1") == "v1") + case _ => assert(false, "Do not have expected options") + } + } + + test("Simple Project") { + val readWithTable = proto.Read + .newBuilder() + .setNamedTable(proto.Read.NamedTable.newBuilder.setUnparsedIdentifier("name").build()) + .build() + val project = + proto.Project + .newBuilder() + .setInput(proto.Relation.newBuilder().setRead(readWithTable).build()) + .addExpressions( + proto.Expression + .newBuilder() + .setUnresolvedStar(UnresolvedStar.newBuilder().build()) + .build()) + .build() + val res = transform(proto.Relation.newBuilder.setProject(project).build()) + assert(res !== null) + assert(res.nodeName == "Project") + } + + test("Simple Sort") { + val sort = proto.Sort.newBuilder + .addAllOrder(Seq(proto.Expression.SortOrder.newBuilder().build()).asJava) + .build() + intercept[IndexOutOfBoundsException]( + transform(proto.Relation.newBuilder().setSort(sort).build()), + "No Input set.") + + val f = proto.Expression.SortOrder + .newBuilder() + .setNullOrdering(proto.Expression.SortOrder.NullOrdering.SORT_NULLS_LAST) + .setDirection(proto.Expression.SortOrder.SortDirection.SORT_DIRECTION_DESCENDING) + .setChild( + proto.Expression.newBuilder + .setUnresolvedAttribute( + proto.Expression.UnresolvedAttribute.newBuilder.setUnparsedIdentifier("col").build()) + .build()) + .build() + + val res = transform( + proto.Relation.newBuilder + .setSort( + proto.Sort.newBuilder + .addAllOrder(Seq(f).asJava) + .setInput(readRel) + .setIsGlobal(true)) + .build()) + assert(res.nodeName == "Sort") + assert(res.asInstanceOf[logical.Sort].global) + + val res2 = transform( + proto.Relation.newBuilder + .setSort( + proto.Sort.newBuilder + .addAllOrder(Seq(f).asJava) + .setInput(readRel) + .setIsGlobal(false)) + .build()) + assert(res2.nodeName == "Sort") + assert(!res2.asInstanceOf[logical.Sort].global) + } + + test("Simple Union") { + intercept[InvalidPlanInput]( + transform(proto.Relation.newBuilder.setSetOp(proto.SetOperation.newBuilder.build()).build)) + val union = proto.Relation.newBuilder + .setSetOp( + proto.SetOperation.newBuilder.setLeftInput(readRel).setRightInput(readRel).build()) + .build() + val msg = intercept[InvalidPlanInput] { + transform(union) + } + assert(msg.getMessage.contains("Unsupported set operation")) + + val res = transform( + proto.Relation.newBuilder + .setSetOp( + proto.SetOperation.newBuilder + .setLeftInput(readRel) + .setRightInput(readRel) + .setSetOpType(proto.SetOperation.SetOpType.SET_OP_TYPE_UNION) + .setIsAll(true) + .build()) + .build()) + assert(res.nodeName == "Union") + } + + test("Union By Name") { + val union = proto.Relation.newBuilder + .setSetOp( + proto.SetOperation.newBuilder + .setLeftInput(readRel) + .setRightInput(readRel) + .setSetOpType(proto.SetOperation.SetOpType.SET_OP_TYPE_UNION) + .setByName(false) + .setAllowMissingColumns(true) + .build()) + .build() + val msg = intercept[InvalidPlanInput] { + transform(union) + } + assert( + msg.getMessage.contains( + "UnionByName `allowMissingCol` can be true only if `byName` is true.")) + } + + test("Simple Join") { + val incompleteJoin = + proto.Relation.newBuilder.setJoin(proto.Join.newBuilder.setLeft(readRel)).build() + intercept[AssertionError](transform(incompleteJoin)) + + // Join type JOIN_TYPE_UNSPECIFIED is not supported. + intercept[InvalidPlanInput] { + val simpleJoin = proto.Relation.newBuilder + .setJoin(proto.Join.newBuilder.setLeft(readRel).setRight(readRel)) + .build() + transform(simpleJoin) + } + + // Construct a simple Join. + val unresolvedAttribute = proto.Expression + .newBuilder() + .setUnresolvedAttribute( + proto.Expression.UnresolvedAttribute.newBuilder().setUnparsedIdentifier("left").build()) + .build() + + val joinCondition = proto.Expression.newBuilder.setUnresolvedFunction( + proto.Expression.UnresolvedFunction.newBuilder + .setFunctionName("==") + .addArguments(unresolvedAttribute) + .addArguments(unresolvedAttribute) + .build()) + + val simpleJoin = proto.Relation.newBuilder + .setJoin( + proto.Join.newBuilder + .setLeft(readRel) + .setRight(readRel) + .setJoinType(proto.Join.JoinType.JOIN_TYPE_INNER) + .setJoinCondition(joinCondition) + .build()) + .build() + + val res = transform(simpleJoin) + assert(res.nodeName == "Join") + assert(res != null) + + val e = intercept[InvalidPlanInput] { + val simpleJoin = proto.Relation.newBuilder + .setJoin( + proto.Join.newBuilder + .setLeft(readRel) + .setRight(readRel) + .addUsingColumns("test_col") + .setJoinCondition(joinCondition)) + .build() + transform(simpleJoin) + } + assert( + e.getMessage.contains( + "Using columns or join conditions cannot be set at the same time in Join")) + } + + test("Simple Projection") { + val project = proto.Project.newBuilder + .setInput(readRel) + .addExpressions( + proto.Expression.newBuilder + .setLiteral(proto.Expression.Literal.newBuilder.setInteger(32)) + .build()) + .build() + + val res = transform(proto.Relation.newBuilder.setProject(project).build()) + assert(res.nodeName == "Project") + } + + test("Simple Aggregation") { + val unresolvedAttribute = proto.Expression + .newBuilder() + .setUnresolvedAttribute( + proto.Expression.UnresolvedAttribute.newBuilder().setUnparsedIdentifier("left").build()) + .build() + + val sum = + proto.Expression + .newBuilder() + .setUnresolvedFunction( + proto.Expression.UnresolvedFunction + .newBuilder() + .setFunctionName("sum") + .addArguments(unresolvedAttribute)) + .build() + + val agg = proto.Aggregate.newBuilder + .setInput(readRel) + .addAggregateExpressions(sum) + .addGroupingExpressions(unresolvedAttribute) + .setGroupType(proto.Aggregate.GroupType.GROUP_TYPE_GROUPBY) + .build() + + val res = transform(proto.Relation.newBuilder.setAggregate(agg).build()) + assert(res.nodeName == "Aggregate") + } + + test("Test invalid deduplicate") { + val deduplicate = proto.Deduplicate + .newBuilder() + .setInput(readRel) + .setAllColumnsAsKeys(true) + .addColumnNames("test") + + val e = intercept[InvalidPlanInput] { + transform(proto.Relation.newBuilder.setDeduplicate(deduplicate).build()) + } + assert( + e.getMessage.contains("Cannot deduplicate on both all columns and a subset of columns")) + + val deduplicate2 = proto.Deduplicate + .newBuilder() + .setInput(readRel) + val e2 = intercept[InvalidPlanInput] { + transform(proto.Relation.newBuilder.setDeduplicate(deduplicate2).build()) + } + assert(e2.getMessage.contains("either deduplicate on all columns or a subset of columns")) + } + + test("Test invalid intersect, except") { + // Except with union_by_name=true + val except = proto.SetOperation + .newBuilder() + .setLeftInput(readRel) + .setRightInput(readRel) + .setByName(true) + .setSetOpType(proto.SetOperation.SetOpType.SET_OP_TYPE_EXCEPT) + val e = + intercept[InvalidPlanInput](transform(proto.Relation.newBuilder.setSetOp(except).build())) + assert(e.getMessage.contains("Except does not support union_by_name")) + + // Intersect with union_by_name=true + val intersect = proto.SetOperation + .newBuilder() + .setLeftInput(readRel) + .setRightInput(readRel) + .setByName(true) + .setSetOpType(proto.SetOperation.SetOpType.SET_OP_TYPE_INTERSECT) + val e2 = intercept[InvalidPlanInput]( + transform(proto.Relation.newBuilder.setSetOp(intersect).build())) + assert(e2.getMessage.contains("Intersect does not support union_by_name")) + } + + test("transform LocalRelation") { + val rows = (0 until 10).map { i => + InternalRow(i, UTF8String.fromString(s"str-$i"), InternalRow(i)) + } + + val schema = StructType( + Seq( + StructField("int", IntegerType), + StructField("str", StringType), + StructField("struct", StructType(Seq(StructField("inner", IntegerType)))))) + val inputRows = rows.map { row => + val proj = UnsafeProjection.create(schema) + proj(row).copy() + } + + val localRelation = createLocalRelationProto(schema.toAttributes, inputRows) + val df = Dataset.ofRows(spark, transform(localRelation)) + val array = df.collect() + assertResult(10)(array.length) + assert(schema == df.schema) + for (i <- 0 until 10) { + assert(i == array(i).getInt(0)) + assert(s"str-$i" == array(i).getString(1)) + assert(i == array(i).getStruct(2).getInt(0)) + } + } + + test("Empty ArrowBatch") { + val schema = StructType(Seq(StructField("int", IntegerType))) + val data = ArrowConverters.createEmptyArrowBatch(schema, null) + val localRelation = proto.Relation + .newBuilder() + .setLocalRelation( + proto.LocalRelation + .newBuilder() + .setData(ByteString.copyFrom(data)) + .build()) + .build() + val df = Dataset.ofRows(spark, transform(localRelation)) + assert(schema == df.schema) + assert(df.isEmpty) + } + + test("Illegal LocalRelation data") { + intercept[Exception] { + transform( + proto.Relation + .newBuilder() + .setLocalRelation( + proto.LocalRelation + .newBuilder() + .setData(ByteString.copyFrom("illegal".getBytes())) + .build()) + .build()) + } + } + + test("Test duplicated names in WithColumns") { + intercept[AnalysisException] { + transform( + proto.Relation + .newBuilder() + .setWithColumns( + proto.WithColumns + .newBuilder() + .setInput(readRel) + .addAliases(proto.Expression.Alias + .newBuilder() + .addName("test") + .setExpr(proto.Expression.newBuilder + .setLiteral(proto.Expression.Literal.newBuilder.setInteger(32)))) + .addAliases(proto.Expression.Alias + .newBuilder() + .addName("test") + .setExpr(proto.Expression.newBuilder + .setLiteral(proto.Expression.Literal.newBuilder.setInteger(32))))) + .build()) + } + } + + test("Test multi nameparts for column names in WithColumns") { + val e = intercept[InvalidPlanInput] { + transform( + proto.Relation + .newBuilder() + .setWithColumns( + proto.WithColumns + .newBuilder() + .setInput(readRel) + .addAliases( + proto.Expression.Alias + .newBuilder() + .addName("part1") + .addName("part2") + .setExpr(proto.Expression.newBuilder + .setLiteral(proto.Expression.Literal.newBuilder.setInteger(32))))) + .build()) + } + assert(e.getMessage.contains("part1, part2")) + } + + test("transform UnresolvedStar and ExpressionString") { + val sql = + "SELECT * FROM VALUES (1,'spark',1), (2,'hadoop',2), (3,'kafka',3) AS tab(id, name, value)" + val input = proto.Relation + .newBuilder() + .setSql( + proto.SQL + .newBuilder() + .setQuery(sql) + .build()) + + val project = + proto.Project + .newBuilder() + .setInput(input) + .addExpressions( + proto.Expression + .newBuilder() + .setUnresolvedStar(UnresolvedStar.newBuilder().build()) + .build()) + .addExpressions( + proto.Expression + .newBuilder() + .setExpressionString(ExpressionString.newBuilder().setExpression("name").build()) + .build()) + .build() + + val df = + Dataset.ofRows(spark, transform(proto.Relation.newBuilder.setProject(project).build())) + val array = df.collect() + assert(array.length == 3) + assert(array(0).toString == InternalRow(1, "spark", 1, "spark").toString) + assert(array(1).toString == InternalRow(2, "hadoop", 2, "hadoop").toString) + assert(array(2).toString == InternalRow(3, "kafka", 3, "kafka").toString) + } + + test("transform UnresolvedStar with target field") { + val rows = (0 until 10).map { i => + InternalRow(InternalRow(InternalRow(i, i + 1))) + } + + val schema = StructType( + Seq( + StructField( + "a", + StructType(Seq(StructField( + "b", + StructType(Seq(StructField("c", IntegerType), StructField("d", IntegerType))))))))) + val inputRows = rows.map { row => + val proj = UnsafeProjection.create(schema) + proj(row).copy() + } + + val localRelation = createLocalRelationProto(schema.toAttributes, inputRows) + + val project = + proto.Project + .newBuilder() + .setInput(localRelation) + .addExpressions( + proto.Expression + .newBuilder() + .setUnresolvedStar(UnresolvedStar.newBuilder().setUnparsedTarget("a.b.*").build()) + .build()) + .build() + + val df = + Dataset.ofRows(spark, transform(proto.Relation.newBuilder.setProject(project).build())) + assertResult(df.schema)( + StructType(Seq(StructField("c", IntegerType), StructField("d", IntegerType)))) + + val array = df.collect() + assert(array.length == 10) + for (i <- 0 until 10) { + assert(i == array(i).getInt(0)) + assert(i + 1 == array(i).getInt(1)) + } + } + + test("transform Project with Alias") { + val input = proto.Expression + .newBuilder() + .setLiteral( + proto.Expression.Literal + .newBuilder() + .setInteger(1) + .build()) + + val project = + proto.Project + .newBuilder() + .addExpressions( + proto.Expression + .newBuilder() + .setAlias(Alias.newBuilder().setExpr(input).addName("id").build()) + .build()) + .build() + + val df = + Dataset.ofRows(spark, transform(proto.Relation.newBuilder.setProject(project).build())) + assert(df.schema.fields.toSeq.map(_.name) == Seq("id")) + } + + test("Hint") { + val input = proto.Relation + .newBuilder() + .setSql( + proto.SQL + .newBuilder() + .setQuery("select id from range(10)") + .build()) + + val logical = transform( + proto.Relation + .newBuilder() + .setHint(proto.Hint + .newBuilder() + .setInput(input) + .setName("REPARTITION") + .addParameters(proto.Expression.newBuilder().setLiteral(toLiteralProto(10000)).build())) + .build()) + + val df = Dataset.ofRows(spark, logical) + assert(df.rdd.partitions.length == 10000) + } + + test("Hint with illegal name will be ignored") { + val input = proto.Relation + .newBuilder() + .setSql( + proto.SQL + .newBuilder() + .setQuery("select id from range(10)") + .build()) + + val logical = transform( + proto.Relation + .newBuilder() + .setHint( + proto.Hint + .newBuilder() + .setInput(input) + .setName("illegal")) + .build()) + assert(10 === Dataset.ofRows(spark, logical).count()) + } + + test("Hint with string attribute parameters") { + val input = proto.Relation + .newBuilder() + .setSql( + proto.SQL + .newBuilder() + .setQuery("select id from range(10)") + .build()) + + val logical = transform( + proto.Relation + .newBuilder() + .setHint(proto.Hint + .newBuilder() + .setInput(input) + .setName("REPARTITION") + .addParameters(proto.Expression.newBuilder().setLiteral(toLiteralProto("id")).build())) + .build()) + assert(10 === Dataset.ofRows(spark, logical).count()) + } + + test("Hint with wrong parameters") { + val input = proto.Relation + .newBuilder() + .setSql( + proto.SQL + .newBuilder() + .setQuery("select id from range(10)") + .build()) + + val logical = transform( + proto.Relation + .newBuilder() + .setHint(proto.Hint + .newBuilder() + .setInput(input) + .setName("REPARTITION") + .addParameters(proto.Expression.newBuilder().setLiteral(toLiteralProto(true)).build())) + .build()) + intercept[AnalysisException](Dataset.ofRows(spark, logical)) + } + + test("transform SortOrder") { + val input = proto.Relation + .newBuilder() + .setSql( + proto.SQL + .newBuilder() + .setQuery("SELECT id FROM VALUES (5),(1),(2),(6),(4),(3),(7),(9),(8),(null) AS tab(id)") + .build()) + + val relation = proto.Relation + .newBuilder() + .setSort( + proto.Sort + .newBuilder() + .setInput(input) + .setIsGlobal(false) + .addOrder( + proto.Expression.SortOrder + .newBuilder() + .setDirectionValue( + proto.Expression.SortOrder.SortDirection.SORT_DIRECTION_ASCENDING_VALUE) + .setNullOrdering(proto.Expression.SortOrder.NullOrdering.SORT_NULLS_FIRST) + .setChild(proto.Expression + .newBuilder() + .setExpressionString( + proto.Expression.ExpressionString.newBuilder().setExpression("id"))))) + .build() + val df = Dataset.ofRows(spark, transform(relation)) + df.foreachPartition { p: Iterator[Row] => + var previousValue: Int = -1 + p.foreach { r => + val v = r.getAs[Int](0) + // null will be converted to 0 + if (v == 0) { + assert(previousValue == -1, "null should be first") + } + if (previousValue != -1) { + assert(v > previousValue, "Partition is not ordered.") + } + previousValue = v + } + } + } + + test("RepartitionByExpression") { + val input = proto.Relation + .newBuilder() + .setSql( + proto.SQL + .newBuilder() + .setQuery("select id from range(10)") + .build()) + + val logical = transform( + proto.Relation + .newBuilder() + .setRepartitionByExpression( + proto.RepartitionByExpression + .newBuilder() + .setInput(input) + .setNumPartitions(3) + .addPartitionExprs(proto.Expression.newBuilder + .setExpressionString(proto.Expression.ExpressionString.newBuilder + .setExpression("id % 2")))) + .build()) + + val df = Dataset.ofRows(spark, logical) + assert(df.rdd.partitions.length == 3) + val valueToPartition = df + .selectExpr("id", "spark_partition_id()") + .rdd + .map(row => (row.getLong(0), row.getInt(1))) + .collectAsMap() + for ((value, partition) <- valueToPartition) { + if (value % 2 == 0) { + assert(partition == valueToPartition(0), "dataframe is not partitioned by `id % 2`") + } else { + assert(partition == valueToPartition(1), "dataframe is not partitioned by `id % 2`") + } + } + } + + test("Repartition by range") { + val input = proto.Relation + .newBuilder() + .setSql( + proto.SQL + .newBuilder() + .setQuery("select id from range(10)") + .build()) + + val logical = transform( + proto.Relation + .newBuilder() + .setRepartitionByExpression( + proto.RepartitionByExpression + .newBuilder() + .setInput(input) + .setNumPartitions(3) + .addPartitionExprs( + proto.Expression.newBuilder + .setSortOrder( + proto.Expression.SortOrder.newBuilder + .setDirectionValue( + proto.Expression.SortOrder.SortDirection.SORT_DIRECTION_ASCENDING_VALUE) + .setNullOrdering(proto.Expression.SortOrder.NullOrdering.SORT_NULLS_FIRST) + .setChild(proto.Expression + .newBuilder() + .setExpressionString( + proto.Expression.ExpressionString.newBuilder().setExpression("id")))))) + .build()) + + val df = Dataset.ofRows(spark, logical) + assert(df.rdd.partitions.length == 3) + df.rdd.foreachPartition { p => + var previousValue = -1L + p.foreach { r => + val v = r.getLong(0) + if (previousValue != -1L) { + assert(previousValue < v, "partition is not ordered.") + } + previousValue = v + } + } + } + + test("RepartitionByExpression with wrong parameters") { + val input = proto.Relation + .newBuilder() + .setSql( + proto.SQL + .newBuilder() + .setQuery("select id from range(10)") + .build()) + + val logical = transform( + proto.Relation + .newBuilder() + .setRepartitionByExpression( + proto.RepartitionByExpression + .newBuilder() + .setInput(input) + .addPartitionExprs(proto.Expression.newBuilder + .setExpressionString(proto.Expression.ExpressionString.newBuilder + .setExpression("illegal")))) + .build()) + + intercept[AnalysisException](Dataset.ofRows(spark, logical)) + } +} diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala new file mode 100644 index 0000000000000..824ee7aceb4bd --- /dev/null +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectProtoSuite.scala @@ -0,0 +1,1061 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.planner + +import java.nio.file.{Files, Paths} + +import scala.collection.JavaConverters._ + +import com.google.protobuf.ByteString + +import org.apache.spark.{SparkClassNotFoundException, SparkIllegalArgumentException} +import org.apache.spark.connect.proto +import org.apache.spark.connect.proto.Expression +import org.apache.spark.connect.proto.Join.JoinType +import org.apache.spark.sql.{AnalysisException, Column, DataFrame, Observation, Row, SaveMode} +import org.apache.spark.sql.catalyst.analysis +import org.apache.spark.sql.catalyst.expressions.{AttributeReference, GenericInternalRow, UnsafeProjection} +import org.apache.spark.sql.catalyst.plans.{FullOuter, Inner, LeftAnti, LeftOuter, LeftSemi, PlanTest, RightOuter} +import org.apache.spark.sql.catalyst.plans.logical.{Distinct, LocalRelation, LogicalPlan} +import org.apache.spark.sql.connect.common.InvalidPlanInput +import org.apache.spark.sql.connect.common.LiteralValueProtoConverter.toLiteralProto +import org.apache.spark.sql.connect.dsl.MockRemoteSession +import org.apache.spark.sql.connect.dsl.commands._ +import org.apache.spark.sql.connect.dsl.expressions._ +import org.apache.spark.sql.connect.dsl.plans._ +import org.apache.spark.sql.connector.catalog.{Identifier, InMemoryTableCatalog, TableCatalog} +import org.apache.spark.sql.connector.catalog.CatalogV2Implicits.CatalogHelper +import org.apache.spark.sql.execution.arrow.ArrowConverters +import org.apache.spark.sql.functions._ +import org.apache.spark.sql.types.{ArrayType, BooleanType, ByteType, DataType, DateType, DecimalType, DoubleType, FloatType, IntegerType, LongType, MapType, Metadata, ShortType, StringType, StructField, StructType} +import org.apache.spark.unsafe.types.UTF8String +import org.apache.spark.util.Utils + +/** + * This suite is based on connect DSL and test that given same dataframe operations, whether + * connect could construct a proto plan that can be translated back, and after analyzed, be the + * same as Spark dataframe's generated plan. + */ +class SparkConnectProtoSuite extends PlanTest with SparkConnectPlanTest { + lazy val connect = new MockRemoteSession() + + lazy val connectTestRelation = + createLocalRelationProto( + Seq(AttributeReference("id", IntegerType)(), AttributeReference("name", StringType)()), + Seq.empty) + + lazy val connectTestRelation2 = + createLocalRelationProto( + Seq(AttributeReference("id", IntegerType)(), AttributeReference("name", StringType)()), + Seq.empty) + + lazy val connectTestRelationMap = + createLocalRelationProto( + Seq(AttributeReference("id", MapType(StringType, StringType))()), + Seq.empty) + + lazy val sparkTestRelation: DataFrame = + spark.createDataFrame( + new java.util.ArrayList[Row](), + StructType(Seq(StructField("id", IntegerType), StructField("name", StringType)))) + + lazy val sparkTestRelation2: DataFrame = + spark.createDataFrame( + new java.util.ArrayList[Row](), + StructType(Seq(StructField("id", IntegerType), StructField("name", StringType)))) + + lazy val sparkTestRelationMap: DataFrame = + spark.createDataFrame( + new java.util.ArrayList[Row](), + StructType(Seq(StructField("id", MapType(StringType, StringType))))) + + lazy val localRelation = + createLocalRelationProto(Seq(AttributeReference("id", IntegerType)()), Seq.empty) + + test("Basic select") { + val connectPlan = connectTestRelation.select("id".protoAttr) + val sparkPlan = sparkTestRelation.select("id") + comparePlans(connectPlan, sparkPlan) + } + + test("Test select expression in strings") { + val connectPlan = connectTestRelation.selectExpr("abs(id)", "name") + val sparkPlan = sparkTestRelation.selectExpr("abs(id)", "name") + comparePlans(connectPlan, sparkPlan) + } + + test("UnresolvedFunction resolution.") { + val connectPlan = + connectTestRelation.select(callFunction(Seq("default", "hex"), Seq("id".protoAttr))) + + assertThrows[UnsupportedOperationException] { + analyzePlan(transform(connectPlan)) + } + + val validPlan = connectTestRelation.select(callFunction(Seq("hex"), Seq("id".protoAttr))) + assert(analyzePlan(transform(validPlan)) != null) + } + + test("Basic filter") { + val connectPlan = connectTestRelation.where("id".protoAttr < 0) + val sparkPlan = sparkTestRelation.where(Column("id") < 0) + comparePlans(connectPlan, sparkPlan) + } + + test("Basic joins with different join types") { + val connectPlan = connectTestRelation.join(connectTestRelation2) + val sparkPlan = sparkTestRelation.join(sparkTestRelation2) + comparePlans(connectPlan, sparkPlan) + + val connectPlan2 = connectTestRelation.join(connectTestRelation2) + val sparkPlan2 = sparkTestRelation.join(sparkTestRelation2) + comparePlans(connectPlan2, sparkPlan2) + + for ((t, y) <- Seq( + (JoinType.JOIN_TYPE_LEFT_OUTER, LeftOuter), + (JoinType.JOIN_TYPE_RIGHT_OUTER, RightOuter), + (JoinType.JOIN_TYPE_FULL_OUTER, FullOuter), + (JoinType.JOIN_TYPE_LEFT_ANTI, LeftAnti), + (JoinType.JOIN_TYPE_LEFT_SEMI, LeftSemi), + (JoinType.JOIN_TYPE_INNER, Inner))) { + + val connectPlan3 = connectTestRelation.join(connectTestRelation2, t, Seq("id")) + val sparkPlan3 = sparkTestRelation.join(sparkTestRelation2, Seq("id"), y.toString) + comparePlans(connectPlan3, sparkPlan3) + } + + val connectPlan4 = + connectTestRelation.join(connectTestRelation2, JoinType.JOIN_TYPE_INNER, Seq("name")) + val sparkPlan4 = sparkTestRelation.join(sparkTestRelation2, Seq("name"), Inner.toString) + comparePlans(connectPlan4, sparkPlan4) + } + + test("Test sample") { + val connectPlan = connectTestRelation.sample(0, 0.2, false, 1) + val sparkPlan = sparkTestRelation.sample(false, 0.2 - 0, 1) + comparePlans(connectPlan, sparkPlan) + } + + test("Test sort") { + val connectPlan = connectTestRelation.sort("id", "name") + val sparkPlan = sparkTestRelation.sort("id", "name") + comparePlans(connectPlan, sparkPlan) + + val connectPlan2 = connectTestRelation.sortWithinPartitions("id", "name") + val sparkPlan2 = sparkTestRelation.sortWithinPartitions("id", "name") + comparePlans(connectPlan2, sparkPlan2) + } + + test("SPARK-41169: Test drop") { + // single column + val connectPlan = connectTestRelation.drop("id") + val sparkPlan = sparkTestRelation.drop("id") + comparePlans(connectPlan, sparkPlan) + + // all columns + val connectPlan2 = connectTestRelation.drop("id", "name") + val sparkPlan2 = sparkTestRelation.drop("id", "name") + comparePlans(connectPlan2, sparkPlan2) + + // non-existing column + val connectPlan3 = connectTestRelation.drop("id2", "name") + val sparkPlan3 = sparkTestRelation.drop("id2", "name") + comparePlans(connectPlan3, sparkPlan3) + } + + test("SPARK-40809: column alias") { + // Simple Test. + val connectPlan = connectTestRelation.select("id".protoAttr.as("id2")) + val sparkPlan = sparkTestRelation.select(Column("id").alias("id2")) + comparePlans(connectPlan, sparkPlan) + + // Scalar columns with metadata + val mdJson = "{\"max\": 99}" + comparePlans( + connectTestRelation.select("id".protoAttr.as("id2", mdJson)), + sparkTestRelation.select(Column("id").as("id2", Metadata.fromJson(mdJson)))) + + comparePlans( + connectTestRelationMap.select(proto_explode("id".protoAttr).as(Seq("a", "b"))), + sparkTestRelationMap.select(explode(Column("id")).as(Seq("a", "b")))) + + // Metadata must only be specified for regular Aliases. + assertThrows[InvalidPlanInput] { + val attr = proto_explode("id".protoAttr) + val alias = proto.Expression.Alias + .newBuilder() + .setExpr(attr) + .addName("a") + .addName("b") + .setMetadata(mdJson) + .build() + transform( + connectTestRelationMap.select(proto.Expression.newBuilder().setAlias(alias).build())) + } + } + + test("Aggregate with more than 1 grouping expressions") { + val connectPlan = + connectTestRelation.groupBy("id".protoAttr, "name".protoAttr)() + val sparkPlan = + sparkTestRelation.groupBy(Column("id"), Column("name")).agg(Map.empty[String, String]) + comparePlans(connectPlan, sparkPlan) + } + + test("Aggregate expressions") { + val connectPlan = + connectTestRelation.groupBy("id".protoAttr)(proto_min("name".protoAttr)) + val sparkPlan = + sparkTestRelation.groupBy(Column("id")).agg(min(Column("name"))) + comparePlans(connectPlan, sparkPlan) + + val connectPlan2 = + connectTestRelation.groupBy("id".protoAttr)(proto_min("name".protoAttr).as("agg1")) + val sparkPlan2 = + sparkTestRelation.groupBy(Column("id")).agg(min(Column("name")).as("agg1")) + comparePlans(connectPlan2, sparkPlan2) + } + + test("Rollup expressions") { + val connectPlan1 = + connectTestRelation.rollup("id".protoAttr)(proto_min("name".protoAttr)) + val sparkPlan1 = + sparkTestRelation.rollup(Column("id")).agg(min(Column("name"))) + comparePlans(connectPlan1, sparkPlan1) + + val connectPlan2 = + connectTestRelation.rollup("id".protoAttr)(proto_min("name".protoAttr).as("agg1")) + val sparkPlan2 = + sparkTestRelation.rollup(Column("id")).agg(min(Column("name")).as("agg1")) + comparePlans(connectPlan2, sparkPlan2) + + val connectPlan3 = + connectTestRelation.rollup("id".protoAttr, "name".protoAttr)( + proto_min(proto.Expression.newBuilder().setLiteral(toLiteralProto(1)).build()) + .as("agg1")) + val sparkPlan3 = + sparkTestRelation + .rollup(Column("id"), Column("name")) + .agg(min(lit(1)).as("agg1")) + comparePlans(connectPlan3, sparkPlan3) + } + + test("Cube expressions") { + val connectPlan1 = + connectTestRelation.cube("id".protoAttr)(proto_min("name".protoAttr)) + val sparkPlan1 = + sparkTestRelation.cube(Column("id")).agg(min(Column("name"))) + comparePlans(connectPlan1, sparkPlan1) + + val connectPlan2 = + connectTestRelation.cube("id".protoAttr)(proto_min("name".protoAttr).as("agg1")) + val sparkPlan2 = + sparkTestRelation.cube(Column("id")).agg(min(Column("name")).as("agg1")) + comparePlans(connectPlan2, sparkPlan2) + + val connectPlan3 = + connectTestRelation.cube("id".protoAttr, "name".protoAttr)( + proto_min(proto.Expression.newBuilder().setLiteral(toLiteralProto(1)).build()) + .as("agg1")) + val sparkPlan3 = + sparkTestRelation + .cube(Column("id"), Column("name")) + .agg(min(lit(1)).as("agg1")) + comparePlans(connectPlan3, sparkPlan3) + } + + test("Pivot expressions") { + val connectPlan1 = + connectTestRelation.pivot("id".protoAttr)( + "name".protoAttr, + Seq("a", "b", "c").map(toLiteralProto))( + proto_min(proto.Expression.newBuilder().setLiteral(toLiteralProto(1)).build()) + .as("agg1")) + val sparkPlan1 = + sparkTestRelation + .groupBy(Column("id")) + .pivot(Column("name"), Seq("a", "b", "c")) + .agg(min(lit(1)).as("agg1")) + comparePlans(connectPlan1, sparkPlan1) + + val connectPlan2 = + connectTestRelation.pivot("name".protoAttr)( + "id".protoAttr, + Seq(1, 2, 3).map(toLiteralProto))( + proto_min(proto.Expression.newBuilder().setLiteral(toLiteralProto(1)).build()) + .as("agg1")) + val sparkPlan2 = + sparkTestRelation + .groupBy(Column("name")) + .pivot(Column("id"), Seq(1, 2, 3)) + .agg(min(lit(1)).as("agg1")) + comparePlans(connectPlan2, sparkPlan2) + } + + test("Test as(alias: String)") { + val connectPlan = connectTestRelation.as("target_table") + val sparkPlan = sparkTestRelation.as("target_table") + comparePlans(connectPlan, sparkPlan) + } + + test("Test StructType in LocalRelation") { + val connectPlan = createLocalRelationProtoByAttributeReferences( + Seq(AttributeReference("a", StructType(Seq(StructField("id", IntegerType))))())) + val sparkPlan = + LocalRelation(AttributeReference("a", StructType(Seq(StructField("id", IntegerType))))()) + comparePlans(connectPlan, sparkPlan) + } + + test("Test limit offset") { + val connectPlan = connectTestRelation.limit(10) + val sparkPlan = sparkTestRelation.limit(10) + comparePlans(connectPlan, sparkPlan) + + val connectPlan2 = connectTestRelation.offset(2) + val sparkPlan2 = sparkTestRelation.offset(2) + comparePlans(connectPlan2, sparkPlan2) + + val connectPlan3 = connectTestRelation.limit(10).offset(2) + val sparkPlan3 = sparkTestRelation.limit(10).offset(2) + comparePlans(connectPlan3, sparkPlan3) + + val connectPlan4 = connectTestRelation.offset(2).limit(10) + val sparkPlan4 = sparkTestRelation.offset(2).limit(10) + comparePlans(connectPlan4, sparkPlan4) + } + + test("Test basic deduplicate") { + val connectPlan = connectTestRelation.distinct() + val sparkPlan = sparkTestRelation.distinct() + comparePlans(connectPlan, sparkPlan) + + val connectPlan2 = connectTestRelation.deduplicate(Seq("id", "name")) + val sparkPlan2 = sparkTestRelation.dropDuplicates(Seq("id", "name")) + comparePlans(connectPlan2, sparkPlan2) + } + + test("Test union, except, intersect") { + val connectPlan1 = connectTestRelation.except(connectTestRelation, isAll = false) + val sparkPlan1 = sparkTestRelation.except(sparkTestRelation) + comparePlans(connectPlan1, sparkPlan1) + + val connectPlan2 = connectTestRelation.except(connectTestRelation, isAll = true) + val sparkPlan2 = sparkTestRelation.exceptAll(sparkTestRelation) + comparePlans(connectPlan2, sparkPlan2) + + val connectPlan3 = connectTestRelation.intersect(connectTestRelation, isAll = false) + val sparkPlan3 = sparkTestRelation.intersect(sparkTestRelation) + comparePlans(connectPlan3, sparkPlan3) + + val connectPlan4 = connectTestRelation.intersect(connectTestRelation, isAll = true) + val sparkPlan4 = sparkTestRelation.intersectAll(sparkTestRelation) + comparePlans(connectPlan4, sparkPlan4) + + val connectPlan5 = connectTestRelation.union(connectTestRelation, isAll = true) + val sparkPlan5 = sparkTestRelation.union(sparkTestRelation) + comparePlans(connectPlan5, sparkPlan5) + + val connectPlan6 = connectTestRelation.union(connectTestRelation, isAll = false) + val sparkPlan6 = Distinct(sparkTestRelation.union(sparkTestRelation).logicalPlan) + comparePlans(connectPlan6, sparkPlan6) + + val connectPlan7 = + connectTestRelation.union(connectTestRelation2, isAll = true, byName = true) + val sparkPlan7 = sparkTestRelation.unionByName(sparkTestRelation2) + comparePlans(connectPlan7, sparkPlan7) + + val connectPlan8 = + connectTestRelation.union(connectTestRelation2, isAll = false, byName = true) + val sparkPlan8 = Distinct(sparkTestRelation.unionByName(sparkTestRelation2).logicalPlan) + comparePlans(connectPlan8, sparkPlan8) + } + + test("Test Range") { + comparePlans(connect.range(None, 10, None, None), spark.range(10).toDF()) + comparePlans(connect.range(Some(2), 10, None, None), spark.range(2, 10).toDF()) + comparePlans(connect.range(Some(2), 10, Some(10), None), spark.range(2, 10, 10).toDF()) + comparePlans( + connect.range(Some(2), 10, Some(10), Some(100)), + spark.range(2, 10, 10, 100).toDF()) + } + + test("Test Session.sql") { + comparePlans(connect.sql("SELECT 1"), spark.sql("SELECT 1")) + } + + test("Test Repartition") { + val connectPlan1 = connectTestRelation.repartition(12) + val sparkPlan1 = sparkTestRelation.repartition(12) + comparePlans(connectPlan1, sparkPlan1) + + val connectPlan2 = connectTestRelation.coalesce(2) + val sparkPlan2 = sparkTestRelation.coalesce(2) + comparePlans(connectPlan2, sparkPlan2) + } + + test("Test RepartitionByExpression") { + val connectPlan1 = connectTestRelation.repartition(12, "id".protoAttr) + val sparkPlan1 = sparkTestRelation.repartition(12, sparkTestRelation.col("id")) + comparePlans(connectPlan1, sparkPlan1) + + val connectPlan2 = connectTestRelation.repartition("id".protoAttr) + val sparkPlan2 = sparkTestRelation.repartition(sparkTestRelation.col("id")) + comparePlans(connectPlan2, sparkPlan2) + } + + test("Test repartitionByRange") { + val connectPlan1 = connectTestRelation.repartitionByRange(12, "id".protoAttr) + val sparkPlan1 = sparkTestRelation.repartitionByRange(12, sparkTestRelation.col("id")) + comparePlans(connectPlan1, sparkPlan1) + + val connectPlan2 = connectTestRelation.repartitionByRange("id".protoAttr) + val sparkPlan2 = sparkTestRelation.repartitionByRange(sparkTestRelation.col("id")) + comparePlans(connectPlan2, sparkPlan2) + + val connectPlan3 = connectTestRelation.repartitionByRange(12, "id".asc) + val sparkPlan3 = sparkTestRelation.repartitionByRange(12, sparkTestRelation.col("id").asc) + comparePlans(connectPlan3, sparkPlan3) + } + + test("SPARK-41128: Test fill na") { + comparePlans(connectTestRelation.na.fillValue(1L), sparkTestRelation.na.fill(1L)) + comparePlans(connectTestRelation.na.fillValue(1.5), sparkTestRelation.na.fill(1.5)) + comparePlans(connectTestRelation.na.fillValue("str"), sparkTestRelation.na.fill("str")) + comparePlans( + connectTestRelation.na.fillColumns(1L, Seq("id")), + sparkTestRelation.na.fill(1L, Seq("id"))) + comparePlans( + connectTestRelation.na.fillValueMap(Map("id" -> 1L)), + sparkTestRelation.na.fill(Map("id" -> 1L))) + comparePlans( + connectTestRelation.na.fillValueMap(Map("id" -> 1L, "name" -> "xyz")), + sparkTestRelation.na.fill(Map("id" -> 1L, "name" -> "xyz"))) + } + + test("SPARK-41148: Test drop na") { + comparePlans(connectTestRelation.na.drop(), sparkTestRelation.na.drop()) + comparePlans( + connectTestRelation.na.drop(cols = Seq("id")), + sparkTestRelation.na.drop(cols = Seq("id"))) + comparePlans( + connectTestRelation.na.drop(how = Some("all")), + sparkTestRelation.na.drop(how = "all")) + comparePlans( + connectTestRelation.na.drop(how = Some("all"), cols = Seq("id", "name")), + sparkTestRelation.na.drop(how = "all", cols = Seq("id", "name"))) + comparePlans( + connectTestRelation.na.drop(minNonNulls = Some(1)), + sparkTestRelation.na.drop(minNonNulls = 1)) + comparePlans( + connectTestRelation.na.drop(minNonNulls = Some(1), cols = Seq("id", "name")), + sparkTestRelation.na.drop(minNonNulls = 1, cols = Seq("id", "name"))) + } + + test("SPARK-41315: Test replace") { + comparePlans( + connectTestRelation.na.replace(cols = Seq("id"), replacement = Map(1.0 -> 2.0)), + sparkTestRelation.na.replace(cols = Seq("id"), replacement = Map(1.0 -> 2.0))) + comparePlans( + connectTestRelation.na.replace(cols = Seq("name"), replacement = Map("a" -> "b")), + sparkTestRelation.na.replace(cols = Seq("name"), replacement = Map("a" -> "b"))) + comparePlans( + connectTestRelation.na.replace(cols = Seq("*"), replacement = Map("a" -> "b")), + sparkTestRelation.na.replace(col = "*", replacement = Map("a" -> "b"))) + } + + test("Test summary") { + comparePlans( + connectTestRelation.summary("count", "mean", "stddev"), + sparkTestRelation.summary("count", "mean", "stddev")) + } + + test("Test describe") { + comparePlans( + connectTestRelation.describe("id", "name"), + sparkTestRelation.describe("id", "name")) + } + + test("Test crosstab") { + comparePlans( + connectTestRelation.stat.crosstab("id", "name"), + sparkTestRelation.stat.crosstab("id", "name")) + } + + test("Test freqItems") { + comparePlans( + connectTestRelation.stat.freqItems(Seq("id", "name"), 1), + sparkTestRelation.stat.freqItems(Seq("id", "name"), 1)) + + comparePlans( + connectTestRelation.stat.freqItems(Seq("id", "name")), + sparkTestRelation.stat.freqItems(Seq("id", "name"))) + } + + test("Test to") { + val dataTypes: Seq[DataType] = Seq( + StringType, + DateType, + BooleanType, + ByteType, + ShortType, + IntegerType, + LongType, + FloatType, + DoubleType, + DecimalType.SYSTEM_DEFAULT, + DecimalType.USER_DEFAULT, + ArrayType(IntegerType, true), + MapType(StringType, IntegerType, false), + new StructType().add("f1", IntegerType)) + + val schema = StructType(dataTypes.map(t => StructField(t.getClass.getName, t))) + comparePlans(connectTestRelation.to(schema), sparkTestRelation.to(schema)) + } + + test("Test toDF") { + comparePlans(connectTestRelation.toDF("col1", "col2"), sparkTestRelation.toDF("col1", "col2")) + } + + test("Test withColumnsRenamed") { + comparePlans( + connectTestRelation.withColumnsRenamed(Map("id" -> "id1")), + sparkTestRelation.withColumnsRenamed(Map("id" -> "id1"))) + comparePlans( + connectTestRelation.withColumnsRenamed(Map("id" -> "id1", "name" -> "name1")), + sparkTestRelation.withColumnsRenamed(Map("id" -> "id1", "name" -> "name1"))) + comparePlans( + connectTestRelation.withColumnsRenamed(Map("id" -> "id1", "col1" -> "col2")), + sparkTestRelation.withColumnsRenamed(Map("id" -> "id1", "col1" -> "col2"))) + comparePlans( + connectTestRelation.withColumnsRenamed(Map("id" -> "id1", "id" -> "id2")), + sparkTestRelation.withColumnsRenamed(Map("id" -> "id1", "id" -> "id2"))) + + checkError( + exception = intercept[AnalysisException] { + transform( + connectTestRelation.withColumnsRenamed( + Map("id" -> "duplicatedCol", "name" -> "duplicatedCol"))) + }, + errorClass = "COLUMN_ALREADY_EXISTS", + parameters = Map("columnName" -> "`duplicatedcol`")) + } + + test("Writes fails without path or table") { + assertThrows[SparkIllegalArgumentException] { + transform(localRelation.write()) + } + } + + test("Writes without path or table") { + transform(localRelation.write(format = Some("noop"), mode = Some("Append"))) + } + + test("Write fails with unknown table - AnalysisException") { + val cmd = readRel.write(tableName = Some("dest")) + assertThrows[AnalysisException] { + transform(cmd) + } + } + + test("Write with partitions") { + val cmd = localRelation.write( + tableName = Some("testtable"), + tableSaveMethod = Some("save_as_table"), + format = Some("parquet"), + partitionByCols = Seq("noid")) + assertThrows[AnalysisException] { + transform(cmd) + } + } + + test("Write with invalid bucketBy configuration") { + val cmd = localRelation.write(bucketByCols = Seq("id"), numBuckets = Some(0)) + assertThrows[InvalidCommandInput] { + transform(cmd) + } + } + + test("Write to Path") { + withTempDir { f => + val cmd = localRelation.write( + format = Some("parquet"), + path = Some(f.getPath), + mode = Some("Overwrite")) + transform(cmd) + assert(Files.exists(Paths.get(f.getPath)), s"Output file must exist: ${f.getPath}") + } + + // should work if format is not set + withTempPath { f => + transform(localRelation.write(path = Some(f.getCanonicalPath))) + assert(Files.exists(Paths.get(f.getPath)), s"Output file must exist: ${f.getPath}") + } + } + + test("Write to Path with invalid input") { + // Wrong data source. + assertThrows[SparkClassNotFoundException]( + transform( + localRelation.write(path = Some("/tmp/tmppath"), format = Some("ThisAintNoFormat")))) + } + + test("Write with sortBy") { + // Sort by existing column. + withTable("testtable") { + transform( + localRelation.write( + tableName = Some("testtable"), + tableSaveMethod = Some("save_as_table"), + format = Some("parquet"), + sortByColumns = Seq("id"), + bucketByCols = Seq("id"), + numBuckets = Some(10))) + } + + // Sort by non-existing column + assertThrows[AnalysisException]( + transform( + localRelation + .write( + tableName = Some("testtable"), + tableSaveMethod = Some("save_as_table"), + format = Some("parquet"), + sortByColumns = Seq("noid"), + bucketByCols = Seq("id"), + numBuckets = Some(10)))) + } + + test("Write to Table") { + withTable("testtable") { + val cmd = localRelation.write( + format = Some("parquet"), + tableName = Some("testtable"), + tableSaveMethod = Some("save_as_table")) + transform(cmd) + // Check that we can find and drop the table. + spark.sql(s"select count(*) from testtable").collect() + } + } + + test("SaveMode conversion tests") { + assertThrows[IllegalArgumentException]( + SaveModeConverter.toSaveMode(proto.WriteOperation.SaveMode.SAVE_MODE_UNSPECIFIED)) + + val combinations = Seq( + (SaveMode.Append, proto.WriteOperation.SaveMode.SAVE_MODE_APPEND), + (SaveMode.Ignore, proto.WriteOperation.SaveMode.SAVE_MODE_IGNORE), + (SaveMode.Overwrite, proto.WriteOperation.SaveMode.SAVE_MODE_OVERWRITE), + (SaveMode.ErrorIfExists, proto.WriteOperation.SaveMode.SAVE_MODE_ERROR_IF_EXISTS)) + combinations.foreach { a => + assert(SaveModeConverter.toSaveModeProto(a._1) == a._2) + assert(SaveModeConverter.toSaveMode(a._2) == a._1) + } + } + + test("TableSaveMethod conversion tests") { + assertThrows[IllegalArgumentException]( + TableSaveMethodConverter.toTableSaveMethodProto("unknown")) + + val combinations = Seq( + ( + "save_as_table", + proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_SAVE_AS_TABLE), + ( + "insert_into", + proto.WriteOperation.SaveTable.TableSaveMethod.TABLE_SAVE_METHOD_INSERT_INTO)) + combinations.foreach { a => + assert(TableSaveMethodConverter.toTableSaveMethodProto(a._1) == a._2) + } + } + + test("WriteTo with create") { + withTable("testcat.table_name") { + spark.conf.set("spark.sql.catalog.testcat", classOf[InMemoryTableCatalog].getName) + + val rows = Seq( + new GenericInternalRow(Array(1L, UTF8String.fromString("a"))), + new GenericInternalRow(Array(2L, UTF8String.fromString("b"))), + new GenericInternalRow(Array(3L, UTF8String.fromString("c")))) + + val schema = StructType(Array(StructField("id", LongType), StructField("data", StringType))) + val inputRows = rows.map { row => + val proj = UnsafeProjection.create(schema) + proj(row).copy() + } + + val localRelationV2 = createLocalRelationProto(schema.toAttributes, inputRows) + + val cmd = localRelationV2.writeV2( + tableName = Some("testcat.table_name"), + mode = Some("MODE_CREATE")) + transform(cmd) + + val outputRows = spark.table("testcat.table_name").collect() + assert(outputRows.length == 3) + } + } + + test("WriteTo with create and using") { + val defaultOwnership = Map(TableCatalog.PROP_OWNER -> Utils.getCurrentUserName()) + withTable("testcat.table_name") { + spark.conf.set("spark.sql.catalog.testcat", classOf[InMemoryTableCatalog].getName) + + val rows = Seq( + new GenericInternalRow(Array(1L, UTF8String.fromString("a"))), + new GenericInternalRow(Array(2L, UTF8String.fromString("b"))), + new GenericInternalRow(Array(3L, UTF8String.fromString("c")))) + + val schema = StructType(Array(StructField("id", LongType), StructField("data", StringType))) + val inputRows = rows.map { row => + val proj = UnsafeProjection.create(schema) + proj(row).copy() + } + + val localRelationV2 = createLocalRelationProto(schema.toAttributes, inputRows) + + val cmd = localRelationV2.writeV2( + tableName = Some("testcat.table_name"), + provider = Some("foo"), + mode = Some("MODE_CREATE")) + transform(cmd) + + val outputRows = spark.table("testcat.table_name").collect() + assert(outputRows.length == 3) + val table = spark.sessionState.catalogManager + .catalog("testcat") + .asTableCatalog + .loadTable(Identifier.of(Array(), "table_name")) + assert(table.name === "testcat.table_name") + assert(table.schema === new StructType().add("id", LongType).add("data", StringType)) + assert(table.partitioning.isEmpty) + assert(table.properties === (Map("provider" -> "foo") ++ defaultOwnership).asJava) + } + } + + test("WriteTo with append") { + withTable("testcat.table_name") { + spark.conf.set("spark.sql.catalog.testcat", classOf[InMemoryTableCatalog].getName) + + val rows = Seq( + new GenericInternalRow(Array(1L, UTF8String.fromString("a"))), + new GenericInternalRow(Array(2L, UTF8String.fromString("b"))), + new GenericInternalRow(Array(3L, UTF8String.fromString("c")))) + + val schema = StructType(Array(StructField("id", LongType), StructField("data", StringType))) + val inputRows = rows.map { row => + val proj = UnsafeProjection.create(schema) + proj(row).copy() + } + + val localRelationV2 = createLocalRelationProto(schema.toAttributes, inputRows) + + spark.sql("CREATE TABLE testcat.table_name (id bigint, data string) USING foo") + + assert(spark.table("testcat.table_name").collect().isEmpty) + + val cmd = localRelationV2.writeV2( + tableName = Some("testcat.table_name"), + mode = Some("MODE_APPEND")) + transform(cmd) + + val outputRows = spark.table("testcat.table_name").collect() + assert(outputRows.length == 3) + } + } + + test("WriteTo with overwrite") { + withTable("testcat.table_name") { + spark.conf.set("spark.sql.catalog.testcat", classOf[InMemoryTableCatalog].getName) + + val rows1 = (1L to 3L).map { i => + new GenericInternalRow(Array(i, UTF8String.fromString("" + (i - 1 + 'a').toChar))) + } + val rows2 = (4L to 7L).map { i => + new GenericInternalRow(Array(i, UTF8String.fromString("" + (i - 1 + 'a').toChar))) + } + + val schema = StructType(Array(StructField("id", LongType), StructField("data", StringType))) + val inputRows1 = rows1.map { row => + val proj = UnsafeProjection.create(schema) + proj(row).copy() + } + val inputRows2 = rows2.map { row => + val proj = UnsafeProjection.create(schema) + proj(row).copy() + } + + val localRelation1V2 = createLocalRelationProto(schema.toAttributes, inputRows1) + val localRelation2V2 = createLocalRelationProto(schema.toAttributes, inputRows2) + + spark.sql( + "CREATE TABLE testcat.table_name (id bigint, data string) USING foo PARTITIONED BY (id)") + + assert(spark.table("testcat.table_name").collect().isEmpty) + + val cmd1 = localRelation1V2.writeV2( + tableName = Some("testcat.table_name"), + mode = Some("MODE_APPEND")) + transform(cmd1) + + val outputRows1 = spark.table("testcat.table_name").collect() + assert(outputRows1.length == 3) + + val overwriteCondition = Expression + .newBuilder() + .setLiteral(Expression.Literal.newBuilder().setBoolean(true)) + .build() + + val cmd2 = localRelation2V2.writeV2( + tableName = Some("testcat.table_name"), + mode = Some("MODE_OVERWRITE"), + overwriteCondition = Some(overwriteCondition)) + transform(cmd2) + + val outputRows2 = spark.table("testcat.table_name").collect() + assert(outputRows2.length == 4) + } + } + + test("WriteTo with overwritePartitions") { + withTable("testcat.table_name") { + spark.conf.set("spark.sql.catalog.testcat", classOf[InMemoryTableCatalog].getName) + + val rows = (4L to 7L).map { i => + new GenericInternalRow(Array(i, UTF8String.fromString("" + (i - 1 + 'a').toChar))) + } + + val schema = StructType(Array(StructField("id", LongType), StructField("data", StringType))) + val inputRows = rows.map { row => + val proj = UnsafeProjection.create(schema) + proj(row).copy() + } + + val localRelationV2 = createLocalRelationProto(schema.toAttributes, inputRows) + + spark.sql( + "CREATE TABLE testcat.table_name (id bigint, data string) USING foo PARTITIONED BY (id)") + + assert(spark.table("testcat.table_name").collect().isEmpty) + + val cmd = localRelationV2.writeV2( + tableName = Some("testcat.table_name"), + mode = Some("MODE_OVERWRITE_PARTITIONS")) + transform(cmd) + + val outputRows = spark.table("testcat.table_name").collect() + assert(outputRows.length == 4) + } + } + + test("Test CreateView") { + withView("view1", "view2", "view3", "view4") { + transform(localRelation.createView("view1", global = true, replace = true)) + assert(spark.catalog.tableExists("global_temp.view1")) + + transform(localRelation.createView("view2", global = false, replace = true)) + assert(spark.catalog.tableExists("view2")) + + transform(localRelation.createView("view3", global = true, replace = false)) + assertThrows[AnalysisException] { + transform(localRelation.createView("view3", global = true, replace = false)) + } + + transform(localRelation.createView("view4", global = false, replace = false)) + assertThrows[AnalysisException] { + transform(localRelation.createView("view4", global = false, replace = false)) + } + } + } + + test("Project does not require an input") { + comparePlans(select(1), spark.sql("SELECT 1")) + } + + test("Test withColumns") { + comparePlans( + connectTestRelation.withColumns(Map("id" -> 1024, "col_not_exist" -> 2048)), + sparkTestRelation.withColumns(Map("id" -> lit(1024), "col_not_exist" -> lit(2048)))) + } + + test("Test cast") { + comparePlans( + connectTestRelation.select("id".protoAttr.cast( + proto.DataType.newBuilder().setString(proto.DataType.String.getDefaultInstance).build())), + sparkTestRelation.select(col("id").cast(StringType))) + + comparePlans( + connectTestRelation.select("id".protoAttr.cast("string")), + sparkTestRelation.select(col("id").cast("string"))) + } + + test("Test colRegex") { + comparePlans( + connectTestRelation.select("id".colRegex), + sparkTestRelation.select(sparkTestRelation.colRegex("id"))) + + comparePlans( + connectTestRelation.select("`(_1)?+.+`".colRegex), + sparkTestRelation.select(sparkTestRelation.colRegex("`(_1)?+.+`"))) + } + + test("Test Hint") { + comparePlans(connectTestRelation.hint("COALESCE", 3), sparkTestRelation.hint("COALESCE", 3)) + } + + test("Test Unpivot") { + val connectPlan0 = + connectTestRelation.unpivot(Seq("id".protoAttr), Seq("name".protoAttr), "variable", "value") + val sparkPlan0 = + sparkTestRelation.unpivot(Array(Column("id")), Array(Column("name")), "variable", "value") + comparePlans(connectPlan0, sparkPlan0) + + val connectPlan1 = + connectTestRelation.unpivot(Seq("id".protoAttr), "variable", "value") + val sparkPlan1 = + sparkTestRelation.unpivot(Array(Column("id")), "variable", "value") + comparePlans(connectPlan1, sparkPlan1) + } + + test("Test Melt") { + val connectPlan0 = + connectTestRelation.melt(Seq("id".protoAttr), Seq("name".protoAttr), "variable", "value") + val sparkPlan0 = + sparkTestRelation.melt(Array(Column("id")), Array(Column("name")), "variable", "value") + comparePlans(connectPlan0, sparkPlan0) + + val connectPlan1 = + connectTestRelation.melt(Seq("id".protoAttr), "variable", "value") + val sparkPlan1 = + sparkTestRelation.melt(Array(Column("id")), "variable", "value") + comparePlans(connectPlan1, sparkPlan1) + } + + test("Test observe") { + val connectPlan0 = + connectTestRelation.observe( + "my_metric", + proto_min("id".protoAttr).as("min_val"), + proto_max("id".protoAttr).as("max_val"), + proto_sum("id".protoAttr)) + val sparkPlan0 = + sparkTestRelation.observe( + "my_metric", + min(Column("id")).as("min_val"), + max(Column("id")).as("max_val"), + sum(Column("id"))) + comparePlans(connectPlan0, sparkPlan0) + + val connectPlan1 = + connectTestRelation.observe("my_metric", proto_min("id".protoAttr).as("min_val")) + val sparkPlan1 = + sparkTestRelation.observe("my_metric", min(Column("id")).as("min_val")) + comparePlans(connectPlan1, sparkPlan1) + + checkError( + exception = intercept[AnalysisException] { + analyzePlan( + transform(connectTestRelation.observe("my_metric", "id".protoAttr.cast("string")))) + }, + errorClass = "_LEGACY_ERROR_TEMP_2322", + parameters = Map("sqlExpr" -> "CAST(id AS STRING) AS id")) + + val connectPlan2 = + connectTestRelation.observe( + Observation("my_metric"), + proto_min("id".protoAttr).as("min_val"), + proto_max("id".protoAttr).as("max_val"), + proto_sum("id".protoAttr)) + val sparkPlan2 = + sparkTestRelation.observe( + Observation("my_metric"), + min(Column("id")).as("min_val"), + max(Column("id")).as("max_val"), + sum(Column("id"))) + comparePlans(connectPlan2, sparkPlan2) + + val connectPlan3 = + connectTestRelation.observe( + Observation("my_metric"), + proto_min("id".protoAttr).as("min_val")) + val sparkPlan3 = + sparkTestRelation.observe(Observation("my_metric"), min(Column("id")).as("min_val")) + comparePlans(connectPlan3, sparkPlan3) + + checkError( + exception = intercept[AnalysisException] { + analyzePlan( + transform( + connectTestRelation.observe(Observation("my_metric"), "id".protoAttr.cast("string")))) + }, + errorClass = "_LEGACY_ERROR_TEMP_2322", + parameters = Map("sqlExpr" -> "CAST(id AS STRING) AS id")) + } + + test("Test RandomSplit") { + val splitRelations0 = connectTestRelation.randomSplit(Array[Double](1, 2, 3), 1) + val splits0 = sparkTestRelation.randomSplit(Array[Double](1, 2, 3), 1) + assert(splitRelations0.length == splits0.length) + splitRelations0.zip(splits0).foreach { case (connectPlan, sparkPlan) => + comparePlans(connectPlan, sparkPlan) + } + + val splitRelations1 = connectTestRelation.randomSplit(Array[Double](1, 2, 3)) + val splits1 = sparkTestRelation.randomSplit(Array[Double](1, 2, 3)) + assert(splitRelations1.length == splits1.length) + splitRelations1.zip(splits1).foreach { case (connectPlan, sparkPlan) => + comparePlans(connectPlan, sparkPlan) + } + } + + private def createLocalRelationProtoByAttributeReferences( + attrs: Seq[AttributeReference]): proto.Relation = { + val localRelationBuilder = proto.LocalRelation.newBuilder() + + val attributes = attrs.map(exp => AttributeReference(exp.name, exp.dataType)()) + val buffer = ArrowConverters + .toBatchWithSchemaIterator( + Iterator.empty, + StructType.fromAttributes(attributes), + Long.MaxValue, + Long.MaxValue, + null) + .next() + proto.Relation + .newBuilder() + .setLocalRelation(localRelationBuilder.setData(ByteString.copyFrom(buffer)).build()) + .build() + } + + // This is a function for testing only. This is used when the plan is ready and it only waits + // analyzer to analyze attribute references within the plan. + private def analyzePlan(plan: LogicalPlan): LogicalPlan = { + val connectAnalyzed = analysis.SimpleAnalyzer.execute(plan) + analysis.SimpleAnalyzer.checkAnalysis(connectAnalyzed) + connectAnalyzed + } + + // Compares proto plan with DataFrame. + private def comparePlans(connectPlan: proto.Relation, sparkPlan: DataFrame): Unit = { + comparePlans(connectPlan, sparkPlan.queryExecution.analyzed) + } + + // Compares proto plan with LogicalPlan. + private def comparePlans(connectPlan: proto.Relation, sparkPlan: LogicalPlan): Unit = { + val connectAnalyzed = analyzePlan(transform(connectPlan)) + comparePlans(connectAnalyzed, sparkPlan, false) + } +} diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectServiceSuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectServiceSuite.scala new file mode 100644 index 0000000000000..c36ba76f98451 --- /dev/null +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/planner/SparkConnectServiceSuite.scala @@ -0,0 +1,373 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.connect.planner + +import scala.collection.JavaConverters._ +import scala.collection.mutable + +import io.grpc.StatusRuntimeException +import io.grpc.stub.StreamObserver +import org.apache.arrow.memory.RootAllocator +import org.apache.arrow.vector.{BigIntVector, Float8Vector} +import org.apache.arrow.vector.ipc.ArrowStreamReader + +import org.apache.spark.connect.proto +import org.apache.spark.sql.connect.dsl.MockRemoteSession +import org.apache.spark.sql.connect.dsl.expressions._ +import org.apache.spark.sql.connect.dsl.plans._ +import org.apache.spark.sql.connect.service.{SparkConnectAnalyzeHandler, SparkConnectService} +import org.apache.spark.sql.test.SharedSparkSession + +/** + * Testing Connect Service implementation. + */ +class SparkConnectServiceSuite extends SharedSparkSession { + + test("Test schema in analyze response") { + withTable("test") { + spark.sql(""" + | CREATE TABLE test (col1 INT, col2 STRING) + | USING parquet + |""".stripMargin) + + val plan = proto.Plan + .newBuilder() + .setRoot( + proto.Relation + .newBuilder() + .setRead( + proto.Read + .newBuilder() + .setNamedTable( + proto.Read.NamedTable.newBuilder.setUnparsedIdentifier("test").build()) + .build()) + .build()) + .build() + + val handler = new SparkConnectAnalyzeHandler(null) + + val request1 = proto.AnalyzePlanRequest + .newBuilder() + .setSchema(proto.AnalyzePlanRequest.Schema.newBuilder().setPlan(plan).build()) + .build() + val response1 = handler.process(request1, spark) + assert(response1.hasSchema) + assert(response1.getSchema.getSchema.hasStruct) + val schema = response1.getSchema.getSchema.getStruct + assert(schema.getFieldsCount == 2) + assert( + schema.getFields(0).getName == "col1" + && schema.getFields(0).getDataType.getKindCase == proto.DataType.KindCase.INTEGER) + assert( + schema.getFields(1).getName == "col2" + && schema.getFields(1).getDataType.getKindCase == proto.DataType.KindCase.STRING) + + val request2 = proto.AnalyzePlanRequest + .newBuilder() + .setExplain( + proto.AnalyzePlanRequest.Explain + .newBuilder() + .setPlan(plan) + .setExplainMode(proto.AnalyzePlanRequest.Explain.ExplainMode.EXPLAIN_MODE_SIMPLE) + .build()) + .build() + val response2 = handler.process(request2, spark) + assert(response2.hasExplain) + assert(response2.getExplain.getExplainString.size > 0) + + val request3 = proto.AnalyzePlanRequest + .newBuilder() + .setIsLocal(proto.AnalyzePlanRequest.IsLocal.newBuilder().setPlan(plan).build()) + .build() + val response3 = handler.process(request3, spark) + assert(response3.hasIsLocal) + assert(!response3.getIsLocal.getIsLocal) + + val request4 = proto.AnalyzePlanRequest + .newBuilder() + .setIsStreaming(proto.AnalyzePlanRequest.IsStreaming.newBuilder().setPlan(plan).build()) + .build() + val response4 = handler.process(request4, spark) + assert(response4.hasIsStreaming) + assert(!response4.getIsStreaming.getIsStreaming) + + val request5 = proto.AnalyzePlanRequest + .newBuilder() + .setTreeString(proto.AnalyzePlanRequest.TreeString.newBuilder().setPlan(plan).build()) + .build() + val response5 = handler.process(request5, spark) + assert(response5.hasTreeString) + val treeString = response5.getTreeString.getTreeString + assert(treeString.contains("root")) + assert(treeString.contains("|-- col1: integer (nullable = true)")) + assert(treeString.contains("|-- col2: string (nullable = true)")) + + val request6 = proto.AnalyzePlanRequest + .newBuilder() + .setInputFiles(proto.AnalyzePlanRequest.InputFiles.newBuilder().setPlan(plan).build()) + .build() + val response6 = handler.process(request6, spark) + assert(response6.hasInputFiles) + assert(response6.getInputFiles.getFilesCount === 0) + } + } + + test("SPARK-41224: collect data using arrow") { + val instance = new SparkConnectService(false) + val connect = new MockRemoteSession() + val context = proto.UserContext + .newBuilder() + .setUserId("c1") + .build() + val plan = proto.Plan + .newBuilder() + .setRoot(connect.sql("select id, exp(id) as eid from range(0, 100, 1, 4)")) + .build() + val request = proto.ExecutePlanRequest + .newBuilder() + .setPlan(plan) + .setUserContext(context) + .build() + + // Execute plan. + @volatile var done = false + val responses = mutable.Buffer.empty[proto.ExecutePlanResponse] + instance.executePlan( + request, + new StreamObserver[proto.ExecutePlanResponse] { + override def onNext(v: proto.ExecutePlanResponse): Unit = responses += v + + override def onError(throwable: Throwable): Unit = throw throwable + + override def onCompleted(): Unit = done = true + }) + + // The current implementation is expected to be blocking. This is here to make sure it is. + assert(done) + + // 4 Partitions + Metrics + assert(responses.size == 6) + + // Make sure the first response is schema only + val head = responses.head + assert(head.hasSchema && !head.hasArrowBatch && !head.hasMetrics) + + // Make sure the last response is metrics only + val last = responses.last + assert(last.hasMetrics && !last.hasSchema && !last.hasArrowBatch) + + val allocator = new RootAllocator() + + // Check the 'data' batches + var expectedId = 0L + var previousEId = 0.0d + responses.tail.dropRight(1).foreach { response => + assert(response.hasArrowBatch) + val batch = response.getArrowBatch + assert(batch.getData != null) + assert(batch.getRowCount == 25) + + val reader = new ArrowStreamReader(batch.getData.newInput(), allocator) + while (reader.loadNextBatch()) { + val root = reader.getVectorSchemaRoot + val idVector = root.getVector(0).asInstanceOf[BigIntVector] + val eidVector = root.getVector(1).asInstanceOf[Float8Vector] + val numRows = root.getRowCount + var i = 0 + while (i < numRows) { + assert(idVector.get(i) == expectedId) + expectedId += 1 + val eid = eidVector.get(i) + assert(eid > previousEId) + previousEId = eid + i += 1 + } + } + reader.close() + } + allocator.close() + } + + test("SPARK-41165: failures in the arrow collect path should not cause hangs") { + val instance = new SparkConnectService(false) + + // Add an always crashing UDF + val session = SparkConnectService.getOrCreateIsolatedSession("c1", "session").session + val instaKill: Long => Long = { _ => + throw new Exception("Kaboom") + } + session.udf.register("insta_kill", instaKill) + + val connect = new MockRemoteSession() + val context = proto.UserContext + .newBuilder() + .setUserId("c1") + .build() + val plan = proto.Plan + .newBuilder() + .setRoot(connect.sql("select insta_kill(id) from range(10)")) + .build() + val request = proto.ExecutePlanRequest + .newBuilder() + .setPlan(plan) + .setUserContext(context) + .setSessionId("session") + .build() + + // The observer is executed inside this thread. So + // we can perform the checks inside the observer. + instance.executePlan( + request, + new StreamObserver[proto.ExecutePlanResponse] { + override def onNext(v: proto.ExecutePlanResponse): Unit = { + fail("this should not receive responses") + } + + override def onError(throwable: Throwable): Unit = { + assert(throwable.isInstanceOf[StatusRuntimeException]) + } + + override def onCompleted(): Unit = { + fail("this should not complete") + } + }) + } + + test("Test explain mode in analyze response") { + withTable("test") { + spark.sql(""" + | CREATE TABLE test (col1 INT, col2 STRING) + | USING parquet + |""".stripMargin) + val relation = proto.Relation + .newBuilder() + .setProject( + proto.Project + .newBuilder() + .addExpressions( + proto.Expression + .newBuilder() + .setUnresolvedFunction( + proto.Expression.UnresolvedFunction + .newBuilder() + .setFunctionName("abs") + .addArguments(proto.Expression + .newBuilder() + .setLiteral(proto.Expression.Literal.newBuilder().setInteger(-1))))) + .setInput( + proto.Relation + .newBuilder() + .setRead(proto.Read + .newBuilder() + .setNamedTable( + proto.Read.NamedTable.newBuilder.setUnparsedIdentifier("test").build())))) + .build() + + val plan = proto.Plan.newBuilder().setRoot(relation).build() + + val handler = new SparkConnectAnalyzeHandler(null) + + val request = proto.AnalyzePlanRequest + .newBuilder() + .setExplain( + proto.AnalyzePlanRequest.Explain + .newBuilder() + .setPlan(plan) + .setExplainMode(proto.AnalyzePlanRequest.Explain.ExplainMode.EXPLAIN_MODE_EXTENDED) + .build()) + .build() + + val response = handler.process(request, spark) + + assert(response.getExplain.getExplainString.contains("Parsed Logical Plan")) + assert(response.getExplain.getExplainString.contains("Analyzed Logical Plan")) + assert(response.getExplain.getExplainString.contains("Optimized Logical Plan")) + assert(response.getExplain.getExplainString.contains("Physical Plan")) + } + } + + test("Test observe response") { + withTable("test") { + spark.sql(""" + | CREATE TABLE test (col1 INT, col2 STRING) + | USING parquet + |""".stripMargin) + + val instance = new SparkConnectService(false) + + val connect = new MockRemoteSession() + val context = proto.UserContext + .newBuilder() + .setUserId("c1") + .build() + val collectMetrics = proto.Relation + .newBuilder() + .setCollectMetrics( + proto.CollectMetrics + .newBuilder() + .setInput(connect.sql("select id, exp(id) as eid from range(0, 100, 1, 4)")) + .setName("my_metric") + .addAllMetrics(Seq( + proto_min("id".protoAttr).as("min_val"), + proto_max("id".protoAttr).as("max_val")).asJava)) + .build() + val plan = proto.Plan + .newBuilder() + .setRoot(collectMetrics) + .build() + val request = proto.ExecutePlanRequest + .newBuilder() + .setPlan(plan) + .setUserContext(context) + .build() + + // Execute plan. + @volatile var done = false + val responses = mutable.Buffer.empty[proto.ExecutePlanResponse] + instance.executePlan( + request, + new StreamObserver[proto.ExecutePlanResponse] { + override def onNext(v: proto.ExecutePlanResponse): Unit = responses += v + + override def onError(throwable: Throwable): Unit = throw throwable + + override def onCompleted(): Unit = done = true + }) + + // The current implementation is expected to be blocking. This is here to make sure it is. + assert(done) + + assert(responses.size == 7) + + // Make sure the first response is schema only + val head = responses.head + assert(head.hasSchema && !head.hasArrowBatch && !head.hasMetrics) + + // Make sure the last response is observed metrics only + val last = responses.last + assert(last.getObservedMetricsCount == 1 && !last.hasSchema && !last.hasArrowBatch) + + val observedMetricsList = last.getObservedMetricsList.asScala + val observedMetric = observedMetricsList.head + assert(observedMetric.getName == "my_metric") + assert(observedMetric.getValuesCount == 2) + val valuesList = observedMetric.getValuesList.asScala + assert(valuesList.head.hasLong && valuesList.head.getLong == 0) + assert(valuesList.last.hasLong && valuesList.last.getLong == 99) + } + } +} diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/plugin/SparkConnectPluginRegistrySuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/plugin/SparkConnectPluginRegistrySuite.scala new file mode 100644 index 0000000000000..39fc90fd0022d --- /dev/null +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/plugin/SparkConnectPluginRegistrySuite.scala @@ -0,0 +1,261 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.plugin + +import com.google.protobuf + +import org.apache.spark.{SparkContext, SparkEnv, SparkException} +import org.apache.spark.connect.proto +import org.apache.spark.connect.proto.Relation +import org.apache.spark.sql.Dataset +import org.apache.spark.sql.catalyst.expressions.{Alias, Expression} +import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan +import org.apache.spark.sql.connect.common.InvalidPlanInput +import org.apache.spark.sql.connect.config.Connect +import org.apache.spark.sql.connect.planner.{SparkConnectPlanner, SparkConnectPlanTest} +import org.apache.spark.sql.test.SharedSparkSession + +class DummyPlugin extends RelationPlugin { + override def transform( + relation: protobuf.Any, + planner: SparkConnectPlanner): Option[LogicalPlan] = None +} + +class DummyExpressionPlugin extends ExpressionPlugin { + override def transform( + relation: protobuf.Any, + planner: SparkConnectPlanner): Option[Expression] = None +} + +class DummyPluginNoTrivialCtor(id: Int) extends RelationPlugin { + override def transform( + relation: protobuf.Any, + planner: SparkConnectPlanner): Option[LogicalPlan] = None +} + +class DummyPluginInstantiationError extends RelationPlugin { + + throw new ArrayIndexOutOfBoundsException("Bad Plugin Error") + + override def transform( + relation: protobuf.Any, + planner: SparkConnectPlanner): Option[LogicalPlan] = None +} + +class ExampleRelationPlugin extends RelationPlugin { + override def transform( + relation: protobuf.Any, + planner: SparkConnectPlanner): Option[LogicalPlan] = { + + if (!relation.is(classOf[proto.ExamplePluginRelation])) { + return None + } + val plugin = relation.unpack(classOf[proto.ExamplePluginRelation]) + Some(planner.transformRelation(plugin.getInput)) + } +} + +class ExampleExpressionPlugin extends ExpressionPlugin { + override def transform( + relation: protobuf.Any, + planner: SparkConnectPlanner): Option[Expression] = { + if (!relation.is(classOf[proto.ExamplePluginExpression])) { + return None + } + val exp = relation.unpack(classOf[proto.ExamplePluginExpression]) + Some( + Alias(planner.transformExpression(exp.getChild), exp.getCustomField)(explicitMetadata = + None)) + } +} + +class ExampleCommandPlugin extends CommandPlugin { + override def process(command: protobuf.Any, planner: SparkConnectPlanner): Option[Unit] = { + if (!command.is(classOf[proto.ExamplePluginCommand])) { + return None + } + val cmd = command.unpack(classOf[proto.ExamplePluginCommand]) + assert(planner.session != null) + SparkContext.getActive.get.setLocalProperty("testingProperty", cmd.getCustomField) + Some() + } +} + +class SparkConnectPluginRegistrySuite extends SharedSparkSession with SparkConnectPlanTest { + + override def beforeEach(): Unit = { + if (SparkEnv.get.conf.contains(Connect.CONNECT_EXTENSIONS_EXPRESSION_CLASSES)) { + SparkEnv.get.conf.remove(Connect.CONNECT_EXTENSIONS_EXPRESSION_CLASSES) + } + if (SparkEnv.get.conf.contains(Connect.CONNECT_EXTENSIONS_RELATION_CLASSES)) { + SparkEnv.get.conf.remove(Connect.CONNECT_EXTENSIONS_RELATION_CLASSES) + } + if (SparkEnv.get.conf.contains(Connect.CONNECT_EXTENSIONS_COMMAND_CLASSES)) { + SparkEnv.get.conf.remove(Connect.CONNECT_EXTENSIONS_COMMAND_CLASSES) + } + SparkConnectPluginRegistry.reset() + } + + def withSparkConf(pairs: (String, String)*)(f: => Unit): Unit = { + val conf = SparkEnv.get.conf + pairs.foreach { kv => conf.set(kv._1, kv._2) } + try f + finally { + pairs.foreach { kv => conf.remove(kv._1) } + } + } + + def buildRelation(): proto.Relation = { + val input = Relation + .newBuilder() + .setExtension( + protobuf.Any.pack( + proto.ExamplePluginRelation + .newBuilder() + .setInput( + proto.Relation + .newBuilder() + .setRange(proto.Range + .newBuilder() + .setStart(0) + .setEnd(10) + .setStep(1))) + .build())) + Relation + .newBuilder() + .setProject( + proto.Project + .newBuilder() + .addExpressions( + proto.Expression + .newBuilder() + .setExtension( + protobuf.Any.pack( + proto.ExamplePluginExpression + .newBuilder() + .setChild(proto.Expression + .newBuilder() + .setUnresolvedAttribute(proto.Expression.UnresolvedAttribute + .newBuilder() + .setUnparsedIdentifier("id"))) + .setCustomField("martin") + .build()))) + .setInput(input)) + .build() + } + + test("end to end with no extensions configured") { + assertThrows[InvalidPlanInput] { + transform(buildRelation()) + } + + } + + test("End to end Relation plugin test") { + withSparkConf( + Connect.CONNECT_EXTENSIONS_RELATION_CLASSES.key -> + "org.apache.spark.sql.connect.plugin.ExampleRelationPlugin", + Connect.CONNECT_EXTENSIONS_EXPRESSION_CLASSES.key -> + "org.apache.spark.sql.connect.plugin.ExampleExpressionPlugin") { + val plan = transform(buildRelation()) + val ds = Dataset.ofRows(spark, plan) + val result = ds.collect() + assert(result.length == 10) + assert(result(0).schema.fieldNames(0) == "martin") + } + } + + test("End to end Command test") { + withSparkConf( + Connect.CONNECT_EXTENSIONS_COMMAND_CLASSES.key -> + "org.apache.spark.sql.connect.plugin.ExampleCommandPlugin") { + spark.sparkContext.setLocalProperty("testingProperty", "notset") + val plan = proto.Command + .newBuilder() + .setExtension( + protobuf.Any.pack( + proto.ExamplePluginCommand + .newBuilder() + .setCustomField("Martin") + .build())) + .build() + + new SparkConnectPlanner(spark).process(plan, "clientId", new MockObserver()) + assert(spark.sparkContext.getLocalProperty("testingProperty").equals("Martin")) + } + } + + test("Exception handling for plugin classes") { + withSparkConf( + Connect.CONNECT_EXTENSIONS_RELATION_CLASSES.key -> + "org.apache.spark.sql.connect.plugin.DummyPluginNoTrivialCtor") { + checkError( + exception = intercept[SparkException] { + SparkConnectPluginRegistry.loadRelationPlugins() + }, + errorClass = "CONNECT.PLUGIN_CTOR_MISSING", + parameters = Map("cls" -> "org.apache.spark.sql.connect.plugin.DummyPluginNoTrivialCtor")) + } + + withSparkConf( + Connect.CONNECT_EXTENSIONS_RELATION_CLASSES.key -> + "org.apache.spark.sql.connect.plugin.DummyPluginInstantiationError") { + checkError( + exception = intercept[SparkException] { + SparkConnectPluginRegistry.loadRelationPlugins() + }, + errorClass = "CONNECT.PLUGIN_RUNTIME_ERROR", + parameters = Map("msg" -> "Bad Plugin Error")) + } + } + + test("Emtpy registries are really empty and work") { + assert(SparkConnectPluginRegistry.loadRelationPlugins().isEmpty) + assert(SparkConnectPluginRegistry.loadExpressionPlugins().isEmpty) + assert(SparkConnectPluginRegistry.loadCommandPlugins().isEmpty) + } + + test("Building builders using factory methods") { + val x = SparkConnectPluginRegistry.relation[DummyPlugin](classOf[DummyPlugin]) + assert(x != null) + assert(x().isInstanceOf[RelationPlugin]) + val y = + SparkConnectPluginRegistry.expression[DummyExpressionPlugin](classOf[DummyExpressionPlugin]) + assert(y != null) + assert(y().isInstanceOf[ExpressionPlugin]) + } + + test("Configured class not found is properly thrown") { + withSparkConf( + Connect.CONNECT_EXTENSIONS_EXPRESSION_CLASSES.key -> "this.class.does.not.exist") { + assertThrows[ClassNotFoundException] { + SparkConnectPluginRegistry.createConfiguredPlugins( + SparkEnv.get.conf.get(Connect.CONNECT_EXTENSIONS_EXPRESSION_CLASSES)) + } + } + + withSparkConf( + Connect.CONNECT_EXTENSIONS_RELATION_CLASSES.key -> "this.class.does.not.exist") { + assertThrows[ClassNotFoundException] { + SparkConnectPluginRegistry.createConfiguredPlugins( + SparkEnv.get.conf.get(Connect.CONNECT_EXTENSIONS_RELATION_CLASSES)) + } + } + } + +} diff --git a/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/service/InterceptorRegistrySuite.scala b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/service/InterceptorRegistrySuite.scala new file mode 100644 index 0000000000000..7f85966f0a7b6 --- /dev/null +++ b/connector/connect/server/src/test/scala/org/apache/spark/sql/connect/service/InterceptorRegistrySuite.scala @@ -0,0 +1,187 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.connect.service + +import io.grpc.{Metadata, ServerCall, ServerCallHandler, ServerInterceptor} +import io.grpc.ForwardingServerCallListener.SimpleForwardingServerCallListener +import io.grpc.netty.NettyServerBuilder + +import org.apache.spark.{SparkEnv, SparkException} +import org.apache.spark.sql.connect.config.Connect +import org.apache.spark.sql.test.SharedSparkSession + +/** + * Used for testing only, does not do anything. + */ +class DummyInterceptor extends ServerInterceptor { + override def interceptCall[ReqT, RespT]( + call: ServerCall[ReqT, RespT], + headers: Metadata, + next: ServerCallHandler[ReqT, RespT]): ServerCall.Listener[ReqT] = { + val listener = next.startCall(call, headers) + new SimpleForwardingServerCallListener[ReqT](listener) { + override def onMessage(message: ReqT): Unit = { + delegate().onMessage(message) + } + } + } +} + +/** + * Used for testing only. + */ +class TestingInterceptorNoTrivialCtor(id: Int) extends ServerInterceptor { + override def interceptCall[ReqT, RespT]( + call: ServerCall[ReqT, RespT], + headers: Metadata, + next: ServerCallHandler[ReqT, RespT]): ServerCall.Listener[ReqT] = { + val listener = next.startCall(call, headers) + new SimpleForwardingServerCallListener[ReqT](listener) { + override def onMessage(message: ReqT): Unit = { + delegate().onMessage(message) + } + } + } +} + +/** + * Used for testing only. + */ +class TestingInterceptorInstantiationError extends ServerInterceptor { + throw new ArrayIndexOutOfBoundsException("Bad Error") + + override def interceptCall[ReqT, RespT]( + call: ServerCall[ReqT, RespT], + headers: Metadata, + next: ServerCallHandler[ReqT, RespT]): ServerCall.Listener[ReqT] = { + val listener = next.startCall(call, headers) + new SimpleForwardingServerCallListener[ReqT](listener) { + override def onMessage(message: ReqT): Unit = { + delegate().onMessage(message) + } + } + } +} + +class InterceptorRegistrySuite extends SharedSparkSession { + + override def beforeEach(): Unit = { + if (SparkEnv.get.conf.contains(Connect.CONNECT_GRPC_INTERCEPTOR_CLASSES)) { + SparkEnv.get.conf.remove(Connect.CONNECT_GRPC_INTERCEPTOR_CLASSES) + } + } + + def withSparkConf(pairs: (String, String)*)(f: => Unit): Unit = { + val conf = SparkEnv.get.conf + pairs.foreach { kv => conf.set(kv._1, kv._2) } + try f + finally { + pairs.foreach { kv => conf.remove(kv._1) } + } + } + + test("Check that the empty registry works") { + val sb = NettyServerBuilder.forPort(9999) + SparkConnectInterceptorRegistry.chainInterceptors(sb) + } + + test("Test server builder and configured interceptor") { + withSparkConf( + Connect.CONNECT_GRPC_INTERCEPTOR_CLASSES.key -> + "org.apache.spark.sql.connect.service.DummyInterceptor") { + val sb = NettyServerBuilder.forPort(9999) + SparkConnectInterceptorRegistry.chainInterceptors(sb) + } + } + + test("Test server build throws when using bad configured interceptor") { + withSparkConf( + Connect.CONNECT_GRPC_INTERCEPTOR_CLASSES.key -> + "org.apache.spark.sql.connect.service.TestingInterceptorNoTrivialCtor") { + val sb = NettyServerBuilder.forPort(9999) + checkError( + exception = intercept[SparkException] { + SparkConnectInterceptorRegistry.chainInterceptors(sb) + }, + errorClass = "CONNECT.INTERCEPTOR_CTOR_MISSING", + parameters = + Map("cls" -> "org.apache.spark.sql.connect.service.TestingInterceptorNoTrivialCtor")) + } + } + + test("Exception handling for interceptor classes") { + withSparkConf( + Connect.CONNECT_GRPC_INTERCEPTOR_CLASSES.key -> + "org.apache.spark.sql.connect.service.TestingInterceptorNoTrivialCtor") { + checkError( + exception = intercept[SparkException] { + SparkConnectInterceptorRegistry.createConfiguredInterceptors + }, + errorClass = "CONNECT.INTERCEPTOR_CTOR_MISSING", + parameters = + Map("cls" -> "org.apache.spark.sql.connect.service.TestingInterceptorNoTrivialCtor")) + } + + withSparkConf( + Connect.CONNECT_GRPC_INTERCEPTOR_CLASSES.key -> + "org.apache.spark.sql.connect.service.TestingInterceptorInstantiationError") { + checkError( + exception = intercept[SparkException] { + SparkConnectInterceptorRegistry.createConfiguredInterceptors + }, + errorClass = "CONNECT.INTERCEPTOR_RUNTIME_ERROR", + parameters = Map("msg" -> "Bad Error")) + } + } + + test("No configured interceptors returns empty list") { + // Not set. + assert(SparkConnectInterceptorRegistry.createConfiguredInterceptors.isEmpty) + // Set to empty string + withSparkConf(Connect.CONNECT_GRPC_INTERCEPTOR_CLASSES.key -> "") { + assert(SparkConnectInterceptorRegistry.createConfiguredInterceptors.isEmpty) + } + } + + test("Configured classes can have multiple entries") { + withSparkConf( + Connect.CONNECT_GRPC_INTERCEPTOR_CLASSES.key -> + (" org.apache.spark.sql.connect.service.DummyInterceptor," + + " org.apache.spark.sql.connect.service.DummyInterceptor ")) { + assert(SparkConnectInterceptorRegistry.createConfiguredInterceptors.size == 2) + } + } + + test("Configured class not found is properly thrown") { + withSparkConf(Connect.CONNECT_GRPC_INTERCEPTOR_CLASSES.key -> "this.class.does.not.exist") { + assertThrows[ClassNotFoundException] { + SparkConnectInterceptorRegistry.createConfiguredInterceptors + } + } + } + + test("LoggingInterceptor initializes when configured in spark conf") { + withSparkConf( + Connect.CONNECT_GRPC_INTERCEPTOR_CLASSES.key -> + "org.apache.spark.sql.connect.service.LoggingInterceptor") { + val interceptors = SparkConnectInterceptorRegistry.createConfiguredInterceptors() + assert(interceptors.size == 1) + assert(interceptors.head.isInstanceOf[LoggingInterceptor]) + } + } +} diff --git a/connector/docker-integration-tests/pom.xml b/connector/docker-integration-tests/pom.xml new file mode 100644 index 0000000000000..99d21a4fb39fe --- /dev/null +++ b/connector/docker-integration-tests/pom.xml @@ -0,0 +1,171 @@ + + + + + 4.0.0 + + org.apache.spark + spark-parent_2.12 + 3.4.1 + ../../pom.xml + + + spark-docker-integration-tests_2.12 + jar + Spark Project Docker Integration Tests + https://spark.apache.org/ + + docker-integration-tests + + + + + db + https://app.camunda.com/nexus/content/repositories/public/ + + true + warn + + + + + + + com.spotify + docker-client + test + shaded + + + org.apache.httpcomponents + httpclient + test + + + org.apache.httpcomponents + httpcore + test + + + + com.google.guava + guava + 18.0 + + + org.apache.spark + spark-core_${scala.binary.version} + ${project.version} + test + + + org.apache.spark + spark-core_${scala.binary.version} + ${project.version} + test-jar + test + + + org.apache.spark + spark-catalyst_${scala.binary.version} + ${project.version} + test-jar + test + + + org.apache.spark + spark-sql_${scala.binary.version} + ${project.version} + test + + + org.apache.spark + spark-sql_${scala.binary.version} + ${project.version} + test-jar + test + + + org.apache.spark + spark-tags_${scala.binary.version} + test-jar + test + + + org.apache.hadoop + hadoop-minikdc + test + + + + org.glassfish.jersey.bundles.repackaged + jersey-guava + 2.25.1 + test + + + org.mariadb.jdbc + mariadb-java-client + test + + + org.postgresql + postgresql + test + + + com.oracle.database.jdbc + ojdbc8 + test + + + + + com.ibm.db2 + jcc + test + + + com.microsoft.sqlserver + mssql-jdbc + test + + + com.mysql + mysql-connector-j + test + + + diff --git a/external/docker-integration-tests/src/test/resources/db2_krb_setup.sh b/connector/docker-integration-tests/src/test/resources/db2_krb_setup.sh similarity index 100% rename from external/docker-integration-tests/src/test/resources/db2_krb_setup.sh rename to connector/docker-integration-tests/src/test/resources/db2_krb_setup.sh diff --git a/external/docker-integration-tests/src/test/resources/log4j2.properties b/connector/docker-integration-tests/src/test/resources/log4j2.properties similarity index 100% rename from external/docker-integration-tests/src/test/resources/log4j2.properties rename to connector/docker-integration-tests/src/test/resources/log4j2.properties diff --git a/external/docker-integration-tests/src/test/resources/mariadb_docker_entrypoint.sh b/connector/docker-integration-tests/src/test/resources/mariadb_docker_entrypoint.sh similarity index 100% rename from external/docker-integration-tests/src/test/resources/mariadb_docker_entrypoint.sh rename to connector/docker-integration-tests/src/test/resources/mariadb_docker_entrypoint.sh diff --git a/external/docker-integration-tests/src/test/resources/mariadb_krb_setup.sh b/connector/docker-integration-tests/src/test/resources/mariadb_krb_setup.sh similarity index 100% rename from external/docker-integration-tests/src/test/resources/mariadb_krb_setup.sh rename to connector/docker-integration-tests/src/test/resources/mariadb_krb_setup.sh diff --git a/external/docker-integration-tests/src/test/resources/postgres_krb_setup.sh b/connector/docker-integration-tests/src/test/resources/postgres_krb_setup.sh similarity index 100% rename from external/docker-integration-tests/src/test/resources/postgres_krb_setup.sh rename to connector/docker-integration-tests/src/test/resources/postgres_krb_setup.sh diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala new file mode 100644 index 0000000000000..e4251512e432e --- /dev/null +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2IntegrationSuite.scala @@ -0,0 +1,241 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.jdbc + +import java.math.BigDecimal +import java.sql.{Connection, Date, Timestamp} +import java.util.Properties + +import org.scalatest.time.SpanSugar._ + +import org.apache.spark.sql.{Row, SaveMode} +import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._ +import org.apache.spark.sql.types.{BooleanType, ByteType, ShortType, StructType} +import org.apache.spark.tags.DockerTest + +/** + * To run this test suite for a specific version (e.g., ibmcom/db2:11.5.6.0a): + * {{{ + * ENABLE_DOCKER_INTEGRATION_TESTS=1 DB2_DOCKER_IMAGE_NAME=ibmcom/db2:11.5.6.0a + * ./build/sbt -Pdocker-integration-tests + * "testOnly org.apache.spark.sql.jdbc.DB2IntegrationSuite" + * }}} + */ +@DockerTest +class DB2IntegrationSuite extends DockerJDBCIntegrationSuite { + override val db = new DatabaseOnDocker { + override val imageName = sys.env.getOrElse("DB2_DOCKER_IMAGE_NAME", "ibmcom/db2:11.5.6.0a") + override val env = Map( + "DB2INST1_PASSWORD" -> "rootpass", + "LICENSE" -> "accept", + "DBNAME" -> "foo", + "ARCHIVE_LOGS" -> "false", + "AUTOCONFIG" -> "false" + ) + override val usesIpc = false + override val jdbcPort: Int = 50000 + override val privileged = true + override def getJdbcUrl(ip: String, port: Int): String = + s"jdbc:db2://$ip:$port/foo:user=db2inst1;password=rootpass;retrieveMessagesFromServerOnGetMessage=true;" //scalastyle:ignore + } + + override val connectionTimeout = timeout(3.minutes) + + override def dataPreparation(conn: Connection): Unit = { + conn.prepareStatement("CREATE TABLE tbl (x INTEGER, y VARCHAR(8))").executeUpdate() + conn.prepareStatement("INSERT INTO tbl VALUES (42,'fred')").executeUpdate() + conn.prepareStatement("INSERT INTO tbl VALUES (17,'dave')").executeUpdate() + + conn.prepareStatement("CREATE TABLE numbers ( small SMALLINT, med INTEGER, big BIGINT, " + + "deci DECIMAL(31,20), flt FLOAT, dbl DOUBLE, real REAL, " + + "decflt DECFLOAT, decflt16 DECFLOAT(16), decflt34 DECFLOAT(34))").executeUpdate() + conn.prepareStatement("INSERT INTO numbers VALUES (17, 77777, 922337203685477580, " + + "123456745.56789012345000000000, 42.75, 5.4E-70, " + + "3.4028234663852886e+38, 4.2999, DECFLOAT('9.999999999999999E19', 16), " + + "DECFLOAT('1234567891234567.123456789123456789', 34))").executeUpdate() + + conn.prepareStatement("CREATE TABLE dates (d DATE, t TIME, ts TIMESTAMP )").executeUpdate() + conn.prepareStatement("INSERT INTO dates VALUES ('1991-11-09', '13:31:24', " + + "'2009-02-13 23:31:30')").executeUpdate() + + // TODO: Test locale conversion for strings. + conn.prepareStatement("CREATE TABLE strings (a CHAR(10), b VARCHAR(10), c CLOB, d BLOB, e XML)") + .executeUpdate() + conn.prepareStatement("INSERT INTO strings VALUES ('the', 'quick', 'brown', BLOB('fox')," + + "'Kathy')").executeUpdate() + } + + test("Basic test") { + val df = sqlContext.read.jdbc(jdbcUrl, "tbl", new Properties) + val rows = df.collect() + assert(rows.length == 2) + val types = rows(0).toSeq.map(x => x.getClass.toString) + assert(types.length == 2) + assert(types(0).equals("class java.lang.Integer")) + assert(types(1).equals("class java.lang.String")) + } + + test("Numeric types") { + val df = sqlContext.read.jdbc(jdbcUrl, "numbers", new Properties) + val rows = df.collect() + assert(rows.length == 1) + val types = rows(0).toSeq.map(x => x.getClass.toString) + assert(types.length == 10) + assert(types(0).equals("class java.lang.Integer")) + assert(types(1).equals("class java.lang.Integer")) + assert(types(2).equals("class java.lang.Long")) + assert(types(3).equals("class java.math.BigDecimal")) + assert(types(4).equals("class java.lang.Double")) + assert(types(5).equals("class java.lang.Double")) + assert(types(6).equals("class java.lang.Float")) + assert(types(7).equals("class java.math.BigDecimal")) + assert(types(8).equals("class java.math.BigDecimal")) + assert(types(9).equals("class java.math.BigDecimal")) + assert(rows(0).getInt(0) == 17) + assert(rows(0).getInt(1) == 77777) + assert(rows(0).getLong(2) == 922337203685477580L) + val bd = new BigDecimal("123456745.56789012345000000000") + assert(rows(0).getAs[BigDecimal](3).equals(bd)) + assert(rows(0).getDouble(4) == 42.75) + assert(rows(0).getDouble(5) == 5.4E-70) + assert(rows(0).getFloat(6) == 3.4028234663852886e+38) + assert(rows(0).getDecimal(7) == new BigDecimal("4.299900000000000000")) + assert(rows(0).getDecimal(8) == new BigDecimal("99999999999999990000.000000000000000000")) + assert(rows(0).getDecimal(9) == new BigDecimal("1234567891234567.123456789123456789")) + } + + test("Date types") { + withDefaultTimeZone(UTC) { + val df = sqlContext.read.jdbc(jdbcUrl, "dates", new Properties) + val rows = df.collect() + assert(rows.length == 1) + val types = rows(0).toSeq.map(x => x.getClass.toString) + assert(types.length == 3) + assert(types(0).equals("class java.sql.Date")) + assert(types(1).equals("class java.sql.Timestamp")) + assert(types(2).equals("class java.sql.Timestamp")) + assert(rows(0).getAs[Date](0).equals(Date.valueOf("1991-11-09"))) + assert(rows(0).getAs[Timestamp](1).equals(Timestamp.valueOf("1970-01-01 13:31:24"))) + assert(rows(0).getAs[Timestamp](2).equals(Timestamp.valueOf("2009-02-13 23:31:30"))) + } + } + + test("String types") { + val df = sqlContext.read.jdbc(jdbcUrl, "strings", new Properties) + val rows = df.collect() + assert(rows.length == 1) + val types = rows(0).toSeq.map(x => x.getClass.toString) + assert(types.length == 5) + assert(types(0).equals("class java.lang.String")) + assert(types(1).equals("class java.lang.String")) + assert(types(2).equals("class java.lang.String")) + assert(types(3).equals("class [B")) + assert(rows(0).getString(0).equals("the ")) + assert(rows(0).getString(1).equals("quick")) + assert(rows(0).getString(2).equals("brown")) + assert(java.util.Arrays.equals(rows(0).getAs[Array[Byte]](3), Array[Byte](102, 111, 120))) + assert(rows(0).getString(4).equals("""Kathy""")) + } + + test("Basic write test") { + // cast decflt column with precision value of 38 to DB2 max decimal precision value of 31. + val df1 = sqlContext.read.jdbc(jdbcUrl, "numbers", new Properties) + .selectExpr("small", "med", "big", "deci", "flt", "dbl", "real", + "cast(decflt as decimal(31, 5)) as decflt") + val df2 = sqlContext.read.jdbc(jdbcUrl, "dates", new Properties) + val df3 = sqlContext.read.jdbc(jdbcUrl, "strings", new Properties) + df1.write.jdbc(jdbcUrl, "numberscopy", new Properties) + df2.write.jdbc(jdbcUrl, "datescopy", new Properties) + df3.write.jdbc(jdbcUrl, "stringscopy", new Properties) + // spark types that does not have exact matching db2 table types. + val df4 = sqlContext.createDataFrame( + sparkContext.parallelize(Seq(Row("1".toShort, "20".toByte, true))), + new StructType().add("c1", ShortType).add("b", ByteType).add("c3", BooleanType)) + df4.write.jdbc(jdbcUrl, "otherscopy", new Properties) + val rows = sqlContext.read.jdbc(jdbcUrl, "otherscopy", new Properties).collect() + assert(rows(0).getInt(0) == 1) + assert(rows(0).getInt(1) == 20) + assert(rows(0).getString(2) == "1") + } + + test("query JDBC option") { + val expectedResult = Set( + (42, "fred"), + (17, "dave") + ).map { case (x, y) => + Row(Integer.valueOf(x), String.valueOf(y)) + } + + val query = "SELECT x, y FROM tbl WHERE x > 10" + // query option to pass on the query string. + val df = spark.read.format("jdbc") + .option("url", jdbcUrl) + .option("query", query) + .load() + assert(df.collect.toSet === expectedResult) + + // query option in the create table path. + sql( + s""" + |CREATE OR REPLACE TEMPORARY VIEW queryOption + |USING org.apache.spark.sql.jdbc + |OPTIONS (url '$jdbcUrl', query '$query') + """.stripMargin.replaceAll("\n", " ")) + assert(sql("select x, y from queryOption").collect.toSet == expectedResult) + } + + test("SPARK-30062") { + val expectedResult = Set( + (42, "fred"), + (17, "dave") + ).map { case (x, y) => + Row(Integer.valueOf(x), String.valueOf(y)) + } + val df = sqlContext.read.jdbc(jdbcUrl, "tbl", new Properties) + for (_ <- 0 to 2) { + df.write.mode(SaveMode.Append).jdbc(jdbcUrl, "tblcopy", new Properties) + } + assert(sqlContext.read.jdbc(jdbcUrl, "tblcopy", new Properties).count === 6) + df.write.mode(SaveMode.Overwrite).option("truncate", true) + .jdbc(jdbcUrl, "tblcopy", new Properties) + val actual = sqlContext.read.jdbc(jdbcUrl, "tblcopy", new Properties).collect + assert(actual.length === 2) + assert(actual.toSet === expectedResult) + } + + test("SPARK-42534: DB2 Limit pushdown test") { + val actual = sqlContext.read + .format("jdbc") + .option("url", jdbcUrl) + .option("dbtable", "tbl") + .load() + .limit(2) + .select("x", "y") + .orderBy("x") + .collect() + + val expected = sqlContext.read + .format("jdbc") + .option("url", jdbcUrl) + .option("query", "SELECT x, y FROM tbl ORDER BY x FETCH FIRST 2 ROWS ONLY") + .load() + .collect() + + assert(actual === expected) + } +} diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2KrbIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2KrbIntegrationSuite.scala similarity index 100% rename from external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2KrbIntegrationSuite.scala rename to connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DB2KrbIntegrationSuite.scala diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerIntegrationFunSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerIntegrationFunSuite.scala similarity index 100% rename from external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerIntegrationFunSuite.scala rename to connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerIntegrationFunSuite.scala diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerJDBCIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerJDBCIntegrationSuite.scala similarity index 100% rename from external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerJDBCIntegrationSuite.scala rename to connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerJDBCIntegrationSuite.scala diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerKrbJDBCIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerKrbJDBCIntegrationSuite.scala similarity index 100% rename from external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerKrbJDBCIntegrationSuite.scala rename to connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/DockerKrbJDBCIntegrationSuite.scala diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala similarity index 100% rename from external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala rename to connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MariaDBKrbIntegrationSuite.scala diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MsSqlServerIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MsSqlServerIntegrationSuite.scala similarity index 91% rename from external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MsSqlServerIntegrationSuite.scala rename to connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MsSqlServerIntegrationSuite.scala index e293f9a8f7ba9..a4e2dba534380 100644 --- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MsSqlServerIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MsSqlServerIntegrationSuite.scala @@ -21,6 +21,7 @@ import java.math.BigDecimal import java.sql.{Connection, Date, Timestamp} import java.util.Properties +import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._ import org.apache.spark.sql.functions.col import org.apache.spark.sql.internal.SQLConf @@ -374,4 +375,58 @@ class MsSqlServerIntegrationSuite extends DockerJDBCIntegrationSuite { val filtered = df.where(col("c") === 0).collect() assert(filtered.length == 0) } + + test("SPARK-37259: prepareQuery and query JDBC options") { + val expectedResult = Set( + (42, "fred"), + (17, "dave") + ).map { case (x, y) => + Row(Integer.valueOf(x), String.valueOf(y)) + } + + val prepareQuery = "WITH t AS (SELECT x, y FROM tbl)" + val query = "SELECT * FROM t WHERE x > 10" + val df = spark.read.format("jdbc") + .option("url", jdbcUrl) + .option("prepareQuery", prepareQuery) + .option("query", query) + .load() + assert(df.collect.toSet === expectedResult) + } + + test("SPARK-37259: prepareQuery and dbtable JDBC options") { + val expectedResult = Set( + (42, "fred"), + (17, "dave") + ).map { case (x, y) => + Row(Integer.valueOf(x), String.valueOf(y)) + } + + val prepareQuery = "WITH t AS (SELECT x, y FROM tbl WHERE x > 10)" + val dbtable = "t" + val df = spark.read.format("jdbc") + .option("url", jdbcUrl) + .option("prepareQuery", prepareQuery) + .option("dbtable", dbtable) + .load() + assert(df.collect.toSet === expectedResult) + } + + test("SPARK-37259: temp table prepareQuery and query JDBC options") { + val expectedResult = Set( + (42, "fred"), + (17, "dave") + ).map { case (x, y) => + Row(Integer.valueOf(x), String.valueOf(y)) + } + + val prepareQuery = "(SELECT * INTO #TempTable FROM (SELECT * FROM tbl) t)" + val query = "SELECT * FROM #TempTable" + val df = spark.read.format("jdbc") + .option("url", jdbcUrl) + .option("prepareQuery", prepareQuery) + .option("query", query) + .load() + assert(df.collect.toSet === expectedResult) + } } diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegrationSuite.scala new file mode 100644 index 0000000000000..bc202b1b8323e --- /dev/null +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/MySQLIntegrationSuite.scala @@ -0,0 +1,195 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.jdbc + +import java.math.BigDecimal +import java.sql.{Connection, Date, Timestamp} +import java.util.Properties + +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._ +import org.apache.spark.tags.DockerTest + +/** + * To run this test suite for a specific version (e.g., mysql:8.0.31): + * {{{ + * ENABLE_DOCKER_INTEGRATION_TESTS=1 MYSQL_DOCKER_IMAGE_NAME=mysql:8.0.31 + * ./build/sbt -Pdocker-integration-tests + * "testOnly org.apache.spark.sql.jdbc.MySQLIntegrationSuite" + * }}} + */ +@DockerTest +class MySQLIntegrationSuite extends DockerJDBCIntegrationSuite { + override val db = new DatabaseOnDocker { + override val imageName = sys.env.getOrElse("MYSQL_DOCKER_IMAGE_NAME", "mysql:8.0.31") + override val env = Map( + "MYSQL_ROOT_PASSWORD" -> "rootpass" + ) + override val usesIpc = false + override val jdbcPort: Int = 3306 + override def getJdbcUrl(ip: String, port: Int): String = + s"jdbc:mysql://$ip:$port/mysql?user=root&password=rootpass" + } + + override def dataPreparation(conn: Connection): Unit = { + // Since MySQL 5.7.14+, we need to disable strict mode + conn.prepareStatement("SET GLOBAL sql_mode = ''").executeUpdate() + conn.prepareStatement("CREATE DATABASE foo").executeUpdate() + conn.prepareStatement("CREATE TABLE tbl (x INTEGER, y TEXT(8))").executeUpdate() + conn.prepareStatement("INSERT INTO tbl VALUES (42,'fred')").executeUpdate() + conn.prepareStatement("INSERT INTO tbl VALUES (17,'dave')").executeUpdate() + + conn.prepareStatement("CREATE TABLE numbers (onebit BIT(1), tenbits BIT(10), " + + "small SMALLINT, med MEDIUMINT, nor INT, big BIGINT, deci DECIMAL(40,20), flt FLOAT, " + + "dbl DOUBLE)").executeUpdate() + conn.prepareStatement("INSERT INTO numbers VALUES (b'0', b'1000100101', " + + "17, 77777, 123456789, 123456789012345, 123456789012345.123456789012345, " + + "42.75, 1.0000000000000002)").executeUpdate() + + conn.prepareStatement("CREATE TABLE dates (d DATE, t TIME, dt DATETIME, ts TIMESTAMP, " + + "yr YEAR)").executeUpdate() + conn.prepareStatement("INSERT INTO dates VALUES ('1991-11-09', '13:31:24', " + + "'1996-01-01 01:23:45', '2009-02-13 23:31:30', '2001')").executeUpdate() + + // TODO: Test locale conversion for strings. + conn.prepareStatement("CREATE TABLE strings (a CHAR(10), b VARCHAR(10), c TINYTEXT, " + + "d TEXT, e MEDIUMTEXT, f LONGTEXT, g BINARY(4), h VARBINARY(10), i BLOB)" + ).executeUpdate() + conn.prepareStatement("INSERT INTO strings VALUES ('the', 'quick', 'brown', 'fox', " + + "'jumps', 'over', 'the', 'lazy', 'dog')").executeUpdate() + } + + test("Basic test") { + val df = sqlContext.read.jdbc(jdbcUrl, "tbl", new Properties) + val rows = df.collect() + assert(rows.length == 2) + val types = rows(0).toSeq.map(x => x.getClass.toString) + assert(types.length == 2) + assert(types(0).equals("class java.lang.Integer")) + assert(types(1).equals("class java.lang.String")) + } + + test("Numeric types") { + val df = sqlContext.read.jdbc(jdbcUrl, "numbers", new Properties) + val rows = df.collect() + assert(rows.length == 1) + val types = rows(0).toSeq.map(x => x.getClass.toString) + assert(types.length == 9) + assert(types(0).equals("class java.lang.Boolean")) + assert(types(1).equals("class java.lang.Long")) + assert(types(2).equals("class java.lang.Integer")) + assert(types(3).equals("class java.lang.Integer")) + assert(types(4).equals("class java.lang.Integer")) + assert(types(5).equals("class java.lang.Long")) + assert(types(6).equals("class java.math.BigDecimal")) + assert(types(7).equals("class java.lang.Double")) + assert(types(8).equals("class java.lang.Double")) + assert(rows(0).getBoolean(0) == false) + assert(rows(0).getLong(1) == 0x225) + assert(rows(0).getInt(2) == 17) + assert(rows(0).getInt(3) == 77777) + assert(rows(0).getInt(4) == 123456789) + assert(rows(0).getLong(5) == 123456789012345L) + val bd = new BigDecimal("123456789012345.12345678901234500000") + assert(rows(0).getAs[BigDecimal](6).equals(bd)) + assert(rows(0).getDouble(7) == 42.75) + assert(rows(0).getDouble(8) == 1.0000000000000002) + } + + test("Date types") { + withDefaultTimeZone(UTC) { + val df = sqlContext.read.jdbc(jdbcUrl, "dates", new Properties) + val rows = df.collect() + assert(rows.length == 1) + val types = rows(0).toSeq.map(x => x.getClass.toString) + assert(types.length == 5) + assert(types(0).equals("class java.sql.Date")) + assert(types(1).equals("class java.sql.Timestamp")) + assert(types(2).equals("class java.sql.Timestamp")) + assert(types(3).equals("class java.sql.Timestamp")) + assert(types(4).equals("class java.sql.Date")) + assert(rows(0).getAs[Date](0).equals(Date.valueOf("1991-11-09"))) + assert( + rows(0).getAs[Timestamp](1) === Timestamp.valueOf("1970-01-01 13:31:24")) + assert(rows(0).getAs[Timestamp](2).equals(Timestamp.valueOf("1996-01-01 01:23:45"))) + assert(rows(0).getAs[Timestamp](3).equals(Timestamp.valueOf("2009-02-13 23:31:30"))) + assert(rows(0).getAs[Date](4).equals(Date.valueOf("2001-01-01"))) + } + } + + test("String types") { + val df = sqlContext.read.jdbc(jdbcUrl, "strings", new Properties) + val rows = df.collect() + assert(rows.length == 1) + val types = rows(0).toSeq.map(x => x.getClass.toString) + assert(types.length == 9) + assert(types(0).equals("class java.lang.String")) + assert(types(1).equals("class java.lang.String")) + assert(types(2).equals("class java.lang.String")) + assert(types(3).equals("class java.lang.String")) + assert(types(4).equals("class java.lang.String")) + assert(types(5).equals("class java.lang.String")) + assert(types(6).equals("class [B")) + assert(types(7).equals("class [B")) + assert(types(8).equals("class [B")) + assert(rows(0).getString(0).equals("the")) + assert(rows(0).getString(1).equals("quick")) + assert(rows(0).getString(2).equals("brown")) + assert(rows(0).getString(3).equals("fox")) + assert(rows(0).getString(4).equals("jumps")) + assert(rows(0).getString(5).equals("over")) + assert(java.util.Arrays.equals(rows(0).getAs[Array[Byte]](6), Array[Byte](116, 104, 101, 0))) + assert(java.util.Arrays.equals(rows(0).getAs[Array[Byte]](7), Array[Byte](108, 97, 122, 121))) + assert(java.util.Arrays.equals(rows(0).getAs[Array[Byte]](8), Array[Byte](100, 111, 103))) + } + + test("Basic write test") { + val df1 = sqlContext.read.jdbc(jdbcUrl, "numbers", new Properties) + val df2 = sqlContext.read.jdbc(jdbcUrl, "dates", new Properties) + val df3 = sqlContext.read.jdbc(jdbcUrl, "strings", new Properties) + df1.write.jdbc(jdbcUrl, "numberscopy", new Properties) + df2.write.jdbc(jdbcUrl, "datescopy", new Properties) + df3.write.jdbc(jdbcUrl, "stringscopy", new Properties) + } + + test("query JDBC option") { + val expectedResult = Set( + (42, "fred"), + (17, "dave") + ).map { case (x, y) => + Row(Integer.valueOf(x), String.valueOf(y)) + } + + val query = "SELECT x, y FROM tbl WHERE x > 10" + // query option to pass on the query string. + val df = spark.read.format("jdbc") + .option("url", jdbcUrl) + .option("query", query) + .load() + assert(df.collect.toSet === expectedResult) + + // query option in the create table path. + sql( + s""" + |CREATE OR REPLACE TEMPORARY VIEW queryOption + |USING org.apache.spark.sql.jdbc + |OPTIONS (url '$jdbcUrl', query '$query') + """.stripMargin.replaceAll("\n", " ")) + assert(sql("select x, y from queryOption").collect.toSet == expectedResult) + } +} diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala new file mode 100644 index 0000000000000..a9c57e5d38d43 --- /dev/null +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/OracleIntegrationSuite.scala @@ -0,0 +1,521 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.jdbc + +import java.math.BigDecimal +import java.sql.{Connection, Date, Timestamp} +import java.util.{Properties, TimeZone} + +import org.scalatest.time.SpanSugar._ + +import org.apache.spark.sql.{Row, SaveMode} +import org.apache.spark.sql.catalyst.util.DateTimeTestUtils._ +import org.apache.spark.sql.execution.{RowDataSourceScanExec, WholeStageCodegenExec} +import org.apache.spark.sql.execution.datasources.LogicalRelation +import org.apache.spark.sql.execution.datasources.jdbc.{JDBCPartition, JDBCRelation} +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types._ +import org.apache.spark.tags.DockerTest + +/** + * The following are the steps to test this: + * + * 1. Choose to use a prebuilt image or build Oracle database in a container + * - The documentation on how to build Oracle RDBMS in a container is at + * https://github.com/oracle/docker-images/blob/master/OracleDatabase/SingleInstance/README.md + * - Official Oracle container images can be found at https://container-registry.oracle.com + * - A trustable and streamlined Oracle XE database image can be found on Docker Hub at + * https://hub.docker.com/r/gvenzl/oracle-xe see also https://github.com/gvenzl/oci-oracle-xe + * 2. Run: export ORACLE_DOCKER_IMAGE_NAME=image_you_want_to_use_for_testing + * - Example: export ORACLE_DOCKER_IMAGE_NAME=gvenzl/oracle-xe:latest + * 3. Run: export ENABLE_DOCKER_INTEGRATION_TESTS=1 + * 4. Start docker: sudo service docker start + * - Optionally, docker pull $ORACLE_DOCKER_IMAGE_NAME + * 5. Run Spark integration tests for Oracle with: ./build/sbt -Pdocker-integration-tests + * "testOnly org.apache.spark.sql.jdbc.OracleIntegrationSuite" + * + * A sequence of commands to build the Oracle XE database container image: + * $ git clone https://github.com/oracle/docker-images.git + * $ cd docker-images/OracleDatabase/SingleInstance/dockerfiles + * $ ./buildContainerImage.sh -v 21.3.0 -x + * $ export ORACLE_DOCKER_IMAGE_NAME=oracle/database:21.3.0-xe + * + * This procedure has been validated with Oracle 18.4.0 and 21.3.0 Express Edition. + */ +@DockerTest +class OracleIntegrationSuite extends DockerJDBCIntegrationSuite with SharedSparkSession { + import testImplicits._ + + override val db = new DatabaseOnDocker { + lazy override val imageName = + sys.env.getOrElse("ORACLE_DOCKER_IMAGE_NAME", "gvenzl/oracle-xe:21.3.0") + val oracle_password = "Th1s1sThe0racle#Pass" + override val env = Map( + "ORACLE_PWD" -> oracle_password, // oracle images uses this + "ORACLE_PASSWORD" -> oracle_password // gvenzl/oracle-xe uses this + ) + override val usesIpc = false + override val jdbcPort: Int = 1521 + override def getJdbcUrl(ip: String, port: Int): String = + s"jdbc:oracle:thin:system/$oracle_password@//$ip:$port/xe" + } + + override val connectionTimeout = timeout(7.minutes) + + override def dataPreparation(conn: Connection): Unit = { + // In 18.4.0 Express Edition auto commit is enabled by default. + conn.setAutoCommit(false) + conn.prepareStatement("CREATE TABLE datetime (id NUMBER(10), d DATE, t TIMESTAMP)") + .executeUpdate() + conn.prepareStatement( + """INSERT INTO datetime VALUES + |(1, {d '1991-11-09'}, {ts '1996-01-01 01:23:45'}) + """.stripMargin.replaceAll("\n", " ")).executeUpdate() + conn.commit() + + conn.prepareStatement( + "CREATE TABLE ts_with_timezone (id NUMBER(10), t TIMESTAMP WITH TIME ZONE)").executeUpdate() + conn.prepareStatement( + "INSERT INTO ts_with_timezone VALUES " + + "(1, to_timestamp_tz('1999-12-01 11:00:00 UTC','YYYY-MM-DD HH:MI:SS TZR'))").executeUpdate() + conn.prepareStatement( + "INSERT INTO ts_with_timezone VALUES " + + "(2, to_timestamp_tz('1999-12-01 12:00:00 PST','YYYY-MM-DD HH:MI:SS TZR'))").executeUpdate() + conn.commit() + + conn.prepareStatement( + "CREATE TABLE tableWithCustomSchema (id NUMBER, n1 NUMBER(1), n2 NUMBER(1))").executeUpdate() + conn.prepareStatement( + "INSERT INTO tableWithCustomSchema values(12312321321321312312312312123, 1, 0)") + .executeUpdate() + conn.commit() + + sql( + s""" + |CREATE TEMPORARY VIEW datetime + |USING org.apache.spark.sql.jdbc + |OPTIONS (url '$jdbcUrl', dbTable 'datetime', oracle.jdbc.mapDateToTimestamp 'false') + """.stripMargin.replaceAll("\n", " ")) + + conn.prepareStatement("CREATE TABLE datetime1 (id NUMBER(10), d DATE, t TIMESTAMP)") + .executeUpdate() + conn.commit() + + sql( + s""" + |CREATE TEMPORARY VIEW datetime1 + |USING org.apache.spark.sql.jdbc + |OPTIONS (url '$jdbcUrl', dbTable 'datetime1', oracle.jdbc.mapDateToTimestamp 'false') + """.stripMargin.replaceAll("\n", " ")) + + + conn.prepareStatement("CREATE TABLE numerics (b DECIMAL(1), f DECIMAL(3, 2), i DECIMAL(10))") + .executeUpdate() + conn.prepareStatement( + "INSERT INTO numerics VALUES (4, 1.23, 9999999999)").executeUpdate() + conn.commit() + + conn.prepareStatement("CREATE TABLE oracle_types (d BINARY_DOUBLE, f BINARY_FLOAT)") + .executeUpdate() + conn.commit() + + conn.prepareStatement("CREATE TABLE datetimePartitionTest (id NUMBER(10), d DATE, t TIMESTAMP)") + .executeUpdate() + conn.prepareStatement( + """INSERT INTO datetimePartitionTest VALUES + |(1, {d '2018-07-06'}, {ts '2018-07-06 05:50:00'}) + """.stripMargin.replaceAll("\n", " ")).executeUpdate() + conn.prepareStatement( + """INSERT INTO datetimePartitionTest VALUES + |(2, {d '2018-07-06'}, {ts '2018-07-06 08:10:08'}) + """.stripMargin.replaceAll("\n", " ")).executeUpdate() + conn.prepareStatement( + """INSERT INTO datetimePartitionTest VALUES + |(3, {d '2018-07-08'}, {ts '2018-07-08 13:32:01'}) + """.stripMargin.replaceAll("\n", " ")).executeUpdate() + conn.prepareStatement( + """INSERT INTO datetimePartitionTest VALUES + |(4, {d '2018-07-12'}, {ts '2018-07-12 09:51:15'}) + """.stripMargin.replaceAll("\n", " ")).executeUpdate() + conn.commit() + } + + test("SPARK-16625 : Importing Oracle numeric types") { + val df = sqlContext.read.jdbc(jdbcUrl, "numerics", new Properties) + val rows = df.collect() + assert(rows.size == 1) + val row = rows(0) + // The main point of the below assertions is not to make sure that these Oracle types are + // mapped to decimal types, but to make sure that the returned values are correct. + // A value > 1 from DECIMAL(1) is correct: + assert(row.getDecimal(0).compareTo(BigDecimal.valueOf(4)) == 0) + // A value with fractions from DECIMAL(3, 2) is correct: + assert(row.getDecimal(1).compareTo(BigDecimal.valueOf(1.23)) == 0) + // A value > Int.MaxValue from DECIMAL(10) is correct: + assert(row.getDecimal(2).compareTo(BigDecimal.valueOf(9999999999L)) == 0) + } + + + test("SPARK-12941: String datatypes to be mapped to Varchar in Oracle") { + // create a sample dataframe with string type + val df1 = sparkContext.parallelize(Seq(("foo"))).toDF("x") + // write the dataframe to the oracle table tbl + df1.write.jdbc(jdbcUrl, "tbl2", new Properties) + // read the table from the oracle + val dfRead = sqlContext.read.jdbc(jdbcUrl, "tbl2", new Properties) + // get the rows + val rows = dfRead.collect() + // verify the data type is inserted + val types = rows(0).toSeq.map(x => x.getClass.toString) + assert(types(0).equals("class java.lang.String")) + // verify the value is the inserted correct or not + assert(rows(0).getString(0).equals("foo")) + } + + test("SPARK-16625: General data types to be mapped to Oracle") { + val props = new Properties() + props.put("oracle.jdbc.mapDateToTimestamp", "false") + + val schema = StructType(Seq( + StructField("boolean_type", BooleanType, true), + StructField("integer_type", IntegerType, true), + StructField("long_type", LongType, true), + StructField("float_Type", FloatType, true), + StructField("double_type", DoubleType, true), + StructField("byte_type", ByteType, true), + StructField("short_type", ShortType, true), + StructField("string_type", StringType, true), + StructField("binary_type", BinaryType, true), + StructField("date_type", DateType, true), + StructField("timestamp_type", TimestampType, true) + )) + + val tableName = "test_oracle_general_types" + val booleanVal = true + val integerVal = 1 + val longVal = 2L + val floatVal = 3.0f + val doubleVal = 4.0 + val byteVal = 2.toByte + val shortVal = 5.toShort + val stringVal = "string" + val binaryVal = Array[Byte](6, 7, 8) + val dateVal = Date.valueOf("2016-07-26") + val timestampVal = Timestamp.valueOf("2016-07-26 11:49:45") + + val data = spark.sparkContext.parallelize(Seq( + Row( + booleanVal, integerVal, longVal, floatVal, doubleVal, byteVal, shortVal, stringVal, + binaryVal, dateVal, timestampVal + ))) + + val dfWrite = spark.createDataFrame(data, schema) + dfWrite.write.jdbc(jdbcUrl, tableName, props) + + val dfRead = spark.read.jdbc(jdbcUrl, tableName, props) + val rows = dfRead.collect() + // verify the data type is inserted + val types = dfRead.schema.map(field => field.dataType) + assert(types(0).equals(DecimalType(1, 0))) + assert(types(1).equals(DecimalType(10, 0))) + assert(types(2).equals(DecimalType(19, 0))) + assert(types(3).equals(DecimalType(19, 4))) + assert(types(4).equals(DecimalType(19, 4))) + assert(types(5).equals(DecimalType(3, 0))) + assert(types(6).equals(DecimalType(5, 0))) + assert(types(7).equals(StringType)) + assert(types(8).equals(BinaryType)) + assert(types(9).equals(DateType)) + assert(types(10).equals(TimestampType)) + + // verify the value is the inserted correct or not + val values = rows(0) + assert(values.getDecimal(0).compareTo(BigDecimal.valueOf(1)) == 0) + assert(values.getDecimal(1).compareTo(BigDecimal.valueOf(integerVal)) == 0) + assert(values.getDecimal(2).compareTo(BigDecimal.valueOf(longVal)) == 0) + assert(values.getDecimal(3).compareTo(BigDecimal.valueOf(floatVal)) == 0) + assert(values.getDecimal(4).compareTo(BigDecimal.valueOf(doubleVal)) == 0) + assert(values.getDecimal(5).compareTo(BigDecimal.valueOf(byteVal)) == 0) + assert(values.getDecimal(6).compareTo(BigDecimal.valueOf(shortVal)) == 0) + assert(values.getString(7).equals(stringVal)) + assert(values.getAs[Array[Byte]](8).mkString.equals("678")) + assert(values.getDate(9).equals(dateVal)) + assert(values.getTimestamp(10).equals(timestampVal)) + } + + test("SPARK-19318: connection property keys should be case-sensitive") { + def checkRow(row: Row): Unit = { + assert(row.getDecimal(0).equals(BigDecimal.valueOf(1))) + assert(row.getDate(1).equals(Date.valueOf("1991-11-09"))) + assert(row.getTimestamp(2).equals(Timestamp.valueOf("1996-01-01 01:23:45"))) + } + checkRow(sql("SELECT * FROM datetime where id = 1").head()) + sql("INSERT INTO TABLE datetime1 SELECT * FROM datetime where id = 1") + checkRow(sql("SELECT * FROM datetime1 where id = 1").head()) + } + + test("SPARK-20557: column type TIMESTAMP with TIME ZONE should be recognized") { + val dfRead = sqlContext.read.jdbc(jdbcUrl, "ts_with_timezone", new Properties) + val rows = dfRead.collect() + val types = rows(0).toSeq.map(x => x.getClass.toString) + assert(types(1).equals("class java.sql.Timestamp")) + } + + test("Column type TIMESTAMP with SESSION_LOCAL_TIMEZONE is different from default") { + val defaultJVMTimeZone = TimeZone.getDefault + // Pick the timezone different from the current default time zone of JVM + val sofiaTimeZone = TimeZone.getTimeZone("Europe/Sofia") + val shanghaiTimeZone = TimeZone.getTimeZone("Asia/Shanghai") + val localSessionTimeZone = + if (defaultJVMTimeZone == shanghaiTimeZone) sofiaTimeZone else shanghaiTimeZone + + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> localSessionTimeZone.getID) { + val e = intercept[java.sql.SQLException] { + val dfRead = sqlContext.read.jdbc(jdbcUrl, "ts_with_timezone", new Properties) + dfRead.collect() + }.getMessage + assert(e.contains("Unrecognized SQL type -101")) + } + } + + test("Column TIMESTAMP with TIME ZONE(JVM timezone)") { + def checkRow(row: Row, ts: String): Unit = { + assert(row.getTimestamp(1).equals(Timestamp.valueOf(ts))) + } + + withSQLConf(SQLConf.SESSION_LOCAL_TIMEZONE.key -> TimeZone.getDefault.getID) { + val dfRead = sqlContext.read.jdbc(jdbcUrl, "ts_with_timezone", new Properties) + withDefaultTimeZone(PST) { + assert(dfRead.collect().toSet === + Set( + Row(BigDecimal.valueOf(1), java.sql.Timestamp.valueOf("1999-12-01 03:00:00")), + Row(BigDecimal.valueOf(2), java.sql.Timestamp.valueOf("1999-12-01 12:00:00")))) + } + + withDefaultTimeZone(UTC) { + assert(dfRead.collect().toSet === + Set( + Row(BigDecimal.valueOf(1), java.sql.Timestamp.valueOf("1999-12-01 11:00:00")), + Row(BigDecimal.valueOf(2), java.sql.Timestamp.valueOf("1999-12-01 20:00:00")))) + } + } + } + + test("SPARK-18004: Make sure date or timestamp related predicate is pushed down correctly") { + val props = new Properties() + props.put("oracle.jdbc.mapDateToTimestamp", "false") + + val schema = StructType(Seq( + StructField("date_type", DateType, true), + StructField("timestamp_type", TimestampType, true) + )) + + val tableName = "test_date_timestamp_pushdown" + val dateVal = Date.valueOf("2017-06-22") + val timestampVal = Timestamp.valueOf("2017-06-22 21:30:07") + + val data = spark.sparkContext.parallelize(Seq( + Row(dateVal, timestampVal) + )) + + val dfWrite = spark.createDataFrame(data, schema) + dfWrite.write.jdbc(jdbcUrl, tableName, props) + + val dfRead = spark.read.jdbc(jdbcUrl, tableName, props) + + val millis = System.currentTimeMillis() + val dt = new java.sql.Date(millis) + val ts = new java.sql.Timestamp(millis) + + // Query Oracle table with date and timestamp predicates + // which should be pushed down to Oracle. + val df = dfRead.filter(dfRead.col("date_type").lt(dt)) + .filter(dfRead.col("timestamp_type").lt(ts)) + + val parentPlan = df.queryExecution.executedPlan + assert(parentPlan.isInstanceOf[WholeStageCodegenExec]) + val node = parentPlan.asInstanceOf[WholeStageCodegenExec] + val metadata = node.child.asInstanceOf[RowDataSourceScanExec].metadata + // The "PushedFilters" part should exist in Dataframe's + // physical plan and the existence of right literals in + // "PushedFilters" is used to prove that the predicates + // pushing down have been effective. + assert(metadata.get("PushedFilters").isDefined) + assert(metadata("PushedFilters").contains(dt.toString)) + assert(metadata("PushedFilters").contains(ts.toString)) + + val row = df.collect()(0) + assert(row.getDate(0).equals(dateVal)) + assert(row.getTimestamp(1).equals(timestampVal)) + } + + test("SPARK-20427/SPARK-20921: read table use custom schema by jdbc api") { + // default will throw IllegalArgumentException + val e = intercept[org.apache.spark.SparkException] { + spark.read.jdbc(jdbcUrl, "tableWithCustomSchema", new Properties()).collect() + } + assert(e.getCause().isInstanceOf[ArithmeticException]) + assert(e.getMessage.contains("Decimal precision 39 exceeds max precision 38")) + + // custom schema can read data + val props = new Properties() + props.put("customSchema", + s"ID DECIMAL(${DecimalType.MAX_PRECISION}, 0), N1 INT, N2 BOOLEAN") + val dfRead = spark.read.jdbc(jdbcUrl, "tableWithCustomSchema", props) + + val rows = dfRead.collect() + // verify the data type + val types = rows(0).toSeq.map(x => x.getClass.toString) + assert(types(0).equals("class java.math.BigDecimal")) + assert(types(1).equals("class java.lang.Integer")) + assert(types(2).equals("class java.lang.Boolean")) + + // verify the value + val values = rows(0) + assert(values.getDecimal(0).equals(new java.math.BigDecimal("12312321321321312312312312123"))) + assert(values.getInt(1).equals(1)) + assert(values.getBoolean(2) == false) + } + + test("SPARK-22303: handle BINARY_DOUBLE and BINARY_FLOAT as DoubleType and FloatType") { + val tableName = "oracle_types" + val schema = StructType(Seq( + StructField("d", DoubleType, true), + StructField("f", FloatType, true))) + val props = new Properties() + + // write it back to the table (append mode) + val data = spark.sparkContext.parallelize(Seq(Row(1.1, 2.2f))) + val dfWrite = spark.createDataFrame(data, schema) + dfWrite.write.mode(SaveMode.Append).jdbc(jdbcUrl, tableName, props) + + // read records from oracle_types + val dfRead = sqlContext.read.jdbc(jdbcUrl, tableName, new Properties) + val rows = dfRead.collect() + assert(rows.size == 1) + + // check data types + val types = dfRead.schema.map(field => field.dataType) + assert(types(0).equals(DoubleType)) + assert(types(1).equals(FloatType)) + + // check values + val values = rows(0) + assert(values.getDouble(0) === 1.1) + assert(values.getFloat(1) === 2.2f) + } + + test("SPARK-22814 support date/timestamp types in partitionColumn") { + val expectedResult = Set( + (1, "2018-07-06", "2018-07-06 05:50:00"), + (2, "2018-07-06", "2018-07-06 08:10:08"), + (3, "2018-07-08", "2018-07-08 13:32:01"), + (4, "2018-07-12", "2018-07-12 09:51:15") + ).map { case (id, date, timestamp) => + Row(BigDecimal.valueOf(id), Date.valueOf(date), Timestamp.valueOf(timestamp)) + } + + // DateType partition column + val df1 = spark.read.format("jdbc") + .option("url", jdbcUrl) + .option("dbtable", "datetimePartitionTest") + .option("partitionColumn", "d") + .option("lowerBound", "2018-07-06") + .option("upperBound", "2018-07-20") + .option("numPartitions", 3) + // oracle.jdbc.mapDateToTimestamp defaults to true. If this flag is not disabled, column d + // (Oracle DATE) will be resolved as Catalyst Timestamp, which will fail bound evaluation of + // the partition column. E.g. 2018-07-06 cannot be evaluated as Timestamp, and the error + // message says: Timestamp format must be yyyy-mm-dd hh:mm:ss[.fffffffff]. + .option("oracle.jdbc.mapDateToTimestamp", "false") + .option("sessionInitStatement", "ALTER SESSION SET NLS_DATE_FORMAT = 'YYYY-MM-DD'") + .load() + + df1.logicalPlan match { + case LogicalRelation(JDBCRelation(_, parts, _), _, _, _) => + val whereClauses = parts.map(_.asInstanceOf[JDBCPartition].whereClause).toSet + assert(whereClauses === Set( + """"D" < '2018-07-11' or "D" is null""", + """"D" >= '2018-07-11' AND "D" < '2018-07-15'""", + """"D" >= '2018-07-15'""")) + } + assert(df1.collect.toSet === expectedResult) + + // TimestampType partition column + val df2 = spark.read.format("jdbc") + .option("url", jdbcUrl) + .option("dbtable", "datetimePartitionTest") + .option("partitionColumn", "t") + .option("lowerBound", "2018-07-04 03:30:00.0") + .option("upperBound", "2018-07-27 14:11:05.0") + .option("numPartitions", 2) + .option("oracle.jdbc.mapDateToTimestamp", "false") + .option("sessionInitStatement", + "ALTER SESSION SET NLS_TIMESTAMP_FORMAT = 'YYYY-MM-DD HH24:MI:SS.FF'") + .load() + + df2.logicalPlan match { + case LogicalRelation(JDBCRelation(_, parts, _), _, _, _) => + val whereClauses = parts.map(_.asInstanceOf[JDBCPartition].whereClause).toSet + assert(whereClauses === Set( + """"T" < '2018-07-15 20:50:32.5' or "T" is null""", + """"T" >= '2018-07-15 20:50:32.5'""")) + } + assert(df2.collect.toSet === expectedResult) + } + + test("query JDBC option") { + val expectedResult = Set( + (1, "1991-11-09", "1996-01-01 01:23:45") + ).map { case (id, date, timestamp) => + Row(BigDecimal.valueOf(id), Date.valueOf(date), Timestamp.valueOf(timestamp)) + } + + val query = "SELECT id, d, t FROM datetime WHERE id = 1" + // query option to pass on the query string. + val df = spark.read.format("jdbc") + .option("url", jdbcUrl) + .option("query", query) + .option("oracle.jdbc.mapDateToTimestamp", "false") + .load() + assert(df.collect.toSet === expectedResult) + + // query option in the create table path. + sql( + s""" + |CREATE OR REPLACE TEMPORARY VIEW queryOption + |USING org.apache.spark.sql.jdbc + |OPTIONS (url '$jdbcUrl', + | query '$query', + | oracle.jdbc.mapDateToTimestamp false) + """.stripMargin.replaceAll("\n", " ")) + assert(sql("select id, d, t from queryOption").collect.toSet == expectedResult) + } + + test("SPARK-32992: map Oracle's ROWID type to StringType") { + val rows = spark.read.format("jdbc") + .option("url", jdbcUrl) + .option("query", "SELECT ROWID from datetime") + .load() + .collect() + val types = rows(0).toSeq.map(x => x.getClass.toString) + assert(types(0).equals("class java.lang.String")) + assert(!rows(0).getString(0).isEmpty) + } +} diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala new file mode 100644 index 0000000000000..d3229ba50eca3 --- /dev/null +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresIntegrationSuite.scala @@ -0,0 +1,382 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.jdbc + +import java.math.{BigDecimal => JBigDecimal} +import java.sql.{Connection, Date, Timestamp} +import java.text.SimpleDateFormat +import java.util.Properties + +import org.apache.spark.sql.Column +import org.apache.spark.sql.Row +import org.apache.spark.sql.catalyst.expressions.Literal +import org.apache.spark.sql.types.{ArrayType, DecimalType, FloatType, ShortType} +import org.apache.spark.tags.DockerTest + +/** + * To run this test suite for a specific version (e.g., postgres:15.1): + * {{{ + * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:15.1 + * ./build/sbt -Pdocker-integration-tests + * "testOnly org.apache.spark.sql.jdbc.PostgresIntegrationSuite" + * }}} + */ +@DockerTest +class PostgresIntegrationSuite extends DockerJDBCIntegrationSuite { + override val db = new DatabaseOnDocker { + override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:15.1-alpine") + override val env = Map( + "POSTGRES_PASSWORD" -> "rootpass" + ) + override val usesIpc = false + override val jdbcPort = 5432 + override def getJdbcUrl(ip: String, port: Int): String = + s"jdbc:postgresql://$ip:$port/postgres?user=postgres&password=rootpass" + } + + override def dataPreparation(conn: Connection): Unit = { + conn.prepareStatement("CREATE DATABASE foo").executeUpdate() + conn.setCatalog("foo") + conn.prepareStatement("CREATE TYPE enum_type AS ENUM ('d1', 'd2')").executeUpdate() + conn.prepareStatement("CREATE TABLE bar (c0 text, c1 integer, c2 double precision, c3 bigint, " + + "c4 bit(1), c5 bit(10), c6 bytea, c7 boolean, c8 inet, c9 cidr, " + + "c10 integer[], c11 text[], c12 real[], c13 numeric(2,2)[], c14 enum_type, " + + "c15 float4, c16 smallint, c17 numeric[], c18 bit varying(6), c19 point, c20 line, " + + "c21 lseg, c22 box, c23 path, c24 polygon, c25 circle, c26 pg_lsn, " + + "c27 character(2), c28 character varying(3), c29 date, c30 interval, " + + "c31 macaddr, c32 macaddr8, c33 numeric(6,4), c34 pg_snapshot, " + + "c35 real, c36 time, c37 timestamp, c38 tsquery, c39 tsvector, c40 txid_snapshot, " + + "c41 xml)").executeUpdate() + conn.prepareStatement("INSERT INTO bar VALUES ('hello', 42, 1.25, 123456789012345, B'0', " + + "B'1000100101', E'\\\\xDEADBEEF', true, '172.16.0.42', '192.168.0.0/16', " + + """'{1, 2}', '{"a", null, "b"}', '{0.11, 0.22}', '{0.11, 0.22}', 'd1', 1.01, 1, """ + + "'{111.2222, 333.4444}', B'101010', '(800, 600)', '(23.8, 56.2), (16.23, 89.2)', " + + "'[(80.12, 131.24), (201.5, 503.33)]', '(19.84, 11.23), (20.21, 2.1)', " + + "'(10.2, 30.4), (50.6, 70.8), (90.1, 11.3)', " + + "'((100.3, 40.2), (20.198, 83.1), (500.821, 311.38))', '<500, 200, 100>', '16/B374D848', " + + "'ab', 'efg', '2021-02-02', '1 minute', '00:11:22:33:44:55', " + + "'00:11:22:33:44:55:66:77', 12.3456, '10:20:10,14,15', 1E+37, " + + "'17:22:31', '2016-08-12 10:22:31.949271', 'cat:AB & dog:CD', " + + "'dog and cat and fox', '10:20:10,14,15', 'id10')" + ).executeUpdate() + conn.prepareStatement("INSERT INTO bar VALUES (null, null, null, null, null, " + + "null, null, null, null, null, null, null, null, null, null, null, null, " + + "null, null, null, null, null, null, null, null, null, null, null, null, " + + "null, null, null, null, null, null, null, null, null, null, null, null, null)" + ).executeUpdate() + + conn.prepareStatement("CREATE TABLE ts_with_timezone " + + "(id integer, tstz TIMESTAMP WITH TIME ZONE, ttz TIME WITH TIME ZONE)") + .executeUpdate() + conn.prepareStatement("INSERT INTO ts_with_timezone VALUES " + + "(1, TIMESTAMP WITH TIME ZONE '2016-08-12 10:22:31.949271-07', " + + "TIME WITH TIME ZONE '17:22:31.949271+00')") + .executeUpdate() + + conn.prepareStatement("CREATE TABLE st_with_array (c0 uuid, c1 inet, c2 cidr," + + "c3 json, c4 jsonb, c5 uuid[], c6 inet[], c7 cidr[], c8 json[], c9 jsonb[], c10 xml[], " + + "c11 tsvector[], c12 tsquery[], c13 macaddr[], c14 txid_snapshot[], c15 point[], " + + "c16 line[], c17 lseg[], c18 box[], c19 path[], c20 polygon[], c21 circle[], c22 pg_lsn[], " + + "c23 bit varying(6)[], c24 interval[], c25 macaddr8[], c26 pg_snapshot[])") + .executeUpdate() + conn.prepareStatement("INSERT INTO st_with_array VALUES ( " + + "'0a532531-cdf1-45e3-963d-5de90b6a30f1', '172.168.22.1', '192.168.100.128/25', " + + """'{"a": "foo", "b": "bar"}', '{"a": 1, "b": 2}', """ + + "ARRAY['7be8aaf8-650e-4dbb-8186-0a749840ecf2'," + + "'205f9bfc-018c-4452-a605-609c0cfad228']::uuid[], ARRAY['172.16.0.41', " + + "'172.16.0.42']::inet[], ARRAY['192.168.0.0/24', '10.1.0.0/16']::cidr[], " + + """ARRAY['{"a": "foo", "b": "bar"}', '{"a": 1, "b": 2}']::json[], """ + + """ARRAY['{"a": 1, "b": 2, "c": 3}']::jsonb[], """ + + """ARRAY['id10']::xml[], ARRAY['The dog laying on the grass', """ + + """'the:1 cat:2 is:3 on:4 the:5 table:6']::tsvector[], """ + + """ARRAY['programming & language & ! interpreter', 'cat:AB & dog:CD']::tsquery[], """ + + """ARRAY['12:34:56:78:90:ab', 'cd-ef-12-34-56-78']::macaddr[], """ + + """ARRAY['10:20:10,14,15']::txid_snapshot[], """ + + """ARRAY['(800, 600)', '83.24, 5.10']::point[], """ + + """ARRAY['(23.8, 56.2), (16.23, 89.2)', '{23.85, 10.87, 5.92}']::line[], """ + + """ARRAY['[(80.12, 131.24), (201.5, 503.33)]']::lseg[], """ + + """ARRAY['(19.84, 11.23), (20.21, 2.1)']::box[], """ + + """ARRAY['(10.2, 30.4), (50.6, 70.8), (90.1, 11.3)']::path[], """ + + """ARRAY['((100.3, 40.2), (20.198, 83.1), (500.821, 311.38))']::polygon[], """ + + """ARRAY['<500, 200, 100>']::circle[], """ + + """ARRAY['16/B374D848']::pg_lsn[], """ + + """ARRAY[B'101010']::bit varying(6)[], """ + + """ARRAY['1 day', '2 minutes']::interval[], """ + + """ARRAY['08:00:2b:01:02:03:04:05']::macaddr8[], """ + + """ARRAY['10:20:10,14,15']::pg_snapshot[])""" + ).executeUpdate() + + conn.prepareStatement("CREATE TABLE char_types (" + + "c0 char(4), c1 character(4), c2 character varying(4), c3 varchar(4), c4 bpchar)" + ).executeUpdate() + conn.prepareStatement("INSERT INTO char_types VALUES " + + "('abcd', 'efgh', 'ijkl', 'mnop', 'q')").executeUpdate() + + conn.prepareStatement("CREATE TABLE char_array_types (" + + "c0 char(4)[], c1 character(4)[], c2 character varying(4)[], c3 varchar(4)[], c4 bpchar[])" + ).executeUpdate() + conn.prepareStatement("INSERT INTO char_array_types VALUES " + + """('{"a", "bcd"}', '{"ef", "gh"}', '{"i", "j", "kl"}', '{"mnop"}', '{"q", "r"}')""" + ).executeUpdate() + + conn.prepareStatement("CREATE TABLE money_types (" + + "c0 money)").executeUpdate() + conn.prepareStatement("INSERT INTO money_types VALUES " + + "('$1,000.00')").executeUpdate() + } + + test("Type mapping for various types") { + val df = sqlContext.read.jdbc(jdbcUrl, "bar", new Properties) + val rows = df.collect().sortBy(_.toString()) + assert(rows.length == 2) + // Test the types, and values using the first row. + val types = rows(0).toSeq.map(x => x.getClass) + assert(types.length == 42) + assert(classOf[String].isAssignableFrom(types(0))) + assert(classOf[java.lang.Integer].isAssignableFrom(types(1))) + assert(classOf[java.lang.Double].isAssignableFrom(types(2))) + assert(classOf[java.lang.Long].isAssignableFrom(types(3))) + assert(classOf[java.lang.Boolean].isAssignableFrom(types(4))) + assert(classOf[Array[Byte]].isAssignableFrom(types(5))) + assert(classOf[Array[Byte]].isAssignableFrom(types(6))) + assert(classOf[java.lang.Boolean].isAssignableFrom(types(7))) + assert(classOf[String].isAssignableFrom(types(8))) + assert(classOf[String].isAssignableFrom(types(9))) + assert(classOf[scala.collection.Seq[Int]].isAssignableFrom(types(10))) + assert(classOf[scala.collection.Seq[String]].isAssignableFrom(types(11))) + assert(classOf[scala.collection.Seq[Double]].isAssignableFrom(types(12))) + assert(classOf[scala.collection.Seq[BigDecimal]].isAssignableFrom(types(13))) + assert(classOf[String].isAssignableFrom(types(14))) + assert(classOf[java.lang.Float].isAssignableFrom(types(15))) + assert(classOf[java.lang.Short].isAssignableFrom(types(16))) + assert(classOf[scala.collection.Seq[BigDecimal]].isAssignableFrom(types(17))) + assert(classOf[String].isAssignableFrom(types(18))) + assert(classOf[String].isAssignableFrom(types(19))) + assert(classOf[String].isAssignableFrom(types(20))) + assert(classOf[String].isAssignableFrom(types(21))) + assert(classOf[String].isAssignableFrom(types(22))) + assert(classOf[String].isAssignableFrom(types(23))) + assert(classOf[String].isAssignableFrom(types(24))) + assert(classOf[String].isAssignableFrom(types(25))) + assert(classOf[String].isAssignableFrom(types(26))) + assert(classOf[String].isAssignableFrom(types(27))) + assert(classOf[String].isAssignableFrom(types(28))) + assert(classOf[Date].isAssignableFrom(types(29))) + assert(classOf[String].isAssignableFrom(types(30))) + assert(classOf[String].isAssignableFrom(types(31))) + assert(classOf[String].isAssignableFrom(types(32))) + assert(classOf[JBigDecimal].isAssignableFrom(types(33))) + assert(classOf[String].isAssignableFrom(types(34))) + assert(classOf[java.lang.Float].isAssignableFrom(types(35))) + assert(classOf[java.sql.Timestamp].isAssignableFrom(types(36))) + assert(classOf[java.sql.Timestamp].isAssignableFrom(types(37))) + assert(classOf[String].isAssignableFrom(types(38))) + assert(classOf[String].isAssignableFrom(types(39))) + assert(classOf[String].isAssignableFrom(types(40))) + assert(classOf[String].isAssignableFrom(types(41))) + assert(rows(0).getString(0).equals("hello")) + assert(rows(0).getInt(1) == 42) + assert(rows(0).getDouble(2) == 1.25) + assert(rows(0).getLong(3) == 123456789012345L) + assert(!rows(0).getBoolean(4)) + // BIT(10)'s come back as ASCII strings of ten ASCII 0's and 1's... + assert(java.util.Arrays.equals(rows(0).getAs[Array[Byte]](5), + Array[Byte](49, 48, 48, 48, 49, 48, 48, 49, 48, 49))) + assert(java.util.Arrays.equals(rows(0).getAs[Array[Byte]](6), + Array[Byte](0xDE.toByte, 0xAD.toByte, 0xBE.toByte, 0xEF.toByte))) + assert(rows(0).getBoolean(7)) + assert(rows(0).getString(8) == "172.16.0.42") + assert(rows(0).getString(9) == "192.168.0.0/16") + assert(rows(0).getSeq(10) == Seq(1, 2)) + assert(rows(0).getSeq(11) == Seq("a", null, "b")) + assert(rows(0).getSeq(12).toSeq == Seq(0.11f, 0.22f)) + assert(rows(0).getSeq(13) == Seq("0.11", "0.22").map(BigDecimal(_).bigDecimal)) + assert(rows(0).getString(14) == "d1") + assert(rows(0).getFloat(15) == 1.01f) + assert(rows(0).getShort(16) == 1) + assert(rows(0).getSeq(17) == + Seq("111.222200000000000000", "333.444400000000000000").map(BigDecimal(_).bigDecimal)) + assert(rows(0).getString(18) == "101010") + assert(rows(0).getString(19) == "(800,600)") + assert(rows(0).getString(20) == "{-4.359313077939234,-1,159.9516512549538}") + assert(rows(0).getString(21) == "[(80.12,131.24),(201.5,503.33)]") + assert(rows(0).getString(22) == "(20.21,11.23),(19.84,2.1)") + assert(rows(0).getString(23) == "((10.2,30.4),(50.6,70.8),(90.1,11.3))") + assert(rows(0).getString(24) == "((100.3,40.2),(20.198,83.1),(500.821,311.38))") + assert(rows(0).getString(25) == "<(500,200),100>") + assert(rows(0).getString(26) == "16/B374D848") + assert(rows(0).getString(27) == "ab") + assert(rows(0).getString(28) == "efg") + assert(rows(0).getDate(29) == new SimpleDateFormat("yyyy-MM-dd").parse("2021-02-02")) + assert(rows(0).getString(30) == "00:01:00") + assert(rows(0).getString(31) == "00:11:22:33:44:55") + assert(rows(0).getString(32) == "00:11:22:33:44:55:66:77") + assert(rows(0).getDecimal(33) == new JBigDecimal("12.3456")) + assert(rows(0).getString(34) == "10:20:10,14,15") + assert(rows(0).getFloat(35) == 1E+37F) + assert(rows(0).getTimestamp(36) == Timestamp.valueOf("1970-01-01 17:22:31.0")) + assert(rows(0).getTimestamp(37) == Timestamp.valueOf("2016-08-12 10:22:31.949271")) + assert(rows(0).getString(38) == "'cat':AB & 'dog':CD") + assert(rows(0).getString(39) == "'and' 'cat' 'dog' 'fox'") + assert(rows(0).getString(40) == "10:20:10,14,15") + assert(rows(0).getString(41) == "id10") + + // Test reading null values using the second row. + assert(0.until(16).forall(rows(1).isNullAt(_))) + } + + test("Basic write test") { + val df = sqlContext.read.jdbc(jdbcUrl, "bar", new Properties) + // Test only that it doesn't crash. + df.write.jdbc(jdbcUrl, "public.barcopy", new Properties) + // Test that written numeric type has same DataType as input + assert(sqlContext.read.jdbc(jdbcUrl, "public.barcopy", new Properties).schema(13).dataType == + ArrayType(DecimalType(2, 2), true)) + // Test write null values. + df.select(df.queryExecution.analyzed.output.map { a => + Column(Literal.create(null, a.dataType)).as(a.name) + }: _*).write.jdbc(jdbcUrl, "public.barcopy2", new Properties) + } + + test("Creating a table with shorts and floats") { + sqlContext.createDataFrame(Seq((1.0f, 1.toShort))) + .write.jdbc(jdbcUrl, "shortfloat", new Properties) + val schema = sqlContext.read.jdbc(jdbcUrl, "shortfloat", new Properties).schema + assert(schema(0).dataType == FloatType) + assert(schema(1).dataType == ShortType) + } + + test("SPARK-20557: column type TIMESTAMP with TIME ZONE and TIME with TIME ZONE " + + "should be recognized") { + // When using JDBC to read the columns of TIMESTAMP with TIME ZONE and TIME with TIME ZONE + // the actual types are java.sql.Types.TIMESTAMP and java.sql.Types.TIME + val dfRead = sqlContext.read.jdbc(jdbcUrl, "ts_with_timezone", new Properties) + val rows = dfRead.collect() + val types = rows(0).toSeq.map(x => x.getClass.toString) + assert(types(1).equals("class java.sql.Timestamp")) + assert(types(2).equals("class java.sql.Timestamp")) + } + + test("SPARK-22291: Conversion error when transforming array types of " + + "uuid, inet and cidr to StingType in PostgreSQL") { + val df = sqlContext.read.jdbc(jdbcUrl, "st_with_array", new Properties) + val rows = df.collect() + assert(rows(0).getString(0) == "0a532531-cdf1-45e3-963d-5de90b6a30f1") + assert(rows(0).getString(1) == "172.168.22.1") + assert(rows(0).getString(2) == "192.168.100.128/25") + assert(rows(0).getString(3) == "{\"a\": \"foo\", \"b\": \"bar\"}") + assert(rows(0).getString(4) == "{\"a\": 1, \"b\": 2}") + assert(rows(0).getSeq(5) == Seq("7be8aaf8-650e-4dbb-8186-0a749840ecf2", + "205f9bfc-018c-4452-a605-609c0cfad228")) + assert(rows(0).getSeq(6) == Seq("172.16.0.41", "172.16.0.42")) + assert(rows(0).getSeq(7) == Seq("192.168.0.0/24", "10.1.0.0/16")) + assert(rows(0).getSeq(8) == Seq("""{"a": "foo", "b": "bar"}""", """{"a": 1, "b": 2}""")) + assert(rows(0).getSeq(9) == Seq("""{"a": 1, "b": 2, "c": 3}""")) + assert(rows(0).getSeq(10) == Seq("""id10""")) + assert(rows(0).getSeq(11) == Seq("'The' 'dog' 'grass' 'laying' 'on' 'the'", + "'cat':2 'is':3 'on':4 'table':6 'the':1,5")) + assert(rows(0).getSeq(12) == Seq("'programming' & 'language' & !'interpreter'", + "'cat':AB & 'dog':CD")) + assert(rows(0).getSeq(13) == Seq("12:34:56:78:90:ab", "cd:ef:12:34:56:78")) + assert(rows(0).getSeq(14) == Seq("10:20:10,14,15")) + assert(rows(0).getSeq(15) == Seq("(800.0,600.0)", "(83.24,5.1)")) + assert(rows(0).getSeq(16) == Seq("{-4.359313077939234,-1.0,159.9516512549538}", + "{23.85,10.87,5.92}")) + assert(rows(0).getSeq(17) == Seq("[(80.12,131.24),(201.5,503.33)]")) + assert(rows(0).getSeq(18) == Seq("(20.21,11.23),(19.84,2.1)")) + assert(rows(0).getSeq(19) == Seq("((10.2,30.4),(50.6,70.8),(90.1,11.3))")) + assert(rows(0).getSeq(20) == Seq("((100.3,40.2),(20.198,83.1),(500.821,311.38))")) + assert(rows(0).getSeq(21) == Seq("<(500.0,200.0),100.0>")) + assert(rows(0).getSeq(22) == Seq("16/B374D848")) + assert(rows(0).getSeq(23) == Seq("101010")) + assert(rows(0).getSeq(24) == Seq("0 years 0 mons 1 days 0 hours 0 mins 0.0 secs", + "0 years 0 mons 0 days 0 hours 2 mins 0.0 secs")) + assert(rows(0).getSeq(25) == Seq("08:00:2b:01:02:03:04:05")) + assert(rows(0).getSeq(26) == Seq("10:20:10,14,15")) + } + + test("query JDBC option") { + val expectedResult = Set( + (42, 123456789012345L) + ).map { case (c1, c3) => + Row(Integer.valueOf(c1), java.lang.Long.valueOf(c3)) + } + + val query = "SELECT c1, c3 FROM bar WHERE c1 IS NOT NULL" + // query option to pass on the query string. + val df = spark.read.format("jdbc") + .option("url", jdbcUrl) + .option("query", query) + .load() + assert(df.collect.toSet === expectedResult) + + // query option in the create table path. + sql( + s""" + |CREATE OR REPLACE TEMPORARY VIEW queryOption + |USING org.apache.spark.sql.jdbc + |OPTIONS (url '$jdbcUrl', query '$query') + """.stripMargin.replaceAll("\n", " ")) + assert(sql("select c1, c3 from queryOption").collect.toSet == expectedResult) + } + + test("write byte as smallint") { + sqlContext.createDataFrame(Seq((1.toByte, 2.toShort))) + .write.jdbc(jdbcUrl, "byte_to_smallint_test", new Properties) + val df = sqlContext.read.jdbc(jdbcUrl, "byte_to_smallint_test", new Properties) + val schema = df.schema + assert(schema.head.dataType == ShortType) + assert(schema(1).dataType == ShortType) + val rows = df.collect() + assert(rows.length === 1) + assert(rows(0).getShort(0) === 1) + assert(rows(0).getShort(1) === 2) + } + + test("character type tests") { + val df = sqlContext.read.jdbc(jdbcUrl, "char_types", new Properties) + val row = df.collect() + assert(row.length == 1) + assert(row(0).length === 5) + assert(row(0).getString(0) === "abcd") + assert(row(0).getString(1) === "efgh") + assert(row(0).getString(2) === "ijkl") + assert(row(0).getString(3) === "mnop") + assert(row(0).getString(4) === "q") + } + + test("SPARK-32576: character array type tests") { + val df = sqlContext.read.jdbc(jdbcUrl, "char_array_types", new Properties) + val row = df.collect() + assert(row.length == 1) + assert(row(0).length === 5) + assert(row(0).getSeq[String](0) === Seq("a ", "bcd ")) + assert(row(0).getSeq[String](1) === Seq("ef ", "gh ")) + assert(row(0).getSeq[String](2) === Seq("i", "j", "kl")) + assert(row(0).getSeq[String](3) === Seq("mnop")) + assert(row(0).getSeq[String](4) === Seq("q", "r")) + } + + test("SPARK-34333: money type tests") { + val df = sqlContext.read.jdbc(jdbcUrl, "money_types", new Properties) + val row = df.collect() + assert(row.length === 1) + assert(row(0).length === 1) + assert(row(0).getString(0) === "$1,000.00") + } +} diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresKrbIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresKrbIntegrationSuite.scala similarity index 97% rename from external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresKrbIntegrationSuite.scala rename to connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresKrbIntegrationSuite.scala index c46a845a74395..4debe24754de3 100644 --- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresKrbIntegrationSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/PostgresKrbIntegrationSuite.scala @@ -25,9 +25,9 @@ import org.apache.spark.sql.execution.datasources.jdbc.connection.SecureConnecti import org.apache.spark.tags.DockerTest /** - * To run this test suite for a specific version (e.g., postgres:14.0): + * To run this test suite for a specific version (e.g., postgres:15.1): * {{{ - * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:14.0 + * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:15.1 * ./build/sbt -Pdocker-integration-tests "testOnly *PostgresKrbIntegrationSuite" * }}} */ @@ -37,7 +37,7 @@ class PostgresKrbIntegrationSuite extends DockerKrbJDBCIntegrationSuite { override protected val keytabFileName = "postgres.keytab" override val db = new DatabaseOnDocker { - override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:14.0") + override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:15.1") override val env = Map( "POSTGRES_PASSWORD" -> "rootpass" ) diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2IntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2IntegrationSuite.scala new file mode 100644 index 0000000000000..1a25cd2802dd7 --- /dev/null +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2IntegrationSuite.scala @@ -0,0 +1,113 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.jdbc.v2 + +import java.sql.Connection +import java.util.Locale + +import org.scalatest.time.SpanSugar._ + +import org.apache.spark.SparkConf +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.execution.datasources.v2.jdbc.JDBCTableCatalog +import org.apache.spark.sql.jdbc.DatabaseOnDocker +import org.apache.spark.sql.types._ +import org.apache.spark.tags.DockerTest + +/** + * To run this test suite for a specific version (e.g., ibmcom/db2:11.5.6.0a): + * {{{ + * ENABLE_DOCKER_INTEGRATION_TESTS=1 DB2_DOCKER_IMAGE_NAME=ibmcom/db2:11.5.6.0a + * ./build/sbt -Pdocker-integration-tests "testOnly *v2.DB2IntegrationSuite" + * }}} + */ +@DockerTest +class DB2IntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCTest { + override val catalogName: String = "db2" + override val namespaceOpt: Option[String] = Some("DB2INST1") + override val db = new DatabaseOnDocker { + override val imageName = sys.env.getOrElse("DB2_DOCKER_IMAGE_NAME", "ibmcom/db2:11.5.6.0a") + override val env = Map( + "DB2INST1_PASSWORD" -> "rootpass", + "LICENSE" -> "accept", + "DBNAME" -> "foo", + "ARCHIVE_LOGS" -> "false", + "AUTOCONFIG" -> "false" + ) + override val usesIpc = false + override val jdbcPort: Int = 50000 + override val privileged = true + override def getJdbcUrl(ip: String, port: Int): String = + s"jdbc:db2://$ip:$port/foo:user=db2inst1;password=rootpass;retrieveMessagesFromServerOnGetMessage=true;" //scalastyle:ignore + } + + override val connectionTimeout = timeout(3.minutes) + + override def sparkConf: SparkConf = super.sparkConf + .set("spark.sql.catalog.db2", classOf[JDBCTableCatalog].getName) + .set("spark.sql.catalog.db2.url", db.getJdbcUrl(dockerIp, externalPort)) + .set("spark.sql.catalog.db2.pushDownAggregate", "true") + + override def tablePreparation(connection: Connection): Unit = { + connection.prepareStatement( + "CREATE TABLE employee (dept INTEGER, name VARCHAR(10), salary DECIMAL(20, 2), bonus DOUBLE)") + .executeUpdate() + } + + override def testUpdateColumnType(tbl: String): Unit = { + sql(s"CREATE TABLE $tbl (ID INTEGER)") + var t = spark.table(tbl) + var expectedSchema = new StructType().add("ID", IntegerType, true, defaultMetadata) + assert(t.schema === expectedSchema) + sql(s"ALTER TABLE $tbl ALTER COLUMN id TYPE DOUBLE") + t = spark.table(tbl) + expectedSchema = new StructType().add("ID", DoubleType, true, defaultMetadata) + assert(t.schema === expectedSchema) + // Update column type from DOUBLE to STRING + val msg1 = intercept[AnalysisException] { + sql(s"ALTER TABLE $tbl ALTER COLUMN id TYPE VARCHAR(10)") + }.getMessage + assert(msg1.contains( + s"Cannot update $catalogName.alt_table field ID: double cannot be cast to varchar")) + } + + override def testCreateTableWithProperty(tbl: String): Unit = { + sql(s"CREATE TABLE $tbl (ID INT)" + + s" TBLPROPERTIES('CCSID'='UNICODE')") + val t = spark.table(tbl) + val expectedSchema = new StructType().add("ID", IntegerType, true, defaultMetadata) + assert(t.schema === expectedSchema) + } + + override def caseConvert(tableName: String): String = tableName.toUpperCase(Locale.ROOT) + + testVarPop() + testVarPop(true) + testVarSamp() + testVarSamp(true) + testStddevPop() + testStddevPop(true) + testStddevSamp() + testStddevSamp(true) + testCovarPop() + testCovarSamp() + testRegrIntercept() + testRegrSlope() + testRegrR2() + testRegrSXY() +} diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2NamespaceSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2NamespaceSuite.scala similarity index 100% rename from external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2NamespaceSuite.scala rename to connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DB2NamespaceSuite.scala diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DockerJDBCIntegrationV2Suite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DockerJDBCIntegrationV2Suite.scala similarity index 100% rename from external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DockerJDBCIntegrationV2Suite.scala rename to connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/DockerJDBCIntegrationV2Suite.scala diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MsSqlServerIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MsSqlServerIntegrationSuite.scala similarity index 100% rename from external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MsSqlServerIntegrationSuite.scala rename to connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MsSqlServerIntegrationSuite.scala diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MsSqlServerNamespaceSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MsSqlServerNamespaceSuite.scala similarity index 100% rename from external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MsSqlServerNamespaceSuite.scala rename to connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MsSqlServerNamespaceSuite.scala diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala new file mode 100644 index 0000000000000..072fdbb3f3424 --- /dev/null +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLIntegrationSuite.scala @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.jdbc.v2 + +import java.sql.{Connection, SQLFeatureNotSupportedException} + +import org.scalatest.time.SpanSugar._ + +import org.apache.spark.SparkConf +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.execution.datasources.v2.jdbc.JDBCTableCatalog +import org.apache.spark.sql.jdbc.DatabaseOnDocker +import org.apache.spark.sql.types._ +import org.apache.spark.tags.DockerTest + +/** + * To run this test suite for a specific version (e.g., mysql:8.0.31): + * {{{ + * ENABLE_DOCKER_INTEGRATION_TESTS=1 MYSQL_DOCKER_IMAGE_NAME=mysql:8.0.31 + * ./build/sbt -Pdocker-integration-tests "testOnly *v2*MySQLIntegrationSuite" + * }}} + */ +@DockerTest +class MySQLIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCTest { + override val catalogName: String = "mysql" + override val db = new DatabaseOnDocker { + override val imageName = sys.env.getOrElse("MYSQL_DOCKER_IMAGE_NAME", "mysql:8.0.31") + override val env = Map( + "MYSQL_ROOT_PASSWORD" -> "rootpass" + ) + override val usesIpc = false + override val jdbcPort: Int = 3306 + + override def getJdbcUrl(ip: String, port: Int): String = + s"jdbc:mysql://$ip:$port/" + + s"mysql?user=root&password=rootpass&allowPublicKeyRetrieval=true&useSSL=false" + } + + override def sparkConf: SparkConf = super.sparkConf + .set("spark.sql.catalog.mysql", classOf[JDBCTableCatalog].getName) + .set("spark.sql.catalog.mysql.url", db.getJdbcUrl(dockerIp, externalPort)) + .set("spark.sql.catalog.mysql.pushDownAggregate", "true") + + override val connectionTimeout = timeout(7.minutes) + + private var mySQLVersion = -1 + + override def tablePreparation(connection: Connection): Unit = { + mySQLVersion = connection.getMetaData.getDatabaseMajorVersion + connection.prepareStatement( + "CREATE TABLE employee (dept INT, name VARCHAR(32), salary DECIMAL(20, 2)," + + " bonus DOUBLE)").executeUpdate() + } + + override def testUpdateColumnType(tbl: String): Unit = { + sql(s"CREATE TABLE $tbl (ID INTEGER)") + var t = spark.table(tbl) + var expectedSchema = new StructType().add("ID", IntegerType, true, defaultMetadata) + assert(t.schema === expectedSchema) + sql(s"ALTER TABLE $tbl ALTER COLUMN id TYPE STRING") + t = spark.table(tbl) + expectedSchema = new StructType().add("ID", StringType, true, defaultMetadata) + assert(t.schema === expectedSchema) + // Update column type from STRING to INTEGER + val msg1 = intercept[AnalysisException] { + sql(s"ALTER TABLE $tbl ALTER COLUMN id TYPE INTEGER") + }.getMessage + assert(msg1.contains( + s"Cannot update $catalogName.alt_table field ID: string cannot be cast to int")) + } + + override def testRenameColumn(tbl: String): Unit = { + assert(mySQLVersion > 0) + if (mySQLVersion < 8) { + // Rename is unsupported for mysql versions < 8.0. + val exception = intercept[AnalysisException] { + sql(s"ALTER TABLE $tbl RENAME COLUMN ID TO RENAMED") + } + assert(exception.getCause != null, s"Wrong exception thrown: $exception") + val msg = exception.getCause.asInstanceOf[SQLFeatureNotSupportedException].getMessage + assert(msg.contains("Rename column is only supported for MySQL version 8.0 and above.")) + } else { + super.testRenameColumn(tbl) + } + } + + override def testUpdateColumnNullability(tbl: String): Unit = { + sql(s"CREATE TABLE $tbl (ID STRING NOT NULL)") + // Update nullability is unsupported for mysql db. + val msg = intercept[AnalysisException] { + sql(s"ALTER TABLE $tbl ALTER COLUMN ID DROP NOT NULL") + }.getCause.asInstanceOf[SQLFeatureNotSupportedException].getMessage + + assert(msg.contains("UpdateColumnNullability is not supported")) + } + + override def testCreateTableWithProperty(tbl: String): Unit = { + sql(s"CREATE TABLE $tbl (ID INT)" + + s" TBLPROPERTIES('ENGINE'='InnoDB', 'DEFAULT CHARACTER SET'='utf8')") + val t = spark.table(tbl) + val expectedSchema = new StructType().add("ID", IntegerType, true, defaultMetadata) + assert(t.schema === expectedSchema) + } + + override def supportsIndex: Boolean = true + + override def supportListIndexes: Boolean = true + + override def indexOptions: String = "KEY_BLOCK_SIZE=10" + + testVarPop() + testVarSamp() + testStddevPop() + testStddevSamp() +} diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLNamespaceSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLNamespaceSuite.scala similarity index 97% rename from external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLNamespaceSuite.scala rename to connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLNamespaceSuite.scala index d8dee61d70ea6..b73e2b8fd23ca 100644 --- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLNamespaceSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/MySQLNamespaceSuite.scala @@ -28,16 +28,16 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.tags.DockerTest /** - * To run this test suite for a specific version (e.g., mysql:5.7.36): + * To run this test suite for a specific version (e.g., mysql:8.0.31): * {{{ - * ENABLE_DOCKER_INTEGRATION_TESTS=1 MYSQL_DOCKER_IMAGE_NAME=mysql:5.7.36 + * ENABLE_DOCKER_INTEGRATION_TESTS=1 MYSQL_DOCKER_IMAGE_NAME=mysql:8.0.31 * ./build/sbt -Pdocker-integration-tests "testOnly *v2*MySQLNamespaceSuite" * }}} */ @DockerTest class MySQLNamespaceSuite extends DockerJDBCIntegrationSuite with V2JDBCNamespaceTest { override val db = new DatabaseOnDocker { - override val imageName = sys.env.getOrElse("MYSQL_DOCKER_IMAGE_NAME", "mysql:5.7.36") + override val imageName = sys.env.getOrElse("MYSQL_DOCKER_IMAGE_NAME", "mysql:8.0.31") override val env = Map( "MYSQL_ROOT_PASSWORD" -> "rootpass" ) diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleIntegrationSuite.scala new file mode 100644 index 0000000000000..5de7608918852 --- /dev/null +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleIntegrationSuite.scala @@ -0,0 +1,118 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.jdbc.v2 + +import java.sql.Connection +import java.util.Locale + +import org.scalatest.time.SpanSugar._ + +import org.apache.spark.SparkConf +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.execution.datasources.v2.jdbc.JDBCTableCatalog +import org.apache.spark.sql.jdbc.DatabaseOnDocker +import org.apache.spark.sql.types._ +import org.apache.spark.tags.DockerTest + +/** + * The following are the steps to test this: + * + * 1. Choose to use a prebuilt image or build Oracle database in a container + * - The documentation on how to build Oracle RDBMS in a container is at + * https://github.com/oracle/docker-images/blob/master/OracleDatabase/SingleInstance/README.md + * - Official Oracle container images can be found at https://container-registry.oracle.com + * - A trustable and streamlined Oracle XE database image can be found on Docker Hub at + * https://hub.docker.com/r/gvenzl/oracle-xe see also https://github.com/gvenzl/oci-oracle-xe + * 2. Run: export ORACLE_DOCKER_IMAGE_NAME=image_you_want_to_use_for_testing + * - Example: export ORACLE_DOCKER_IMAGE_NAME=gvenzl/oracle-xe:latest + * 3. Run: export ENABLE_DOCKER_INTEGRATION_TESTS=1 + * 4. Start docker: sudo service docker start + * - Optionally, docker pull $ORACLE_DOCKER_IMAGE_NAME + * 5. Run Spark integration tests for Oracle with: ./build/sbt -Pdocker-integration-tests + * "testOnly org.apache.spark.sql.jdbc.v2.OracleIntegrationSuite" + * + * A sequence of commands to build the Oracle XE database container image: + * $ git clone https://github.com/oracle/docker-images.git + * $ cd docker-images/OracleDatabase/SingleInstance/dockerfiles + * $ ./buildContainerImage.sh -v 21.3.0 -x + * $ export ORACLE_DOCKER_IMAGE_NAME=oracle/database:21.3.0-xe + * + * This procedure has been validated with Oracle 18.4.0 and 21.3.0 Express Edition. + */ +@DockerTest +class OracleIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCTest { + override val catalogName: String = "oracle" + override val namespaceOpt: Option[String] = Some("SYSTEM") + override val db = new DatabaseOnDocker { + lazy override val imageName = + sys.env.getOrElse("ORACLE_DOCKER_IMAGE_NAME", "gvenzl/oracle-xe:21.3.0") + val oracle_password = "Th1s1sThe0racle#Pass" + override val env = Map( + "ORACLE_PWD" -> oracle_password, // oracle images uses this + "ORACLE_PASSWORD" -> oracle_password // gvenzl/oracle-xe uses this + ) + override val usesIpc = false + override val jdbcPort: Int = 1521 + override def getJdbcUrl(ip: String, port: Int): String = + s"jdbc:oracle:thin:system/$oracle_password@//$ip:$port/xe" + } + + override def sparkConf: SparkConf = super.sparkConf + .set("spark.sql.catalog.oracle", classOf[JDBCTableCatalog].getName) + .set("spark.sql.catalog.oracle.url", db.getJdbcUrl(dockerIp, externalPort)) + .set("spark.sql.catalog.oracle.pushDownAggregate", "true") + + override val connectionTimeout = timeout(7.minutes) + + override def tablePreparation(connection: Connection): Unit = { + connection.prepareStatement( + "CREATE TABLE employee (dept NUMBER(32), name VARCHAR2(32), salary NUMBER(20, 2)," + + " bonus BINARY_DOUBLE)").executeUpdate() + } + + override def testUpdateColumnType(tbl: String): Unit = { + sql(s"CREATE TABLE $tbl (ID INTEGER)") + var t = spark.table(tbl) + var expectedSchema = new StructType().add("ID", DecimalType(10, 0), true, defaultMetadata) + assert(t.schema === expectedSchema) + sql(s"ALTER TABLE $tbl ALTER COLUMN id TYPE STRING") + t = spark.table(tbl) + expectedSchema = new StructType().add("ID", StringType, true, defaultMetadata) + assert(t.schema === expectedSchema) + // Update column type from STRING to INTEGER + val msg1 = intercept[AnalysisException] { + sql(s"ALTER TABLE $tbl ALTER COLUMN id TYPE INTEGER") + }.getMessage + assert(msg1.contains( + s"Cannot update $catalogName.alt_table field ID: string cannot be cast to int")) + } + + override def caseConvert(tableName: String): String = tableName.toUpperCase(Locale.ROOT) + + testVarPop() + testVarSamp() + testStddevPop() + testStddevSamp() + testCovarPop() + testCovarSamp() + testCorr() + testRegrIntercept() + testRegrSlope() + testRegrR2() + testRegrSXY() +} diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleNamespaceSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleNamespaceSuite.scala similarity index 94% rename from external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleNamespaceSuite.scala rename to connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleNamespaceSuite.scala index 31f26d2990666..b3e9d19a10f38 100644 --- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleNamespaceSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/OracleNamespaceSuite.scala @@ -45,16 +45,16 @@ import org.apache.spark.tags.DockerTest * A sequence of commands to build the Oracle XE database container image: * $ git clone https://github.com/oracle/docker-images.git * $ cd docker-images/OracleDatabase/SingleInstance/dockerfiles - * $ ./buildContainerImage.sh -v 18.4.0 -x - * $ export ORACLE_DOCKER_IMAGE_NAME=oracle/database:18.4.0-xe + * $ ./buildContainerImage.sh -v 21.3.0 -x + * $ export ORACLE_DOCKER_IMAGE_NAME=oracle/database:21.3.0-xe * - * This procedure has been validated with Oracle 18.4.0 Express Edition. + * This procedure has been validated with Oracle 18.4.0 and 21.3.0 Express Edition. */ @DockerTest class OracleNamespaceSuite extends DockerJDBCIntegrationSuite with V2JDBCNamespaceTest { override val db = new DatabaseOnDocker { lazy override val imageName = - sys.env.getOrElse("ORACLE_DOCKER_IMAGE_NAME", "gvenzl/oracle-xe:18.4.0") + sys.env.getOrElse("ORACLE_DOCKER_IMAGE_NAME", "gvenzl/oracle-xe:21.3.0") val oracle_password = "Th1s1sThe0racle#Pass" override val env = Map( "ORACLE_PWD" -> oracle_password, // oracle images uses this diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala new file mode 100644 index 0000000000000..db3a80ffeaac5 --- /dev/null +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresIntegrationSuite.scala @@ -0,0 +1,115 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.jdbc.v2 + +import java.sql.Connection + +import org.apache.spark.SparkConf +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.execution.datasources.v2.jdbc.JDBCTableCatalog +import org.apache.spark.sql.jdbc.DatabaseOnDocker +import org.apache.spark.sql.types._ +import org.apache.spark.tags.DockerTest + +/** + * To run this test suite for a specific version (e.g., postgres:15.1): + * {{{ + * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:15.1 + * ./build/sbt -Pdocker-integration-tests "testOnly *v2.PostgresIntegrationSuite" + * }}} + */ +@DockerTest +class PostgresIntegrationSuite extends DockerJDBCIntegrationV2Suite with V2JDBCTest { + override val catalogName: String = "postgresql" + override val db = new DatabaseOnDocker { + override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:15.1-alpine") + override val env = Map( + "POSTGRES_PASSWORD" -> "rootpass" + ) + override val usesIpc = false + override val jdbcPort = 5432 + override def getJdbcUrl(ip: String, port: Int): String = + s"jdbc:postgresql://$ip:$port/postgres?user=postgres&password=rootpass" + } + override def sparkConf: SparkConf = super.sparkConf + .set("spark.sql.catalog.postgresql", classOf[JDBCTableCatalog].getName) + .set("spark.sql.catalog.postgresql.url", db.getJdbcUrl(dockerIp, externalPort)) + .set("spark.sql.catalog.postgresql.pushDownTableSample", "true") + .set("spark.sql.catalog.postgresql.pushDownLimit", "true") + .set("spark.sql.catalog.postgresql.pushDownAggregate", "true") + + override def tablePreparation(connection: Connection): Unit = { + connection.prepareStatement( + "CREATE TABLE employee (dept INTEGER, name VARCHAR(32), salary NUMERIC(20, 2)," + + " bonus double precision)").executeUpdate() + } + + override def testUpdateColumnType(tbl: String): Unit = { + sql(s"CREATE TABLE $tbl (ID INTEGER)") + var t = spark.table(tbl) + var expectedSchema = new StructType().add("ID", IntegerType, true, defaultMetadata) + assert(t.schema === expectedSchema) + sql(s"ALTER TABLE $tbl ALTER COLUMN id TYPE STRING") + t = spark.table(tbl) + expectedSchema = new StructType().add("ID", StringType, true, defaultMetadata) + assert(t.schema === expectedSchema) + // Update column type from STRING to INTEGER + val msg = intercept[AnalysisException] { + sql(s"ALTER TABLE $tbl ALTER COLUMN id TYPE INTEGER") + }.getMessage + assert(msg.contains( + s"Cannot update $catalogName.alt_table field ID: string cannot be cast to int")) + } + + override def testCreateTableWithProperty(tbl: String): Unit = { + sql(s"CREATE TABLE $tbl (ID INT)" + + s" TBLPROPERTIES('TABLESPACE'='pg_default')") + val t = spark.table(tbl) + val expectedSchema = new StructType().add("ID", IntegerType, true, defaultMetadata) + assert(t.schema === expectedSchema) + } + + override def supportsTableSample: Boolean = true + + override def supportsIndex: Boolean = true + + override def indexOptions: String = "FILLFACTOR=70" + + testVarPop() + testVarPop(true) + testVarSamp() + testVarSamp(true) + testStddevPop() + testStddevPop(true) + testStddevSamp() + testStddevSamp(true) + testCovarPop() + testCovarPop(true) + testCovarSamp() + testCovarSamp(true) + testCorr() + testCorr(true) + testRegrIntercept() + testRegrIntercept(true) + testRegrSlope() + testRegrSlope(true) + testRegrR2() + testRegrR2(true) + testRegrSXY() + testRegrSXY(true) +} diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresNamespaceSuite.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresNamespaceSuite.scala similarity index 97% rename from external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresNamespaceSuite.scala rename to connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresNamespaceSuite.scala index 33190103d6a9a..8c525717758c3 100644 --- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresNamespaceSuite.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/PostgresNamespaceSuite.scala @@ -26,16 +26,16 @@ import org.apache.spark.sql.util.CaseInsensitiveStringMap import org.apache.spark.tags.DockerTest /** - * To run this test suite for a specific version (e.g., postgres:14.0): + * To run this test suite for a specific version (e.g., postgres:15.1): * {{{ - * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:14.0 + * ENABLE_DOCKER_INTEGRATION_TESTS=1 POSTGRES_DOCKER_IMAGE_NAME=postgres:15.1 * ./build/sbt -Pdocker-integration-tests "testOnly *v2.PostgresNamespaceSuite" * }}} */ @DockerTest class PostgresNamespaceSuite extends DockerJDBCIntegrationSuite with V2JDBCNamespaceTest { override val db = new DatabaseOnDocker { - override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:14.0-alpine") + override val imageName = sys.env.getOrElse("POSTGRES_DOCKER_IMAGE_NAME", "postgres:15.1-alpine") override val env = Map( "POSTGRES_PASSWORD" -> "rootpass" ) diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCNamespaceTest.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCNamespaceTest.scala similarity index 92% rename from external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCNamespaceTest.scala rename to connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCNamespaceTest.scala index bae0d7c361635..d3f17187a3754 100644 --- a/external/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCNamespaceTest.scala +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCNamespaceTest.scala @@ -27,6 +27,7 @@ import org.apache.logging.log4j.Level import org.apache.spark.sql.AnalysisException import org.apache.spark.sql.catalyst.analysis.NonEmptyNamespaceException import org.apache.spark.sql.connector.catalog.{Identifier, NamespaceChange} +import org.apache.spark.sql.connector.expressions.Transform import org.apache.spark.sql.execution.datasources.v2.jdbc.JDBCTableCatalog import org.apache.spark.sql.jdbc.DockerIntegrationFunSuite import org.apache.spark.sql.test.SharedSparkSession @@ -62,7 +63,8 @@ private[v2] trait V2JDBCNamespaceTest extends SharedSparkSession with DockerInte Map.empty[String, String] } catalog.createNamespace(Array("foo"), commentMap.asJava) - assert(catalog.listNamespaces() === listNamespaces(Array("foo"))) + assert(catalog.listNamespaces().map(_.toSet).toSet === + listNamespaces(Array("foo")).map(_.toSet).toSet) assert(catalog.listNamespaces(Array("foo")) === Array()) assert(catalog.namespaceExists(Array("foo")) === true) @@ -87,10 +89,12 @@ private[v2] trait V2JDBCNamespaceTest extends SharedSparkSession with DockerInte } assert(catalog.namespaceExists(Array("foo")) === false) assert(catalog.listNamespaces() === builtinNamespaces) - val msg = intercept[AnalysisException] { + val e = intercept[AnalysisException] { catalog.listNamespaces(Array("foo")) - }.getMessage - assert(msg.contains("Namespace 'foo' not found")) + } + checkError(e, + errorClass = "SCHEMA_NOT_FOUND", + parameters = Map("schemaName" -> "`foo`")) } } @@ -115,7 +119,7 @@ private[v2] trait V2JDBCNamespaceTest extends SharedSparkSession with DockerInte // Drop non empty namespace without cascade catalog.createNamespace(Array("foo"), commentMap.asJava) assert(catalog.namespaceExists(Array("foo")) === true) - catalog.createTable(ident1, schema, Array.empty, emptyProps) + catalog.createTable(ident1, schema, Array.empty[Transform], emptyProps) if (supportsDropSchemaRestrict) { intercept[NonEmptyNamespaceException] { catalog.dropNamespace(Array("foo"), cascade = false) diff --git a/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala new file mode 100644 index 0000000000000..f16d9b507d5f2 --- /dev/null +++ b/connector/docker-integration-tests/src/test/scala/org/apache/spark/sql/jdbc/v2/V2JDBCTest.scala @@ -0,0 +1,615 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.jdbc.v2 + +import org.apache.logging.log4j.Level + +import org.apache.spark.sql.{AnalysisException, DataFrame} +import org.apache.spark.sql.catalyst.analysis.{IndexAlreadyExistsException, NoSuchIndexException, UnresolvedAttribute} +import org.apache.spark.sql.catalyst.plans.logical.{Aggregate, Filter, Sample} +import org.apache.spark.sql.catalyst.util.quoteIdentifier +import org.apache.spark.sql.connector.catalog.{Catalogs, Identifier, TableCatalog} +import org.apache.spark.sql.connector.catalog.index.SupportsIndex +import org.apache.spark.sql.connector.expressions.aggregate.GeneralAggregateFunc +import org.apache.spark.sql.execution.datasources.v2.{DataSourceV2ScanRelation, V1ScanWrapper} +import org.apache.spark.sql.jdbc.DockerIntegrationFunSuite +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types._ +import org.apache.spark.tags.DockerTest + +@DockerTest +private[v2] trait V2JDBCTest extends SharedSparkSession with DockerIntegrationFunSuite { + import testImplicits._ + + val catalogName: String + + val namespaceOpt: Option[String] = None + + private def catalogAndNamespace = + namespaceOpt.map(namespace => s"$catalogName.$namespace").getOrElse(catalogName) + + // dialect specific update column type test + def testUpdateColumnType(tbl: String): Unit + + def notSupportsTableComment: Boolean = false + + val defaultMetadata = new MetadataBuilder().putLong("scale", 0).build() + + def testUpdateColumnNullability(tbl: String): Unit = { + sql(s"CREATE TABLE $catalogName.alt_table (ID STRING NOT NULL)") + var t = spark.table(s"$catalogName.alt_table") + // nullable is true in the expectedSchema because Spark always sets nullable to true + // regardless of the JDBC metadata https://github.com/apache/spark/pull/18445 + var expectedSchema = new StructType().add("ID", StringType, true, defaultMetadata) + assert(t.schema === expectedSchema) + sql(s"ALTER TABLE $catalogName.alt_table ALTER COLUMN ID DROP NOT NULL") + t = spark.table(s"$catalogName.alt_table") + expectedSchema = new StructType().add("ID", StringType, true, defaultMetadata) + assert(t.schema === expectedSchema) + // Update nullability of not existing column + val msg = intercept[AnalysisException] { + sql(s"ALTER TABLE $catalogName.alt_table ALTER COLUMN bad_column DROP NOT NULL") + }.getMessage + assert(msg.contains("Missing field bad_column")) + } + + def testRenameColumn(tbl: String): Unit = { + sql(s"ALTER TABLE $tbl RENAME COLUMN ID TO RENAMED") + val t = spark.table(s"$tbl") + val expectedSchema = new StructType().add("RENAMED", StringType, true, defaultMetadata) + .add("ID1", StringType, true, defaultMetadata).add("ID2", StringType, true, defaultMetadata) + assert(t.schema === expectedSchema) + } + + def testCreateTableWithProperty(tbl: String): Unit = {} + + test("SPARK-33034: ALTER TABLE ... add new columns") { + withTable(s"$catalogName.alt_table") { + sql(s"CREATE TABLE $catalogName.alt_table (ID STRING)") + var t = spark.table(s"$catalogName.alt_table") + var expectedSchema = new StructType().add("ID", StringType, true, defaultMetadata) + assert(t.schema === expectedSchema) + sql(s"ALTER TABLE $catalogName.alt_table ADD COLUMNS (C1 STRING, C2 STRING)") + t = spark.table(s"$catalogName.alt_table") + expectedSchema = expectedSchema.add("C1", StringType, true, defaultMetadata) + .add("C2", StringType, true, defaultMetadata) + assert(t.schema === expectedSchema) + sql(s"ALTER TABLE $catalogName.alt_table ADD COLUMNS (C3 STRING)") + t = spark.table(s"$catalogName.alt_table") + expectedSchema = expectedSchema.add("C3", StringType, true, defaultMetadata) + assert(t.schema === expectedSchema) + // Add already existing column + val msg = intercept[AnalysisException] { + sql(s"ALTER TABLE $catalogName.alt_table ADD COLUMNS (C3 DOUBLE)") + }.getMessage + assert(msg.contains("Cannot add column, because C3 already exists")) + } + // Add a column to not existing table + val e = intercept[AnalysisException] { + sql(s"ALTER TABLE $catalogName.not_existing_table ADD COLUMNS (C4 STRING)") + } + checkErrorTableNotFound(e, s"`$catalogName`.`not_existing_table`", + ExpectedContext(s"$catalogName.not_existing_table", 12, + 11 + s"$catalogName.not_existing_table".length)) + } + + test("SPARK-33034: ALTER TABLE ... drop column") { + withTable(s"$catalogName.alt_table") { + sql(s"CREATE TABLE $catalogName.alt_table (C1 INTEGER, C2 STRING, c3 INTEGER)") + sql(s"ALTER TABLE $catalogName.alt_table DROP COLUMN C1") + sql(s"ALTER TABLE $catalogName.alt_table DROP COLUMN c3") + val t = spark.table(s"$catalogName.alt_table") + val expectedSchema = new StructType().add("C2", StringType, true, defaultMetadata) + assert(t.schema === expectedSchema) + // Drop not existing column + val msg = intercept[AnalysisException] { + sql(s"ALTER TABLE $catalogName.alt_table DROP COLUMN bad_column") + }.getMessage + assert(msg.contains(s"Missing field bad_column in table $catalogName.alt_table")) + } + // Drop a column from a not existing table + val e = intercept[AnalysisException] { + sql(s"ALTER TABLE $catalogName.not_existing_table DROP COLUMN C1") + } + checkErrorTableNotFound(e, s"`$catalogName`.`not_existing_table`", + ExpectedContext(s"$catalogName.not_existing_table", 12, + 11 + s"$catalogName.not_existing_table".length)) + } + + test("SPARK-33034: ALTER TABLE ... update column type") { + withTable(s"$catalogName.alt_table") { + testUpdateColumnType(s"$catalogName.alt_table") + // Update not existing column + val msg2 = intercept[AnalysisException] { + sql(s"ALTER TABLE $catalogName.alt_table ALTER COLUMN bad_column TYPE DOUBLE") + }.getMessage + assert(msg2.contains("Missing field bad_column")) + } + // Update column type in not existing table + val e = intercept[AnalysisException] { + sql(s"ALTER TABLE $catalogName.not_existing_table ALTER COLUMN id TYPE DOUBLE") + } + checkErrorTableNotFound(e, s"`$catalogName`.`not_existing_table`", + ExpectedContext(s"$catalogName.not_existing_table", 12, + 11 + s"$catalogName.not_existing_table".length)) + } + + test("SPARK-33034: ALTER TABLE ... rename column") { + withTable(s"$catalogName.alt_table") { + sql(s"CREATE TABLE $catalogName.alt_table (ID STRING NOT NULL," + + s" ID1 STRING NOT NULL, ID2 STRING NOT NULL)") + testRenameColumn(s"$catalogName.alt_table") + // Rename to already existing column + val msg = intercept[AnalysisException] { + sql(s"ALTER TABLE $catalogName.alt_table RENAME COLUMN ID1 TO ID2") + }.getMessage + assert(msg.contains("Cannot rename column, because ID2 already exists")) + } + // Rename a column in a not existing table + val e = intercept[AnalysisException] { + sql(s"ALTER TABLE $catalogName.not_existing_table RENAME COLUMN ID TO C") + } + checkErrorTableNotFound(e, + UnresolvedAttribute.parseAttributeName(s"$catalogName.not_existing_table") + .map(part => quoteIdentifier(part)).mkString("."), + ExpectedContext(s"$catalogName.not_existing_table", 12, + 11 + s"$catalogName.not_existing_table".length)) + } + + test("SPARK-33034: ALTER TABLE ... update column nullability") { + withTable(s"$catalogName.alt_table") { + testUpdateColumnNullability(s"$catalogName.alt_table") + } + // Update column nullability in not existing table + val e = intercept[AnalysisException] { + sql(s"ALTER TABLE $catalogName.not_existing_table ALTER COLUMN ID DROP NOT NULL") + } + checkErrorTableNotFound(e, s"`$catalogName`.`not_existing_table`", + ExpectedContext(s"$catalogName.not_existing_table", 12, + 11 + s"$catalogName.not_existing_table".length)) + } + + test("CREATE TABLE with table comment") { + withTable(s"$catalogName.new_table") { + val logAppender = new LogAppender("table comment") + withLogAppender(logAppender) { + sql(s"CREATE TABLE $catalogName.new_table(i INT) COMMENT 'this is a comment'") + } + val createCommentWarning = logAppender.loggingEvents + .filter(_.getLevel == Level.WARN) + .map(_.getMessage.getFormattedMessage) + .exists(_.contains("Cannot create JDBC table comment")) + assert(createCommentWarning === notSupportsTableComment) + } + } + + test("CREATE TABLE with table property") { + withTable(s"$catalogName.new_table") { + val m = intercept[AnalysisException] { + sql(s"CREATE TABLE $catalogName.new_table (i INT) TBLPROPERTIES('a'='1')") + }.message + assert(m.contains("Failed table creation")) + testCreateTableWithProperty(s"$catalogName.new_table") + } + } + + def supportsIndex: Boolean = false + + def supportListIndexes: Boolean = false + + def indexOptions: String = "" + + test("SPARK-36895: Test INDEX Using SQL") { + if (supportsIndex) { + withTable(s"$catalogName.new_table") { + sql(s"CREATE TABLE $catalogName.new_table(col1 INT, col2 INT, col3 INT," + + " col4 INT, col5 INT)") + val loaded = Catalogs.load(catalogName, conf) + val jdbcTable = loaded.asInstanceOf[TableCatalog] + .loadTable(Identifier.of(Array.empty[String], "new_table")) + .asInstanceOf[SupportsIndex] + assert(jdbcTable.indexExists("i1") == false) + assert(jdbcTable.indexExists("i2") == false) + + val indexType = "DUMMY" + var m = intercept[UnsupportedOperationException] { + sql(s"CREATE index i1 ON $catalogName.new_table USING $indexType (col1)") + }.getMessage + assert(m.contains(s"Index Type $indexType is not supported." + + s" The supported Index Types are:")) + + sql(s"CREATE index i1 ON $catalogName.new_table USING BTREE (col1)") + assert(jdbcTable.indexExists("i1")) + if (supportListIndexes) { + val indexes = jdbcTable.listIndexes() + assert(indexes.size == 1) + assert(indexes.head.indexName() == "i1") + } + + sql(s"CREATE index i2 ON $catalogName.new_table (col2, col3, col5)" + + s" OPTIONS ($indexOptions)") + assert(jdbcTable.indexExists("i2")) + if (supportListIndexes) { + val indexes = jdbcTable.listIndexes() + assert(indexes.size == 2) + assert(indexes.map(_.indexName()).sorted === Array("i1", "i2")) + } + + // This should pass without exception + sql(s"CREATE index IF NOT EXISTS i1 ON $catalogName.new_table (col1)") + + checkError( + exception = intercept[IndexAlreadyExistsException] { + sql(s"CREATE index i1 ON $catalogName.new_table (col1)") + }, + errorClass = "INDEX_ALREADY_EXISTS", + parameters = Map("indexName" -> "i1", "tableName" -> "new_table") + ) + + sql(s"DROP index i1 ON $catalogName.new_table") + assert(jdbcTable.indexExists("i1") == false) + if (supportListIndexes) { + val indexes = jdbcTable.listIndexes() + assert(indexes.size == 1) + assert(indexes.head.indexName() == "i2") + } + + sql(s"DROP index i2 ON $catalogName.new_table") + assert(jdbcTable.indexExists("i2") == false) + if (supportListIndexes) { + assert(jdbcTable.listIndexes().isEmpty) + } + + // This should pass without exception + sql(s"DROP index IF EXISTS i1 ON $catalogName.new_table") + + checkError( + exception = intercept[NoSuchIndexException] { + sql(s"DROP index i1 ON $catalogName.new_table") + }, + errorClass = "INDEX_NOT_FOUND", + parameters = Map("indexName" -> "i1", "tableName" -> "new_table") + ) + } + } + } + + def supportsTableSample: Boolean = false + + private def checkSamplePushed(df: DataFrame, pushed: Boolean = true): Unit = { + val sample = df.queryExecution.optimizedPlan.collect { + case s: Sample => s + } + if (pushed) { + assert(sample.isEmpty) + } else { + assert(sample.nonEmpty) + } + } + + private def checkFilterPushed(df: DataFrame, pushed: Boolean = true): Unit = { + val filter = df.queryExecution.optimizedPlan.collect { + case f: Filter => f + } + if (pushed) { + assert(filter.isEmpty) + } else { + assert(filter.nonEmpty) + } + } + + private def limitPushed(df: DataFrame, limit: Int): Boolean = { + df.queryExecution.optimizedPlan.collect { + case relation: DataSourceV2ScanRelation => relation.scan match { + case v1: V1ScanWrapper => + return v1.pushedDownOperators.limit == Some(limit) + } + } + false + } + + private def checkColumnPruned(df: DataFrame, col: String): Unit = { + val scan = df.queryExecution.optimizedPlan.collectFirst { + case s: DataSourceV2ScanRelation => s + }.get + assert(scan.schema.names.sameElements(Seq(col))) + } + + test("SPARK-37038: Test TABLESAMPLE") { + if (supportsTableSample) { + withTable(s"$catalogName.new_table") { + sql(s"CREATE TABLE $catalogName.new_table (col1 INT, col2 INT)") + spark.range(10).select($"id" * 2, $"id" * 2 + 1).write.insertInto(s"$catalogName.new_table") + + // sample push down + column pruning + val df1 = sql(s"SELECT col1 FROM $catalogName.new_table TABLESAMPLE (BUCKET 6 OUT OF 10)" + + " REPEATABLE (12345)") + checkSamplePushed(df1) + checkColumnPruned(df1, "col1") + assert(df1.collect().length < 10) + + // sample push down only + val df2 = sql(s"SELECT * FROM $catalogName.new_table TABLESAMPLE (50 PERCENT)" + + " REPEATABLE (12345)") + checkSamplePushed(df2) + assert(df2.collect().length < 10) + + // sample(BUCKET ... OUT OF) push down + limit push down + column pruning + val df3 = sql(s"SELECT col1 FROM $catalogName.new_table TABLESAMPLE (BUCKET 6 OUT OF 10)" + + " LIMIT 2") + checkSamplePushed(df3) + assert(limitPushed(df3, 2)) + checkColumnPruned(df3, "col1") + assert(df3.collect().length <= 2) + + // sample(... PERCENT) push down + limit push down + column pruning + val df4 = sql(s"SELECT col1 FROM $catalogName.new_table" + + " TABLESAMPLE (50 PERCENT) REPEATABLE (12345) LIMIT 2") + checkSamplePushed(df4) + assert(limitPushed(df4, 2)) + checkColumnPruned(df4, "col1") + assert(df4.collect().length <= 2) + + // sample push down + filter push down + limit push down + val df5 = sql(s"SELECT * FROM $catalogName.new_table" + + " TABLESAMPLE (BUCKET 6 OUT OF 10) WHERE col1 > 0 LIMIT 2") + checkSamplePushed(df5) + checkFilterPushed(df5) + assert(limitPushed(df5, 2)) + assert(df5.collect().length <= 2) + + // sample + filter + limit + column pruning + // sample pushed down, filer/limit not pushed down, column pruned + // Todo: push down filter/limit + val df6 = sql(s"SELECT col1 FROM $catalogName.new_table" + + " TABLESAMPLE (BUCKET 6 OUT OF 10) WHERE col1 > 0 LIMIT 2") + checkSamplePushed(df6) + checkFilterPushed(df6, false) + assert(!limitPushed(df6, 2)) + checkColumnPruned(df6, "col1") + assert(df6.collect().length <= 2) + + // sample + limit + // Push down order is sample -> filter -> limit + // only limit is pushed down because in this test sample is after limit + val df7 = spark.read.table(s"$catalogName.new_table").limit(2).sample(0.5) + checkSamplePushed(df7, false) + assert(limitPushed(df7, 2)) + + // sample + filter + // Push down order is sample -> filter -> limit + // only filter is pushed down because in this test sample is after filter + val df8 = spark.read.table(s"$catalogName.new_table").where($"col1" > 1).sample(0.5) + checkSamplePushed(df8, false) + checkFilterPushed(df8) + assert(df8.collect().length < 10) + } + } + } + + protected def checkAggregateRemoved(df: DataFrame): Unit = { + val aggregates = df.queryExecution.optimizedPlan.collect { + case agg: Aggregate => agg + } + assert(aggregates.isEmpty) + } + + private def checkAggregatePushed(df: DataFrame, funcName: String): Unit = { + df.queryExecution.optimizedPlan.collect { + case DataSourceV2ScanRelation(_, scan, _, _, _) => + assert(scan.isInstanceOf[V1ScanWrapper]) + val wrapper = scan.asInstanceOf[V1ScanWrapper] + assert(wrapper.pushedDownOperators.aggregation.isDefined) + val aggregationExpressions = + wrapper.pushedDownOperators.aggregation.get.aggregateExpressions() + assert(aggregationExpressions.length == 1) + assert(aggregationExpressions(0).isInstanceOf[GeneralAggregateFunc]) + assert(aggregationExpressions(0).asInstanceOf[GeneralAggregateFunc].name() == funcName) + } + } + + protected def caseConvert(tableName: String): String = tableName + + private def withOrWithout(isDistinct: Boolean): String = if (isDistinct) "with" else "without" + + protected def testVarPop(isDistinct: Boolean = false): Unit = { + val distinct = if (isDistinct) "DISTINCT " else "" + test(s"scan with aggregate push-down: VAR_POP ${withOrWithout(isDistinct)} DISTINCT") { + val df = sql(s"SELECT VAR_POP(${distinct}bonus) FROM $catalogAndNamespace." + + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") + checkFilterPushed(df) + checkAggregateRemoved(df) + checkAggregatePushed(df, "VAR_POP") + val row = df.collect() + assert(row.length === 3) + assert(row(0).getDouble(0) === 10000.0) + assert(row(1).getDouble(0) === 2500.0) + assert(row(2).getDouble(0) === 0.0) + } + } + + protected def testVarSamp(isDistinct: Boolean = false): Unit = { + val distinct = if (isDistinct) "DISTINCT " else "" + test(s"scan with aggregate push-down: VAR_SAMP ${withOrWithout(isDistinct)} DISTINCT") { + val df = sql( + s"SELECT VAR_SAMP(${distinct}bonus) FROM $catalogAndNamespace." + + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") + checkFilterPushed(df) + checkAggregateRemoved(df) + checkAggregatePushed(df, "VAR_SAMP") + val row = df.collect() + assert(row.length === 3) + assert(row(0).getDouble(0) === 20000.0) + assert(row(1).getDouble(0) === 5000.0) + assert(row(2).isNullAt(0)) + } + } + + protected def testStddevPop(isDistinct: Boolean = false): Unit = { + val distinct = if (isDistinct) "DISTINCT " else "" + test(s"scan with aggregate push-down: STDDEV_POP ${withOrWithout(isDistinct)} DISTINCT") { + val df = sql( + s"SELECT STDDEV_POP(${distinct}bonus) FROM $catalogAndNamespace." + + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") + checkFilterPushed(df) + checkAggregateRemoved(df) + checkAggregatePushed(df, "STDDEV_POP") + val row = df.collect() + assert(row.length === 3) + assert(row(0).getDouble(0) === 100.0) + assert(row(1).getDouble(0) === 50.0) + assert(row(2).getDouble(0) === 0.0) + } + } + + protected def testStddevSamp(isDistinct: Boolean = false): Unit = { + val distinct = if (isDistinct) "DISTINCT " else "" + test(s"scan with aggregate push-down: STDDEV_SAMP ${withOrWithout(isDistinct)} DISTINCT") { + val df = sql( + s"SELECT STDDEV_SAMP(${distinct}bonus) FROM $catalogAndNamespace." + + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") + checkFilterPushed(df) + checkAggregateRemoved(df) + checkAggregatePushed(df, "STDDEV_SAMP") + val row = df.collect() + assert(row.length === 3) + assert(row(0).getDouble(0) === 141.4213562373095) + assert(row(1).getDouble(0) === 70.71067811865476) + assert(row(2).isNullAt(0)) + } + } + + protected def testCovarPop(isDistinct: Boolean = false): Unit = { + val distinct = if (isDistinct) "DISTINCT " else "" + test(s"scan with aggregate push-down: COVAR_POP ${withOrWithout(isDistinct)} DISTINCT") { + val df = sql( + s"SELECT COVAR_POP(${distinct}bonus, bonus) FROM $catalogAndNamespace." + + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") + checkFilterPushed(df) + checkAggregateRemoved(df) + checkAggregatePushed(df, "COVAR_POP") + val row = df.collect() + assert(row.length === 3) + assert(row(0).getDouble(0) === 10000.0) + assert(row(1).getDouble(0) === 2500.0) + assert(row(2).getDouble(0) === 0.0) + } + } + + protected def testCovarSamp(isDistinct: Boolean = false): Unit = { + val distinct = if (isDistinct) "DISTINCT " else "" + test(s"scan with aggregate push-down: COVAR_SAMP ${withOrWithout(isDistinct)} DISTINCT") { + val df = sql( + s"SELECT COVAR_SAMP(${distinct}bonus, bonus) FROM $catalogAndNamespace." + + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") + checkFilterPushed(df) + checkAggregateRemoved(df) + checkAggregatePushed(df, "COVAR_SAMP") + val row = df.collect() + assert(row.length === 3) + assert(row(0).getDouble(0) === 20000.0) + assert(row(1).getDouble(0) === 5000.0) + assert(row(2).isNullAt(0)) + } + } + + protected def testCorr(isDistinct: Boolean = false): Unit = { + val distinct = if (isDistinct) "DISTINCT " else "" + test(s"scan with aggregate push-down: CORR ${withOrWithout(isDistinct)} DISTINCT") { + val df = sql( + s"SELECT CORR(${distinct}bonus, bonus) FROM $catalogAndNamespace." + + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") + checkFilterPushed(df) + checkAggregateRemoved(df) + checkAggregatePushed(df, "CORR") + val row = df.collect() + assert(row.length === 3) + assert(row(0).getDouble(0) === 1.0) + assert(row(1).getDouble(0) === 1.0) + assert(row(2).isNullAt(0)) + } + } + + protected def testRegrIntercept(isDistinct: Boolean = false): Unit = { + val distinct = if (isDistinct) "DISTINCT " else "" + test(s"scan with aggregate push-down: REGR_INTERCEPT ${withOrWithout(isDistinct)} DISTINCT") { + val df = sql( + s"SELECT REGR_INTERCEPT(${distinct}bonus, bonus) FROM $catalogAndNamespace." + + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") + checkFilterPushed(df) + checkAggregateRemoved(df) + checkAggregatePushed(df, "REGR_INTERCEPT") + val row = df.collect() + assert(row.length === 3) + assert(row(0).getDouble(0) === 0.0) + assert(row(1).getDouble(0) === 0.0) + assert(row(2).isNullAt(0)) + } + } + + protected def testRegrSlope(isDistinct: Boolean = false): Unit = { + val distinct = if (isDistinct) "DISTINCT " else "" + test(s"scan with aggregate push-down: REGR_SLOPE ${withOrWithout(isDistinct)} DISTINCT") { + val df = sql( + s"SELECT REGR_SLOPE(${distinct}bonus, bonus) FROM $catalogAndNamespace." + + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") + checkFilterPushed(df) + checkAggregateRemoved(df) + checkAggregatePushed(df, "REGR_SLOPE") + val row = df.collect() + assert(row.length === 3) + assert(row(0).getDouble(0) === 1.0) + assert(row(1).getDouble(0) === 1.0) + assert(row(2).isNullAt(0)) + } + } + + protected def testRegrR2(isDistinct: Boolean = false): Unit = { + val distinct = if (isDistinct) "DISTINCT " else "" + test(s"scan with aggregate push-down: REGR_R2 ${withOrWithout(isDistinct)} DISTINCT") { + val df = sql( + s"SELECT REGR_R2(${distinct}bonus, bonus) FROM $catalogAndNamespace." + + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") + checkFilterPushed(df) + checkAggregateRemoved(df) + checkAggregatePushed(df, "REGR_R2") + val row = df.collect() + assert(row.length === 3) + assert(row(0).getDouble(0) === 1.0) + assert(row(1).getDouble(0) === 1.0) + assert(row(2).isNullAt(0)) + } + } + + protected def testRegrSXY(isDistinct: Boolean = false): Unit = { + val distinct = if (isDistinct) "DISTINCT " else "" + test(s"scan with aggregate push-down: REGR_SXY ${withOrWithout(isDistinct)} DISTINCT") { + val df = sql( + s"SELECT REGR_SXY(${distinct}bonus, bonus) FROM $catalogAndNamespace." + + s"${caseConvert("employee")} WHERE dept > 0 GROUP BY dept ORDER BY dept") + checkFilterPushed(df) + checkAggregateRemoved(df) + checkAggregatePushed(df, "REGR_SXY") + val row = df.collect() + assert(row.length === 3) + assert(row(0).getDouble(0) === 20000.0) + assert(row(1).getDouble(0) === 5000.0) + assert(row(2).getDouble(0) === 0.0) + } + } +} diff --git a/external/docker-integration-tests/src/test/scala/org/apache/spark/util/DockerUtils.scala b/connector/docker-integration-tests/src/test/scala/org/apache/spark/util/DockerUtils.scala similarity index 100% rename from external/docker-integration-tests/src/test/scala/org/apache/spark/util/DockerUtils.scala rename to connector/docker-integration-tests/src/test/scala/org/apache/spark/util/DockerUtils.scala diff --git a/external/docker/README.md b/connector/docker/README.md similarity index 100% rename from external/docker/README.md rename to connector/docker/README.md diff --git a/connector/docker/build b/connector/docker/build new file mode 100755 index 0000000000000..de83c7d7611dc --- /dev/null +++ b/connector/docker/build @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +docker images > /dev/null || { echo Please install docker in non-sudo mode. ; exit; } + +./spark-test/build \ No newline at end of file diff --git a/external/docker/spark-test/README.md b/connector/docker/spark-test/README.md similarity index 100% rename from external/docker/spark-test/README.md rename to connector/docker/spark-test/README.md diff --git a/external/docker/spark-test/base/Dockerfile b/connector/docker/spark-test/base/Dockerfile similarity index 100% rename from external/docker/spark-test/base/Dockerfile rename to connector/docker/spark-test/base/Dockerfile diff --git a/connector/docker/spark-test/build b/connector/docker/spark-test/build new file mode 100755 index 0000000000000..55dff4754b000 --- /dev/null +++ b/connector/docker/spark-test/build @@ -0,0 +1,22 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +docker build -t spark-test-base spark-test/base/ +docker build -t spark-test-master spark-test/master/ +docker build -t spark-test-worker spark-test/worker/ diff --git a/external/docker/spark-test/master/Dockerfile b/connector/docker/spark-test/master/Dockerfile similarity index 100% rename from external/docker/spark-test/master/Dockerfile rename to connector/docker/spark-test/master/Dockerfile diff --git a/connector/docker/spark-test/master/default_cmd b/connector/docker/spark-test/master/default_cmd new file mode 100755 index 0000000000000..6865ca41b894f --- /dev/null +++ b/connector/docker/spark-test/master/default_cmd @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }') +echo "CONTAINER_IP=$IP" +export SPARK_LOCAL_IP=$IP +export SPARK_PUBLIC_DNS=$IP + +/opt/spark/bin/spark-class org.apache.spark.deploy.master.Master -i $IP diff --git a/external/docker/spark-test/worker/Dockerfile b/connector/docker/spark-test/worker/Dockerfile similarity index 100% rename from external/docker/spark-test/worker/Dockerfile rename to connector/docker/spark-test/worker/Dockerfile diff --git a/connector/docker/spark-test/worker/default_cmd b/connector/docker/spark-test/worker/default_cmd new file mode 100755 index 0000000000000..1f2aac95ed699 --- /dev/null +++ b/connector/docker/spark-test/worker/default_cmd @@ -0,0 +1,25 @@ +#!/usr/bin/env bash + +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +IP=$(ip -o -4 addr list eth0 | perl -n -e 'if (m{inet\s([\d\.]+)\/\d+\s}xms) { print $1 }') +echo "CONTAINER_IP=$IP" +export SPARK_LOCAL_IP=$IP +export SPARK_PUBLIC_DNS=$IP + +/opt/spark/bin/spark-class org.apache.spark.deploy.worker.Worker $1 diff --git a/connector/kafka-0-10-assembly/pom.xml b/connector/kafka-0-10-assembly/pom.xml new file mode 100644 index 0000000000000..b7223fb99ccc6 --- /dev/null +++ b/connector/kafka-0-10-assembly/pom.xml @@ -0,0 +1,180 @@ + + + + + 4.0.0 + + org.apache.spark + spark-parent_2.12 + 3.4.1 + ../../pom.xml + + + spark-streaming-kafka-0-10-assembly_2.12 + jar + Spark Integration for Kafka 0.10 Assembly + https://spark.apache.org/ + + + streaming-kafka-0-10-assembly + + + + + org.apache.spark + spark-streaming-kafka-0-10_${scala.binary.version} + ${project.version} + + + org.apache.spark + spark-streaming_${scala.binary.version} + ${project.version} + provided + + + + commons-codec + commons-codec + provided + + + commons-lang + commons-lang + provided + + + com.google.protobuf + protobuf-java + provided + + + org.lz4 + lz4-java + provided + + + org.apache.hadoop + ${hadoop-client-api.artifact} + ${hadoop.version} + provided + + + org.apache.hadoop + ${hadoop-client-runtime.artifact} + ${hadoop.version} + + + org.apache.avro + avro-mapred + provided + + + org.apache.curator + curator-recipes + provided + + + org.apache.zookeeper + zookeeper + provided + + + org.apache.logging.log4j + log4j-api + provided + + + org.apache.logging.log4j + log4j-core + provided + + + org.apache.logging.log4j + log4j-1.2-api + provided + + + org.scala-lang + scala-library + provided + + + org.slf4j + slf4j-api + provided + + + org.xerial.snappy + snappy-java + provided + + + + + target/scala-${scala.binary.version}/classes + target/scala-${scala.binary.version}/test-classes + + + org.apache.maven.plugins + maven-shade-plugin + + false + + + *:* + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + package + + shade + + + + + + reference.conf + + + log4j2.properties + + + + + + + + + + + + diff --git a/connector/kafka-0-10-sql/pom.xml b/connector/kafka-0-10-sql/pom.xml new file mode 100644 index 0000000000000..9a2186213de78 --- /dev/null +++ b/connector/kafka-0-10-sql/pom.xml @@ -0,0 +1,183 @@ + + + + + 4.0.0 + + org.apache.spark + spark-parent_2.12 + 3.4.1 + ../../pom.xml + + + org.apache.spark + spark-sql-kafka-0-10_2.12 + + sql-kafka-0-10 + + jar + Kafka 0.10+ Source for Structured Streaming + http://spark.apache.org/ + + + + org.apache.spark + spark-token-provider-kafka-0-10_${scala.binary.version} + ${project.version} + + + org.apache.spark + spark-sql_${scala.binary.version} + ${project.version} + provided + + + org.apache.spark + spark-token-provider-kafka-0-10_${scala.binary.version} + ${project.version} + test-jar + test + + + org.apache.spark + spark-core_${scala.binary.version} + ${project.version} + test-jar + test + + + org.apache.spark + spark-catalyst_${scala.binary.version} + ${project.version} + test-jar + test + + + org.apache.spark + spark-sql_${scala.binary.version} + ${project.version} + test-jar + test + + + + org.apache.kafka + kafka-clients + ${kafka.version} + + + com.github.luben + zstd-jni + + + + + com.google.code.findbugs + jsr305 + + + org.apache.commons + commons-pool2 + ${commons-pool2.version} + + + org.apache.kafka + kafka_${scala.binary.version} + ${kafka.version} + test + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-databind + + + com.fasterxml.jackson.core + jackson-annotations + + + + + org.apache.hadoop + hadoop-minikdc + + + + org.apache.zookeeper + zookeeper + 3.5.7 + test + + + net.sf.jopt-simple + jopt-simple + 3.2 + test + + + org.eclipse.jetty + jetty-servlet + ${jetty.version} + test + + + org.mockito + mockito-core + test + + + org.scalacheck + scalacheck_${scala.binary.version} + test + + + org.apache.spark + spark-tags_${scala.binary.version} + + + org.jmock + jmock-junit4 + test + + + + + org.apache.spark + spark-tags_${scala.binary.version} + test-jar + test + + + + + target/scala-${scala.binary.version}/classes + target/scala-${scala.binary.version}/test-classes + + + diff --git a/external/kafka-0-10-sql/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister b/connector/kafka-0-10-sql/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister similarity index 100% rename from external/kafka-0-10-sql/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister rename to connector/kafka-0-10-sql/src/main/resources/META-INF/services/org.apache.spark.sql.sources.DataSourceRegister diff --git a/connector/kafka-0-10-sql/src/main/resources/error/kafka-error-classes.json b/connector/kafka-0-10-sql/src/main/resources/error/kafka-error-classes.json new file mode 100644 index 0000000000000..ea7ffb592a555 --- /dev/null +++ b/connector/kafka-0-10-sql/src/main/resources/error/kafka-error-classes.json @@ -0,0 +1,26 @@ +{ + "MISMATCHED_TOPIC_PARTITIONS_BETWEEN_END_OFFSET_AND_PREFETCHED" : { + "message" : [ + "Kafka data source in Trigger.AvailableNow should provide the same topic partitions in pre-fetched offset to end offset for each microbatch. The error could be transient - restart your query, and report if you still see the same issue.", + "topic-partitions for pre-fetched offset: , topic-partitions for end offset: ." + ] + }, + "END_OFFSET_HAS_GREATER_OFFSET_FOR_TOPIC_PARTITION_THAN_PREFETCHED" : { + "message" : [ + "For Kafka data source with Trigger.AvailableNow, end offset should have lower or equal offset per each topic partition than pre-fetched offset. The error could be transient - restart your query, and report if you still see the same issue.", + "pre-fetched offset: , end offset: ." + ] + }, + "LOST_TOPIC_PARTITIONS_IN_END_OFFSET_WITH_TRIGGER_AVAILABLENOW" : { + "message" : [ + "Some of partitions in Kafka topic(s) have been lost during running query with Trigger.AvailableNow. The error could be transient - restart your query, and report if you still see the same issue.", + "topic-partitions for latest offset: , topic-partitions for end offset: " + ] + }, + "END_OFFSET_HAS_GREATER_OFFSET_FOR_TOPIC_PARTITION_THAN_LATEST_WITH_TRIGGER_AVAILABLENOW" : { + "message" : [ + "Some of partitions in Kafka topic(s) report available offset which is less than end offset during running query with Trigger.AvailableNow. The error could be transient - restart your query, and report if you still see the same issue.", + "latest offset: , end offset: " + ] + } +} diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/ConsumerStrategy.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/ConsumerStrategy.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/ConsumerStrategy.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/ConsumerStrategy.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/JsonUtils.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/JsonUtils.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/JsonUtils.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/JsonUtils.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatch.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatch.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatch.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatch.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatchPartitionReader.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatchPartitionReader.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatchPartitionReader.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatchPartitionReader.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatchWrite.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatchWrite.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatchWrite.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaBatchWrite.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousStream.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousStream.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousStream.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaContinuousStream.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaDataWriter.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaDataWriter.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaDataWriter.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaDataWriter.scala diff --git a/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaExceptions.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaExceptions.scala new file mode 100644 index 0000000000000..b0e30f37af51f --- /dev/null +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaExceptions.scala @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.kafka010 + +import org.apache.kafka.common.TopicPartition + +import org.apache.spark.{ErrorClassesJsonReader, SparkException} + +object KafkaExceptions { + private val errorClassesJsonReader: ErrorClassesJsonReader = + new ErrorClassesJsonReader( + Seq(getClass.getClassLoader.getResource("error/kafka-error-classes.json"))) + + def mismatchedTopicPartitionsBetweenEndOffsetAndPrefetched( + tpsForPrefetched: Set[TopicPartition], + tpsForEndOffset: Set[TopicPartition]): SparkException = { + val errMsg = errorClassesJsonReader.getErrorMessage( + "MISMATCHED_TOPIC_PARTITIONS_BETWEEN_END_OFFSET_AND_PREFETCHED", + Map( + "tpsForPrefetched" -> tpsForPrefetched.toString(), + "tpsForEndOffset" -> tpsForEndOffset.toString() + ) + ) + new SparkException(errMsg) + } + + def endOffsetHasGreaterOffsetForTopicPartitionThanPrefetched( + prefetchedOffset: Map[TopicPartition, Long], + endOffset: Map[TopicPartition, Long]): SparkException = { + val errMsg = errorClassesJsonReader.getErrorMessage( + "END_OFFSET_HAS_GREATER_OFFSET_FOR_TOPIC_PARTITION_THAN_PREFETCHED", + Map( + "prefetchedOffset" -> prefetchedOffset.toString(), + "endOffset" -> endOffset.toString() + ) + ) + new SparkException(errMsg) + } + + def lostTopicPartitionsInEndOffsetWithTriggerAvailableNow( + tpsForLatestOffset: Set[TopicPartition], + tpsForEndOffset: Set[TopicPartition]): SparkException = { + val errMsg = errorClassesJsonReader.getErrorMessage( + "LOST_TOPIC_PARTITIONS_IN_END_OFFSET_WITH_TRIGGER_AVAILABLENOW", + Map( + "tpsForLatestOffset" -> tpsForLatestOffset.toString(), + "tpsForEndOffset" -> tpsForEndOffset.toString() + ) + ) + new SparkException(errMsg) + } + + def endOffsetHasGreaterOffsetForTopicPartitionThanLatestWithTriggerAvailableNow( + latestOffset: Map[TopicPartition, Long], + endOffset: Map[TopicPartition, Long]): SparkException = { + val errMsg = errorClassesJsonReader.getErrorMessage( + "END_OFFSET_HAS_GREATER_OFFSET_FOR_TOPIC_PARTITION_THAN_LATEST_WITH_TRIGGER_AVAILABLENOW", + Map( + "latestOffset" -> latestOffset.toString(), + "endOffset" -> endOffset.toString() + ) + ) + new SparkException(errMsg) + } +} diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala similarity index 88% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala index 77bc658a1ef20..53063fe4d1f5b 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchStream.scala @@ -85,8 +85,6 @@ private[kafka010] class KafkaMicroBatchStream( private val includeHeaders = options.getBoolean(INCLUDE_HEADERS, false) - private var endPartitionOffsets: KafkaSourceOffset = _ - private var latestPartitionOffsets: PartitionOffsetMap = _ private var allDataForTriggerAvailableNow: PartitionOffsetMap = _ @@ -114,7 +112,7 @@ private[kafka010] class KafkaMicroBatchStream( } override def reportLatestOffset(): Offset = { - KafkaSourceOffset(latestPartitionOffsets) + Option(KafkaSourceOffset(latestPartitionOffsets)).filterNot(_.partitionToOffsets.isEmpty).orNull } override def latestOffset(): Offset = { @@ -163,8 +161,7 @@ private[kafka010] class KafkaMicroBatchStream( }.getOrElse(latestPartitionOffsets) } - endPartitionOffsets = KafkaSourceOffset(offsets) - endPartitionOffsets + Option(KafkaSourceOffset(offsets)).filterNot(_.partitionToOffsets.isEmpty).orNull } /** Checks if we need to skip this trigger based on minOffsetsPerTrigger & maxTriggerDelay */ @@ -194,6 +191,10 @@ private[kafka010] class KafkaMicroBatchStream( val startPartitionOffsets = start.asInstanceOf[KafkaSourceOffset].partitionToOffsets val endPartitionOffsets = end.asInstanceOf[KafkaSourceOffset].partitionToOffsets + if (allDataForTriggerAvailableNow != null) { + verifyEndOffsetForTriggerAvailableNow(endPartitionOffsets) + } + val offsetRanges = kafkaOffsetReader.getOffsetRangesFromResolvedOffsets( startPartitionOffsets, endPartitionOffsets, @@ -316,6 +317,50 @@ private[kafka010] class KafkaMicroBatchStream( } } + private def verifyEndOffsetForTriggerAvailableNow( + endPartitionOffsets: Map[TopicPartition, Long]): Unit = { + val tpsForPrefetched = allDataForTriggerAvailableNow.keySet + val tpsForEndOffset = endPartitionOffsets.keySet + + if (tpsForPrefetched != tpsForEndOffset) { + throw KafkaExceptions.mismatchedTopicPartitionsBetweenEndOffsetAndPrefetched( + tpsForPrefetched, tpsForEndOffset) + } + + val endOffsetHasGreaterThanPrefetched = { + allDataForTriggerAvailableNow.keySet.exists { tp => + val offsetFromPrefetched = allDataForTriggerAvailableNow(tp) + val offsetFromEndOffset = endPartitionOffsets(tp) + offsetFromEndOffset > offsetFromPrefetched + } + } + if (endOffsetHasGreaterThanPrefetched) { + throw KafkaExceptions.endOffsetHasGreaterOffsetForTopicPartitionThanPrefetched( + allDataForTriggerAvailableNow, endPartitionOffsets) + } + + val latestOffsets = kafkaOffsetReader.fetchLatestOffsets(Some(endPartitionOffsets)) + val tpsForLatestOffsets = latestOffsets.keySet + + if (!tpsForEndOffset.subsetOf(tpsForLatestOffsets)) { + throw KafkaExceptions.lostTopicPartitionsInEndOffsetWithTriggerAvailableNow( + tpsForLatestOffsets, tpsForEndOffset) + } + + val endOffsetHasGreaterThenLatest = { + tpsForEndOffset.exists { tp => + val offsetFromLatest = latestOffsets(tp) + val offsetFromEndOffset = endPartitionOffsets(tp) + offsetFromEndOffset > offsetFromLatest + } + } + if (endOffsetHasGreaterThenLatest) { + throw KafkaExceptions + .endOffsetHasGreaterOffsetForTopicPartitionThanLatestWithTriggerAvailableNow( + latestOffsets, endPartitionOffsets) + } + } + override def prepareForTriggerAvailableNow(): Unit = { allDataForTriggerAvailableNow = kafkaOffsetReader.fetchLatestOffsets( Some(getOrCreateInitialPartitionOffsets())) diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeCalculator.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeCalculator.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeCalculator.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeCalculator.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeLimit.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeLimit.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeLimit.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeLimit.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReader.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderAdmin.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderAdmin.scala similarity index 99% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderAdmin.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderAdmin.scala index 25c8cb8d518e9..b443bbcee0fc3 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderAdmin.scala +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderAdmin.scala @@ -243,7 +243,7 @@ private[kafka010] class KafkaOffsetReaderAdmin( } tp -> offset - }.toMap + } } private def fetchSpecificOffsets0( diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderConsumer.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderConsumer.scala similarity index 99% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderConsumer.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderConsumer.scala index cdd269216874a..10c7488de8968 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderConsumer.scala +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderConsumer.scala @@ -291,7 +291,7 @@ private[kafka010] class KafkaOffsetReaderConsumer( } tp -> offset - }.toMap + } } private def fetchSpecificOffsets0( diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRecordToRowConverter.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRecordToRowConverter.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRecordToRowConverter.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRecordToRowConverter.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaRelation.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSink.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSink.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSink.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSink.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala similarity index 88% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala index c82fda85eb4e8..f5d4abb569a31 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSource.scala @@ -162,7 +162,7 @@ private[kafka010] class KafkaSource( } override def reportLatestOffset(): streaming.Offset = { - latestPartitionOffsets.map(KafkaSourceOffset(_)).getOrElse(null) + latestPartitionOffsets.map(KafkaSourceOffset(_)).orNull } override def latestOffset(startOffset: streaming.Offset, limit: ReadLimit): streaming.Offset = { @@ -177,7 +177,7 @@ private[kafka010] class KafkaSource( kafkaReader.fetchLatestOffsets(currentOffsets) } - latestPartitionOffsets = Some(latest) + latestPartitionOffsets = if (latest.isEmpty) None else Some(latest) val limits: Seq[ReadLimit] = limit match { case rows: CompositeReadLimit => rows.getReadLimits @@ -213,7 +213,7 @@ private[kafka010] class KafkaSource( } currentPartitionOffsets = Some(offsets) logDebug(s"GetOffset: ${offsets.toSeq.map(_.toString).sorted}") - KafkaSourceOffset(offsets) + Option(KafkaSourceOffset(offsets)).filterNot(_.partitionToOffsets.isEmpty).orNull } /** Checks if we need to skip this trigger based on minOffsetsPerTrigger & maxTriggerDelay */ @@ -293,6 +293,11 @@ private[kafka010] class KafkaSource( logInfo(s"GetBatch called with start = $start, end = $end") val untilPartitionOffsets = KafkaSourceOffset.getPartitionOffsets(end) + + if (allDataForTriggerAvailableNow != null) { + verifyEndOffsetForTriggerAvailableNow(untilPartitionOffsets) + } + // On recovery, getBatch will get called before getOffset if (currentPartitionOffsets.isEmpty) { currentPartitionOffsets = Some(untilPartitionOffsets) @@ -349,6 +354,50 @@ private[kafka010] class KafkaSource( } } + private def verifyEndOffsetForTriggerAvailableNow( + endPartitionOffsets: Map[TopicPartition, Long]): Unit = { + val tpsForPrefetched = allDataForTriggerAvailableNow.keySet + val tpsForEndOffset = endPartitionOffsets.keySet + + if (tpsForPrefetched != tpsForEndOffset) { + throw KafkaExceptions.mismatchedTopicPartitionsBetweenEndOffsetAndPrefetched( + tpsForPrefetched, tpsForEndOffset) + } + + val endOffsetHasGreaterThanPrefetched = { + allDataForTriggerAvailableNow.keySet.exists { tp => + val offsetFromPrefetched = allDataForTriggerAvailableNow(tp) + val offsetFromEndOffset = endPartitionOffsets(tp) + offsetFromEndOffset > offsetFromPrefetched + } + } + if (endOffsetHasGreaterThanPrefetched) { + throw KafkaExceptions.endOffsetHasGreaterOffsetForTopicPartitionThanPrefetched( + allDataForTriggerAvailableNow, endPartitionOffsets) + } + + val latestOffsets = kafkaReader.fetchLatestOffsets(Some(endPartitionOffsets)) + val tpsForLatestOffsets = latestOffsets.keySet + + if (!tpsForEndOffset.subsetOf(tpsForLatestOffsets)) { + throw KafkaExceptions.lostTopicPartitionsInEndOffsetWithTriggerAvailableNow( + tpsForLatestOffsets, tpsForEndOffset) + } + + val endOffsetHasGreaterThenLatest = { + tpsForEndOffset.exists { tp => + val offsetFromLatest = latestOffsets(tp) + val offsetFromEndOffset = endPartitionOffsets(tp) + offsetFromEndOffset > offsetFromLatest + } + } + if (endOffsetHasGreaterThenLatest) { + throw KafkaExceptions + .endOffsetHasGreaterOffsetForTopicPartitionThanLatestWithTriggerAvailableNow( + latestOffsets, endPartitionOffsets) + } + } + override def prepareForTriggerAvailableNow(): Unit = { allDataForTriggerAvailableNow = kafkaReader.fetchLatestOffsets(Some(initialPartitionOffsets)) } diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceInitialOffsetWriter.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceInitialOffsetWriter.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceInitialOffsetWriter.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceInitialOffsetWriter.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceOffset.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceProvider.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaSourceRDD.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamingWrite.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamingWrite.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamingWrite.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaStreamingWrite.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWrite.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWrite.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWrite.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWrite.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriteTask.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/KafkaWriter.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/FetchedDataPool.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/FetchedDataPool.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/FetchedDataPool.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/FetchedDataPool.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/InternalKafkaConsumerPool.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/InternalKafkaConsumerPool.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/InternalKafkaConsumerPool.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/InternalKafkaConsumerPool.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumer.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumer.scala similarity index 96% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumer.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumer.scala index 37fe38ea94ece..d88e9821489cf 100644 --- a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumer.scala +++ b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumer.scala @@ -267,20 +267,18 @@ private[kafka010] class KafkaDataConsumer( * within [offset, untilOffset). * * This method also will try its best to detect data loss. If `failOnDataLoss` is `true`, it will - * throw an exception when we detect an unavailable offset. If `failOnDataLoss` is `false`, this - * method will try to fetch next available record within [offset, untilOffset). - * - * When this method tries to skip offsets due to either invisible messages or data loss and - * reaches `untilOffset`, it will return `null`. + * throw an exception when it detects an unavailable offset. If `failOnDataLoss` is `false`, this + * method will try to fetch next available record within [offset, untilOffset). When this method + * reaches `untilOffset` and still can't find an available record, it will return `null`. * * @param offset the offset to fetch. * @param untilOffset the max offset to fetch. Exclusive. * @param pollTimeoutMs timeout in milliseconds to poll data from Kafka. * @param failOnDataLoss When `failOnDataLoss` is `true`, this method will either return record at - * offset if available, or throw exception.when `failOnDataLoss` is `false`, - * this method will either return record at offset if available, or return - * the next earliest available record less than untilOffset, or null. It - * will not throw any exception. + * offset if available, or throw an exception. When `failOnDataLoss` is + * `false`, this method will return record at offset if available, or return + * the record at the next earliest available offset that is less than + * untilOffset, otherwise null. */ def get( offset: Long, @@ -298,9 +296,10 @@ private[kafka010] class KafkaDataConsumer( s"requested $offset") // The following loop is basically for `failOnDataLoss = false`. When `failOnDataLoss` is - // `false`, first, we will try to fetch the record at `offset`. If no such record exists, then - // we will move to the next available offset within `[offset, untilOffset)` and retry. - // If `failOnDataLoss` is `true`, the loop body will be executed only once. + // `false`, we will try to fetch the record at `offset`, if the record does not exist, we will + // try to fetch next available record within [offset, untilOffset). + // If `failOnDataLoss` is `true`, the loop body will be executed only once, either return the + // record at `offset` or throw an exception when the record does not exist. var toFetchOffset = offset var fetchedRecord: FetchedRecord = null // We want to break out of the while loop on a successful fetch to avoid using "return" @@ -452,7 +451,7 @@ private[kafka010] class KafkaDataConsumer( /** * Get the fetched record for the given offset if available. * - * If the record is invisible (either a transaction message, or an aborted message when the + * If the record is invisible (either a transaction message, or an aborted message when the * consumer's `isolation.level` is `read_committed`), it will return a `FetchedRecord` with the * next offset to fetch. * diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/package-info.java b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/package-info.java similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/package-info.java rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/package-info.java diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/package.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/package.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/package.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/package.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/producer/CachedKafkaProducer.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/producer/CachedKafkaProducer.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/producer/CachedKafkaProducer.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/producer/CachedKafkaProducer.scala diff --git a/external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/producer/InternalKafkaProducerPool.scala b/connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/producer/InternalKafkaProducerPool.scala similarity index 100% rename from external/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/producer/InternalKafkaProducerPool.scala rename to connector/kafka-0-10-sql/src/main/scala/org/apache/spark/sql/kafka010/producer/InternalKafkaProducerPool.scala diff --git a/external/kafka-0-10-sql/src/test/resources/kafka-source-initial-offset-future-version.bin b/connector/kafka-0-10-sql/src/test/resources/kafka-source-initial-offset-future-version.bin similarity index 100% rename from external/kafka-0-10-sql/src/test/resources/kafka-source-initial-offset-future-version.bin rename to connector/kafka-0-10-sql/src/test/resources/kafka-source-initial-offset-future-version.bin diff --git a/external/kafka-0-10-sql/src/test/resources/kafka-source-initial-offset-version-2.1.0.bin b/connector/kafka-0-10-sql/src/test/resources/kafka-source-initial-offset-version-2.1.0.bin similarity index 100% rename from external/kafka-0-10-sql/src/test/resources/kafka-source-initial-offset-version-2.1.0.bin rename to connector/kafka-0-10-sql/src/test/resources/kafka-source-initial-offset-version-2.1.0.bin diff --git a/external/kafka-0-10-sql/src/test/resources/kafka-source-offset-version-2.1.0.txt b/connector/kafka-0-10-sql/src/test/resources/kafka-source-offset-version-2.1.0.txt similarity index 100% rename from external/kafka-0-10-sql/src/test/resources/kafka-source-offset-version-2.1.0.txt rename to connector/kafka-0-10-sql/src/test/resources/kafka-source-offset-version-2.1.0.txt diff --git a/external/kafka-0-10-sql/src/test/resources/log4j2.properties b/connector/kafka-0-10-sql/src/test/resources/log4j2.properties similarity index 100% rename from external/kafka-0-10-sql/src/test/resources/log4j2.properties rename to connector/kafka-0-10-sql/src/test/resources/log4j2.properties diff --git a/external/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/commits/0 b/connector/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/commits/0 similarity index 100% rename from external/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/commits/0 rename to connector/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/commits/0 diff --git a/external/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/metadata b/connector/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/metadata similarity index 100% rename from external/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/metadata rename to connector/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/metadata diff --git a/external/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/offsets/0 b/connector/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/offsets/0 similarity index 100% rename from external/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/offsets/0 rename to connector/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/offsets/0 diff --git a/external/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/sources/0/0 b/connector/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/sources/0/0 similarity index 100% rename from external/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/sources/0/0 rename to connector/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/sources/0/0 diff --git a/external/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/state/0/0/1.delta b/connector/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/state/0/0/1.delta similarity index 100% rename from external/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/state/0/0/1.delta rename to connector/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/state/0/0/1.delta diff --git a/external/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/state/0/1/1.delta b/connector/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/state/0/1/1.delta similarity index 100% rename from external/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/state/0/1/1.delta rename to connector/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/state/0/1/1.delta diff --git a/external/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/state/0/2/1.delta b/connector/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/state/0/2/1.delta similarity index 100% rename from external/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/state/0/2/1.delta rename to connector/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/state/0/2/1.delta diff --git a/external/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/state/0/3/1.delta b/connector/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/state/0/3/1.delta similarity index 100% rename from external/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/state/0/3/1.delta rename to connector/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/state/0/3/1.delta diff --git a/external/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/state/0/4/1.delta b/connector/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/state/0/4/1.delta similarity index 100% rename from external/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/state/0/4/1.delta rename to connector/kafka-0-10-sql/src/test/resources/structured-streaming/checkpoint-version-2.4.3-kafka-include-headers-default/state/0/4/1.delta diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/ConsumerStrategySuite.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/ConsumerStrategySuite.scala similarity index 100% rename from external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/ConsumerStrategySuite.scala rename to connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/ConsumerStrategySuite.scala diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/JsonUtilsSuite.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/JsonUtilsSuite.scala similarity index 100% rename from external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/JsonUtilsSuite.scala rename to connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/JsonUtilsSuite.scala diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala similarity index 100% rename from external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala rename to connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousSourceSuite.scala diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala similarity index 100% rename from external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala rename to connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaContinuousTest.scala diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDelegationTokenSuite.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDelegationTokenSuite.scala similarity index 100% rename from external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDelegationTokenSuite.scala rename to connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDelegationTokenSuite.scala diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDontFailOnDataLossSuite.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDontFailOnDataLossSuite.scala similarity index 100% rename from external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDontFailOnDataLossSuite.scala rename to connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaDontFailOnDataLossSuite.scala diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala similarity index 91% rename from external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala rename to connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala index db71f0fd9184a..d63b9805e5530 100644 --- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala +++ b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaMicroBatchSourceSuite.scala @@ -25,6 +25,7 @@ import java.util.concurrent.ConcurrentLinkedQueue import java.util.concurrent.atomic.AtomicInteger import scala.collection.JavaConverters._ +import scala.collection.mutable.ListBuffer import scala.io.Source import scala.util.Random @@ -32,19 +33,22 @@ import org.apache.commons.io.FileUtils import org.apache.kafka.clients.producer.{ProducerRecord, RecordMetadata} import org.apache.kafka.common.TopicPartition import org.scalatest.concurrent.PatienceConfiguration.Timeout +import org.scalatest.matchers.should._ import org.scalatest.time.SpanSugar._ +import org.apache.spark.TestUtils import org.apache.spark.sql.{Dataset, ForeachWriter, Row, SparkSession} import org.apache.spark.sql.catalyst.util.CaseInsensitiveMap import org.apache.spark.sql.connector.read.streaming.SparkDataStream import org.apache.spark.sql.execution.datasources.v2.StreamingDataSourceV2Relation import org.apache.spark.sql.execution.exchange.ReusedExchangeExec import org.apache.spark.sql.execution.streaming._ +import org.apache.spark.sql.execution.streaming.AsyncProgressTrackingMicroBatchExecution.{ASYNC_PROGRESS_TRACKING_CHECKPOINTING_INTERVAL_MS, ASYNC_PROGRESS_TRACKING_ENABLED} import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution -import org.apache.spark.sql.functions.{count, window} +import org.apache.spark.sql.functions.{count, expr, window} import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.kafka010.KafkaSourceProvider._ -import org.apache.spark.sql.streaming.{StreamingQuery, StreamTest, Trigger} +import org.apache.spark.sql.streaming.{StreamingQuery, StreamingQueryException, StreamTest, Trigger} import org.apache.spark.sql.streaming.util.StreamManualClock import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.util.CaseInsensitiveStringMap @@ -120,7 +124,7 @@ abstract class KafkaSourceTest extends StreamTest with SharedSparkSession with K val sources: Seq[SparkDataStream] = { query.get.logicalPlan.collect { - case StreamingExecutionRelation(source: KafkaSource, _) => source + case StreamingExecutionRelation(source: KafkaSource, _, _) => source case r: StreamingDataSourceV2Relation if r.stream.isInstanceOf[KafkaMicroBatchStream] || r.stream.isInstanceOf[KafkaContinuousStream] => r.stream @@ -179,7 +183,7 @@ abstract class KafkaSourceTest extends StreamTest with SharedSparkSession with K protected def newTopic(): String = s"topic-${topicId.getAndIncrement()}" } -abstract class KafkaMicroBatchSourceSuiteBase extends KafkaSourceSuiteBase { +abstract class KafkaMicroBatchSourceSuiteBase extends KafkaSourceSuiteBase with Matchers { import testImplicits._ @@ -195,6 +199,89 @@ abstract class KafkaMicroBatchSourceSuiteBase extends KafkaSourceSuiteBase { true } + /** + * Test async progress tracking capability with Kafka source and sink + */ + test("async progress tracking") { + val inputTopic = newTopic() + testUtils.createTopic(inputTopic, partitions = 5) + + val dataSent = new ListBuffer[String]() + testUtils.sendMessages(inputTopic, (0 until 15).map { case x => + val m = s"foo-$x" + dataSent += m + m + }.toArray, Some(0)) + + val outputTopic = newTopic() + testUtils.createTopic(outputTopic, partitions = 5) + + withTempDir { dir => + val reader = spark + .readStream + .format("kafka") + .option("kafka.bootstrap.servers", testUtils.brokerAddress) + .option("kafka.metadata.max.age.ms", "1") + .option("maxOffsetsPerTrigger", 5) + .option("subscribe", inputTopic) + .option("startingOffsets", "earliest") + .load() + + def startQuery(): StreamingQuery = { + reader.writeStream + .format("kafka") + .option("kafka.bootstrap.servers", testUtils.brokerAddress) + .option("kafka.max.block.ms", "5000") + .option("topic", outputTopic) + .option("checkpointLocation", dir.getCanonicalPath) + .option(ASYNC_PROGRESS_TRACKING_ENABLED, true) + .option(ASYNC_PROGRESS_TRACKING_CHECKPOINTING_INTERVAL_MS, 1000) + .queryName("kafkaStream") + .start() + } + + def readResults(): List[String] = { + spark.read + .format("kafka") + .option("kafka.bootstrap.servers", testUtils.brokerAddress) + .option("startingOffsets", "earliest") + .option("subscribe", outputTopic) + .load() + .select(expr("CAST(value AS string)")) + .toDF + .collect().map(_.getAs[String]("value")).toList + } + + val query = startQuery() + try { + query.processAllAvailable() + } finally { + query.stop() + } + + val data = readResults() + data should equal (dataSent) + + // Restart query + + testUtils.sendMessages(inputTopic, (15 until 30).map { case x => + val m = s"foo-$x" + dataSent += m + m + }.toArray, Some(0)) + + val query2 = startQuery() + try { + query2.processAllAvailable() + } finally { + query2.stop() + } + + val data2 = readResults() + data2.toSet should equal (dataSent.toSet) + } + } + test("Trigger.AvailableNow") { val topic = newTopic() testUtils.createTopic(topic, partitions = 5) @@ -234,6 +321,114 @@ abstract class KafkaMicroBatchSourceSuiteBase extends KafkaSourceSuiteBase { assert(index == 3) } + test("Query with Trigger.AvailableNow should throw error when topic partitions got unavailable " + + "during subsequent batches") { + val topic = newTopic() + testUtils.createTopic(topic, partitions = 5) + + testUtils.sendMessages(topic, (0 until 15).map { case x => + s"foo-$x" + }.toArray, Some(0)) + + val reader = spark + .readStream + .format("kafka") + .option("kafka.bootstrap.servers", testUtils.brokerAddress) + .option("kafka.metadata.max.age.ms", "1") + .option("maxOffsetsPerTrigger", 5) + .option("subscribe", topic) + .option("startingOffsets", "earliest") + // the query should fail regardless of this option + .option("failOnDataLoss", "true") + .load() + + def startTriggerAvailableNowQuery(): StreamingQuery = { + reader.writeStream + .foreachBatch((_: Dataset[Row], batchId: Long) => { + testUtils.deleteTopic(topic) + // create partitions less than the kafka data source figured out as an end state + testUtils.createTopic(topic, partitions = 3) + // offset will keep the same + testUtils.sendMessages(topic, (0 until 15).map { case x => + s"foo-$x" + }.toArray, Some(0)) + null.asInstanceOf[Unit] + }) + .trigger(Trigger.AvailableNow) + .start() + } + + // SPARK-41996 - Increase query termination timeout to ensure that + // Kafka operations can be completed + val queryTimeout = 300.seconds + val exc = intercept[Exception] { + val query = startTriggerAvailableNowQuery() + try { + assert(query.awaitTermination(queryTimeout.toMillis)) + } finally { + query.stop() + } + } + TestUtils.assertExceptionMsg(exc, "Some of partitions in Kafka topic(s) have been lost " + + "during running query with Trigger.AvailableNow.") + TestUtils.assertExceptionMsg(exc, "topic-partitions for latest offset: ") + TestUtils.assertExceptionMsg(exc, "topic-partitions for end offset: ") + } + + test("Query with Trigger.AvailableNow should throw error when offset(s) in planned topic " + + "partitions got unavailable during subsequent batches") { + val topic = newTopic() + testUtils.createTopic(topic, partitions = 5) + + testUtils.sendMessages(topic, (0 until 15).map { case x => + s"foo-$x" + }.toArray, Some(0)) + + val reader = spark + .readStream + .format("kafka") + .option("kafka.bootstrap.servers", testUtils.brokerAddress) + .option("kafka.metadata.max.age.ms", "1") + .option("maxOffsetsPerTrigger", 5) + .option("subscribe", topic) + .option("startingOffsets", "earliest") + // the query should fail regardless of this option + .option("failOnDataLoss", "true") + .load() + + def startTriggerAvailableNowQuery(): StreamingQuery = { + reader.writeStream + .foreachBatch((_: Dataset[Row], batchId: Long) => { + testUtils.deleteTopic(topic) + // the number of topic partitions remain the same + testUtils.createTopic(topic, partitions = 5) + // the number of available records will change to lower than the end state + testUtils.sendMessages(topic, (0 until 10).map { case x => + s"foo-$x" + }.toArray, Some(0)) + null.asInstanceOf[Unit] + }) + .trigger(Trigger.AvailableNow) + .start() + } + + // SPARK-41996 - Increase query termination timeout to ensure that + // Kafka operations can be completed + val queryTimeout = 300.seconds + val exc = intercept[StreamingQueryException] { + val query = startTriggerAvailableNowQuery() + try { + assert(query.awaitTermination(queryTimeout.toMillis)) + } finally { + query.stop() + } + } + TestUtils.assertExceptionMsg(exc, "Some of partitions in Kafka topic(s) report available" + + " offset which is less than end offset during running query with Trigger.AvailableNow.") + TestUtils.assertExceptionMsg(exc, "latest offset: ") + TestUtils.assertExceptionMsg(exc, "end offset: ") + } + test("(de)serialization of initial offsets") { val topic = newTopic() testUtils.createTopic(topic, partitions = 5) @@ -338,6 +533,7 @@ abstract class KafkaMicroBatchSourceSuiteBase extends KafkaSourceSuiteBase { ) // When Trigger.Once() is used, the read limit should be ignored + // NOTE: the test uses the deprecated Trigger.Once() by intention, do not change. val allData = Seq(1) ++ (10 to 20) ++ (100 to 200) withTempDir { dir => testStream(mapped)( @@ -435,6 +631,7 @@ abstract class KafkaMicroBatchSourceSuiteBase extends KafkaSourceSuiteBase { 13, 14, 15, 16, 17, 18, 19, 2, 20, 21, 22, 23, 24, 25) ) // When Trigger.Once() is used, the read limit should be ignored + // NOTE: the test uses the deprecated Trigger.Once() by intention, do not change. val allData = Seq(1, 2) ++ (10 to 25) ++ (100 to 125) withTempDir { dir => testStream(mapped)( @@ -537,6 +734,7 @@ abstract class KafkaMicroBatchSourceSuiteBase extends KafkaSourceSuiteBase { ) // When Trigger.Once() is used, the read limit should be ignored + // NOTE: the test uses the deprecated Trigger.Once() by intention, do not change. val allData = Seq(1, 2) ++ (10 to 30) ++ (100 to 128) withTempDir { dir => testStream(mapped)( @@ -624,6 +822,45 @@ abstract class KafkaMicroBatchSourceSuiteBase extends KafkaSourceSuiteBase { ) } + test("SPARK-41375: empty partitions should not record to latest offset") { + val topicPrefix = newTopic() + val topic = topicPrefix + "-good" + testUtils.createTopic(topic, partitions = 5) + testUtils.sendMessages(topic, Array("-1")) + require(testUtils.getLatestOffsets(Set(topic)).size === 5) + + val reader = spark + .readStream + .format("kafka") + .option("kafka.bootstrap.servers", testUtils.brokerAddress) + .option("kafka.metadata.max.age.ms", "1") + .option("kafka.request.timeout.ms", "3000") + .option("kafka.default.api.timeout.ms", "3000") + .option("subscribePattern", s"$topicPrefix-.*") + .option("failOnDataLoss", "false") + + val kafka = reader.load() + .selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") + .as[(String, String)] + val mapped = kafka.map(kv => kv._2.toInt + 1) + + testStream(mapped)( + makeSureGetOffsetCalled, + AddKafkaData(Set(topic), 1, 2, 3), + CheckAnswer(2, 3, 4), + Assert { + testUtils.deleteTopic(topic) + true + }, + AssertOnQuery { q => + val latestOffset: Option[(Long, OffsetSeq)] = q.offsetLog.getLatest + latestOffset.exists { offset => + !offset._2.offsets.exists(_.exists(_.json == "{}")) + } + } + ) + } + test("subscribe topic by pattern with topic recreation between batches") { val topicPrefix = newTopic() val topic = topicPrefix + "-good" @@ -792,9 +1029,9 @@ abstract class KafkaMicroBatchSourceSuiteBase extends KafkaSourceSuiteBase { val windowedAggregation = kafka .withWatermark("timestamp", "10 seconds") - .groupBy(window($"timestamp", "5 seconds") as 'window) - .agg(count("*") as 'count) - .select($"window".getField("start") as 'window, $"count") + .groupBy(window($"timestamp", "5 seconds") as Symbol("window")) + .agg(count("*") as Symbol("count")) + .select($"window".getField("start") as Symbol("window"), $"count") val query = windowedAggregation .writeStream @@ -1392,7 +1629,7 @@ class KafkaMicroBatchV1SourceSuite extends KafkaMicroBatchSourceSuiteBase { makeSureGetOffsetCalled, AssertOnQuery { query => query.logicalPlan.collect { - case StreamingExecutionRelation(_: KafkaSource, _) => true + case StreamingExecutionRelation(_: KafkaSource, _, _) => true }.nonEmpty } ) @@ -1448,7 +1685,8 @@ class KafkaMicroBatchV2SourceSuite extends KafkaMicroBatchSourceSuiteBase { val inputPartitions = stream.planInputPartitions( KafkaSourceOffset(Map(tp -> 0L)), KafkaSourceOffset(Map(tp -> 100L))).map(_.asInstanceOf[KafkaBatchInputPartition]) - withClue(s"minPartitions = $minPartitions generated factories $inputPartitions\n\t") { + withClue(s"minPartitions = $minPartitions generated factories " + + s"${inputPartitions.mkString("inputPartitions(", ", ", ")")}\n\t") { assert(inputPartitions.size == numPartitionsGenerated) } } @@ -2280,7 +2518,7 @@ abstract class KafkaSourceSuiteBase extends KafkaSourceTest { val headers = row.getList[Row](row.fieldIndex("headers")).asScala assert(headers.length === expected.length) - (0 until expected.length).foreach { idx => + expected.indices.foreach { idx => val key = headers(idx).getAs[String]("key") val value = headers(idx).getAs[Array[Byte]]("value") assert(key === expected(idx)._1) diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeCalculatorSuite.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeCalculatorSuite.scala similarity index 100% rename from external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeCalculatorSuite.scala rename to connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaOffsetRangeCalculatorSuite.scala diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderSuite.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderSuite.scala similarity index 100% rename from external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderSuite.scala rename to connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaOffsetReaderSuite.scala diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala similarity index 100% rename from external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala rename to connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaRelationSuite.scala diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala similarity index 99% rename from external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala rename to connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala index 4e808a5277a98..f54eff90a5e07 100644 --- a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala +++ b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSinkSuite.scala @@ -524,7 +524,7 @@ abstract class KafkaSinkBatchSuiteBase extends KafkaSinkSuiteBase { test("SPARK-20496: batch - enforce analyzed plans") { val inputEvents = spark.range(1, 1000) - .select(to_json(struct("*")) as 'value) + .select(to_json(struct("*")) as Symbol("value")) val topic = newTopic() testUtils.createTopic(topic) diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala similarity index 100% rename from external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala rename to connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceOffsetSuite.scala diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceProviderSuite.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceProviderSuite.scala similarity index 100% rename from external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceProviderSuite.scala rename to connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSourceProviderSuite.scala diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSparkConfSuite.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSparkConfSuite.scala similarity index 100% rename from external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSparkConfSuite.scala rename to connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaSparkConfSuite.scala diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTest.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTest.scala similarity index 100% rename from external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTest.scala rename to connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTest.scala diff --git a/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala new file mode 100644 index 0000000000000..7c9c40883a58f --- /dev/null +++ b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/KafkaTestUtils.scala @@ -0,0 +1,687 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.kafka010 + +import java.io.{File, IOException} +import java.net.InetSocketAddress +import java.nio.charset.StandardCharsets +import java.util.{Collections, Properties, UUID} +import java.util.concurrent.TimeUnit +import javax.security.auth.login.Configuration + +import scala.collection.JavaConverters._ +import scala.io.Source +import scala.util.control.NonFatal + +import com.google.common.io.Files +import kafka.api.Request +import kafka.server.{HostedPartition, KafkaConfig, KafkaServer} +import kafka.server.checkpoints.OffsetCheckpointFile +import kafka.zk.KafkaZkClient +import org.apache.hadoop.minikdc.MiniKdc +import org.apache.hadoop.security.UserGroupInformation +import org.apache.kafka.clients.CommonClientConfigs +import org.apache.kafka.clients.admin._ +import org.apache.kafka.clients.producer._ +import org.apache.kafka.common.TopicPartition +import org.apache.kafka.common.config.SaslConfigs +import org.apache.kafka.common.network.ListenerName +import org.apache.kafka.common.security.auth.SecurityProtocol.{PLAINTEXT, SASL_PLAINTEXT} +import org.apache.kafka.common.serialization.StringSerializer +import org.apache.kafka.common.utils.SystemTime +import org.apache.zookeeper.client.ZKClientConfig +import org.apache.zookeeper.server.{NIOServerCnxnFactory, ZooKeeperServer} +import org.apache.zookeeper.server.auth.SASLAuthenticationProvider +import org.scalatest.Assertions._ +import org.scalatest.concurrent.Eventually._ +import org.scalatest.time.SpanSugar._ + +import org.apache.spark.{SparkConf, SparkException} +import org.apache.spark.internal.Logging +import org.apache.spark.kafka010.KafkaTokenUtil +import org.apache.spark.util.{SecurityUtils, ShutdownHookManager, Utils} + +/** + * This is a helper class for Kafka test suites. This has the functionality to set up + * and tear down local Kafka servers, and to push data using Kafka producers. + * + * The reason to put Kafka test utility class in src is to test Python related Kafka APIs. + */ +class KafkaTestUtils( + withBrokerProps: Map[String, Object] = Map.empty, + secure: Boolean = false) extends Logging { + + private val JAVA_AUTH_CONFIG = "java.security.auth.login.config" + + private val localHostNameForURI = Utils.localHostNameForURI() + logInfo(s"Local host name is $localHostNameForURI") + + // MiniKDC uses canonical host name on host part, hence we need to provide canonical host name + // on the 'host' part of the principal. + private val localCanonicalHostName = Utils.localCanonicalHostName() + logInfo(s"Local canonical host name is $localCanonicalHostName") + + private var kdc: MiniKdc = _ + + // Zookeeper related configurations + private val zkHost = localHostNameForURI + private var zkPort: Int = 0 + private val zkConnectionTimeout = 60000 + private val zkSessionTimeout = 10000 + + private var zookeeper: EmbeddedZookeeper = _ + private var zkClient: KafkaZkClient = _ + + // Kafka broker related configurations + private val brokerHost = localHostNameForURI + private var brokerPort = 0 + private var brokerConf: KafkaConfig = _ + + private val brokerServiceName = "kafka" + private val clientUser = s"client/$localCanonicalHostName" + private var clientKeytabFile: File = _ + + // Kafka broker server + private var server: KafkaServer = _ + private var adminClient: AdminClient = _ + + // Kafka producer + private var producer: Producer[String, String] = _ + + // Flag to test whether the system is correctly started + private var kdcReady = false + private var zkReady = false + private var brokerReady = false + private var leakDetector: AnyRef = null + + def zkAddress: String = { + assert(zkReady, "Zookeeper not setup yet or already torn down, cannot get zookeeper address") + s"$zkHost:$zkPort" + } + + def brokerAddress: String = { + assert(brokerReady, "Kafka not setup yet or already torn down, cannot get broker address") + s"$brokerHost:$brokerPort" + } + + def zookeeperClient: KafkaZkClient = { + assert(zkReady, "Zookeeper not setup yet or already torn down, cannot get zookeeper client") + Option(zkClient).getOrElse( + throw new IllegalStateException("Zookeeper client is not yet initialized")) + } + + def clientPrincipal: String = { + assert(kdcReady, "KDC should be set up beforehand") + clientUser + "@" + kdc.getRealm() + } + + def clientKeytab: String = { + assert(kdcReady, "KDC should be set up beforehand") + clientKeytabFile.getAbsolutePath() + } + + private def setUpMiniKdc(): Unit = { + val kdcDir = Utils.createTempDir() + val kdcConf = MiniKdc.createConf() + kdcConf.setProperty(MiniKdc.DEBUG, "true") + // The port for MiniKdc service gets selected in the constructor, but will be bound + // to it later in MiniKdc.start() -> MiniKdc.initKDCServer() -> KdcServer.start(). + // In meantime, when some other service might capture the port during this progress, and + // cause BindException. + // This makes our tests which have dedicated JVMs and rely on MiniKDC being flaky + // + // https://issues.apache.org/jira/browse/HADOOP-12656 get fixed in Hadoop 2.8.0. + // + // The workaround here is to periodically repeat this process with a timeout , since we are + // using Hadoop 2.7.4 as default. + // https://issues.apache.org/jira/browse/SPARK-31631 + eventually(timeout(60.seconds), interval(1.second)) { + try { + kdc = new MiniKdc(kdcConf, kdcDir) + kdc.start() + } catch { + case NonFatal(e) => + if (kdc != null) { + kdc.stop() + kdc = null + } + throw e + } + } + // TODO https://issues.apache.org/jira/browse/SPARK-30037 + // Need to build spark's own MiniKDC and customize krb5.conf like Kafka + rewriteKrb5Conf() + kdcReady = true + } + + /** + * In this method we rewrite krb5.conf to make kdc and client use the same enctypes + */ + private def rewriteKrb5Conf(): Unit = { + val krb5Conf = Utils + .tryWithResource(Source.fromFile(kdc.getKrb5conf, "UTF-8"))(_.getLines().toList) + var rewritten = false + val addedConfig = + addedKrb5Config("default_tkt_enctypes", "aes128-cts-hmac-sha1-96") + + addedKrb5Config("default_tgs_enctypes", "aes128-cts-hmac-sha1-96") + val rewriteKrb5Conf = krb5Conf.map(s => + if (s.contains("libdefaults")) { + rewritten = true + s + addedConfig + } else { + s + }).filter(!_.trim.startsWith("#")).mkString(System.lineSeparator()) + + val krb5confStr = if (!rewritten) { + "[libdefaults]" + addedConfig + System.lineSeparator() + + System.lineSeparator() + rewriteKrb5Conf + } else { + rewriteKrb5Conf + } + + kdc.getKrb5conf.delete() + Files.write(krb5confStr, kdc.getKrb5conf, StandardCharsets.UTF_8) + logDebug(s"krb5.conf file content: $krb5confStr") + } + + private def addedKrb5Config(key: String, value: String): String = { + System.lineSeparator() + s" $key=$value" + } + + private def createKeytabsAndJaasConfigFile(): String = { + assert(kdcReady, "KDC should be set up beforehand") + val baseDir = Utils.createTempDir() + + val zkServerUser = s"zookeeper/$localCanonicalHostName" + val zkServerKeytabFile = new File(baseDir, "zookeeper.keytab") + kdc.createPrincipal(zkServerKeytabFile, zkServerUser) + logDebug(s"Created keytab file: ${zkServerKeytabFile.getAbsolutePath()}") + + val zkClientUser = s"zkclient/$localCanonicalHostName" + val zkClientKeytabFile = new File(baseDir, "zkclient.keytab") + kdc.createPrincipal(zkClientKeytabFile, zkClientUser) + logDebug(s"Created keytab file: ${zkClientKeytabFile.getAbsolutePath()}") + + val kafkaServerUser = s"kafka/$localCanonicalHostName" + val kafkaServerKeytabFile = new File(baseDir, "kafka.keytab") + kdc.createPrincipal(kafkaServerKeytabFile, kafkaServerUser) + logDebug(s"Created keytab file: ${kafkaServerKeytabFile.getAbsolutePath()}") + + clientKeytabFile = new File(baseDir, "client.keytab") + kdc.createPrincipal(clientKeytabFile, clientUser) + logDebug(s"Created keytab file: ${clientKeytabFile.getAbsolutePath()}") + + val file = new File(baseDir, "jaas.conf"); + val realm = kdc.getRealm() + val content = + s""" + |Server { + | ${SecurityUtils.getKrb5LoginModuleName()} required + | useKeyTab=true + | storeKey=true + | useTicketCache=false + | refreshKrb5Config=true + | keyTab="${zkServerKeytabFile.getAbsolutePath()}" + | principal="$zkServerUser@$realm"; + |}; + | + |Client { + | ${SecurityUtils.getKrb5LoginModuleName()} required + | useKeyTab=true + | storeKey=true + | useTicketCache=false + | refreshKrb5Config=true + | keyTab="${zkClientKeytabFile.getAbsolutePath()}" + | principal="$zkClientUser@$realm"; + |}; + | + |KafkaServer { + | ${SecurityUtils.getKrb5LoginModuleName()} required + | serviceName="$brokerServiceName" + | useKeyTab=true + | storeKey=true + | keyTab="${kafkaServerKeytabFile.getAbsolutePath()}" + | principal="$kafkaServerUser@$realm"; + |}; + """.stripMargin.trim + Files.write(content, file, StandardCharsets.UTF_8) + logDebug(s"Created JAAS file: ${file.getPath}") + logDebug(s"JAAS file content: $content") + file.getAbsolutePath() + } + + // Set up the Embedded Zookeeper server and get the proper Zookeeper port + private def setupEmbeddedZookeeper(): Unit = { + // Zookeeper server startup + zookeeper = new EmbeddedZookeeper(s"$zkHost:$zkPort") + // Get the actual zookeeper binding port + zkPort = zookeeper.actualPort + zkClient = KafkaZkClient(s"$zkHost:$zkPort", isSecure = false, zkSessionTimeout, + zkConnectionTimeout, 1, new SystemTime(), "test", new ZKClientConfig) + zkReady = true + } + + // Set up the Embedded Kafka server + private def setupEmbeddedKafkaServer(): Unit = { + assert(zkReady, "Zookeeper should be set up beforehand") + + val protocolName = if (!secure) PLAINTEXT.name else SASL_PLAINTEXT.name + + // Kafka broker startup + Utils.startServiceOnPort(brokerPort, port => { + brokerPort = port + brokerConf = new KafkaConfig(brokerConfiguration, doLog = false) + server = new KafkaServer(brokerConf) + server.startup() + brokerPort = server.boundPort(new ListenerName(protocolName)) + (server, brokerPort) + }, new SparkConf(), "KafkaBroker") + + adminClient = AdminClient.create(adminClientConfiguration) + brokerReady = true + } + + /** setup the whole embedded servers, including Zookeeper and Kafka brokers */ + def setup(): Unit = { + // Set up a KafkaTestUtils leak detector so that we can see where the leak KafkaTestUtils is + // created. + val exception = new SparkException("It was created at: ") + leakDetector = ShutdownHookManager.addShutdownHook { () => + logError("Found a leak KafkaTestUtils.", exception) + } + + if (secure) { + SecurityUtils.setGlobalKrbDebug(true) + setUpMiniKdc() + val jaasConfigFile = createKeytabsAndJaasConfigFile() + System.setProperty(JAVA_AUTH_CONFIG, jaasConfigFile) + Configuration.getConfiguration.refresh() + } else { + System.clearProperty(JAVA_AUTH_CONFIG) + } + setupEmbeddedZookeeper() + setupEmbeddedKafkaServer() + eventually(timeout(1.minute)) { + assert(zkClient.getAllBrokersInCluster.nonEmpty, "Broker was not up in 60 seconds") + } + } + + /** Teardown the whole servers, including Kafka broker and Zookeeper */ + def teardown(): Unit = { + if (leakDetector != null) { + ShutdownHookManager.removeShutdownHook(leakDetector) + } + brokerReady = false + zkReady = false + kdcReady = false + + if (producer != null) { + producer.close() + producer = null + } + + if (adminClient != null) { + adminClient.close() + adminClient = null + } + + if (server != null) { + server.shutdown() + server.awaitShutdown() + server = null + } + + // On Windows, `logDirs` is left open even after Kafka server above is completely shut down + // in some cases. It leads to test failures on Windows if the directory deletion failure + // throws an exception. + brokerConf.logDirs.foreach { f => + try { + Utils.deleteRecursively(new File(f)) + } catch { + case e: IOException if Utils.isWindows => + logWarning(e.getMessage) + } + } + + if (zkClient != null) { + zkClient.close() + zkClient = null + } + + if (zookeeper != null) { + zookeeper.shutdown() + zookeeper = null + } + + System.clearProperty(JAVA_AUTH_CONFIG) + Configuration.getConfiguration.refresh() + if (kdc != null) { + kdc.stop() + kdc = null + } + UserGroupInformation.reset() + SecurityUtils.setGlobalKrbDebug(false) + } + + /** Create a Kafka topic and wait until it is propagated to the whole cluster */ + def createTopic(topic: String, partitions: Int, overwrite: Boolean = false): Unit = { + var created = false + while (!created) { + try { + val newTopic = new NewTopic(topic, partitions, 1.shortValue()) + adminClient.createTopics(Collections.singleton(newTopic)) + created = true + } catch { + // Workaround fact that TopicExistsException is in kafka.common in 0.10.0 and + // org.apache.kafka.common.errors in 0.10.1 (!) + case e: Exception if (e.getClass.getSimpleName == "TopicExistsException") && overwrite => + deleteTopic(topic) + } + } + // wait until metadata is propagated + (0 until partitions).foreach { p => + waitUntilMetadataIsPropagated(topic, p) + } + } + + def getAllTopicsAndPartitionSize(): Seq[(String, Int)] = { + zkClient.getPartitionsForTopics(zkClient.getAllTopicsInCluster()).mapValues(_.size).toSeq + } + + /** Create a Kafka topic and wait until it is propagated to the whole cluster */ + def createTopic(topic: String): Unit = { + createTopic(topic, 1) + } + + /** Delete a Kafka topic and wait until it is propagated to the whole cluster */ + def deleteTopic(topic: String): Unit = { + val partitions = zkClient.getPartitionsForTopics(Set(topic))(topic).size + adminClient.deleteTopics(Collections.singleton(topic)) + verifyTopicDeletionWithRetries(topic, partitions, List(this.server)) + } + + /** Add new partitions to a Kafka topic */ + def addPartitions(topic: String, partitions: Int): Unit = { + adminClient.createPartitions( + Map(topic -> NewPartitions.increaseTo(partitions)).asJava, + new CreatePartitionsOptions) + // wait until metadata is propagated + (0 until partitions).foreach { p => + waitUntilMetadataIsPropagated(topic, p) + } + } + + def sendMessages(topic: String, msgs: Array[String]): Seq[(String, RecordMetadata)] = { + sendMessages(topic, msgs, None) + } + + def sendMessages( + topic: String, + msgs: Array[String], + part: Option[Int]): Seq[(String, RecordMetadata)] = { + val records = msgs.map { msg => + val builder = new RecordBuilder(topic, msg) + part.foreach { p => builder.partition(p) } + builder.build() + } + sendMessages(records) + } + + def sendMessage(msg: ProducerRecord[String, String]): Seq[(String, RecordMetadata)] = { + sendMessages(Array(msg)) + } + + def sendMessages(msgs: Seq[ProducerRecord[String, String]]): Seq[(String, RecordMetadata)] = { + producer = new KafkaProducer[String, String](producerConfiguration) + val offsets = try { + msgs.map { msg => + val metadata = producer.send(msg).get(10, TimeUnit.SECONDS) + logInfo(s"\tSent ($msg) to partition ${metadata.partition}, offset ${metadata.offset}") + (msg.value(), metadata) + } + } finally { + if (producer != null) { + producer.close() + producer = null + } + } + offsets + } + + def cleanupLogs(): Unit = { + server.logManager.cleanupLogs() + } + + private def getOffsets(topics: Set[String], offsetSpec: OffsetSpec): Map[TopicPartition, Long] = { + val listOffsetsParams = adminClient.describeTopics(topics.asJava).all().get().asScala + .flatMap { topicDescription => + topicDescription._2.partitions().asScala.map { topicPartitionInfo => + new TopicPartition(topicDescription._1, topicPartitionInfo.partition()) + } + }.map(_ -> offsetSpec).toMap.asJava + val partitionOffsets = adminClient.listOffsets(listOffsetsParams).all().get().asScala + .map(result => result._1 -> result._2.offset()).toMap + partitionOffsets + } + + def getEarliestOffsets(topics: Set[String]): Map[TopicPartition, Long] = { + getOffsets(topics, OffsetSpec.earliest()) + } + + def getLatestOffsets(topics: Set[String]): Map[TopicPartition, Long] = { + getOffsets(topics, OffsetSpec.latest()) + } + + def listConsumerGroups(): ListConsumerGroupsResult = { + adminClient.listConsumerGroups() + } + + protected def brokerConfiguration: Properties = { + val props = new Properties() + props.put("broker.id", "0") + props.put("listeners", s"PLAINTEXT://$localHostNameForURI:$brokerPort") + props.put("log.dir", Utils.createTempDir().getAbsolutePath) + props.put("zookeeper.connect", zkAddress) + props.put("zookeeper.connection.timeout.ms", "60000") + props.put("log.flush.interval.messages", "1") + props.put("replica.socket.timeout.ms", "1500") + props.put("delete.topic.enable", "true") + props.put("group.initial.rebalance.delay.ms", "10") + + // Change the following settings as we have only 1 broker + props.put("offsets.topic.num.partitions", "1") + props.put("offsets.topic.replication.factor", "1") + props.put("transaction.state.log.replication.factor", "1") + props.put("transaction.state.log.min.isr", "1") + + if (secure) { + props.put("listeners", s"SASL_PLAINTEXT://$localHostNameForURI:0") + props.put("advertised.listeners", s"SASL_PLAINTEXT://$localHostNameForURI:0") + props.put("inter.broker.listener.name", "SASL_PLAINTEXT") + props.put("delegation.token.master.key", UUID.randomUUID().toString) + props.put("sasl.enabled.mechanisms", "GSSAPI,SCRAM-SHA-512") + } + + // Can not use properties.putAll(propsMap.asJava) in scala-2.12 + // See https://github.com/scala/bug/issues/10418 + withBrokerProps.foreach { case (k, v) => props.put(k, v) } + props + } + + private def adminClientConfiguration: Properties = { + val props = new Properties() + props.put(CommonClientConfigs.BOOTSTRAP_SERVERS_CONFIG, s"$brokerHost:$brokerPort") + setAuthenticationConfigIfNeeded(props) + props + } + + private def producerConfiguration: Properties = { + val props = new Properties() + props.put("bootstrap.servers", brokerAddress) + props.put("value.serializer", classOf[StringSerializer].getName) + props.put("key.serializer", classOf[StringSerializer].getName) + // wait for all in-sync replicas to ack sends + props.put("acks", "all") + props.put("partitioner.class", + classOf[org.apache.kafka.clients.producer.internals.DefaultPartitioner].getName) + setAuthenticationConfigIfNeeded(props) + props + } + + /** Call `f` with a `KafkaProducer` that has initialized transactions. */ + def withTransactionalProducer(f: KafkaProducer[String, String] => Unit): Unit = { + val props = producerConfiguration + props.put("transactional.id", UUID.randomUUID().toString) + val producer = new KafkaProducer[String, String](props) + try { + producer.initTransactions() + f(producer) + } finally { + producer.close() + } + } + + private def setAuthenticationConfigIfNeeded(props: Properties): Unit = { + if (secure) { + val jaasParams = KafkaTokenUtil.getKeytabJaasParams( + clientKeytabFile.getAbsolutePath, clientPrincipal, brokerServiceName) + props.put(SaslConfigs.SASL_JAAS_CONFIG, jaasParams) + props.put(CommonClientConfigs.SECURITY_PROTOCOL_CONFIG, SASL_PLAINTEXT.name) + } + } + + /** Verify topic is deleted in all places, e.g, brokers, zookeeper. */ + private def verifyTopicDeletion( + topic: String, + numPartitions: Int, + servers: Seq[KafkaServer]): Unit = { + val topicAndPartitions = (0 until numPartitions).map(new TopicPartition(topic, _)) + + // wait until admin path for delete topic is deleted, signaling completion of topic deletion + assert(!zkClient.isTopicMarkedForDeletion(topic), "topic is still marked for deletion") + assert(!zkClient.topicExists(topic), "topic still exists") + // ensure that the topic-partition has been deleted from all brokers' replica managers + assert(servers.forall(server => topicAndPartitions.forall(tp => + server.replicaManager.getPartition(tp) == HostedPartition.None)), + s"topic $topic still exists in the replica manager") + // ensure that logs from all replicas are deleted if delete topic is marked successful + assert(servers.forall(server => topicAndPartitions.forall(tp => + server.getLogManager.getLog(tp).isEmpty)), + s"topic $topic still exists in log manager") + // ensure that topic is removed from all cleaner offsets + assert(servers.forall(server => topicAndPartitions.forall { tp => + val checkpoints = server.getLogManager.liveLogDirs.map { logDir => + new OffsetCheckpointFile(new File(logDir, "cleaner-offset-checkpoint")).read() + } + checkpoints.forall(checkpointsPerLogDir => !checkpointsPerLogDir.contains(tp)) + }), s"checkpoint for topic $topic still exists") + // ensure the topic is gone + assert( + !zkClient.getAllTopicsInCluster().contains(topic), + s"topic $topic still exists on zookeeper") + } + + /** Verify topic is deleted. Retry to delete the topic if not. */ + private def verifyTopicDeletionWithRetries( + topic: String, + numPartitions: Int, + servers: Seq[KafkaServer]): Unit = { + eventually(timeout(1.minute), interval(200.milliseconds)) { + try { + verifyTopicDeletion(topic, numPartitions, servers) + } catch { + case e: Throwable => + // As pushing messages into Kafka updates Zookeeper asynchronously, there is a small + // chance that a topic will be recreated after deletion due to the asynchronous update. + // Hence, delete the topic and retry. + adminClient.deleteTopics(Collections.singleton(topic)) + throw e + } + } + } + + private def waitUntilMetadataIsPropagated(topic: String, partition: Int): Unit = { + def isPropagated = server.dataPlaneRequestProcessor.metadataCache + .getPartitionInfo(topic, partition) match { + case Some(partitionState) => + zkClient.getLeaderForPartition(new TopicPartition(topic, partition)).isDefined && + Request.isValidBrokerId(partitionState.leader) && + !partitionState.replicas.isEmpty + + case _ => + false + } + eventually(timeout(1.minute)) { + assert(isPropagated, s"Partition [$topic, $partition] metadata not propagated after timeout") + } + } + + /** + * Wait until the latest offset of the given `TopicPartition` is not less than `offset`. + */ + def waitUntilOffsetAppears(topicPartition: TopicPartition, offset: Long): Unit = { + eventually(timeout(1.minute)) { + val currentOffset = getLatestOffsets(Set(topicPartition.topic)).get(topicPartition) + assert(currentOffset.nonEmpty && currentOffset.get >= offset) + } + } + + private class EmbeddedZookeeper(val zkConnect: String) { + private val ZOOKEEPER_AUTH_PROVIDER = "zookeeper.authProvider.1" + + val snapshotDir = Utils.createTempDir() + val logDir = Utils.createTempDir() + + if (secure) { + System.setProperty(ZOOKEEPER_AUTH_PROVIDER, classOf[SASLAuthenticationProvider].getName) + } else { + System.clearProperty(ZOOKEEPER_AUTH_PROVIDER) + } + val zookeeper = new ZooKeeperServer(snapshotDir, logDir, 500) + val (ip, port) = { + val splits = zkConnect.split(":") + val port = splits(splits.length - 1) + (zkConnect.substring(0, zkConnect.length - port.length - 1), port.toInt) + } + val factory = new NIOServerCnxnFactory() + factory.configure(new InetSocketAddress(ip, port), 16) + factory.startup(zookeeper) + + val actualPort = factory.getLocalPort + + def shutdown(): Unit = { + factory.shutdown() + // The directories are not closed even if the ZooKeeper server is shut down. + // Please see ZOOKEEPER-1844, which is fixed in 3.4.6+. It leads to test failures + // on Windows if the directory deletion failure throws an exception. + try { + Utils.deleteRecursively(snapshotDir) + } catch { + case e: IOException if Utils.isWindows => + logWarning(e.getMessage) + } + try { + Utils.deleteRecursively(logDir) + } catch { + case e: IOException if Utils.isWindows => + logWarning(e.getMessage) + } + System.clearProperty(ZOOKEEPER_AUTH_PROVIDER) + } + } +} diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/RecordBuilder.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/RecordBuilder.scala similarity index 100% rename from external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/RecordBuilder.scala rename to connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/RecordBuilder.scala diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/consumer/FetchedDataPoolSuite.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/consumer/FetchedDataPoolSuite.scala similarity index 100% rename from external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/consumer/FetchedDataPoolSuite.scala rename to connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/consumer/FetchedDataPoolSuite.scala diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/consumer/InternalKafkaConsumerPoolSuite.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/consumer/InternalKafkaConsumerPoolSuite.scala similarity index 100% rename from external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/consumer/InternalKafkaConsumerPoolSuite.scala rename to connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/consumer/InternalKafkaConsumerPoolSuite.scala diff --git a/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumerSuite.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumerSuite.scala new file mode 100644 index 0000000000000..30e8e348f74d2 --- /dev/null +++ b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/consumer/KafkaDataConsumerSuite.scala @@ -0,0 +1,391 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.kafka010.consumer + +import java.{util => ju} +import java.nio.charset.StandardCharsets +import java.util.concurrent.{Executors, TimeUnit} + +import scala.collection.JavaConverters._ +import scala.collection.immutable +import scala.util.Random + +import org.apache.kafka.clients.consumer.ConsumerConfig._ +import org.apache.kafka.common.TopicPartition +import org.apache.kafka.common.serialization.ByteArrayDeserializer +import org.scalatest.PrivateMethodTester + +import org.apache.spark.{TaskContext, TaskContextImpl} +import org.apache.spark.kafka010.KafkaDelegationTokenTest +import org.apache.spark.sql.kafka010.{KafkaTestUtils, RecordBuilder} +import org.apache.spark.sql.kafka010.consumer.KafkaDataConsumer.CacheKey +import org.apache.spark.sql.test.SharedSparkSession + +class KafkaDataConsumerSuite + extends SharedSparkSession + with PrivateMethodTester + with KafkaDelegationTokenTest { + + protected var testUtils: KafkaTestUtils = _ + private val topic = "topic" + Random.nextInt() + private val topicPartition = new TopicPartition(topic, 0) + private val groupId = "groupId" + + override def beforeAll(): Unit = { + super.beforeAll() + testUtils = new KafkaTestUtils(Map[String, Object]()) + testUtils.setup() + } + + override def afterAll(): Unit = { + if (testUtils != null) { + testUtils.teardown() + testUtils = null + } + super.afterAll() + } + + private def getKafkaParams() = Map[String, Object]( + GROUP_ID_CONFIG -> "groupId", + BOOTSTRAP_SERVERS_CONFIG -> testUtils.brokerAddress, + KEY_DESERIALIZER_CLASS_CONFIG -> classOf[ByteArrayDeserializer].getName, + VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[ByteArrayDeserializer].getName, + AUTO_OFFSET_RESET_CONFIG -> "earliest", + ENABLE_AUTO_COMMIT_CONFIG -> "false" + ).asJava + private var fetchedDataPool: FetchedDataPool = _ + private var consumerPool: InternalKafkaConsumerPool = _ + + override def beforeEach(): Unit = { + super.beforeEach() + + fetchedDataPool = { + val fetchedDataPoolMethod = PrivateMethod[FetchedDataPool](Symbol("fetchedDataPool")) + KafkaDataConsumer.invokePrivate(fetchedDataPoolMethod()) + } + + consumerPool = { + val internalKafkaConsumerPoolMethod = + PrivateMethod[InternalKafkaConsumerPool](Symbol("consumerPool")) + KafkaDataConsumer.invokePrivate(internalKafkaConsumerPoolMethod()) + } + + fetchedDataPool.reset() + consumerPool.reset() + } + + test("SPARK-19886: Report error cause correctly in reportDataLoss") { + val cause = new Exception("D'oh!") + val reportDataLoss = PrivateMethod[Unit](Symbol("reportDataLoss0")) + val e = intercept[IllegalStateException] { + KafkaDataConsumer.invokePrivate(reportDataLoss(true, "message", cause)) + } + assert(e.getCause === cause) + } + + test("new KafkaDataConsumer instance in case of Task retry") { + try { + val kafkaParams = getKafkaParams() + val key = CacheKey(groupId, topicPartition) + + val context1 = new TaskContextImpl(0, 0, 0, 0, 0, 1, null, null, null) + TaskContext.setTaskContext(context1) + val consumer1Underlying = initSingleConsumer(kafkaParams, key) + + val context2 = new TaskContextImpl(0, 0, 0, 0, 1, 1, null, null, null) + TaskContext.setTaskContext(context2) + val consumer2Underlying = initSingleConsumer(kafkaParams, key) + + // here we expect different consumer as pool will invalidate for task reattempt + assert(consumer2Underlying.ne(consumer1Underlying)) + } finally { + TaskContext.unset() + } + } + + test("same KafkaDataConsumer instance in case of same token") { + try { + val kafkaParams = getKafkaParams() + val key = new CacheKey(groupId, topicPartition) + + val context = new TaskContextImpl(0, 0, 0, 0, 0, 1, null, null, null) + TaskContext.setTaskContext(context) + setSparkEnv( + Map( + s"spark.kafka.clusters.$identifier1.auth.bootstrap.servers" -> bootStrapServers + ) + ) + addTokenToUGI(tokenService1, tokenId1, tokenPassword1) + val consumer1Underlying = initSingleConsumer(kafkaParams, key) + val consumer2Underlying = initSingleConsumer(kafkaParams, key) + + assert(consumer2Underlying.eq(consumer1Underlying)) + } finally { + TaskContext.unset() + } + } + + test("new KafkaDataConsumer instance in case of token renewal") { + try { + val kafkaParams = getKafkaParams() + val key = new CacheKey(groupId, topicPartition) + + val context = new TaskContextImpl(0, 0, 0, 0, 0, 1, null, null, null) + TaskContext.setTaskContext(context) + setSparkEnv( + Map( + s"spark.kafka.clusters.$identifier1.auth.bootstrap.servers" -> bootStrapServers + ) + ) + addTokenToUGI(tokenService1, tokenId1, tokenPassword1) + val consumer1Underlying = initSingleConsumer(kafkaParams, key) + addTokenToUGI(tokenService1, tokenId2, tokenPassword2) + val consumer2Underlying = initSingleConsumer(kafkaParams, key) + + assert(consumer2Underlying.ne(consumer1Underlying)) + } finally { + TaskContext.unset() + } + } + + private def initSingleConsumer( + kafkaParams: ju.Map[String, Object], + key: CacheKey): InternalKafkaConsumer = { + val consumer = KafkaDataConsumer.acquire(topicPartition, kafkaParams) + + // any method call which requires consumer is necessary + consumer.getOrRetrieveConsumer() + + val consumerUnderlying = consumer._consumer + assert(consumerUnderlying.isDefined) + + consumer.release() + + assert(consumerPool.size(key) === 1) + // check whether acquired object is available in pool + val pooledObj = consumerPool.borrowObject(key, kafkaParams) + assert(consumerUnderlying.get.eq(pooledObj)) + consumerPool.returnObject(pooledObj) + + consumerUnderlying.get + } + + test("SPARK-23623: concurrent use of KafkaDataConsumer") { + val data: immutable.IndexedSeq[(String, Seq[(String, Array[Byte])])] = + prepareTestTopicHavingTestMessages(topic) + + val topicPartition = new TopicPartition(topic, 0) + val kafkaParams = getKafkaParams() + val numThreads = 100 + val numConsumerUsages = 500 + + @volatile var error: Throwable = null + + def consume(i: Int): Unit = { + val taskContext = if (Random.nextBoolean) { + new TaskContextImpl(0, 0, 0, 0, attemptNumber = Random.nextInt(2), 1, + null, null, null) + } else { + null + } + TaskContext.setTaskContext(taskContext) + val consumer = KafkaDataConsumer.acquire(topicPartition, kafkaParams) + try { + val range = consumer.getAvailableOffsetRange() + val rcvd = range.earliest until range.latest map { offset => + val record = consumer.get(offset, Long.MaxValue, 10000, failOnDataLoss = false) + val value = new String(record.value(), StandardCharsets.UTF_8) + val headers = record.headers().toArray.map(header => (header.key(), header.value())).toSeq + (value, headers) + } + data.zip(rcvd).foreach { case (expected, actual) => + // value + assert(expected._1 === actual._1) + // headers + expected._2.zip(actual._2).foreach { case (l, r) => + // header key + assert(l._1 === r._1) + // header value + assert(l._2 === r._2) + } + } + } catch { + case e: Throwable => + error = e + throw e + } finally { + consumer.release() + } + } + + val threadpool = Executors.newFixedThreadPool(numThreads) + try { + val futures = (1 to numConsumerUsages).map { i => + threadpool.submit(new Runnable { + override def run(): Unit = { consume(i) } + }) + } + futures.foreach(_.get(1, TimeUnit.MINUTES)) + assert(error == null) + } finally { + threadpool.shutdown() + } + } + + test("SPARK-25151 Handles multiple tasks in executor fetching same (topic, partition) pair") { + prepareTestTopicHavingTestMessages(topic) + val topicPartition = new TopicPartition(topic, 0) + + val kafkaParams = getKafkaParams() + + withTaskContext(TaskContext.empty()) { + // task A trying to fetch offset 0 to 100, and read 5 records + val consumer1 = KafkaDataConsumer.acquire(topicPartition, kafkaParams) + val lastOffsetForConsumer1 = readAndGetLastOffset(consumer1, 0, 100, 5) + consumer1.release() + + assertFetchedDataPoolStatistic(fetchedDataPool, expectedNumCreated = 1, expectedNumTotal = 1) + + // task B trying to fetch offset 300 to 500, and read 5 records + val consumer2 = KafkaDataConsumer.acquire(topicPartition, kafkaParams) + val lastOffsetForConsumer2 = readAndGetLastOffset(consumer2, 300, 500, 5) + consumer2.release() + + assertFetchedDataPoolStatistic(fetchedDataPool, expectedNumCreated = 2, expectedNumTotal = 2) + + // task A continue reading from the last offset + 1, with upper bound 100 again + val consumer1a = KafkaDataConsumer.acquire(topicPartition, kafkaParams) + + consumer1a.get(lastOffsetForConsumer1 + 1, 100, 10000, failOnDataLoss = false) + consumer1a.release() + + // pool should succeed to provide cached data instead of creating one + assertFetchedDataPoolStatistic(fetchedDataPool, expectedNumCreated = 2, expectedNumTotal = 2) + + // task B also continue reading from the last offset + 1, with upper bound 500 again + val consumer2a = KafkaDataConsumer.acquire(topicPartition, kafkaParams) + + consumer2a.get(lastOffsetForConsumer2 + 1, 500, 10000, failOnDataLoss = false) + consumer2a.release() + + // same expectation: pool should succeed to provide cached data instead of creating one + assertFetchedDataPoolStatistic(fetchedDataPool, expectedNumCreated = 2, expectedNumTotal = 2) + } + } + + test("SPARK-25151 Handles multiple tasks in executor fetching same (topic, partition) pair " + + "and same offset (edge-case) - data in use") { + prepareTestTopicHavingTestMessages(topic) + val topicPartition = new TopicPartition(topic, 0) + + val kafkaParams = getKafkaParams() + + withTaskContext(TaskContext.empty()) { + // task A trying to fetch offset 0 to 100, and read 5 records (still reading) + val consumer1 = KafkaDataConsumer.acquire(topicPartition, kafkaParams) + val lastOffsetForConsumer1 = readAndGetLastOffset(consumer1, 0, 100, 5) + + assertFetchedDataPoolStatistic(fetchedDataPool, expectedNumCreated = 1, expectedNumTotal = 1) + + // task B trying to fetch offset the last offset task A is reading so far + 1 to 500 + // this is a condition for edge case + val consumer2 = KafkaDataConsumer.acquire(topicPartition, kafkaParams) + consumer2.get(lastOffsetForConsumer1 + 1, 100, 10000, failOnDataLoss = false) + + // Pool must create a new fetched data instead of returning existing on now in use even + // there's fetched data matching start offset. + assertFetchedDataPoolStatistic(fetchedDataPool, expectedNumCreated = 2, expectedNumTotal = 2) + + consumer1.release() + consumer2.release() + } + } + + test("SPARK-25151 Handles multiple tasks in executor fetching same (topic, partition) pair " + + "and same offset (edge-case) - data not in use") { + prepareTestTopicHavingTestMessages(topic) + val topicPartition = new TopicPartition(topic, 0) + + val kafkaParams = getKafkaParams() + + withTaskContext(TaskContext.empty()) { + // task A trying to fetch offset 0 to 100, and read 5 records (still reading) + val consumer1 = KafkaDataConsumer.acquire(topicPartition, kafkaParams) + val lastOffsetForConsumer1 = readAndGetLastOffset(consumer1, 0, 100, 5) + consumer1.release() + + assertFetchedDataPoolStatistic(fetchedDataPool, expectedNumCreated = 1, expectedNumTotal = 1) + + // task B trying to fetch offset the last offset task A is reading so far + 1 to 500 + // this is a condition for edge case + val consumer2 = KafkaDataConsumer.acquire(topicPartition, kafkaParams) + consumer2.get(lastOffsetForConsumer1 + 1, 100, 10000, failOnDataLoss = false) + + // Pool cannot determine the origin task, so it has to just provide matching one. + // task A may come back and try to fetch, and cannot find previous data + // (or the data is in use). + // If then task A may have to fetch from Kafka, but we already avoided fetching from Kafka in + // task B, so it is not a big deal in overall. + assertFetchedDataPoolStatistic(fetchedDataPool, expectedNumCreated = 1, expectedNumTotal = 1) + + consumer2.release() + } + } + + private def assertFetchedDataPoolStatistic( + fetchedDataPool: FetchedDataPool, + expectedNumCreated: Long, + expectedNumTotal: Long): Unit = { + assert(fetchedDataPool.numCreated === expectedNumCreated) + assert(fetchedDataPool.numTotal === expectedNumTotal) + } + + private def readAndGetLastOffset( + consumer: KafkaDataConsumer, + startOffset: Long, + untilOffset: Long, + numToRead: Int): Long = { + var lastOffset: Long = startOffset - 1 + (0 until numToRead).foreach { _ => + val record = consumer.get(lastOffset + 1, untilOffset, 10000, failOnDataLoss = false) + // validation for fetched record is covered by other tests, so skip on validating + lastOffset = record.offset() + } + lastOffset + } + + private def prepareTestTopicHavingTestMessages(topic: String) = { + val data = (1 to 1000).map(i => (i.toString, Seq[(String, Array[Byte])]())) + testUtils.createTopic(topic, 1) + val messages = data.map { case (value, hdrs) => + new RecordBuilder(topic, value).headers(hdrs).build() + } + testUtils.sendMessages(messages) + data + } + + private def withTaskContext(context: TaskContext)(task: => Unit): Unit = { + try { + TaskContext.setTaskContext(context) + task + } finally { + TaskContext.unset() + } + } + +} diff --git a/external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/producer/InternalKafkaProducerPoolSuite.scala b/connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/producer/InternalKafkaProducerPoolSuite.scala similarity index 100% rename from external/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/producer/InternalKafkaProducerPoolSuite.scala rename to connector/kafka-0-10-sql/src/test/scala/org/apache/spark/sql/kafka010/producer/InternalKafkaProducerPoolSuite.scala diff --git a/connector/kafka-0-10-token-provider/pom.xml b/connector/kafka-0-10-token-provider/pom.xml new file mode 100644 index 0000000000000..364bd62deca22 --- /dev/null +++ b/connector/kafka-0-10-token-provider/pom.xml @@ -0,0 +1,95 @@ + + + + + 4.0.0 + + org.apache.spark + spark-parent_2.12 + 3.4.1 + ../../pom.xml + + + org.apache.spark + spark-token-provider-kafka-0-10_2.12 + + token-provider-kafka-0-10 + + jar + Kafka 0.10+ Token Provider for Streaming + https://spark.apache.org/ + + + + org.apache.spark + spark-core_${scala.binary.version} + ${project.version} + provided + + + org.apache.spark + spark-core_${scala.binary.version} + ${project.version} + test-jar + test + + + org.apache.kafka + kafka-clients + ${kafka.version} + + + com.github.luben + zstd-jni + + + + + org.mockito + mockito-core + test + + + org.apache.hadoop + ${hadoop-client-runtime.artifact} + ${hadoop.deps.scope} + + + org.apache.spark + spark-tags_${scala.binary.version} + + + + + org.apache.spark + spark-tags_${scala.binary.version} + test-jar + test + + + + + + target/scala-${scala.binary.version}/classes + target/scala-${scala.binary.version}/test-classes + + + diff --git a/external/kafka-0-10-token-provider/src/main/resources/META-INF/services/org.apache.spark.security.HadoopDelegationTokenProvider b/connector/kafka-0-10-token-provider/src/main/resources/META-INF/services/org.apache.spark.security.HadoopDelegationTokenProvider similarity index 100% rename from external/kafka-0-10-token-provider/src/main/resources/META-INF/services/org.apache.spark.security.HadoopDelegationTokenProvider rename to connector/kafka-0-10-token-provider/src/main/resources/META-INF/services/org.apache.spark.security.HadoopDelegationTokenProvider diff --git a/external/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaConfigUpdater.scala b/connector/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaConfigUpdater.scala similarity index 100% rename from external/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaConfigUpdater.scala rename to connector/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaConfigUpdater.scala diff --git a/external/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaDelegationTokenProvider.scala b/connector/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaDelegationTokenProvider.scala similarity index 100% rename from external/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaDelegationTokenProvider.scala rename to connector/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaDelegationTokenProvider.scala diff --git a/external/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaRedactionUtil.scala b/connector/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaRedactionUtil.scala similarity index 100% rename from external/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaRedactionUtil.scala rename to connector/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaRedactionUtil.scala diff --git a/external/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaTokenSparkConf.scala b/connector/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaTokenSparkConf.scala similarity index 100% rename from external/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaTokenSparkConf.scala rename to connector/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaTokenSparkConf.scala diff --git a/external/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaTokenUtil.scala b/connector/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaTokenUtil.scala similarity index 100% rename from external/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaTokenUtil.scala rename to connector/kafka-0-10-token-provider/src/main/scala/org/apache/spark/kafka010/KafkaTokenUtil.scala diff --git a/external/kafka-0-10-token-provider/src/test/resources/log4j2.properties b/connector/kafka-0-10-token-provider/src/test/resources/log4j2.properties similarity index 100% rename from external/kafka-0-10-token-provider/src/test/resources/log4j2.properties rename to connector/kafka-0-10-token-provider/src/test/resources/log4j2.properties diff --git a/external/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaConfigUpdaterSuite.scala b/connector/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaConfigUpdaterSuite.scala similarity index 100% rename from external/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaConfigUpdaterSuite.scala rename to connector/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaConfigUpdaterSuite.scala diff --git a/external/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaDelegationTokenTest.scala b/connector/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaDelegationTokenTest.scala similarity index 100% rename from external/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaDelegationTokenTest.scala rename to connector/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaDelegationTokenTest.scala diff --git a/external/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaHadoopDelegationTokenManagerSuite.scala b/connector/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaHadoopDelegationTokenManagerSuite.scala similarity index 100% rename from external/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaHadoopDelegationTokenManagerSuite.scala rename to connector/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaHadoopDelegationTokenManagerSuite.scala diff --git a/external/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaRedactionUtilSuite.scala b/connector/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaRedactionUtilSuite.scala similarity index 100% rename from external/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaRedactionUtilSuite.scala rename to connector/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaRedactionUtilSuite.scala diff --git a/external/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaTokenSparkConfSuite.scala b/connector/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaTokenSparkConfSuite.scala similarity index 98% rename from external/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaTokenSparkConfSuite.scala rename to connector/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaTokenSparkConfSuite.scala index 17caf96818e47..e42704b85909b 100644 --- a/external/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaTokenSparkConfSuite.scala +++ b/connector/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaTokenSparkConfSuite.scala @@ -18,11 +18,10 @@ package org.apache.spark.kafka010 import org.apache.kafka.common.security.auth.SecurityProtocol.{SASL_SSL, SSL} -import org.scalatest.BeforeAndAfterEach import org.apache.spark.{SparkConf, SparkFunSuite} -class KafkaTokenSparkConfSuite extends SparkFunSuite with BeforeAndAfterEach { +class KafkaTokenSparkConfSuite extends SparkFunSuite { private val identifier1 = "cluster1" private val identifier2 = "cluster2" private val authBootStrapServers = "127.0.0.1:0" diff --git a/external/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaTokenUtilSuite.scala b/connector/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaTokenUtilSuite.scala similarity index 100% rename from external/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaTokenUtilSuite.scala rename to connector/kafka-0-10-token-provider/src/test/scala/org/apache/spark/kafka010/KafkaTokenUtilSuite.scala diff --git a/connector/kafka-0-10/pom.xml b/connector/kafka-0-10/pom.xml new file mode 100644 index 0000000000000..ad667971e4faa --- /dev/null +++ b/connector/kafka-0-10/pom.xml @@ -0,0 +1,150 @@ + + + + + 4.0.0 + + org.apache.spark + spark-parent_2.12 + 3.4.1 + ../../pom.xml + + + spark-streaming-kafka-0-10_2.12 + + streaming-kafka-0-10 + + jar + Spark Integration for Kafka 0.10 + https://spark.apache.org/ + + + + org.apache.spark + spark-token-provider-kafka-0-10_${scala.binary.version} + ${project.version} + + + org.apache.spark + spark-streaming_${scala.binary.version} + ${project.version} + provided + + + org.apache.spark + spark-streaming_${scala.binary.version} + ${project.version} + test-jar + test + + + org.apache.spark + spark-core_${scala.binary.version} + ${project.version} + test-jar + test + + + + org.apache.kafka + kafka-clients + ${kafka.version} + + + com.github.luben + zstd-jni + + + + + org.apache.kafka + kafka_${scala.binary.version} + ${kafka.version} + test + + + com.fasterxml.jackson.core + jackson-core + + + com.fasterxml.jackson.core + jackson-databind + + + com.fasterxml.jackson.core + jackson-annotations + + + + + + org.apache.zookeeper + zookeeper + 3.5.7 + test + + + net.sf.jopt-simple + jopt-simple + 3.2 + test + + + org.scalacheck + scalacheck_${scala.binary.version} + test + + + org.mockito + mockito-core + test + + + org.apache.spark + spark-tags_${scala.binary.version} + + + org.jmock + jmock-junit4 + test + + + + + org.apache.spark + spark-tags_${scala.binary.version} + test-jar + test + + + + + + target/scala-${scala.binary.version}/classes + target/scala-${scala.binary.version}/test-classes + + + diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/ConsumerStrategy.scala b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/ConsumerStrategy.scala similarity index 100% rename from external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/ConsumerStrategy.scala rename to connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/ConsumerStrategy.scala diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala similarity index 100% rename from external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala rename to connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/DirectKafkaInputDStream.scala diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaDataConsumer.scala b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaDataConsumer.scala similarity index 100% rename from external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaDataConsumer.scala rename to connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaDataConsumer.scala diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala similarity index 100% rename from external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala rename to connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDD.scala diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDDPartition.scala b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDDPartition.scala similarity index 100% rename from external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDDPartition.scala rename to connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaRDDPartition.scala diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaUtils.scala b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaUtils.scala similarity index 100% rename from external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaUtils.scala rename to connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/KafkaUtils.scala diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/LocationStrategy.scala b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/LocationStrategy.scala similarity index 100% rename from external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/LocationStrategy.scala rename to connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/LocationStrategy.scala diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/OffsetRange.scala b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/OffsetRange.scala similarity index 100% rename from external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/OffsetRange.scala rename to connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/OffsetRange.scala diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/PerPartitionConfig.scala b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/PerPartitionConfig.scala similarity index 100% rename from external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/PerPartitionConfig.scala rename to connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/PerPartitionConfig.scala diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/package-info.java b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/package-info.java similarity index 100% rename from external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/package-info.java rename to connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/package-info.java diff --git a/external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/package.scala b/connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/package.scala similarity index 100% rename from external/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/package.scala rename to connector/kafka-0-10/src/main/scala/org/apache/spark/streaming/kafka010/package.scala diff --git a/external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaConsumerStrategySuite.java b/connector/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaConsumerStrategySuite.java similarity index 100% rename from external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaConsumerStrategySuite.java rename to connector/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaConsumerStrategySuite.java diff --git a/external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaDirectKafkaStreamSuite.java b/connector/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaDirectKafkaStreamSuite.java similarity index 100% rename from external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaDirectKafkaStreamSuite.java rename to connector/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaDirectKafkaStreamSuite.java diff --git a/external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaKafkaRDDSuite.java b/connector/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaKafkaRDDSuite.java similarity index 100% rename from external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaKafkaRDDSuite.java rename to connector/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaKafkaRDDSuite.java diff --git a/external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaLocationStrategySuite.java b/connector/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaLocationStrategySuite.java similarity index 100% rename from external/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaLocationStrategySuite.java rename to connector/kafka-0-10/src/test/java/org/apache/spark/streaming/kafka010/JavaLocationStrategySuite.java diff --git a/external/kafka-0-10/src/test/resources/log4j2.properties b/connector/kafka-0-10/src/test/resources/log4j2.properties similarity index 100% rename from external/kafka-0-10/src/test/resources/log4j2.properties rename to connector/kafka-0-10/src/test/resources/log4j2.properties diff --git a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala b/connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala similarity index 100% rename from external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala rename to connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/DirectKafkaStreamSuite.scala diff --git a/connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaDataConsumerSuite.scala b/connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaDataConsumerSuite.scala new file mode 100644 index 0000000000000..c7712b1aaee02 --- /dev/null +++ b/connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaDataConsumerSuite.scala @@ -0,0 +1,162 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming.kafka010 + +import java.util.concurrent.{Executors, TimeUnit} + +import scala.collection.JavaConverters._ +import scala.util.Random + +import org.apache.kafka.clients.consumer.ConsumerConfig._ +import org.apache.kafka.common.TopicPartition +import org.apache.kafka.common.serialization.ByteArrayDeserializer +import org.mockito.Mockito.when +import org.scalatestplus.mockito.MockitoSugar + +import org.apache.spark._ + +class KafkaDataConsumerSuite extends SparkFunSuite with MockitoSugar { + private var testUtils: KafkaTestUtils = _ + private val topic = "topic" + Random.nextInt() + private val topicPartition = new TopicPartition(topic, 0) + private val groupId = "groupId" + + override def beforeAll(): Unit = { + super.beforeAll() + val conf = new SparkConf() + val env = mock[SparkEnv] + SparkEnv.set(env) + when(env.conf).thenReturn(conf) + + testUtils = new KafkaTestUtils + testUtils.setup() + KafkaDataConsumer.init(16, 64, 0.75f) + } + + override def afterAll(): Unit = { + if (testUtils != null) { + testUtils.teardown() + testUtils = null + } + SparkEnv.set(null) + super.afterAll() + } + + private def getKafkaParams() = Map[String, Object]( + GROUP_ID_CONFIG -> groupId, + BOOTSTRAP_SERVERS_CONFIG -> testUtils.brokerAddress, + KEY_DESERIALIZER_CLASS_CONFIG -> classOf[ByteArrayDeserializer].getName, + VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[ByteArrayDeserializer].getName, + AUTO_OFFSET_RESET_CONFIG -> "earliest", + ENABLE_AUTO_COMMIT_CONFIG -> "false" + ).asJava + + test("KafkaDataConsumer reuse in case of same groupId and TopicPartition") { + KafkaDataConsumer.cache.clear() + + val kafkaParams = getKafkaParams() + + val consumer1 = KafkaDataConsumer.acquire[Array[Byte], Array[Byte]]( + topicPartition, kafkaParams, null, true) + consumer1.release() + + val consumer2 = KafkaDataConsumer.acquire[Array[Byte], Array[Byte]]( + topicPartition, kafkaParams, null, true) + consumer2.release() + + assert(KafkaDataConsumer.cache.size() == 1) + val key = new CacheKey(groupId, topicPartition) + val existingInternalConsumer = KafkaDataConsumer.cache.get(key) + assert(existingInternalConsumer.eq(consumer1.internalConsumer)) + assert(existingInternalConsumer.eq(consumer2.internalConsumer)) + } + + test("new KafkaDataConsumer instance in case of Task retry") { + KafkaDataConsumer.cache.clear() + + val kafkaParams = getKafkaParams() + val key = new CacheKey(groupId, topicPartition) + + val context1 = new TaskContextImpl(0, 0, 0, 0, 0, 1, null, null, null) + val consumer1 = KafkaDataConsumer.acquire[Array[Byte], Array[Byte]]( + topicPartition, kafkaParams, context1, true) + consumer1.release() + + assert(KafkaDataConsumer.cache.size() == 1) + assert(KafkaDataConsumer.cache.get(key).eq(consumer1.internalConsumer)) + + val context2 = new TaskContextImpl(0, 0, 0, 0, 1, 1, null, null, null) + val consumer2 = KafkaDataConsumer.acquire[Array[Byte], Array[Byte]]( + topicPartition, kafkaParams, context2, true) + consumer2.release() + + // The first consumer should be removed from cache and new non-cached should be returned + assert(KafkaDataConsumer.cache.size() == 0) + assert(consumer1.internalConsumer.ne(consumer2.internalConsumer)) + } + + test("concurrent use of KafkaDataConsumer") { + val data = (1 to 1000).map(_.toString) + testUtils.createTopic(topic) + testUtils.sendMessages(topic, data.toArray) + + val kafkaParams = getKafkaParams() + + val numThreads = 100 + val numConsumerUsages = 500 + + @volatile var error: Throwable = null + + def consume(i: Int): Unit = { + val useCache = Random.nextBoolean + val taskContext = if (Random.nextBoolean) { + new TaskContextImpl(0, 0, 0, 0, attemptNumber = Random.nextInt(2), 1, null, null, null) + } else { + null + } + val consumer = KafkaDataConsumer.acquire[Array[Byte], Array[Byte]]( + topicPartition, kafkaParams, taskContext, useCache) + try { + val rcvd = data.indices.map { offset => + val bytes = consumer.get(offset, 10000).value() + new String(bytes) + } + assert(rcvd == data) + } catch { + case e: Throwable => + error = e + throw e + } finally { + consumer.release() + } + } + + val threadPool = Executors.newFixedThreadPool(numThreads) + try { + val futures = (1 to numConsumerUsages).map { i => + threadPool.submit(new Runnable { + override def run(): Unit = { consume(i) } + }) + } + futures.foreach(_.get(1, TimeUnit.MINUTES)) + assert(error == null) + } finally { + threadPool.shutdown() + } + } +} diff --git a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaRDDSuite.scala b/connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaRDDSuite.scala similarity index 93% rename from external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaRDDSuite.scala rename to connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaRDDSuite.scala index b9ef16fb58cb9..b25e6c8e45928 100644 --- a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaRDDSuite.scala +++ b/connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaRDDSuite.scala @@ -21,21 +21,22 @@ import java.{ util => ju } import java.io.File import scala.collection.JavaConverters._ +import scala.concurrent.duration._ import scala.util.Random -import kafka.log.{CleanerConfig, Log, LogCleaner, LogConfig, ProducerStateManager} +import kafka.log.{CleanerConfig, LogCleaner, LogConfig, UnifiedLog} import kafka.server.{BrokerTopicStats, LogDirFailureChannel} import kafka.utils.Pool import org.apache.kafka.common.TopicPartition import org.apache.kafka.common.record.{CompressionType, MemoryRecords, SimpleRecord} import org.apache.kafka.common.serialization.StringDeserializer -import org.scalatest.BeforeAndAfterAll +import org.scalatest.concurrent.Eventually.{eventually, interval, timeout} import org.apache.spark._ import org.apache.spark.scheduler.ExecutorCacheTaskLocation import org.apache.spark.streaming.kafka010.mocks.MockTime -class KafkaRDDSuite extends SparkFunSuite with BeforeAndAfterAll { +class KafkaRDDSuite extends SparkFunSuite { private var kafkaTestUtils: KafkaTestUtils = _ @@ -84,7 +85,7 @@ class KafkaRDDSuite extends SparkFunSuite with BeforeAndAfterAll { private def compactLogs(topic: String, partition: Int, messages: Array[(String, String)]): Unit = { val mockTime = new MockTime() - val logs = new Pool[TopicPartition, Log]() + val logs = new Pool[TopicPartition, UnifiedLog]() val logDir = kafkaTestUtils.brokerLogDir val dir = new File(logDir, topic + "-" + partition) dir.mkdirs() @@ -93,7 +94,7 @@ class KafkaRDDSuite extends SparkFunSuite with BeforeAndAfterAll { logProps.put(LogConfig.MinCleanableDirtyRatioProp, java.lang.Float.valueOf(0.1f)) val logDirFailureChannel = new LogDirFailureChannel(1) val topicPartition = new TopicPartition(topic, partition) - val log = new Log( + val log = UnifiedLog( dir, LogConfig(logProps), 0L, @@ -101,11 +102,13 @@ class KafkaRDDSuite extends SparkFunSuite with BeforeAndAfterAll { mockTime.scheduler, new BrokerTopicStats(), mockTime, + maxTransactionTimeoutMs = 5 * 60 * 1000, // KAFKA-13221 Int.MaxValue, Int.MaxValue, - topicPartition, - new ProducerStateManager(topicPartition, dir), - logDirFailureChannel + logDirFailureChannel, + lastShutdownClean = false, + topicId = None, + keepPartitionMetadataFile = false ) messages.foreach { case (k, v) => val record = new SimpleRecord(k.getBytes, v.getBytes) @@ -201,6 +204,11 @@ class KafkaRDDSuite extends SparkFunSuite with BeforeAndAfterAll { sc, kafkaParams, offsetRanges, preferredHosts ).map(m => m.key -> m.value) + // To make it sure that the compaction happens + eventually(timeout(20.second), interval(1.seconds)) { + val dir = new File(kafkaTestUtils.brokerLogDir, topic + "-0") + assert(dir.listFiles().exists(_.getName.endsWith(".deleted"))) + } val received = rdd.collect.toSet assert(received === compactedMessages.toSet) diff --git a/connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaTestUtils.scala b/connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaTestUtils.scala new file mode 100644 index 0000000000000..1c1174b3e311c --- /dev/null +++ b/connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/KafkaTestUtils.scala @@ -0,0 +1,350 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.streaming.kafka010 + +import java.io.{File, IOException} +import java.lang.{Integer => JInt} +import java.net.InetSocketAddress +import java.util.{Map => JMap, Properties} +import java.util.concurrent.{TimeoutException, TimeUnit} + +import scala.annotation.tailrec +import scala.collection.JavaConverters._ +import scala.util.control.NonFatal + +import kafka.api.Request +import kafka.server.{KafkaConfig, KafkaServer} +import kafka.zk.{AdminZkClient, KafkaZkClient} +import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord} +import org.apache.kafka.common.TopicPartition +import org.apache.kafka.common.network.ListenerName +import org.apache.kafka.common.serialization.StringSerializer +import org.apache.kafka.common.utils.{Time => KTime} +import org.apache.zookeeper.server.{NIOServerCnxnFactory, ZooKeeperServer} + +import org.apache.spark.{SparkConf, SparkException} +import org.apache.spark.internal.Logging +import org.apache.spark.streaming.Time +import org.apache.spark.util.{ShutdownHookManager, Utils} + +/** + * This is a helper class for Kafka test suites. This has the functionality to set up + * and tear down local Kafka servers, and to push data using Kafka producers. + * + * The reason to put Kafka test utility class in src is to test Python related Kafka APIs. + */ +private[kafka010] class KafkaTestUtils extends Logging { + private val localHostNameForURI = Utils.localHostNameForURI() + + // Zookeeper related configurations + private val zkHost = localHostNameForURI + private var zkPort: Int = 0 + private val zkConnectionTimeout = 60000 + private val zkSessionTimeout = 10000 + + private var zookeeper: EmbeddedZookeeper = _ + + private var zkClient: KafkaZkClient = _ + private var admClient: AdminZkClient = _ + + // Kafka broker related configurations + private val brokerHost = localHostNameForURI + private var brokerPort = 0 + private var brokerConf: KafkaConfig = _ + + // Kafka broker server + private var server: KafkaServer = _ + + // Kafka producer + private var producer: KafkaProducer[String, String] = _ + + // Flag to test whether the system is correctly started + private var zkReady = false + private var brokerReady = false + private var leakDetector: AnyRef = null + + def zkAddress: String = { + assert(zkReady, "Zookeeper not setup yet or already torn down, cannot get zookeeper address") + s"$zkHost:$zkPort" + } + + def brokerAddress: String = { + assert(brokerReady, "Kafka not setup yet or already torn down, cannot get broker address") + s"$brokerHost:$brokerPort" + } + + def zookeeperClient: KafkaZkClient = { + assert(zkReady, "Zookeeper not setup yet or already torn down, cannot get zookeeper client") + Option(zkClient).getOrElse( + throw new IllegalStateException("Zookeeper client is not yet initialized")) + } + + def adminClient: AdminZkClient = { + assert(zkReady, "Zookeeper not setup yet or already torn down, cannot get zookeeper client") + Option(admClient).getOrElse( + throw new IllegalStateException("Admin client is not yet initialized")) + } + + // Set up the Embedded Zookeeper server and get the proper Zookeeper port + private def setupEmbeddedZookeeper(): Unit = { + // Zookeeper server startup + zookeeper = new EmbeddedZookeeper(s"$zkHost:$zkPort") + // Get the actual zookeeper binding port + zkPort = zookeeper.actualPort + zkClient = KafkaZkClient(s"$zkHost:$zkPort", isSecure = false, zkSessionTimeout, + zkConnectionTimeout, 1, KTime.SYSTEM) + admClient = new AdminZkClient(zkClient) + zkReady = true + } + + // Set up the Embedded Kafka server + private def setupEmbeddedKafkaServer(): Unit = { + assert(zkReady, "Zookeeper should be set up beforehand") + + // Kafka broker startup + Utils.startServiceOnPort(brokerPort, port => { + brokerPort = port + brokerConf = new KafkaConfig(brokerConfiguration, doLog = false) + server = new KafkaServer(brokerConf) + server.startup() + brokerPort = server.boundPort(new ListenerName("PLAINTEXT")) + (server, brokerPort) + }, new SparkConf(), "KafkaBroker") + + brokerReady = true + } + + /** setup the whole embedded servers, including Zookeeper and Kafka brokers */ + def setup(): Unit = { + // Set up a KafkaTestUtils leak detector so that we can see where the leak KafkaTestUtils is + // created. + val exception = new SparkException("It was created at: ") + leakDetector = ShutdownHookManager.addShutdownHook { () => + logError("Found a leak KafkaTestUtils.", exception) + } + + setupEmbeddedZookeeper() + setupEmbeddedKafkaServer() + } + + /** Teardown the whole servers, including Kafka broker and Zookeeper */ + def teardown(): Unit = { + if (leakDetector != null) { + ShutdownHookManager.removeShutdownHook(leakDetector) + } + brokerReady = false + zkReady = false + + if (producer != null) { + producer.close() + producer = null + } + + if (server != null) { + server.shutdown() + server.awaitShutdown() + server = null + } + + // On Windows, `logDirs` is left open even after Kafka server above is completely shut down + // in some cases. It leads to test failures on Windows if the directory deletion failure + // throws an exception. + brokerConf.logDirs.foreach { f => + try { + Utils.deleteRecursively(new File(f)) + } catch { + case e: IOException if Utils.isWindows => + logWarning(e.getMessage) + } + } + + if (zkClient != null) { + zkClient.close() + zkClient = null + } + + if (zookeeper != null) { + zookeeper.shutdown() + zookeeper = null + } + } + + /** Create a Kafka topic and wait until it is propagated to the whole cluster */ + def createTopic(topic: String, partitions: Int, config: Properties): Unit = { + adminClient.createTopic(topic, partitions, 1, config) + // wait until metadata is propagated + (0 until partitions).foreach { p => + waitUntilMetadataIsPropagated(topic, p) + } + } + + /** Create a Kafka topic and wait until it is propagated to the whole cluster */ + def createTopic(topic: String, partitions: Int): Unit = { + createTopic(topic, partitions, new Properties()) + } + + /** Create a Kafka topic and wait until it is propagated to the whole cluster */ + def createTopic(topic: String): Unit = { + createTopic(topic, 1, new Properties()) + } + + /** Java-friendly function for sending messages to the Kafka broker */ + def sendMessages(topic: String, messageToFreq: JMap[String, JInt]): Unit = { + sendMessages(topic, Map(messageToFreq.asScala.mapValues(_.intValue()).toSeq: _*)) + } + + /** Send the messages to the Kafka broker */ + def sendMessages(topic: String, messageToFreq: Map[String, Int]): Unit = { + val messages = messageToFreq.flatMap { case (s, freq) => Seq.fill(freq)(s) }.toArray + sendMessages(topic, messages) + } + + /** Send the array of messages to the Kafka broker */ + def sendMessages(topic: String, messages: Array[String]): Unit = { + producer = new KafkaProducer[String, String](producerConfiguration) + messages.foreach { message => + producer.send(new ProducerRecord[String, String](topic, message)) + } + producer.close() + producer = null + } + + /** Send the array of (key, value) messages to the Kafka broker */ + def sendMessages(topic: String, messages: Array[(String, String)]): Unit = { + producer = new KafkaProducer[String, String](producerConfiguration) + messages.foreach { message => + producer.send(new ProducerRecord[String, String](topic, message._1, message._2)) + } + producer.close() + producer = null + } + + val brokerLogDir = Utils.createTempDir().getAbsolutePath + + private def brokerConfiguration: Properties = { + val props = new Properties() + props.put("broker.id", "0") + props.put("listeners", s"PLAINTEXT://$localHostNameForURI:$brokerPort") + props.put("log.dir", brokerLogDir) + props.put("zookeeper.connect", zkAddress) + props.put("zookeeper.connection.timeout.ms", "60000") + props.put("log.flush.interval.messages", "1") + props.put("replica.socket.timeout.ms", "1500") + props.put("delete.topic.enable", "true") + props.put("offsets.topic.num.partitions", "1") + props.put("offsets.topic.replication.factor", "1") + props.put("group.initial.rebalance.delay.ms", "10") + props + } + + private def producerConfiguration: Properties = { + val props = new Properties() + props.put("bootstrap.servers", brokerAddress) + props.put("value.serializer", classOf[StringSerializer].getName) + // Key serializer is required. + props.put("key.serializer", classOf[StringSerializer].getName) + // wait for all in-sync replicas to ack sends + props.put("acks", "all") + props.put("partitioner.class", + classOf[org.apache.kafka.clients.producer.internals.DefaultPartitioner].getName) + props + } + + // A simplified version of scalatest eventually, rewritten here to avoid adding extra test + // dependency + def eventually[T](timeout: Time, interval: Time)(func: => T): T = { + def makeAttempt(): Either[Throwable, T] = { + try { + Right(func) + } catch { + case e if NonFatal(e) => Left(e) + } + } + + val startTimeNs = System.nanoTime() + @tailrec + def tryAgain(attempt: Int): T = { + makeAttempt() match { + case Right(result) => result + case Left(e) => + val durationMs = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - startTimeNs) + if (durationMs < timeout.milliseconds) { + Thread.sleep(interval.milliseconds) + } else { + throw new TimeoutException(e.getMessage) + } + + tryAgain(attempt + 1) + } + } + + tryAgain(1) + } + + private def waitUntilMetadataIsPropagated(topic: String, partition: Int): Unit = { + def isPropagated = server.dataPlaneRequestProcessor.metadataCache + .getPartitionInfo(topic, partition) match { + case Some(partitionState) => + val leader = partitionState.leader + val isr = partitionState.isr + zkClient.getLeaderForPartition(new TopicPartition(topic, partition)).isDefined && + Request.isValidBrokerId(leader) && !isr.isEmpty + case _ => + false + } + eventually(Time(10000), Time(100)) { + assert(isPropagated, s"Partition [$topic, $partition] metadata not propagated after timeout") + } + } + + private class EmbeddedZookeeper(val zkConnect: String) { + val snapshotDir = Utils.createTempDir() + val logDir = Utils.createTempDir() + + val zookeeper = new ZooKeeperServer(snapshotDir, logDir, 500) + val (ip, port) = { + val splits = zkConnect.split(":") + val port = splits(splits.length - 1) + (zkConnect.substring(0, zkConnect.length - port.length - 1), port.toInt) + } + val factory = new NIOServerCnxnFactory() + factory.configure(new InetSocketAddress(ip, port), 16) + factory.startup(zookeeper) + + val actualPort = factory.getLocalPort + + def shutdown(): Unit = { + factory.shutdown() + // The directories are not closed even if the ZooKeeper server is shut down. + // Please see ZOOKEEPER-1844, which is fixed in 3.4.6+. It leads to test failures + // on Windows if the directory deletion failure throws an exception. + try { + Utils.deleteRecursively(snapshotDir) + } catch { + case e: IOException if Utils.isWindows => + logWarning(e.getMessage) + } + try { + Utils.deleteRecursively(logDir) + } catch { + case e: IOException if Utils.isWindows => + logWarning(e.getMessage) + } + } + } +} diff --git a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/mocks/MockScheduler.scala b/connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/mocks/MockScheduler.scala similarity index 100% rename from external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/mocks/MockScheduler.scala rename to connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/mocks/MockScheduler.scala diff --git a/external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/mocks/MockTime.scala b/connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/mocks/MockTime.scala similarity index 100% rename from external/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/mocks/MockTime.scala rename to connector/kafka-0-10/src/test/scala/org/apache/spark/streaming/kafka010/mocks/MockTime.scala diff --git a/connector/kinesis-asl-assembly/pom.xml b/connector/kinesis-asl-assembly/pom.xml new file mode 100644 index 0000000000000..68980757b54b4 --- /dev/null +++ b/connector/kinesis-asl-assembly/pom.xml @@ -0,0 +1,220 @@ + + + + + 4.0.0 + + org.apache.spark + spark-parent_2.12 + 3.4.1 + ../../pom.xml + + + spark-streaming-kinesis-asl-assembly_2.12 + jar + Spark Project Kinesis Assembly + https://spark.apache.org/ + + + streaming-kinesis-asl-assembly + + + + + org.apache.spark + spark-streaming-kinesis-asl_${scala.binary.version} + ${project.version} + + + org.apache.spark + spark-streaming_${scala.binary.version} + ${project.version} + provided + + + + com.fasterxml.jackson.core + jackson-databind + provided + + + commons-lang + commons-lang + provided + + + com.google.protobuf + protobuf-java + 2.6.1 + + + + org.glassfish.jersey.core + jersey-client + provided + + + org.glassfish.jersey.core + jersey-common + provided + + + org.glassfish.jersey.core + jersey-server + provided + + + org.apache.logging.log4j + log4j-api + provided + + + org.apache.logging.log4j + log4j-core + provided + + + org.apache.logging.log4j + log4j-1.2-api + provided + + + org.apache.hadoop + ${hadoop-client-api.artifact} + ${hadoop.version} + provided + + + org.apache.hadoop + ${hadoop-client-runtime.artifact} + ${hadoop.version} + + + org.apache.avro + avro-mapred + provided + + + org.apache.curator + curator-recipes + provided + + + org.apache.zookeeper + zookeeper + provided + + + org.slf4j + slf4j-api + provided + + + org.apache.logging.log4j + log4j-slf4j2-impl + ${log4j.version} + provided + + + org.xerial.snappy + snappy-java + provided + + + + + target/scala-${scala.binary.version}/classes + target/scala-${scala.binary.version}/test-classes + + + + org.apache.maven.plugins + maven-deploy-plugin + + true + + + + org.apache.maven.plugins + maven-install-plugin + + true + + + + org.apache.maven.plugins + maven-shade-plugin + + false + + + *:* + + + + + com.google.protobuf + kinesis.protobuf + + com.google.protobuf.** + + + + + + *:* + + META-INF/*.SF + META-INF/*.DSA + META-INF/*.RSA + + + + + + + package + + shade + + + + + + reference.conf + + + log4j2.properties + + + + + + + + + + + + diff --git a/connector/kinesis-asl/pom.xml b/connector/kinesis-asl/pom.xml new file mode 100644 index 0000000000000..ce21709ea2e73 --- /dev/null +++ b/connector/kinesis-asl/pom.xml @@ -0,0 +1,110 @@ + + + + 4.0.0 + + org.apache.spark + spark-parent_2.12 + 3.4.1 + ../../pom.xml + + + + spark-streaming-kinesis-asl_2.12 + jar + Spark Kinesis Integration + + + streaming-kinesis-asl + + + + + org.apache.spark + spark-streaming_${scala.binary.version} + ${project.version} + + + org.apache.spark + spark-core_${scala.binary.version} + ${project.version} + test-jar + test + + + org.apache.spark + spark-streaming_${scala.binary.version} + ${project.version} + test-jar + test + + + com.amazonaws + amazon-kinesis-client + ${aws.kinesis.client.version} + + + com.amazonaws + aws-java-sdk-sts + ${aws.java.sdk.version} + + + com.amazonaws + amazon-kinesis-producer + ${aws.kinesis.producer.version} + test + + + + com.fasterxml.jackson.dataformat + jackson-dataformat-cbor + ${fasterxml.jackson.version} + + + org.mockito + mockito-core + test + + + org.scalacheck + scalacheck_${scala.binary.version} + test + + + org.apache.spark + spark-tags_${scala.binary.version} + + + + + org.apache.spark + spark-tags_${scala.binary.version} + test-jar + test + + + + + target/scala-${scala.binary.version}/classes + target/scala-${scala.binary.version}/test-classes + + diff --git a/external/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java b/connector/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java similarity index 100% rename from external/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java rename to connector/kinesis-asl/src/main/java/org/apache/spark/examples/streaming/JavaKinesisWordCountASL.java diff --git a/external/kinesis-asl/src/main/java/org/apache/spark/streaming/kinesis/KinesisInitialPositions.java b/connector/kinesis-asl/src/main/java/org/apache/spark/streaming/kinesis/KinesisInitialPositions.java similarity index 100% rename from external/kinesis-asl/src/main/java/org/apache/spark/streaming/kinesis/KinesisInitialPositions.java rename to connector/kinesis-asl/src/main/java/org/apache/spark/streaming/kinesis/KinesisInitialPositions.java diff --git a/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py b/connector/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py similarity index 94% rename from external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py rename to connector/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py index e66763538d15a..53a6b69dc93a8 100644 --- a/external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py +++ b/connector/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py @@ -36,8 +36,8 @@ # run the example $ bin/spark-submit --jars \ - 'external/kinesis-asl-assembly/target/spark-streaming-kinesis-asl-assembly_*.jar' \ - external/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py \ + 'connector/kinesis-asl-assembly/target/spark-streaming-kinesis-asl-assembly_*.jar' \ + connector/kinesis-asl/src/main/python/examples/streaming/kinesis_wordcount_asl.py \ myAppName mySparkStream https://kinesis.us-east-1.amazonaws.com us-east-1 There is a companion helper class called KinesisWordProducerASL which puts dummy data diff --git a/external/kinesis-asl/src/main/resources/log4j2.properties b/connector/kinesis-asl/src/main/resources/log4j2.properties similarity index 100% rename from external/kinesis-asl/src/main/resources/log4j2.properties rename to connector/kinesis-asl/src/main/resources/log4j2.properties diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisExampleUtils.scala b/connector/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisExampleUtils.scala similarity index 100% rename from external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisExampleUtils.scala rename to connector/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisExampleUtils.scala diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala b/connector/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala similarity index 100% rename from external/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala rename to connector/kinesis-asl/src/main/scala/org/apache/spark/examples/streaming/KinesisWordCountASL.scala diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala b/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala similarity index 100% rename from external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala rename to connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDD.scala diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala b/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala similarity index 100% rename from external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala rename to connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointer.scala diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala b/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala similarity index 100% rename from external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala rename to connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisInputDStream.scala diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReadConfigurations.scala b/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReadConfigurations.scala similarity index 100% rename from external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReadConfigurations.scala rename to connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReadConfigurations.scala diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala b/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala similarity index 100% rename from external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala rename to connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisReceiver.scala diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala b/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala similarity index 100% rename from external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala rename to connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisRecordProcessor.scala diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala b/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala similarity index 100% rename from external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala rename to connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisTestUtils.scala diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtilsPythonHelper.scala b/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtilsPythonHelper.scala similarity index 91% rename from external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtilsPythonHelper.scala rename to connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtilsPythonHelper.scala index 0056438c4eefb..8abaef6b834eb 100644 --- a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtilsPythonHelper.scala +++ b/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/KinesisUtilsPythonHelper.scala @@ -17,6 +17,7 @@ package org.apache.spark.streaming.kinesis import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream +import com.amazonaws.services.kinesis.metrics.interfaces.MetricsLevel import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Duration @@ -37,6 +38,7 @@ private class KinesisUtilsPythonHelper { regionName: String, initialPositionInStream: Int, checkpointInterval: Duration, + metricsLevel: Int, storageLevel: StorageLevel, awsAccessKeyId: String, awsSecretKey: String, @@ -64,6 +66,13 @@ private class KinesisUtilsPythonHelper { "InitialPositionInStream.LATEST or InitialPositionInStream.TRIM_HORIZON") } + val cloudWatchMetricsLevel = metricsLevel match { + case 0 => MetricsLevel.DETAILED + case 1 => MetricsLevel.SUMMARY + case 2 => MetricsLevel.NONE + case _ => MetricsLevel.DETAILED + } + val builder = KinesisInputDStream.builder. streamingContext(jssc). checkpointAppName(kinesisAppName). @@ -72,6 +81,7 @@ private class KinesisUtilsPythonHelper { regionName(regionName). initialPosition(KinesisInitialPositions.fromKinesisInitialPosition(kinesisInitialPosition)). checkpointInterval(checkpointInterval). + metricsLevel(cloudWatchMetricsLevel). storageLevel(storageLevel) if (stsAssumeRoleArn != null && stsSessionName != null && stsExternalId != null) { diff --git a/external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/SparkAWSCredentials.scala b/connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/SparkAWSCredentials.scala similarity index 100% rename from external/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/SparkAWSCredentials.scala rename to connector/kinesis-asl/src/main/scala/org/apache/spark/streaming/kinesis/SparkAWSCredentials.scala diff --git a/external/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisInputDStreamBuilderSuite.java b/connector/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisInputDStreamBuilderSuite.java similarity index 100% rename from external/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisInputDStreamBuilderSuite.java rename to connector/kinesis-asl/src/test/java/org/apache/spark/streaming/kinesis/JavaKinesisInputDStreamBuilderSuite.java diff --git a/external/kinesis-asl/src/test/resources/log4j2.properties b/connector/kinesis-asl/src/test/resources/log4j2.properties similarity index 100% rename from external/kinesis-asl/src/test/resources/log4j2.properties rename to connector/kinesis-asl/src/test/resources/log4j2.properties diff --git a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KPLBasedKinesisTestUtils.scala b/connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KPLBasedKinesisTestUtils.scala similarity index 100% rename from external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KPLBasedKinesisTestUtils.scala rename to connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KPLBasedKinesisTestUtils.scala diff --git a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala b/connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala similarity index 99% rename from external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala rename to connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala index 12d950096b4c2..4b3b7454b861b 100644 --- a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala +++ b/connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisBackedBlockRDDSuite.scala @@ -96,7 +96,7 @@ abstract class KinesisBackedBlockRDDTests(aggregateTestData: Boolean) allRanges.map { range => SequenceNumberRanges(Array(range)) }.toArray ).map { bytes => new String(bytes).toInt }.collectPartitions() assert(receivedData3.length === allRanges.size) - for (i <- 0 until allRanges.size) { + for (i <- allRanges.indices) { assert(receivedData3(i).toSeq === shardIdToData(allRanges(i).shardId)) } } diff --git a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointerSuite.scala b/connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointerSuite.scala similarity index 100% rename from external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointerSuite.scala rename to connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisCheckpointerSuite.scala diff --git a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala b/connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala similarity index 100% rename from external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala rename to connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisFunSuite.scala diff --git a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisInputDStreamBuilderSuite.scala b/connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisInputDStreamBuilderSuite.scala similarity index 100% rename from external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisInputDStreamBuilderSuite.scala rename to connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisInputDStreamBuilderSuite.scala diff --git a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala b/connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala similarity index 100% rename from external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala rename to connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisReceiverSuite.scala diff --git a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala b/connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala similarity index 100% rename from external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala rename to connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/KinesisStreamSuite.scala diff --git a/external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/SparkAWSCredentialsBuilderSuite.scala b/connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/SparkAWSCredentialsBuilderSuite.scala similarity index 100% rename from external/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/SparkAWSCredentialsBuilderSuite.scala rename to connector/kinesis-asl/src/test/scala/org/apache/spark/streaming/kinesis/SparkAWSCredentialsBuilderSuite.scala diff --git a/connector/protobuf/README.md b/connector/protobuf/README.md new file mode 100644 index 0000000000000..9dd0a2457db6f --- /dev/null +++ b/connector/protobuf/README.md @@ -0,0 +1,36 @@ +# Spark Protobuf - Developer Documentation + +## Getting Started + +### Build + +```bash +./build/mvn clean package +``` + +or + +```bash +./build/sbt clean package +``` + +### Build with user-defined `protoc` + +When the user cannot use the official `protoc` binary files to build the `protobuf` module in the compilation environment, +for example, compiling `protobuf` module on CentOS 6 or CentOS 7 which the default `glibc` version is less than 2.14, we can try to compile and test by +specifying the user-defined `protoc` binary files as follows: + +```bash +export SPARK_PROTOC_EXEC_PATH=/path-to-protoc-exe +./build/mvn -Phive -Puser-defined-protoc clean package +``` + +or + +```bash +export SPARK_PROTOC_EXEC_PATH=/path-to-protoc-exe +./build/sbt -Puser-defined-protoc clean package +``` + +The user-defined `protoc` binary files can be produced in the user's compilation environment by source code compilation, +for compilation steps, please refer to [protobuf](https://github.com/protocolbuffers/protobuf). diff --git a/connector/protobuf/pom.xml b/connector/protobuf/pom.xml new file mode 100644 index 0000000000000..0154b0ff15bb8 --- /dev/null +++ b/connector/protobuf/pom.xml @@ -0,0 +1,189 @@ + + + + + 4.0.0 + + org.apache.spark + spark-parent_2.12 + 3.4.1 + ../../pom.xml + + + spark-protobuf_2.12 + + protobuf + + jar + Spark Protobuf + https://spark.apache.org/ + + + + org.apache.spark + spark-sql_${scala.binary.version} + ${project.version} + provided + + + org.apache.spark + spark-core_${scala.binary.version} + ${project.version} + test-jar + test + + + org.apache.spark + spark-catalyst_${scala.binary.version} + ${project.version} + test-jar + test + + + org.apache.spark + spark-sql_${scala.binary.version} + ${project.version} + test-jar + test + + + org.scalacheck + scalacheck_${scala.binary.version} + test + + + org.apache.spark + spark-tags_${scala.binary.version} + + + + com.google.protobuf + protobuf-java + ${protobuf.version} + compile + + + + target/scala-${scala.binary.version}/classes + target/scala-${scala.binary.version}/test-classes + + + org.apache.maven.plugins + maven-shade-plugin + + false + true + + + com.google.protobuf:* + + + + + com.google.protobuf + ${spark.shade.packageName}.spark_protobuf.protobuf + + com.google.protobuf.** + + + + + + *:* + + google/protobuf/** + + + + + + + + + + default-protoc + + true + + + + + com.github.os72 + protoc-jar-maven-plugin + ${protoc-jar-maven-plugin.version} + + + + generate-test-sources + + run + + + com.google.protobuf:protoc:${protobuf.version} + ${protobuf.version} + + src/test/resources/protobuf + + test + + + + + + + + + user-defined-protoc + + ${env.SPARK_PROTOC_EXEC_PATH} + + + + + com.github.os72 + protoc-jar-maven-plugin + ${protoc-jar-maven-plugin.version} + + + + generate-test-sources + + run + + + com.google.protobuf:protoc:${protobuf.version} + ${protobuf.version} + ${spark.protoc.executable.path} + + src/test/resources/protobuf + + test + + + + + + + + + diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/CatalystDataToProtobuf.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/CatalystDataToProtobuf.scala new file mode 100644 index 0000000000000..12561fe51e655 --- /dev/null +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/CatalystDataToProtobuf.scala @@ -0,0 +1,55 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.protobuf + +import com.google.protobuf.DynamicMessage + +import org.apache.spark.sql.catalyst.expressions.{Expression, UnaryExpression} +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} +import org.apache.spark.sql.protobuf.utils.ProtobufUtils +import org.apache.spark.sql.types.{BinaryType, DataType} + +private[protobuf] case class CatalystDataToProtobuf( + child: Expression, + messageName: String, + descFilePath: Option[String] = None, + options: Map[String, String] = Map.empty) + extends UnaryExpression { + + override def dataType: DataType = BinaryType + + @transient private lazy val protoDescriptor = + ProtobufUtils.buildDescriptor(messageName, descFilePathOpt = descFilePath) + + @transient private lazy val serializer = + new ProtobufSerializer(child.dataType, protoDescriptor, child.nullable) + + override def nullSafeEval(input: Any): Any = { + val dynamicMessage = serializer.serialize(input).asInstanceOf[DynamicMessage] + dynamicMessage.toByteArray + } + + override def prettyName: String = "to_protobuf" + + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val expr = ctx.addReferenceObj("this", this) + defineCodeGen(ctx, ev, input => s"(byte[]) $expr.nullSafeEval($input)") + } + + override protected def withNewChildInternal(newChild: Expression): CatalystDataToProtobuf = + copy(child = newChild) +} diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala new file mode 100644 index 0000000000000..da44f94d5eac2 --- /dev/null +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDataToCatalyst.scala @@ -0,0 +1,154 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.protobuf + +import scala.collection.JavaConverters._ +import scala.util.control.NonFatal + +import com.google.protobuf.DynamicMessage + +import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, SpecificInternalRow, UnaryExpression} +import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode} +import org.apache.spark.sql.catalyst.util.{FailFastMode, ParseMode, PermissiveMode} +import org.apache.spark.sql.errors.{QueryCompilationErrors, QueryExecutionErrors} +import org.apache.spark.sql.protobuf.utils.{ProtobufOptions, ProtobufUtils, SchemaConverters} +import org.apache.spark.sql.types.{AbstractDataType, BinaryType, DataType, StructType} + +private[protobuf] case class ProtobufDataToCatalyst( + child: Expression, + messageName: String, + descFilePath: Option[String] = None, + options: Map[String, String] = Map.empty) + extends UnaryExpression + with ExpectsInputTypes { + + override def inputTypes: Seq[AbstractDataType] = Seq(BinaryType) + + override lazy val dataType: DataType = { + val dt = SchemaConverters.toSqlType(messageDescriptor, protobufOptions).dataType + parseMode match { + // With PermissiveMode, the output Catalyst row might contain columns of null values for + // corrupt records, even if some of the columns are not nullable in the user-provided schema. + // Therefore we force the schema to be all nullable here. + case PermissiveMode => dt.asNullable + case _ => dt + } + } + + override def nullable: Boolean = true + + private lazy val protobufOptions = ProtobufOptions(options) + + @transient private lazy val messageDescriptor = + ProtobufUtils.buildDescriptor(messageName, descFilePath) + // TODO: Avoid carrying the file name. Read the contents of descriptor file only once + // at the start. Rest of the runs should reuse the buffer. Otherwise, it could + // cause inconsistencies if the file contents are changed the user after a few days. + // Same for the write side in [[CatalystDataToProtobuf]]. + + @transient private lazy val fieldsNumbers = + messageDescriptor.getFields.asScala.map(f => f.getNumber).toSet + + @transient private lazy val deserializer = new ProtobufDeserializer(messageDescriptor, dataType) + + @transient private var result: DynamicMessage = _ + + @transient private lazy val parseMode: ParseMode = { + val mode = protobufOptions.parseMode + if (mode != PermissiveMode && mode != FailFastMode) { + throw QueryCompilationErrors.parseModeUnsupportedError(prettyName, mode) + } + mode + } + + @transient private lazy val nullResultRow: Any = dataType match { + case st: StructType => + val resultRow = new SpecificInternalRow(st.map(_.dataType)) + for (i <- 0 until st.length) { + resultRow.setNullAt(i) + } + resultRow + + case _ => + null + } + + private def handleException(e: Throwable): Any = { + parseMode match { + case PermissiveMode => + nullResultRow + case FailFastMode => + throw QueryExecutionErrors.malformedProtobufMessageDetectedInMessageParsingError(e) + case _ => + throw QueryCompilationErrors.parseModeUnsupportedError(prettyName, parseMode) + } + } + + override def nullSafeEval(input: Any): Any = { + val binary = input.asInstanceOf[Array[Byte]] + try { + result = DynamicMessage.parseFrom(messageDescriptor, binary) + // If the Java class is available, it is likely more efficient to parse with it than using + // DynamicMessage. Can consider it in the future if parsing overhead is noticeable. + + result.getUnknownFields.asMap().keySet().asScala.find(fieldsNumbers.contains(_)) match { + case Some(number) => + // Unknown fields contain a field with same number as a known field. Must be due to + // mismatch of schema between writer and reader here. + throw QueryCompilationErrors.protobufFieldTypeMismatchError( + messageDescriptor.getFields.get(number).toString) + case None => + } + + val deserialized = deserializer.deserialize(result) + assert( + deserialized.isDefined, + "Protobuf deserializer cannot return an empty result because filters are not pushed down") + deserialized.get + } catch { + // There could be multiple possible exceptions here, e.g. java.io.IOException, + // ProtoRuntimeException, ArrayIndexOutOfBoundsException, etc. + // To make it simple, catch all the exceptions here. + case NonFatal(e) => + handleException(e) + } + } + + override def prettyName: String = "from_protobuf" + + override protected def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { + val expr = ctx.addReferenceObj("this", this) + nullSafeCodeGen( + ctx, + ev, + eval => { + val result = ctx.freshName("result") + val dt = CodeGenerator.boxedType(dataType) + s""" + $dt $result = ($dt) $expr.nullSafeEval($eval); + if ($result == null) { + ${ev.isNull} = true; + } else { + ${ev.value} = $result; + } + """ + }) + } + + override protected def withNewChildInternal(newChild: Expression): ProtobufDataToCatalyst = + copy(child = newChild) +} diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala new file mode 100644 index 0000000000000..7723687a4d95c --- /dev/null +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufDeserializer.scala @@ -0,0 +1,358 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.protobuf + +import java.util.concurrent.TimeUnit + +import com.google.protobuf.{ByteString, DynamicMessage, Message} +import com.google.protobuf.Descriptors._ +import com.google.protobuf.Descriptors.FieldDescriptor.JavaType._ + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.{InternalRow, NoopFilters, StructFilters} +import org.apache.spark.sql.catalyst.expressions.{SpecificInternalRow, UnsafeArrayData} +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, ArrayData, DateTimeUtils, GenericArrayData} +import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.protobuf.utils.ProtobufUtils +import org.apache.spark.sql.protobuf.utils.ProtobufUtils.ProtoMatchedField +import org.apache.spark.sql.protobuf.utils.ProtobufUtils.toFieldStr +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String + +private[sql] class ProtobufDeserializer( + rootDescriptor: Descriptor, + rootCatalystType: DataType, + filters: StructFilters) { + + def this(rootDescriptor: Descriptor, rootCatalystType: DataType) = { + this(rootDescriptor, rootCatalystType, new NoopFilters) + } + + private val converter: Any => Option[InternalRow] = + try { + rootCatalystType match { + // A shortcut for empty schema. + case st: StructType if st.isEmpty => + (_: Any) => Some(InternalRow.empty) + + case st: StructType => + val resultRow = new SpecificInternalRow(st.map(_.dataType)) + val fieldUpdater = new RowUpdater(resultRow) + val applyFilters = filters.skipRow(resultRow, _) + val writer = getRecordWriter(rootDescriptor, st, Nil, Nil, applyFilters) + (data: Any) => { + val record = data.asInstanceOf[DynamicMessage] + val skipRow = writer(fieldUpdater, record) + if (skipRow) None else Some(resultRow) + } + } + } catch { + case ise: AnalysisException => + throw QueryCompilationErrors.cannotConvertProtobufTypeToCatalystTypeError( + rootDescriptor.getName, + rootCatalystType, + ise) + } + + def deserialize(data: Message): Option[InternalRow] = converter(data) + + private def newArrayWriter( + protoField: FieldDescriptor, + protoPath: Seq[String], + catalystPath: Seq[String], + elementType: DataType, + containsNull: Boolean): (CatalystDataUpdater, Int, Any) => Unit = { + + val protoElementPath = protoPath :+ "element" + val elementWriter = + newWriter(protoField, elementType, protoElementPath, catalystPath :+ "element") + (updater, ordinal, value) => + val collection = value.asInstanceOf[java.util.Collection[Any]] + val result = createArrayData(elementType, collection.size()) + val elementUpdater = new ArrayDataUpdater(result) + + var i = 0 + val iterator = collection.iterator() + while (iterator.hasNext) { + val element = iterator.next() + if (element == null) { + if (!containsNull) { + throw QueryCompilationErrors.notNullConstraintViolationArrayElementError( + protoElementPath) + } else { + elementUpdater.setNullAt(i) + } + } else { + elementWriter(elementUpdater, i, element) + } + i += 1 + } + + updater.set(ordinal, result) + } + + private def newMapWriter( + protoType: FieldDescriptor, + protoPath: Seq[String], + catalystPath: Seq[String], + keyType: DataType, + valueType: DataType, + valueContainsNull: Boolean): (CatalystDataUpdater, Int, Any) => Unit = { + val keyField = protoType.getMessageType.getFields.get(0) + val valueField = protoType.getMessageType.getFields.get(1) + val keyWriter = newWriter(keyField, keyType, protoPath :+ "key", catalystPath :+ "key") + val valueWriter = + newWriter(valueField, valueType, protoPath :+ "value", catalystPath :+ "value") + (updater, ordinal, value) => + if (value != null) { + val messageList = value.asInstanceOf[java.util.List[com.google.protobuf.Message]] + val valueArray = createArrayData(valueType, messageList.size()) + val valueUpdater = new ArrayDataUpdater(valueArray) + val keyArray = createArrayData(keyType, messageList.size()) + val keyUpdater = new ArrayDataUpdater(keyArray) + var i = 0 + messageList.forEach { field => + { + keyWriter(keyUpdater, i, field.getField(keyField)) + if (field.getField(valueField) == null) { + if (!valueContainsNull) { + throw QueryCompilationErrors.notNullConstraintViolationMapValueError(protoPath) + } else { + valueUpdater.setNullAt(i) + } + } else { + valueWriter(valueUpdater, i, field.getField(valueField)) + } + } + i += 1 + } + updater.set(ordinal, new ArrayBasedMapData(keyArray, valueArray)) + } + } + + /** + * Creates a writer to write Protobuf values to Catalyst values at the given ordinal with the + * given updater. + */ + private def newWriter( + protoType: FieldDescriptor, + catalystType: DataType, + protoPath: Seq[String], + catalystPath: Seq[String]): (CatalystDataUpdater, Int, Any) => Unit = { + + (protoType.getJavaType, catalystType) match { + + case (null, NullType) => (updater, ordinal, _) => updater.setNullAt(ordinal) + + // TODO: we can avoid boxing if future version of Protobuf provide primitive accessors. + case (BOOLEAN, BooleanType) => + (updater, ordinal, value) => updater.setBoolean(ordinal, value.asInstanceOf[Boolean]) + + case (INT, IntegerType) => + (updater, ordinal, value) => updater.setInt(ordinal, value.asInstanceOf[Int]) + + case (INT, ByteType) => + (updater, ordinal, value) => updater.setByte(ordinal, value.asInstanceOf[Byte]) + + case (INT, ShortType) => + (updater, ordinal, value) => updater.setShort(ordinal, value.asInstanceOf[Short]) + + case ( + MESSAGE | BOOLEAN | INT | FLOAT | DOUBLE | LONG | STRING | ENUM | BYTE_STRING, + ArrayType(dataType: DataType, containsNull)) if protoType.isRepeated => + newArrayWriter(protoType, protoPath, catalystPath, dataType, containsNull) + + case (LONG, LongType) => + (updater, ordinal, value) => updater.setLong(ordinal, value.asInstanceOf[Long]) + + case (FLOAT, FloatType) => + (updater, ordinal, value) => updater.setFloat(ordinal, value.asInstanceOf[Float]) + + case (DOUBLE, DoubleType) => + (updater, ordinal, value) => updater.setDouble(ordinal, value.asInstanceOf[Double]) + + case (STRING, StringType) => + (updater, ordinal, value) => + val str = value match { + case s: String => UTF8String.fromString(s) + } + updater.set(ordinal, str) + + case (BYTE_STRING, BinaryType) => + (updater, ordinal, value) => + val byte_array = value match { + case s: ByteString => s.toByteArray + case unsupported => + throw QueryCompilationErrors.invalidByteStringFormatError(unsupported) + } + updater.set(ordinal, byte_array) + + case (MESSAGE, MapType(keyType, valueType, valueContainsNull)) => + newMapWriter(protoType, protoPath, catalystPath, keyType, valueType, valueContainsNull) + + case (MESSAGE, TimestampType) => + (updater, ordinal, value) => + val secondsField = protoType.getMessageType.getFields.get(0) + val nanoSecondsField = protoType.getMessageType.getFields.get(1) + val message = value.asInstanceOf[DynamicMessage] + val seconds = message.getField(secondsField).asInstanceOf[Long] + val nanoSeconds = message.getField(nanoSecondsField).asInstanceOf[Int] + val micros = DateTimeUtils.millisToMicros(seconds * 1000) + updater.setLong(ordinal, micros + TimeUnit.NANOSECONDS.toMicros(nanoSeconds)) + + case (MESSAGE, DayTimeIntervalType(startField, endField)) => + (updater, ordinal, value) => + val secondsField = protoType.getMessageType.getFields.get(0) + val nanoSecondsField = protoType.getMessageType.getFields.get(1) + val message = value.asInstanceOf[DynamicMessage] + val seconds = message.getField(secondsField).asInstanceOf[Long] + val nanoSeconds = message.getField(nanoSecondsField).asInstanceOf[Int] + val micros = DateTimeUtils.millisToMicros(seconds * 1000) + updater.setLong(ordinal, micros + TimeUnit.NANOSECONDS.toMicros(nanoSeconds)) + + case (MESSAGE, st: StructType) => + val writeRecord = getRecordWriter( + protoType.getMessageType, + st, + protoPath, + catalystPath, + applyFilters = _ => false) + (updater, ordinal, value) => + val row = new SpecificInternalRow(st) + writeRecord(new RowUpdater(row), value.asInstanceOf[DynamicMessage]) + updater.set(ordinal, row) + + case (ENUM, StringType) => + (updater, ordinal, value) => updater.set(ordinal, UTF8String.fromString(value.toString)) + + case _ => + throw QueryCompilationErrors.cannotConvertProtobufTypeToSqlTypeError( + toFieldStr(protoPath), + catalystPath, + s"${protoType} ${protoType.toProto.getLabel} ${protoType.getJavaType}" + + s" ${protoType.getType}", + catalystType) + } + } + + private def getRecordWriter( + protoType: Descriptor, + catalystType: StructType, + protoPath: Seq[String], + catalystPath: Seq[String], + applyFilters: Int => Boolean): (CatalystDataUpdater, DynamicMessage) => Boolean = { + + val protoSchemaHelper = + new ProtobufUtils.ProtoSchemaHelper(protoType, catalystType, protoPath, catalystPath) + + // TODO revisit validation of protobuf-catalyst fields. + // protoSchemaHelper.validateNoExtraCatalystFields(ignoreNullable = true) + + var i = 0 + val (validFieldIndexes, fieldWriters) = protoSchemaHelper.matchedFields + .map { case ProtoMatchedField(catalystField, ordinal, protoField) => + val baseWriter = newWriter( + protoField, + catalystField.dataType, + protoPath :+ protoField.getName, + catalystPath :+ catalystField.name) + val fieldWriter = (fieldUpdater: CatalystDataUpdater, value: Any) => { + if (value == null) { + fieldUpdater.setNullAt(ordinal) + } else { + baseWriter(fieldUpdater, ordinal, value) + } + } + i += 1 + (protoField, fieldWriter) + } + .toArray + .unzip + + (fieldUpdater, record) => { + var i = 0 + var skipRow = false + while (i < validFieldIndexes.length && !skipRow) { + val field = validFieldIndexes(i) + val value = if (field.isRepeated || field.hasDefaultValue || record.hasField(field)) { + record.getField(field) + } else null + fieldWriters(i)(fieldUpdater, value) + skipRow = applyFilters(i) + i += 1 + } + skipRow + } + } + + // TODO: All of the code below this line is same between protobuf and avro, it can be shared. + private def createArrayData(elementType: DataType, length: Int): ArrayData = elementType match { + case BooleanType => UnsafeArrayData.fromPrimitiveArray(new Array[Boolean](length)) + case ByteType => UnsafeArrayData.fromPrimitiveArray(new Array[Byte](length)) + case ShortType => UnsafeArrayData.fromPrimitiveArray(new Array[Short](length)) + case IntegerType => UnsafeArrayData.fromPrimitiveArray(new Array[Int](length)) + case LongType => UnsafeArrayData.fromPrimitiveArray(new Array[Long](length)) + case FloatType => UnsafeArrayData.fromPrimitiveArray(new Array[Float](length)) + case DoubleType => UnsafeArrayData.fromPrimitiveArray(new Array[Double](length)) + case _ => new GenericArrayData(new Array[Any](length)) + } + + /** + * A base interface for updating values inside catalyst data structure like `InternalRow` and + * `ArrayData`. + */ + sealed trait CatalystDataUpdater { + def set(ordinal: Int, value: Any): Unit + def setNullAt(ordinal: Int): Unit = set(ordinal, null) + def setBoolean(ordinal: Int, value: Boolean): Unit = set(ordinal, value) + def setByte(ordinal: Int, value: Byte): Unit = set(ordinal, value) + def setShort(ordinal: Int, value: Short): Unit = set(ordinal, value) + def setInt(ordinal: Int, value: Int): Unit = set(ordinal, value) + def setLong(ordinal: Int, value: Long): Unit = set(ordinal, value) + def setDouble(ordinal: Int, value: Double): Unit = set(ordinal, value) + def setFloat(ordinal: Int, value: Float): Unit = set(ordinal, value) + def setDecimal(ordinal: Int, value: Decimal): Unit = set(ordinal, value) + } + + final class RowUpdater(row: InternalRow) extends CatalystDataUpdater { + override def set(ordinal: Int, value: Any): Unit = row.update(ordinal, value) + override def setNullAt(ordinal: Int): Unit = row.setNullAt(ordinal) + override def setBoolean(ordinal: Int, value: Boolean): Unit = row.setBoolean(ordinal, value) + override def setByte(ordinal: Int, value: Byte): Unit = row.setByte(ordinal, value) + override def setShort(ordinal: Int, value: Short): Unit = row.setShort(ordinal, value) + override def setInt(ordinal: Int, value: Int): Unit = row.setInt(ordinal, value) + override def setLong(ordinal: Int, value: Long): Unit = row.setLong(ordinal, value) + override def setDouble(ordinal: Int, value: Double): Unit = row.setDouble(ordinal, value) + override def setFloat(ordinal: Int, value: Float): Unit = row.setFloat(ordinal, value) + override def setDecimal(ordinal: Int, value: Decimal): Unit = + row.setDecimal(ordinal, value, value.precision) + } + + final class ArrayDataUpdater(array: ArrayData) extends CatalystDataUpdater { + override def set(ordinal: Int, value: Any): Unit = array.update(ordinal, value) + override def setNullAt(ordinal: Int): Unit = array.setNullAt(ordinal) + override def setBoolean(ordinal: Int, value: Boolean): Unit = array.setBoolean(ordinal, value) + override def setByte(ordinal: Int, value: Byte): Unit = array.setByte(ordinal, value) + override def setShort(ordinal: Int, value: Short): Unit = array.setShort(ordinal, value) + override def setInt(ordinal: Int, value: Int): Unit = array.setInt(ordinal, value) + override def setLong(ordinal: Int, value: Long): Unit = array.setLong(ordinal, value) + override def setDouble(ordinal: Int, value: Double): Unit = array.setDouble(ordinal, value) + override def setFloat(ordinal: Int, value: Float): Unit = array.setFloat(ordinal, value) + override def setDecimal(ordinal: Int, value: Decimal): Unit = array.update(ordinal, value) + } + +} diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala new file mode 100644 index 0000000000000..0f87c640b194b --- /dev/null +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/ProtobufSerializer.scala @@ -0,0 +1,271 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.protobuf + +import scala.collection.JavaConverters._ + +import com.google.protobuf.{Duration, DynamicMessage, Timestamp} +import com.google.protobuf.Descriptors.{Descriptor, FieldDescriptor} +import com.google.protobuf.Descriptors.FieldDescriptor.JavaType._ + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.InternalRow +import org.apache.spark.sql.catalyst.expressions.SpecializedGetters +import org.apache.spark.sql.catalyst.util.{DateTimeUtils, IntervalUtils} +import org.apache.spark.sql.catalyst.util.IntervalStringStyles.ANSI_STYLE +import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.protobuf.utils.ProtobufUtils +import org.apache.spark.sql.protobuf.utils.ProtobufUtils.{toFieldStr, ProtoMatchedField} +import org.apache.spark.sql.types._ + +/** + * A serializer to serialize data in catalyst format to data in Protobuf format. + */ +private[sql] class ProtobufSerializer( + rootCatalystType: DataType, + rootDescriptor: Descriptor, + nullable: Boolean) + extends Logging { + + def serialize(catalystData: Any): Any = { + converter.apply(catalystData) + } + + private val converter: Any => Any = { + val baseConverter = + try { + rootCatalystType match { + case st: StructType => + newStructConverter(st, rootDescriptor, Nil, Nil).asInstanceOf[Any => Any] + } + } catch { + case ise: AnalysisException => + throw QueryCompilationErrors.cannotConvertSqlTypeToProtobufError( + rootDescriptor.getName, + rootCatalystType, + ise) + } + if (nullable) { (data: Any) => + if (data == null) { + null + } else { + baseConverter.apply(data) + } + } else { + baseConverter + } + } + + private type Converter = (SpecializedGetters, Int) => Any + + private def newConverter( + catalystType: DataType, + fieldDescriptor: FieldDescriptor, + catalystPath: Seq[String], + protoPath: Seq[String]): Converter = { + (catalystType, fieldDescriptor.getJavaType) match { + case (NullType, _) => + (getter, ordinal) => null + case (BooleanType, BOOLEAN) => + (getter, ordinal) => getter.getBoolean(ordinal) + case (ByteType, INT) => + (getter, ordinal) => getter.getByte(ordinal).toInt + case (ShortType, INT) => + (getter, ordinal) => getter.getShort(ordinal).toInt + case (IntegerType, INT) => + (getter, ordinal) => { + getter.getInt(ordinal) + } + case (LongType, LONG) => + (getter, ordinal) => getter.getLong(ordinal) + case (FloatType, FLOAT) => + (getter, ordinal) => getter.getFloat(ordinal) + case (DoubleType, DOUBLE) => + (getter, ordinal) => getter.getDouble(ordinal) + case (StringType, ENUM) => + val enumSymbols: Set[String] = + fieldDescriptor.getEnumType.getValues.asScala.map(e => e.toString).toSet + (getter, ordinal) => + val data = getter.getUTF8String(ordinal).toString + if (!enumSymbols.contains(data)) { + throw QueryCompilationErrors.cannotConvertCatalystTypeToProtobufEnumTypeError( + catalystPath, + toFieldStr(protoPath), + data, + enumSymbols.mkString("\"", "\", \"", "\"")) + } + fieldDescriptor.getEnumType.findValueByName(data) + case (StringType, STRING) => + (getter, ordinal) => { + String.valueOf(getter.getUTF8String(ordinal)) + } + + case (BinaryType, BYTE_STRING) => + (getter, ordinal) => getter.getBinary(ordinal) + + case (DateType, INT) => + (getter, ordinal) => getter.getInt(ordinal) + + case (TimestampType, MESSAGE) => + (getter, ordinal) => + val millis = DateTimeUtils.microsToMillis(getter.getLong(ordinal)) + Timestamp + .newBuilder() + .setSeconds((millis / 1000)) + .setNanos(((millis % 1000) * 1000000).toInt) + .build() + + case (ArrayType(et, containsNull), _) => + val elementConverter = + newConverter(et, fieldDescriptor, catalystPath :+ "element", protoPath :+ "element") + (getter, ordinal) => { + val arrayData = getter.getArray(ordinal) + val len = arrayData.numElements() + val result = new Array[Any](len) + var i = 0 + while (i < len) { + if (containsNull && arrayData.isNullAt(i)) { + result(i) = null + } else { + result(i) = elementConverter(arrayData, i) + } + i += 1 + } + // Protobuf writer is expecting a Java Collection, so we convert it into + // `ArrayList` backed by the specified array without data copying. + java.util.Arrays.asList(result: _*) + } + + case (st: StructType, MESSAGE) => + val structConverter = + newStructConverter(st, fieldDescriptor.getMessageType, catalystPath, protoPath) + val numFields = st.length + (getter, ordinal) => structConverter(getter.getStruct(ordinal, numFields)) + + case (MapType(kt, vt, valueContainsNull), MESSAGE) => + var keyField: FieldDescriptor = null + var valueField: FieldDescriptor = null + fieldDescriptor.getMessageType.getFields.asScala.map { field => + field.getName match { + case "key" => + keyField = field + case "value" => + valueField = field + } + } + + val keyConverter = newConverter(kt, keyField, catalystPath :+ "key", protoPath :+ "key") + val valueConverter = + newConverter(vt, valueField, catalystPath :+ "value", protoPath :+ "value") + + (getter, ordinal) => + val mapData = getter.getMap(ordinal) + val len = mapData.numElements() + val list = new java.util.ArrayList[DynamicMessage]() + val keyArray = mapData.keyArray() + val valueArray = mapData.valueArray() + var i = 0 + while (i < len) { + val result = DynamicMessage.newBuilder(fieldDescriptor.getMessageType) + if (valueContainsNull && valueArray.isNullAt(i)) { + result.setField(keyField, keyConverter(keyArray, i)) + result.setField(valueField, valueField.getDefaultValue) + } else { + result.setField(keyField, keyConverter(keyArray, i)) + result.setField(valueField, valueConverter(valueArray, i)) + } + list.add(result.build()) + i += 1 + } + list + + case (DayTimeIntervalType(startField, endField), MESSAGE) => + (getter, ordinal) => + val dayTimeIntervalString = + IntervalUtils.toDayTimeIntervalString(getter.getLong(ordinal) + , ANSI_STYLE, startField, endField) + val calendarInterval = IntervalUtils.fromIntervalString(dayTimeIntervalString) + + val millis = DateTimeUtils.microsToMillis(calendarInterval.microseconds) + val duration = Duration + .newBuilder() + .setSeconds((millis / 1000)) + .setNanos(((millis % 1000) * 1000000).toInt) + + if (duration.getSeconds < 0 && duration.getNanos > 0) { + duration.setSeconds(duration.getSeconds + 1) + duration.setNanos(duration.getNanos - 1000000000) + } else if (duration.getSeconds > 0 && duration.getNanos < 0) { + duration.setSeconds(duration.getSeconds - 1) + duration.setNanos(duration.getNanos + 1000000000) + } + duration.build() + + case _ => + throw QueryCompilationErrors.cannotConvertCatalystTypeToProtobufTypeError( + catalystPath, + toFieldStr(protoPath), + catalystType, + s"${fieldDescriptor} ${fieldDescriptor.toProto.getLabel} ${fieldDescriptor.getJavaType}" + + s" ${fieldDescriptor.getType}") + } + } + + private def newStructConverter( + catalystStruct: StructType, + descriptor: Descriptor, + catalystPath: Seq[String], + protoPath: Seq[String]): InternalRow => DynamicMessage = { + + val protoSchemaHelper = + new ProtobufUtils.ProtoSchemaHelper(descriptor, catalystStruct, protoPath, catalystPath) + + protoSchemaHelper.validateNoExtraCatalystFields(ignoreNullable = false) + protoSchemaHelper.validateNoExtraRequiredProtoFields() + + val (protoIndices, fieldConverters: Array[Converter]) = protoSchemaHelper.matchedFields + .map { case ProtoMatchedField(catalystField, _, protoField) => + val converter = newConverter( + catalystField.dataType, + protoField, + catalystPath :+ catalystField.name, + protoPath :+ protoField.getName) + (protoField, converter) + } + .toArray + .unzip + + val numFields = catalystStruct.length + row: InternalRow => + val result = DynamicMessage.newBuilder(descriptor) + var i = 0 + while (i < numFields) { + if (row.isNullAt(i)) { + if (!protoIndices(i).isRepeated() && + protoIndices(i).getJavaType() != FieldDescriptor.JavaType.MESSAGE && + protoIndices(i).isRequired()) { + result.setField(protoIndices(i), protoIndices(i).getDefaultValue()) + } + } else { + result.setField(protoIndices(i), fieldConverters(i).apply(row, i)) + } + i += 1 + } + result.build() + } +} diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/functions.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/functions.scala new file mode 100644 index 0000000000000..8056082c66ff7 --- /dev/null +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/functions.scala @@ -0,0 +1,197 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.protobuf + +import scala.collection.JavaConverters._ + +import org.apache.spark.annotation.Experimental +import org.apache.spark.sql.Column + +// scalastyle:off: object.name +object functions { +// scalastyle:on: object.name + + /** + * Converts a binary column of Protobuf format into its corresponding catalyst value. The + * Protobuf definition is provided through Protobuf descriptor file. + * + * @param data + * the binary column. + * @param messageName + * the protobuf message name to look for in descriptor file. + * @param descFilePath + * the protobuf descriptor file. + * @param options + * @since 3.4.0 + */ + @Experimental + def from_protobuf( + data: Column, + messageName: String, + descFilePath: String, + options: java.util.Map[String, String]): Column = { + new Column( + ProtobufDataToCatalyst(data.expr, messageName, Some(descFilePath), options.asScala.toMap) + ) + } + + /** + * Converts a binary column of Protobuf format into its corresponding catalyst value. The + * Protobuf definition is provided through Protobuf descriptor file. + * + * @param data + * the binary column. + * @param messageName + * the protobuf MessageName to look for in descriptor file. + * @param descFilePath + * the protobuf descriptor file. + * @since 3.4.0 + */ + @Experimental + def from_protobuf(data: Column, messageName: String, descFilePath: String): Column = { + new Column(ProtobufDataToCatalyst(data.expr, messageName, descFilePath = Some(descFilePath))) + // TODO: Add an option for user to provide descriptor file content as a buffer. This + // gives flexibility in how the content is fetched. + } + + /** + * Converts a binary column of Protobuf format into its corresponding catalyst value. + * `messageClassName` points to Protobuf Java class. The jar containing Java class should be + * shaded. Specifically, `com.google.protobuf.*` should be shaded to + * `org.sparkproject.spark_protobuf.protobuf.*`. + * https://github.com/rangadi/shaded-protobuf-classes is useful to create shaded jar from + * Protobuf files. + * + * @param data + * the binary column. + * @param messageClassName + * The full name for Protobuf Java class. E.g. com.example.protos.ExampleEvent. + * The jar with these classes needs to be shaded as described above. + * @since 3.4.0 + */ + @Experimental + def from_protobuf(data: Column, messageClassName: String): Column = { + new Column(ProtobufDataToCatalyst(data.expr, messageClassName)) + } + + /** + * Converts a binary column of Protobuf format into its corresponding catalyst value. + * `messageClassName` points to Protobuf Java class. The jar containing Java class should be + * shaded. Specifically, `com.google.protobuf.*` should be shaded to + * `org.sparkproject.spark_protobuf.protobuf.*`. + * https://github.com/rangadi/shaded-protobuf-classes is useful to create shaded jar from + * Protobuf files. + * + * @param data + * the binary column. + * @param messageClassName + * The full name for Protobuf Java class. E.g. com.example.protos.ExampleEvent. + * The jar with these classes needs to be shaded as described above. + * @param options + * @since 3.4.0 + */ + @Experimental + def from_protobuf( + data: Column, + messageClassName: String, + options: java.util.Map[String, String]): Column = { + new Column(ProtobufDataToCatalyst(data.expr, messageClassName, None, options.asScala.toMap)) + } + + /** + * Converts a column into binary of protobuf format. The Protobuf definition is provided + * through Protobuf descriptor file. + * + * @param data + * the data column. + * @param messageName + * the protobuf MessageName to look for in descriptor file. + * @param descFilePath + * the protobuf descriptor file. + * @since 3.4.0 + */ + @Experimental + def to_protobuf(data: Column, messageName: String, descFilePath: String): Column = { + new Column(CatalystDataToProtobuf(data.expr, messageName, Some(descFilePath))) + } + + /** + * Converts a column into binary of protobuf format. The Protobuf definition is provided + * through Protobuf descriptor file. + * + * @param data + * the data column. + * @param messageName + * the protobuf MessageName to look for in descriptor file. + * @param descFilePath + * the protobuf descriptor file. + * @param options + * @since 3.4.0 + */ + @Experimental + def to_protobuf( + data: Column, + messageName: String, + descFilePath: String, + options: java.util.Map[String, String]): Column = { + new Column( + CatalystDataToProtobuf(data.expr, messageName, Some(descFilePath), options.asScala.toMap) + ) + } + + /** + * Converts a column into binary of protobuf format. + * `messageClassName` points to Protobuf Java class. The jar containing Java class should be + * shaded. Specifically, `com.google.protobuf.*` should be shaded to + * `org.sparkproject.spark_protobuf.protobuf.*`. + * https://github.com/rangadi/shaded-protobuf-classes is useful to create shaded jar from + * Protobuf files. + * + * @param data + * the data column. + * @param messageClassName + * The full name for Protobuf Java class. E.g. com.example.protos.ExampleEvent. + * The jar with these classes needs to be shaded as described above. + * @since 3.4.0 + */ + @Experimental + def to_protobuf(data: Column, messageClassName: String): Column = { + new Column(CatalystDataToProtobuf(data.expr, messageClassName)) + } + + /** + * Converts a column into binary of protobuf format. + * `messageClassName` points to Protobuf Java class. The jar containing Java class should be + * shaded. Specifically, `com.google.protobuf.*` should be shaded to + * `org.sparkproject.spark_protobuf.protobuf.*`. + * https://github.com/rangadi/shaded-protobuf-classes is useful to create shaded jar from + * Protobuf files. + * + * @param data + * the data column. + * @param messageClassName + * The full name for Protobuf Java class. E.g. com.example.protos.ExampleEvent. + * The jar with these classes needs to be shaded as described above. + * @param options + * @since 3.4.0 + */ + @Experimental + def to_protobuf(data: Column, messageClassName: String, options: java.util.Map[String, String]) + : Column = { + new Column(CatalystDataToProtobuf(data.expr, messageClassName, None, options.asScala.toMap)) + } +} diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/package.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/package.scala new file mode 100644 index 0000000000000..82cdc6b9c5816 --- /dev/null +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/package.scala @@ -0,0 +1,21 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql + +package object protobuf { + protected[protobuf] object ScalaReflectionLock +} diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufOptions.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufOptions.scala new file mode 100644 index 0000000000000..53036668ebf59 --- /dev/null +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufOptions.scala @@ -0,0 +1,58 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.protobuf.utils + +import org.apache.hadoop.conf.Configuration + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.SparkSession +import org.apache.spark.sql.catalyst.FileSourceOptions +import org.apache.spark.sql.catalyst.util.{CaseInsensitiveMap, FailFastMode, ParseMode} + +/** + * Options for Protobuf Reader and Writer stored in case insensitive manner. + */ +private[sql] class ProtobufOptions( + @transient val parameters: CaseInsensitiveMap[String], + @transient val conf: Configuration) + extends FileSourceOptions(parameters) + with Logging { + + def this(parameters: Map[String, String], conf: Configuration) = { + this(CaseInsensitiveMap(parameters), conf) + } + + val parseMode: ParseMode = + parameters.get("mode").map(ParseMode.fromString).getOrElse(FailFastMode) + + // Setting the `recursive.fields.max.depth` to 1 allows it to be recurse once, + // and 2 allows it to be recursed twice and so on. A value of `recursive.fields.max.depth` + // greater than 10 is not permitted. If it is not specified, the default value is -1; + // A value of 0 or below disallows any recursive fields. If a protobuf + // record has more depth than the allowed value for recursive fields, it will be truncated + // and corresponding fields are ignored (dropped). + val recursiveFieldMaxDepth: Int = parameters.getOrElse("recursive.fields.max.depth", "-1").toInt +} + +private[sql] object ProtobufOptions { + def apply(parameters: Map[String, String]): ProtobufOptions = { + val hadoopConf = SparkSession.getActiveSession + .map(_.sessionState.newHadoopConf()) + .getOrElse(new Configuration()) + new ProtobufOptions(CaseInsensitiveMap(parameters), hadoopConf) + } +} diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala new file mode 100644 index 0000000000000..bf207d6068f73 --- /dev/null +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/ProtobufUtils.scala @@ -0,0 +1,286 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.protobuf.utils + +import java.io.{BufferedInputStream, FileInputStream, IOException} +import java.util.Locale + +import scala.collection.JavaConverters._ + +import com.google.protobuf.{DescriptorProtos, Descriptors, InvalidProtocolBufferException, Message} +import com.google.protobuf.DescriptorProtos.{FileDescriptorProto, FileDescriptorSet} +import com.google.protobuf.Descriptors.{Descriptor, FieldDescriptor} + +import org.apache.spark.internal.Logging +import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.internal.SQLConf +import org.apache.spark.sql.types._ +import org.apache.spark.util.Utils + +private[sql] object ProtobufUtils extends Logging { + + /** Wrapper for a pair of matched fields, one Catalyst and one corresponding Protobuf field. */ + private[sql] case class ProtoMatchedField( + catalystField: StructField, + catalystPosition: Int, + fieldDescriptor: FieldDescriptor) + + /** + * Helper class to perform field lookup/matching on Protobuf schemas. + * + * This will match `descriptor` against `catalystSchema`, attempting to find a matching field in + * the Protobuf descriptor for each field in the Catalyst schema and vice-versa, respecting + * settings for case sensitivity. The match results can be accessed using the getter methods. + * + * @param descriptor + * The descriptor in which to search for fields. Must be of type Descriptor. + * @param catalystSchema + * The Catalyst schema to use for matching. + * @param protoPath + * The seq of parent field names leading to `protoSchema`. + * @param catalystPath + * The seq of parent field names leading to `catalystSchema`. + */ + class ProtoSchemaHelper( + descriptor: Descriptor, + catalystSchema: StructType, + protoPath: Seq[String], + catalystPath: Seq[String]) { + if (descriptor.getName == null) { + throw QueryCompilationErrors.unknownProtobufMessageTypeError( + descriptor.getName, + descriptor.getContainingType().getName) + } + + private[this] val protoFieldArray = descriptor.getFields.asScala.toArray + private[this] val fieldMap = descriptor.getFields.asScala + .groupBy(_.getName.toLowerCase(Locale.ROOT)) + .mapValues(_.toSeq) // toSeq needed for scala 2.13 + + /** The fields which have matching equivalents in both Protobuf and Catalyst schemas. */ + val matchedFields: Seq[ProtoMatchedField] = catalystSchema.zipWithIndex.flatMap { + case (sqlField, sqlPos) => + getFieldByName(sqlField.name).map(ProtoMatchedField(sqlField, sqlPos, _)) + } + + /** + * Validate that there are no Catalyst fields which don't have a matching Protobuf field, + * throwing [[AnalysisException]] if such extra fields are found. If `ignoreNullable` is + * false, consider nullable Catalyst fields to be eligible to be an extra field; otherwise, + * ignore nullable Catalyst fields when checking for extras. + */ + def validateNoExtraCatalystFields(ignoreNullable: Boolean): Unit = + catalystSchema.fields.foreach { sqlField => + if (getFieldByName(sqlField.name).isEmpty && + (!ignoreNullable || !sqlField.nullable)) { + throw QueryCompilationErrors.cannotFindCatalystTypeInProtobufSchemaError( + toFieldStr(catalystPath :+ sqlField.name)) + } + } + + /** + * Validate that there are no Protobuf fields which don't have a matching Catalyst field, + * throwing [[AnalysisException]] if such extra fields are found. Only required (non-nullable) + * fields are checked; nullable fields are ignored. + */ + def validateNoExtraRequiredProtoFields(): Unit = { + val extraFields = protoFieldArray.toSet -- matchedFields.map(_.fieldDescriptor) + extraFields.filter(_.isRequired).foreach { extraField => + throw QueryCompilationErrors.cannotFindProtobufFieldInCatalystError( + toFieldStr(protoPath :+ extraField.getName())) + } + } + + /** + * Extract a single field from the contained Protobuf schema which has the desired field name, + * performing the matching with proper case sensitivity according to SQLConf.resolver. + * + * @param name + * The name of the field to search for. + * @return + * `Some(match)` if a matching Protobuf field is found, otherwise `None`. + */ + private[protobuf] def getFieldByName(name: String): Option[FieldDescriptor] = { + + // get candidates, ignoring case of field name + val candidates = fieldMap.getOrElse(name.toLowerCase(Locale.ROOT), Seq.empty) + + // search candidates, taking into account case sensitivity settings + candidates.filter(f => SQLConf.get.resolver(f.getName(), name)) match { + case Seq(protoField) => Some(protoField) + case Seq() => None + case matches => + throw QueryCompilationErrors.protobufFieldMatchError( + name, + toFieldStr(protoPath), + s"${matches.size}", + matches.map(_.getName()).mkString("[", ", ", "]")) + } + } + } + + /** + * Builds Protobuf message descriptor either from the Java class or from serialized descriptor + * read from the file. + * @param messageName + * Protobuf message name or Java class name. + * @param descFilePathOpt + * When the file name set, the descriptor and it's dependencies are read from the file. Other + * the `messageName` is treated as Java class name. + * @return + */ + def buildDescriptor(messageName: String, descFilePathOpt: Option[String]): Descriptor = { + descFilePathOpt match { + case Some(filePath) => buildDescriptor(descFilePath = filePath, messageName) + case None => buildDescriptorFromJavaClass(messageName) + } + } + + /** + * Loads the given protobuf class and returns Protobuf descriptor for it. + */ + def buildDescriptorFromJavaClass(protobufClassName: String): Descriptor = { + + // Default 'Message' class here is shaded while using the package (as in production). + // The incoming classes might not be shaded. Check both. + val shadedMessageClass = classOf[Message] // Shaded in prod, not in unit tests. + val missingShadingErrorMessage = "The jar with Protobuf classes needs to be shaded " + + s"(com.google.protobuf.* --> ${shadedMessageClass.getPackage.getName}.*)" + + val protobufClass = try { + Utils.classForName(protobufClassName) + } catch { + case e: ClassNotFoundException => + val explanation = + if (protobufClassName.contains(".")) "Ensure the class include in the jar" + else "Ensure the class name includes package prefix" + throw QueryCompilationErrors.protobufClassLoadError(protobufClassName, explanation, e) + + case e: NoClassDefFoundError if e.getMessage.matches("com/google/proto.*Generated.*") => + // This indicates the Java classes are not shaded. + throw QueryCompilationErrors.protobufClassLoadError( + protobufClassName, missingShadingErrorMessage, e) + } + + if (!shadedMessageClass.isAssignableFrom(protobufClass)) { + // Check if this extends 2.x Message class included in spark, that does not work. + val unshadedMessageClass = Utils.classForName( + // Generate "com.google.protobuf.Message". Using join() is a trick to escape from + // jar shader. Otherwise, it will be replaced with 'org.sparkproject...'. + String.join(".", "com", "google", "protobuf", "Message") + ) + val explanation = + if (unshadedMessageClass.isAssignableFrom(protobufClass)) { + s"$protobufClassName does not extend shaded Protobuf Message class " + + s"${shadedMessageClass.getName}. $missingShadingErrorMessage" + } else s"$protobufClassName is not a Protobuf Message type" + throw QueryCompilationErrors.protobufClassLoadError(protobufClassName, explanation) + } + + // Extract the descriptor from Protobuf message. + val getDescriptorMethod = try { + protobufClass + .getDeclaredMethod("getDescriptor") + } catch { + case e: NoSuchMethodError => // This is usually not expected. + throw QueryCompilationErrors.protobufClassLoadError( + protobufClassName, "Could not find getDescriptor() method", e) + } + + getDescriptorMethod + .invoke(null) + .asInstanceOf[Descriptor] + } + + def buildDescriptor(descFilePath: String, messageName: String): Descriptor = { + // Find the first message descriptor that matches the name. + val descriptorOpt = parseFileDescriptorSet(descFilePath) + .flatMap { fileDesc => + fileDesc.getMessageTypes.asScala.find { desc => + desc.getName == messageName || desc.getFullName == messageName + } + }.headOption + + descriptorOpt match { + case Some(d) => d + case None => throw QueryCompilationErrors.unableToLocateProtobufMessageError(messageName) + } + } + + private def parseFileDescriptorSet(descFilePath: String): List[Descriptors.FileDescriptor] = { + var fileDescriptorSet: DescriptorProtos.FileDescriptorSet = null + try { + val dscFile = new BufferedInputStream(new FileInputStream(descFilePath)) + fileDescriptorSet = DescriptorProtos.FileDescriptorSet.parseFrom(dscFile) + } catch { + case ex: InvalidProtocolBufferException => + throw QueryCompilationErrors.descriptorParseError(descFilePath, ex) + case ex: IOException => + throw QueryCompilationErrors.cannotFindDescriptorFileError(descFilePath, ex) + } + try { + val fileDescriptorProtoIndex = createDescriptorProtoMap(fileDescriptorSet) + val fileDescriptorList: List[Descriptors.FileDescriptor] = + fileDescriptorSet.getFileList.asScala.map( fileDescriptorProto => + buildFileDescriptor(fileDescriptorProto, fileDescriptorProtoIndex) + ).toList + fileDescriptorList + } catch { + case e: Exception => + throw QueryCompilationErrors.failedParsingDescriptorError(descFilePath, e) + } + } + + /** + * Recursively constructs file descriptors for all dependencies for given + * FileDescriptorProto and return. + */ + private def buildFileDescriptor( + fileDescriptorProto: FileDescriptorProto, + fileDescriptorProtoMap: Map[String, FileDescriptorProto]): Descriptors.FileDescriptor = { + val fileDescriptorList = fileDescriptorProto.getDependencyList().asScala.map { dependency => + fileDescriptorProtoMap.get(dependency) match { + case Some(dependencyProto) => + buildFileDescriptor(dependencyProto, fileDescriptorProtoMap) + case None => + throw QueryCompilationErrors.protobufDescriptorDependencyError(dependency) + } + } + Descriptors.FileDescriptor.buildFrom(fileDescriptorProto, fileDescriptorList.toArray) + } + + /** + * Returns a map from descriptor proto name as found inside the descriptors to protos. + */ + private def createDescriptorProtoMap( + fileDescriptorSet: FileDescriptorSet): Map[String, FileDescriptorProto] = { + fileDescriptorSet.getFileList().asScala.map { descriptorProto => + descriptorProto.getName() -> descriptorProto + }.toMap[String, FileDescriptorProto] + } + + /** + * Convert a sequence of hierarchical field names (like `Seq(foo, bar)`) into a human-readable + * string representing the field, like "field 'foo.bar'". If `names` is empty, the string + * "top-level record" is returned. + */ + private[protobuf] def toFieldStr(names: Seq[String]): String = names match { + case Seq() => "top-level record" + case n => s"field '${n.mkString(".")}'" + } +} diff --git a/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala new file mode 100644 index 0000000000000..e277f2999e434 --- /dev/null +++ b/connector/protobuf/src/main/scala/org/apache/spark/sql/protobuf/utils/SchemaConverters.scala @@ -0,0 +1,177 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.protobuf.utils + +import scala.collection.JavaConverters._ + +import com.google.protobuf.Descriptors.{Descriptor, FieldDescriptor} + +import org.apache.spark.annotation.DeveloperApi +import org.apache.spark.internal.Logging +import org.apache.spark.sql.errors.QueryCompilationErrors +import org.apache.spark.sql.types._ + +@DeveloperApi +object SchemaConverters extends Logging { + + /** + * Internal wrapper for SQL data type and nullability. + * + * @since 3.4.0 + */ + case class SchemaType(dataType: DataType, nullable: Boolean) + + /** + * Converts an Protobuf schema to a corresponding Spark SQL schema. + * + * @since 3.4.0 + */ + def toSqlType( + descriptor: Descriptor, + protobufOptions: ProtobufOptions = ProtobufOptions(Map.empty)): SchemaType = { + toSqlTypeHelper(descriptor, protobufOptions) + } + + def toSqlTypeHelper( + descriptor: Descriptor, + protobufOptions: ProtobufOptions): SchemaType = { + SchemaType( + StructType(descriptor.getFields.asScala.flatMap( + structFieldFor(_, + Map(descriptor.getFullName -> 1), + protobufOptions: ProtobufOptions)).toArray), + nullable = true) + } + + // existingRecordNames: Map[String, Int] used to track the depth of recursive fields and to + // ensure that the conversion of the protobuf message to a Spark SQL StructType object does not + // exceed the maximum recursive depth specified by the recursiveFieldMaxDepth option. + // A return of None implies the field has reached the maximum allowed recursive depth and + // should be dropped. + def structFieldFor( + fd: FieldDescriptor, + existingRecordNames: Map[String, Int], + protobufOptions: ProtobufOptions): Option[StructField] = { + import com.google.protobuf.Descriptors.FieldDescriptor.JavaType._ + val dataType = fd.getJavaType match { + case INT => Some(IntegerType) + case LONG => Some(LongType) + case FLOAT => Some(FloatType) + case DOUBLE => Some(DoubleType) + case BOOLEAN => Some(BooleanType) + case STRING => Some(StringType) + case BYTE_STRING => Some(BinaryType) + case ENUM => Some(StringType) + case MESSAGE + if (fd.getMessageType.getName == "Duration" && + fd.getMessageType.getFields.size() == 2 && + fd.getMessageType.getFields.get(0).getName.equals("seconds") && + fd.getMessageType.getFields.get(1).getName.equals("nanos")) => + Some(DayTimeIntervalType.defaultConcreteType) + case MESSAGE + if (fd.getMessageType.getName == "Timestamp" && + fd.getMessageType.getFields.size() == 2 && + fd.getMessageType.getFields.get(0).getName.equals("seconds") && + fd.getMessageType.getFields.get(1).getName.equals("nanos")) => + Some(TimestampType) + case MESSAGE if fd.isRepeated && fd.getMessageType.getOptions.hasMapEntry => + var keyType: Option[DataType] = None + var valueType: Option[DataType] = None + fd.getMessageType.getFields.forEach { field => + field.getName match { + case "key" => + keyType = + structFieldFor( + field, + existingRecordNames, + protobufOptions).map(_.dataType) + case "value" => + valueType = + structFieldFor( + field, + existingRecordNames, + protobufOptions).map(_.dataType) + } + } + (keyType, valueType) match { + case (None, _) => + // This is probably never expected. Protobuf does not allow complex types for keys. + log.info(s"Dropping map field ${fd.getFullName}. Key reached max recursive depth.") + None + case (_, None) => + log.info(s"Dropping map field ${fd.getFullName}. Value reached max recursive depth.") + None + case (Some(kt), Some(vt)) => Some(MapType(kt, vt, valueContainsNull = false)) + } + case MESSAGE => + // If the `recursive.fields.max.depth` value is not specified, it will default to -1, + // and recursive fields are not permitted. Setting it to 0 drops all recursive fields, + // 1 allows it to be recursed once, and 2 allows it to be recursed twice and so on. + // A value greater than 10 is not allowed, and if a protobuf record has more depth for + // recursive fields than the allowed value, it will be truncated and some fields may be + // discarded. + // SQL Schema for protob2uf `message Person { string name = 1; Person bff = 2;}` + // will vary based on the value of "recursive.fields.max.depth". + // 1: struct + // 2: struct> + // 3: struct>> + // and so on. + // TODO(rangadi): A better way to terminate would be replace the remaining recursive struct + // with the byte array of corresponding protobuf. This way no information is lost. + // i.e. with max depth 2, the above looks like this: + // struct> + val recordName = fd.getMessageType.getFullName + val recursiveDepth = existingRecordNames.getOrElse(recordName, 0) + val recursiveFieldMaxDepth = protobufOptions.recursiveFieldMaxDepth + if (existingRecordNames.contains(recordName) && (recursiveFieldMaxDepth <= 0 || + recursiveFieldMaxDepth > 10)) { + throw QueryCompilationErrors.foundRecursionInProtobufSchema(fd.toString()) + } else if (existingRecordNames.contains(recordName) && + recursiveDepth >= recursiveFieldMaxDepth) { + // Recursive depth limit is reached. This field is dropped. + // If it is inside a container like map or array, the containing field is dropped. + log.info( + s"The field ${fd.getFullName} of type $recordName is dropped " + + s"at recursive depth $recursiveDepth" + ) + None + } else { + val newRecordNames = existingRecordNames + (recordName -> (recursiveDepth + 1)) + val fields = fd.getMessageType.getFields.asScala.flatMap( + structFieldFor(_, newRecordNames, protobufOptions) + ).toSeq + fields match { + case Nil => + log.info( + s"Dropping ${fd.getFullName} as it does not have any fields left " + + "likely due to recursive depth limit." + ) + None + case fds => Some(StructType(fds)) + } + } + case other => + throw QueryCompilationErrors.protobufTypeUnsupportedYetError(other.toString) + } + dataType.map { + case dt: MapType => StructField(fd.getName, dt) + case dt if fd.isRepeated => + StructField(fd.getName, ArrayType(dt, containsNull = false)) + case dt => StructField(fd.getName, dt, nullable = !fd.isRequired) + } + } +} diff --git a/connector/protobuf/src/test/resources/log4j2.properties b/connector/protobuf/src/test/resources/log4j2.properties new file mode 100644 index 0000000000000..ab02104c69697 --- /dev/null +++ b/connector/protobuf/src/test/resources/log4j2.properties @@ -0,0 +1,39 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +# Set everything to be logged to the file target/unit-tests.log +rootLogger.level = info +rootLogger.appenderRef.file.ref = ${sys:test.appender:-File} + +appender.file.type = File +appender.file.name = File +appender.file.fileName = target/unit-tests.log +appender.file.layout.type = PatternLayout +appender.file.layout.pattern = %d{yy/MM/dd HH:mm:ss.SSS} %t %p %c{1}: %m%n%ex + +# Tests that launch java subprocesses can set the "test.appender" system property to +# "console" to avoid having the child process's logs overwrite the unit test's +# log file. +appender.console.type = Console +appender.console.name = console +appender.console.target = SYSTEM_ERR +appender.console.layout.type = PatternLayout +appender.console.layout.pattern = %t: %m%n%ex + +# Ignore messages below warning level from Jetty, because it's a bit verbose +logger.jetty.name = org.sparkproject.jetty +logger.jetty.level = warn diff --git a/connector/protobuf/src/test/resources/protobuf/basicmessage.proto b/connector/protobuf/src/test/resources/protobuf/basicmessage.proto new file mode 100644 index 0000000000000..8f4c1bb8eae42 --- /dev/null +++ b/connector/protobuf/src/test/resources/protobuf/basicmessage.proto @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// cd connector/protobuf/src/test/resources/protobuf +// protoc --java_out=./ basicmessage.proto +// protoc --include_imports --descriptor_set_out=basicmessage.desc --java_out=org/apache/spark/sql/protobuf/ basicmessage.proto +// protoc --descriptor_set_out=basicmessage_noimports.desc --java_out=org/apache/spark/sql/protobuf/ basicmessage.proto + +syntax = "proto3"; + +package org.apache.spark.sql.protobuf.protos; + +import "nestedenum.proto"; + +option java_outer_classname = "BasicMessageProto"; + +message BasicMessage { + int64 id = 1; + string string_value = 2; + int32 int32_value = 3; + int64 int64_value = 4; + double double_value = 5; + float float_value = 6; + bool bool_value = 7; + bytes bytes_value = 8; + NestedEnum rnested_enum = 9; +} diff --git a/connector/protobuf/src/test/resources/protobuf/basicmessage_noimports.desc b/connector/protobuf/src/test/resources/protobuf/basicmessage_noimports.desc new file mode 100644 index 0000000000000..26ba6552cb01d --- /dev/null +++ b/connector/protobuf/src/test/resources/protobuf/basicmessage_noimports.desc @@ -0,0 +1,18 @@ + +� +basicmessage.proto$org.apache.spark.sql.protobuf.protosnestedenum.proto"� + BasicMessage +id (Rid! + string_value ( R stringValue + int32_value (R +int32Value + int64_value (R +int64Value! + double_value (R doubleValue + float_value (R +floatValue + +bool_value (R boolValue + bytes_value ( R +bytesValueS + rnested_enum (20.org.apache.spark.sql.protobuf.protos.NestedEnumR rnestedEnumBBBasicMessageProtobproto3 \ No newline at end of file diff --git a/connector/protobuf/src/test/resources/protobuf/catalyst_types.desc b/connector/protobuf/src/test/resources/protobuf/catalyst_types.desc new file mode 100644 index 0000000000000..59255b488a03d --- /dev/null +++ b/connector/protobuf/src/test/resources/protobuf/catalyst_types.desc @@ -0,0 +1,48 @@ + +� +Cconnector/protobuf/src/test/resources/protobuf/catalyst_types.protoorg.apache.spark.sql.protobuf") + +BooleanMsg + bool_type (RboolType"+ + +IntegerMsg + +int32_type (R int32Type", + DoubleMsg + double_type (R +doubleType") +FloatMsg + +float_type (R floatType") +BytesMsg + +bytes_type ( R bytesType", + StringMsg + string_type ( R +stringType". +Person +name ( Rname +age (Rage"n +Bad +col_0 ( Rcol0 +col_1 (Rcol1 +col_2 ( Rcol2 +col_3 (Rcol3 +col_4 (Rcol4"q +Actual +col_0 ( Rcol0 +col_1 (Rcol1 +col_2 (Rcol2 +col_3 (Rcol3 +col_4 (Rcol4" + oldConsumer +key ( Rkey"5 + newProducer +key ( Rkey +value (Rvalue"t + newConsumer +key ( Rkey +value (Rvalue= +actual ( 2%.org.apache.spark.sql.protobuf.ActualRactual" + oldProducer +key ( RkeyBB CatalystTypesbproto3 \ No newline at end of file diff --git a/connector/protobuf/src/test/resources/protobuf/catalyst_types.proto b/connector/protobuf/src/test/resources/protobuf/catalyst_types.proto new file mode 100644 index 0000000000000..0732de1085895 --- /dev/null +++ b/connector/protobuf/src/test/resources/protobuf/catalyst_types.proto @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// protoc --java_out=connector/protobuf/src/test/resources/protobuf/ connector/protobuf/src/test/resources/protobuf/catalyst_types.proto +// protoc --descriptor_set_out=connector/protobuf/src/test/resources/protobuf/catalyst_types.desc --java_out=connector/protobuf/src/test/resources/protobuf/org/apache/spark/sql/protobuf/ connector/protobuf/src/test/resources/protobuf/catalyst_types.proto + +syntax = "proto3"; + +package org.apache.spark.sql.protobuf.protos; +option java_outer_classname = "CatalystTypes"; + +message BooleanMsg { + bool bool_type = 1; +} +message IntegerMsg { + int32 int32_type = 1; +} +message DoubleMsg { + double double_type = 1; +} +message FloatMsg { + float float_type = 1; +} +message BytesMsg { + bytes bytes_type = 1; +} +message StringMsg { + string string_type = 1; +} + +message Person { + string name = 1; + int32 age = 2; +} + +message Bad { + bytes col_0 = 1; + double col_1 = 2; + string col_2 = 3; + float col_3 = 4; + int64 col_4 = 5; +} + +message Actual { + string col_0 = 1; + int32 col_1 = 2; + float col_2 = 3; + bool col_3 = 4; + double col_4 = 5; +} + +message oldConsumer { + string key = 1; +} + +message newProducer { + string key = 1; + int32 value = 2; +} + +message newConsumer { + string key = 1; + int32 value = 2; + Actual actual = 3; +} + +message oldProducer { + string key = 1; +} \ No newline at end of file diff --git a/connector/protobuf/src/test/resources/protobuf/duration.proto b/connector/protobuf/src/test/resources/protobuf/duration.proto new file mode 100644 index 0000000000000..2e89a8db5b7be --- /dev/null +++ b/connector/protobuf/src/test/resources/protobuf/duration.proto @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +syntax = "proto3"; + +package org.apache.spark.sql.protobuf.protos; + +option java_outer_classname = "DurationProto"; + +message Duration { + int64 seconds = 1; + int32 nanos = 2; +} diff --git a/connector/protobuf/src/test/resources/protobuf/functions_suite.desc b/connector/protobuf/src/test/resources/protobuf/functions_suite.desc new file mode 100644 index 0000000000000..467b9cac969ba Binary files /dev/null and b/connector/protobuf/src/test/resources/protobuf/functions_suite.desc differ diff --git a/connector/protobuf/src/test/resources/protobuf/functions_suite.proto b/connector/protobuf/src/test/resources/protobuf/functions_suite.proto new file mode 100644 index 0000000000000..d83ba6a4f6e29 --- /dev/null +++ b/connector/protobuf/src/test/resources/protobuf/functions_suite.proto @@ -0,0 +1,285 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// To compile and create test class: +// cd connector/protobuf/src/test/resources/protobuf +// protoc --java_out=./ functions_suite.proto +// protoc --include_imports --descriptor_set_out=functions_suite.desc --java_out=org/apache/spark/sql/protobuf/ functions_suite.proto + +syntax = "proto3"; + +package org.apache.spark.sql.protobuf.protos; + +import "timestamp.proto"; +import "duration.proto"; +import "basicmessage.proto"; + +option java_outer_classname = "SimpleMessageProtos"; + +message SimpleMessageJavaTypes { + int64 id = 1; + string string_value = 2; + int32 int32_value = 3; + int64 int64_value = 4; + double double_value = 5; + float float_value = 6; + bool bool_value = 7; + bytes bytes_value = 8; +} + +message SimpleMessage { + int64 id = 1; + string string_value = 2; + int32 int32_value = 3; + uint32 uint32_value = 4; + sint32 sint32_value = 5; + fixed32 fixed32_value = 6; + sfixed32 sfixed32_value = 7; + int64 int64_value = 8; + uint64 uint64_value = 9; + sint64 sint64_value = 10; + fixed64 fixed64_value = 11; + sfixed64 sfixed64_value = 12; + double double_value = 13; + float float_value = 14; + bool bool_value = 15; + bytes bytes_value = 16; +} + +message SimpleMessageRepeated { + string key = 1; + string value = 2; + enum NestedEnum { + NESTED_NOTHING = 0; + NESTED_FIRST = 1; + NESTED_SECOND = 2; + } + repeated string rstring_value = 3; + repeated int32 rint32_value = 4; + repeated bool rbool_value = 5; + repeated int64 rint64_value = 6; + repeated float rfloat_value = 7; + repeated double rdouble_value = 8; + repeated bytes rbytes_value = 9; + repeated NestedEnum rnested_enum = 10; +} + +message RepeatedMessage { + repeated BasicMessage basic_message = 1; +} + +message SimpleMessageMap { + string key = 1; + string value = 2; + map string_mapdata = 3; + map int32_mapdata = 4; + map uint32_mapdata = 5; + map sint32_mapdata = 6; + map float32_mapdata = 7; + map sfixed32_mapdata = 8; + map int64_mapdata = 9; + map uint64_mapdata = 10; + map sint64_mapdata = 11; + map fixed64_mapdata = 12; + map sfixed64_mapdata = 13; + map double_mapdata = 14; + map float_mapdata = 15; + map bool_mapdata = 16; + map bytes_mapdata = 17; +} + +message BasicEnumMessage { + enum BasicEnum { + NOTHING = 0; + FIRST = 1; + SECOND = 2; + } +} + +message SimpleMessageEnum { + string key = 1; + string value = 2; + enum NestedEnum { + NESTED_NOTHING = 0; + NESTED_FIRST = 1; + NESTED_SECOND = 2; + } + BasicEnumMessage.BasicEnum basic_enum = 3; + NestedEnum nested_enum = 4; +} + + +message OtherExample { + string other = 1; +} + +message IncludedExample { + string included = 1; + OtherExample other = 2; +} + +message MultipleExample { + IncludedExample included_example = 1; +} + +message recursiveA { + string keyA = 1; + recursiveB messageB = 2; +} + +message recursiveB { + string keyB = 1; + recursiveA messageA = 2; +} + +message recursiveC { + string keyC = 1; + recursiveD messageD = 2; +} + +message recursiveD { + string keyD = 1; + repeated recursiveC messageC = 2; +} + +message requiredMsg { + string key = 1; + int32 col_1 = 2; + string col_2 = 3; + int32 col_3 = 4; +} + +message timeStampMsg { + string key = 1; + Timestamp stmp = 2; +} + +message durationMsg { + string key = 1; + Duration duration = 2; +} + +message OneOfEvent { + string key = 1; + oneof payload { + int32 col_1 = 2; + string col_2 = 3; + int64 col_3 = 4; + } + repeated string col_4 = 5; +} + +message EventWithRecursion { + int32 key = 1; + messageA a = 2; +} +message messageA { + EventWithRecursion a = 1; + messageB b = 2; +} +message messageB { + EventWithRecursion aa = 1; + messageC c = 2; +} +message messageC { + EventWithRecursion aaa = 1; + int32 key= 2; +} + +message Employee { + string firstName = 1; + string lastName = 2; + oneof role { + IC ic = 3; + EM em = 4; + EM2 em2 = 5; + } +} + +message IC { + repeated string skills = 1; + Employee icManager = 2; +} + +message EM { + int64 teamsize = 1; + Employee emManager = 2; +} + +message EM2 { + int64 teamsize = 1; + Employee em2Manager = 2; +} + +message EventPerson { // Used for simple recursive field testing. + string name = 1; + EventPerson bff = 2; +} + +message EventPersonWrapper { + EventPerson person = 1; +} + +message PersonWithRecursiveArray { + // A protobuf with recursive repeated field + string name = 1; + repeated PersonWithRecursiveArray friends = 2; +} + +message PersonWithRecursiveMap { + // A protobuf with recursive field in value + string name = 1; + map groups = 3; +} + + +message OneOfEventWithRecursion { + string key = 1; + oneof payload { + EventRecursiveA recursiveA = 3; + EventRecursiveB recursiveB = 6; + } + string value = 7; +} + +message EventRecursiveA { + OneOfEventWithRecursion recursiveOneOffInA = 1; + string key = 2; +} + +message EventRecursiveB { + string key = 1; + string value = 2; + OneOfEventWithRecursion recursiveOneOffInB = 3; +} + +message EmptyRecursiveProto { + // This is a recursive proto with no fields. Used to test edge. Catalyst schema for this + // should be "nothing" (i.e. completely dropped) irrespective of recursive limit. + EmptyRecursiveProto recursive_field = 1; + repeated EmptyRecursiveProto recursive_array = 2; +} + +message EmptyRecursiveProtoWrapper { + string name = 1; + EmptyRecursiveProto empty_recursive = 2; // This field will be dropped. +} + +message Status { + int32 id = 1; + Timestamp trade_time = 2; + Status status = 3; +} diff --git a/connector/protobuf/src/test/resources/protobuf/nestedenum.proto b/connector/protobuf/src/test/resources/protobuf/nestedenum.proto new file mode 100644 index 0000000000000..20e9005bec0a7 --- /dev/null +++ b/connector/protobuf/src/test/resources/protobuf/nestedenum.proto @@ -0,0 +1,28 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = "proto3"; + +package org.apache.spark.sql.protobuf.protos; + +option java_outer_classname = "NestedEnumProto"; + +enum NestedEnum { + NESTED_NOTHING = 0; + NESTED_FIRST = 1; + NESTED_SECOND = 2; +} diff --git a/connector/protobuf/src/test/resources/protobuf/proto2_messages.desc b/connector/protobuf/src/test/resources/protobuf/proto2_messages.desc new file mode 100644 index 0000000000000..a9e4099a7f2b5 --- /dev/null +++ b/connector/protobuf/src/test/resources/protobuf/proto2_messages.desc @@ -0,0 +1,8 @@ + +� +proto2_messages.proto$org.apache.spark.sql.protobuf.protos"@ +FoobarWithRequiredFieldBar +foo ( Rfoo +bar (Rbar"� + NestedFoobarWithRequiredFieldBare + nested_foobar ( 2@.org.apache.spark.sql.protobuf.protos.FoobarWithRequiredFieldBarR nestedFoobarBBProto2Messages \ No newline at end of file diff --git a/connector/protobuf/src/test/resources/protobuf/proto2_messages.proto b/connector/protobuf/src/test/resources/protobuf/proto2_messages.proto new file mode 100644 index 0000000000000..a5d09df8514e0 --- /dev/null +++ b/connector/protobuf/src/test/resources/protobuf/proto2_messages.proto @@ -0,0 +1,33 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = "proto2"; + +package org.apache.spark.sql.protobuf.protos; +option java_outer_classname = "Proto2Messages"; + + +// Used to test missing required field bar in top level schema. +message FoobarWithRequiredFieldBar { + optional string foo = 1; + required int32 bar = 2; +} + +// Used to test missing required field bar in nested struct. +message NestedFoobarWithRequiredFieldBar { + optional FoobarWithRequiredFieldBar nested_foobar = 1; +} diff --git a/connector/protobuf/src/test/resources/protobuf/pyspark_test.proto b/connector/protobuf/src/test/resources/protobuf/pyspark_test.proto new file mode 100644 index 0000000000000..8750371349a06 --- /dev/null +++ b/connector/protobuf/src/test/resources/protobuf/pyspark_test.proto @@ -0,0 +1,32 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// TODO(SPARK-40777): Instead of saving .desc files in resources, generate during build. +// To compile and create test class: +// protoc --java_out=connector/protobuf/src/test/resources/protobuf/ connector/protobuf/src/test/resources/protobuf/pyspark_test.proto +// protoc --descriptor_set_out=connector/protobuf/src/test/resources/protobuf/pyspark_test.desc --java_out=connector/protobuf/src/test/resources/protobuf/org/apache/spark/sql/protobuf/ connector/protobuf/src/test/resources/protobuf/pyspark_test.proto + +syntax = "proto3"; + +package org.apache.spark.sql.protobuf; +option java_outer_classname = "SimpleMessageProtos"; + + +message SimpleMessage { + int32 age = 1; + string name = 2; + int64 score = 3; +} \ No newline at end of file diff --git a/connector/protobuf/src/test/resources/protobuf/serde_suite.desc b/connector/protobuf/src/test/resources/protobuf/serde_suite.desc new file mode 100644 index 0000000000000..3d1847eecc5c3 --- /dev/null +++ b/connector/protobuf/src/test/resources/protobuf/serde_suite.desc @@ -0,0 +1,27 @@ + +� +Fconnector/protobuf/src/test/resources/protobuf/proto_serde_suite.protoorg.apache.spark.sql.protobuf"D + BasicMessage4 +foo ( 2".org.apache.spark.sql.protobuf.FooRfoo" +Foo +bar (Rbar"' +MissMatchTypeInRoot +foo (Rfoo"T +FieldMissingInProto= +foo ( 2+.org.apache.spark.sql.protobuf.MissingFieldRfoo"& + MissingField +barFoo (RbarFoo"\ +MissMatchTypeInDeepNested? +top ( 2-.org.apache.spark.sql.protobuf.TypeMissNestedRtop"K +TypeMissNested9 +foo ( 2'.org.apache.spark.sql.protobuf.TypeMissRfoo" +TypeMiss +bar (Rbar"_ +FieldMissingInSQLRoot4 +foo ( 2".org.apache.spark.sql.protobuf.FooRfoo +boo (Rboo"O +FieldMissingInSQLNested4 +foo ( 2".org.apache.spark.sql.protobuf.BazRfoo") +Baz +bar (Rbar +baz (RbazBBSimpleMessageProtosbproto3 \ No newline at end of file diff --git a/connector/protobuf/src/test/resources/protobuf/serde_suite.proto b/connector/protobuf/src/test/resources/protobuf/serde_suite.proto new file mode 100644 index 0000000000000..87152b035b015 --- /dev/null +++ b/connector/protobuf/src/test/resources/protobuf/serde_suite.proto @@ -0,0 +1,65 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +// To compile and create test class: +// protoc --java_out=connector/protobuf/src/test/resources/protobuf/ connector/protobuf/src/test/resources/protobuf/serde_suite.proto +// protoc --descriptor_set_out=connector/protobuf/src/test/resources/protobuf/serde_suite.desc --java_out=connector/protobuf/src/test/resources/protobuf/org/apache/spark/sql/protobuf/ connector/protobuf/src/test/resources/protobuf/serde_suite.proto + +syntax = "proto3"; + +package org.apache.spark.sql.protobuf.protos; +option java_outer_classname = "SerdeSuiteProtos"; + +/* Clean Message*/ +message SerdeBasicMessage { + Foo foo = 1; +} + +message Foo { + int32 bar = 1; +} + +/* Field Type missMatch in root Message*/ +message MissMatchTypeInRoot { + int64 foo = 1; +} + +/* Field bar missing from protobuf and Available in SQL*/ +message FieldMissingInProto { + MissingField foo = 1; +} + +message MissingField { + int64 barFoo = 1; +} + +/* Deep-nested field bar type missMatch Message*/ +message MissMatchTypeInDeepNested { + TypeMissNested top = 1; +} + +message TypeMissNested { + TypeMiss foo = 1; +} + +message TypeMiss { + int64 bar = 1; +} + +message Baz { + int32 bar = 1; + int32 baz = 2; +} \ No newline at end of file diff --git a/connector/protobuf/src/test/resources/protobuf/timestamp.proto b/connector/protobuf/src/test/resources/protobuf/timestamp.proto new file mode 100644 index 0000000000000..7616cc2ccfc15 --- /dev/null +++ b/connector/protobuf/src/test/resources/protobuf/timestamp.proto @@ -0,0 +1,26 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +syntax = "proto3"; + +package org.apache.spark.sql.protobuf.protos; + +option java_outer_classname = "TimestampProto"; + +message Timestamp { + int64 seconds = 1; + int32 nanos = 2; +} diff --git a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufCatalystDataConversionSuite.scala b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufCatalystDataConversionSuite.scala new file mode 100644 index 0000000000000..3e9273835e3d4 --- /dev/null +++ b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufCatalystDataConversionSuite.scala @@ -0,0 +1,253 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.protobuf + +import com.google.protobuf.{ByteString, DynamicMessage, Message} + +import org.apache.spark.SparkFunSuite +import org.apache.spark.sql.{RandomDataGenerator, Row} +import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, NoopFilters, OrderedFilters, StructFilters} +import org.apache.spark.sql.catalyst.expressions.{ExpressionEvalHelper, GenericInternalRow, Literal} +import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData, MapData} +import org.apache.spark.sql.protobuf.protos.CatalystTypes.BytesMsg +import org.apache.spark.sql.protobuf.utils.{ProtobufUtils, SchemaConverters} +import org.apache.spark.sql.sources.{EqualTo, Not} +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types._ +import org.apache.spark.unsafe.types.UTF8String + +class ProtobufCatalystDataConversionSuite + extends SparkFunSuite + with SharedSparkSession + with ExpressionEvalHelper + with ProtobufTestBase { + + private val testFileDesc = testFile("catalyst_types.desc", "protobuf/catalyst_types.desc") + private val javaClassNamePrefix = "org.apache.spark.sql.protobuf.protos.CatalystTypes$" + + private def checkResultWithEval( + data: Literal, + descFilePath: String, + messageName: String, + expected: Any): Unit = { + + withClue("(Eval check with Java class name)") { + val className = s"$javaClassNamePrefix$messageName" + checkEvaluation( + ProtobufDataToCatalyst( + CatalystDataToProtobuf(data, className), + className, + descFilePath = None), + prepareExpectedResult(expected)) + } + withClue("(Eval check with descriptor file)") { + checkEvaluation( + ProtobufDataToCatalyst( + CatalystDataToProtobuf(data, messageName, Some(descFilePath)), + messageName, + descFilePath = Some(descFilePath)), + prepareExpectedResult(expected)) + } + } + + protected def checkUnsupportedRead( + data: Literal, + descFilePath: String, + actualSchema: String, + badSchema: String): Unit = { + + val binary = CatalystDataToProtobuf(data, actualSchema, Some(descFilePath)) + + intercept[Exception] { + ProtobufDataToCatalyst(binary, badSchema, Some(descFilePath), Map("mode" -> "FAILFAST")) + .eval() + } + + val expected = { + val expectedSchema = ProtobufUtils.buildDescriptor(descFilePath, badSchema) + SchemaConverters.toSqlType(expectedSchema).dataType match { + case st: StructType => + Row.fromSeq((0 until st.length).map { _ => + null + }) + case _ => null + } + } + + checkEvaluation( + ProtobufDataToCatalyst(binary, badSchema, Some(descFilePath), Map("mode" -> "PERMISSIVE")), + expected) + } + + protected def prepareExpectedResult(expected: Any): Any = expected match { + // Spark byte and short both map to Protobuf int + case b: Byte => b.toInt + case s: Short => s.toInt + case row: GenericInternalRow => InternalRow.fromSeq(row.values.map(prepareExpectedResult)) + case array: GenericArrayData => new GenericArrayData(array.array.map(prepareExpectedResult)) + case map: MapData => + val keys = new GenericArrayData( + map.keyArray().asInstanceOf[GenericArrayData].array.map(prepareExpectedResult)) + val values = new GenericArrayData( + map.valueArray().asInstanceOf[GenericArrayData].array.map(prepareExpectedResult)) + new ArrayBasedMapData(keys, values) + case other => other + } + + private val testingTypes = Seq( + StructType(StructField("int32_type", IntegerType, nullable = true) :: Nil), + StructType(StructField("double_type", DoubleType, nullable = true) :: Nil), + StructType(StructField("float_type", FloatType, nullable = true) :: Nil), + StructType(StructField("bytes_type", BinaryType, nullable = true) :: Nil), + StructType(StructField("string_type", StringType, nullable = true) :: Nil)) + + private val catalystTypesToProtoMessages: Map[DataType, (String, Any)] = Map( + (IntegerType, ("IntegerMsg", 0)), // Don't use '->', it causes a scala warning. + (DoubleType, ("DoubleMsg", 0.0d)), + (FloatType, ("FloatMsg", 0.0f)), + (BinaryType, ("BytesMsg", ByteString.empty().toByteArray)), + (StringType, ("StringMsg", ""))) + + testingTypes.foreach { dt => + val seed = scala.util.Random.nextInt(RandomDataGenerator.MAX_STR_LEN) + test(s"single $dt with seed $seed") { + + val (messageName, defaultValue) = catalystTypesToProtoMessages(dt.fields(0).dataType) + + val rand = new scala.util.Random(seed) + val generator = RandomDataGenerator.forType(dt, rand = rand).get + var data = generator().asInstanceOf[Row] + // Do not use default values, since from_protobuf() returns null in v3. + while ( + data != null && + (data.get(0) == defaultValue || + (dt == BinaryType && + data.get(0).asInstanceOf[Array[Byte]].isEmpty))) + data = generator().asInstanceOf[Row] + + val converter = CatalystTypeConverters.createToCatalystConverter(dt) + val input = Literal.create(converter(data), dt) + + checkResultWithEval( + input, + testFileDesc, + messageName, + input.eval()) + } + } + + private def checkDeserialization( + descFilePath: String, + messageName: String, + data: Message, + expected: Option[Any], + filters: StructFilters = new NoopFilters): Unit = { + + val descriptor = ProtobufUtils.buildDescriptor(descFilePath, messageName) + val dataType = SchemaConverters.toSqlType(descriptor).dataType + + val deserializer = new ProtobufDeserializer(descriptor, dataType, filters) + + val dynMsg = DynamicMessage.parseFrom(descriptor, data.toByteArray) + val deserialized = deserializer.deserialize(dynMsg) + + // Verify Java class deserializer matches with descriptor based serializer. + val javaDescriptor = ProtobufUtils + .buildDescriptorFromJavaClass(s"$javaClassNamePrefix$messageName") + assert(dataType == SchemaConverters.toSqlType(javaDescriptor).dataType) + val javaDeserialized = new ProtobufDeserializer(javaDescriptor, dataType, filters) + .deserialize(DynamicMessage.parseFrom(javaDescriptor, data.toByteArray)) + assert(deserialized == javaDeserialized) + + expected match { + case None => assert(deserialized.isEmpty) + case Some(d) => + assert(checkResult(d, deserialized.get, dataType, exprNullable = false)) + } + } + + test("Handle unsupported input of message type") { + val actualSchema = StructType( + Seq( + StructField("col_0", StringType, nullable = false), + StructField("col_1", IntegerType, nullable = false), + StructField("col_2", FloatType, nullable = false), + StructField("col_3", BooleanType, nullable = false), + StructField("col_4", DoubleType, nullable = false))) + + val seed = scala.util.Random.nextLong() + withClue(s"create random record with seed $seed") { + val data = RandomDataGenerator.randomRow(new scala.util.Random(seed), actualSchema) + val converter = CatalystTypeConverters.createToCatalystConverter(actualSchema) + val input = Literal.create(converter(data), actualSchema) + checkUnsupportedRead(input, testFileDesc, "Actual", "Bad") + } + } + + test("filter push-down to Protobuf deserializer") { + + val sqlSchema = new StructType() + .add("name", "string") + .add("age", "int") + + val descriptor = ProtobufUtils.buildDescriptor(testFileDesc, "Person") + val dynamicMessage = DynamicMessage + .newBuilder(descriptor) + .setField(descriptor.findFieldByName("name"), "Maxim") + .setField(descriptor.findFieldByName("age"), 39) + .build() + + val expectedRow = Some(InternalRow(UTF8String.fromString("Maxim"), 39)) + checkDeserialization(testFileDesc, "Person", dynamicMessage, expectedRow) + checkDeserialization( + testFileDesc, + "Person", + dynamicMessage, + expectedRow, + new OrderedFilters(Seq(EqualTo("age", 39)), sqlSchema)) + + checkDeserialization( + testFileDesc, + "Person", + dynamicMessage, + None, + new OrderedFilters(Seq(Not(EqualTo("name", "Maxim"))), sqlSchema)) + } + + test("ProtobufDeserializer with binary type") { + + val bb = java.nio.ByteBuffer.wrap(Array[Byte](97, 48, 53)) + + val bytesProto = BytesMsg + .newBuilder() + .setBytesType(ByteString.copyFrom(bb)) + .build() + + val expected = InternalRow(Array[Byte](97, 48, 53)) + checkDeserialization(testFileDesc, "BytesMsg", bytesProto, Some(expected)) + } + + test("Full names for message using descriptor file") { + val withShortName = ProtobufUtils.buildDescriptor(testFileDesc, "BytesMsg") + assert(withShortName.findFieldByName("bytes_type") != null) + + val withFullName = ProtobufUtils.buildDescriptor( + testFileDesc, "org.apache.spark.sql.protobuf.BytesMsg") + assert(withFullName.findFieldByName("bytes_type") != null) + } +} diff --git a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala new file mode 100644 index 0000000000000..92c3c27bfaed5 --- /dev/null +++ b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufFunctionsSuite.scala @@ -0,0 +1,1119 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.spark.sql.protobuf + +import java.sql.Timestamp +import java.time.Duration + + import scala.collection.JavaConverters._ + +import com.google.protobuf.{ByteString, DynamicMessage} + +import org.apache.spark.sql.{AnalysisException, Column, DataFrame, QueryTest, Row} +import org.apache.spark.sql.functions.{lit, struct} +import org.apache.spark.sql.protobuf.protos.SimpleMessageProtos._ +import org.apache.spark.sql.protobuf.protos.SimpleMessageProtos.SimpleMessageRepeated.NestedEnum +import org.apache.spark.sql.protobuf.utils.ProtobufUtils +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types._ + +class ProtobufFunctionsSuite extends QueryTest with SharedSparkSession with ProtobufTestBase + with Serializable { + + import testImplicits._ + + val testFileDesc = testFile("functions_suite.desc", "protobuf/functions_suite.desc") + private val javaClassNamePrefix = "org.apache.spark.sql.protobuf.protos.SimpleMessageProtos$" + + private def emptyBinaryDF = Seq(Array[Byte]()).toDF("binary") + + /** + * Runs the given closure twice. Once with descriptor file and second time with Java class name. + */ + private def checkWithFileAndClassName(messageName: String)( + fn: (String, Option[String]) => Unit): Unit = { + withClue("(With descriptor file)") { + fn(messageName, Some(testFileDesc)) + } + withClue("(With Java class name)") { + fn(s"$javaClassNamePrefix$messageName", None) + } + } + + // A wrapper to invoke the right variable of from_protobuf() depending on arguments. + private def from_protobuf_wrapper( + col: Column, + messageName: String, + descFilePathOpt: Option[String], + options: Map[String, String] = Map.empty): Column = { + descFilePathOpt match { + case Some(descFilePath) => functions.from_protobuf( + col, messageName, descFilePath, options.asJava + ) + case None => functions.from_protobuf(col, messageName, options.asJava) + } + } + + // A wrapper to invoke the right variable of to_protobuf() depending on arguments. + private def to_protobuf_wrapper( + col: Column, messageName: String, descFilePathOpt: Option[String]): Column = { + descFilePathOpt match { + case Some(descFilePath) => functions.to_protobuf(col, messageName, descFilePath) + case None => functions.to_protobuf(col, messageName) + } + } + + test("roundtrip in to_protobuf and from_protobuf - struct") { + val df = spark + .range(1, 10) + .select(struct( + $"id", + $"id".cast("string").as("string_value"), + $"id".cast("int").as("int32_value"), + $"id".cast("int").as("uint32_value"), + $"id".cast("int").as("sint32_value"), + $"id".cast("int").as("fixed32_value"), + $"id".cast("int").as("sfixed32_value"), + $"id".cast("long").as("int64_value"), + $"id".cast("long").as("uint64_value"), + $"id".cast("long").as("sint64_value"), + $"id".cast("long").as("fixed64_value"), + $"id".cast("long").as("sfixed64_value"), + $"id".cast("double").as("double_value"), + lit(1202.00).cast(org.apache.spark.sql.types.FloatType).as("float_value"), + lit(true).as("bool_value"), + lit("0".getBytes).as("bytes_value")).as("SimpleMessage")) + + checkWithFileAndClassName("SimpleMessage") { + case (name, descFilePathOpt) => + val protoStructDF = df.select( + to_protobuf_wrapper($"SimpleMessage", name, descFilePathOpt).as("proto")) + val actualDf = protoStructDF.select( + from_protobuf_wrapper($"proto", name, descFilePathOpt).as("proto.*")) + checkAnswer(actualDf, df) + } + } + + test("roundtrip in from_protobuf and to_protobuf - Repeated") { + + val protoMessage = SimpleMessageRepeated + .newBuilder() + .setKey("key") + .setValue("value") + .addRboolValue(false) + .addRboolValue(true) + .addRdoubleValue(1092092.654d) + .addRdoubleValue(1092093.654d) + .addRfloatValue(10903.0f) + .addRfloatValue(10902.0f) + .addRnestedEnum(NestedEnum.NESTED_NOTHING) + .addRnestedEnum(NestedEnum.NESTED_FIRST) + .build() + + val df = Seq(protoMessage.toByteArray).toDF("value") + + checkWithFileAndClassName("SimpleMessageRepeated") { + case (name, descFilePathOpt) => + val fromProtoDF = df.select( + from_protobuf_wrapper($"value", name, descFilePathOpt).as("value_from")) + val toProtoDF = fromProtoDF.select( + to_protobuf_wrapper($"value_from", name, descFilePathOpt).as("value_to")) + val toFromProtoDF = toProtoDF.select( + from_protobuf_wrapper($"value_to", name, descFilePathOpt).as("value_to_from")) + checkAnswer(fromProtoDF.select($"value_from.*"), toFromProtoDF.select($"value_to_from.*")) + } + } + + test("roundtrip in from_protobuf and to_protobuf - Repeated Message Once") { + val repeatedMessageDesc = ProtobufUtils.buildDescriptor(testFileDesc, "RepeatedMessage") + val basicMessageDesc = ProtobufUtils.buildDescriptor(testFileDesc, "BasicMessage") + + val basicMessage = DynamicMessage + .newBuilder(basicMessageDesc) + .setField(basicMessageDesc.findFieldByName("id"), 1111L) + .setField(basicMessageDesc.findFieldByName("string_value"), "value") + .setField(basicMessageDesc.findFieldByName("int32_value"), 12345) + .setField(basicMessageDesc.findFieldByName("int64_value"), 0x90000000000L) + .setField(basicMessageDesc.findFieldByName("double_value"), 10000000000.0d) + .setField(basicMessageDesc.findFieldByName("float_value"), 10902.0f) + .setField(basicMessageDesc.findFieldByName("bool_value"), true) + .setField( + basicMessageDesc.findFieldByName("bytes_value"), + ByteString.copyFromUtf8("ProtobufDeserializer")) + .build() + + val dynamicMessage = DynamicMessage + .newBuilder(repeatedMessageDesc) + .addRepeatedField(repeatedMessageDesc.findFieldByName("basic_message"), basicMessage) + .build() + + val df = Seq(dynamicMessage.toByteArray).toDF("value") + + checkWithFileAndClassName("RepeatedMessage") { + case (name, descFilePathOpt) => + val fromProtoDF = df.select( + from_protobuf_wrapper($"value", name, descFilePathOpt).as("value_from")) + val toProtoDF = fromProtoDF.select( + to_protobuf_wrapper($"value_from", name, descFilePathOpt).as("value_to")) + val toFromProtoDF = toProtoDF.select( + from_protobuf_wrapper($"value_to", name, descFilePathOpt).as("value_to_from")) + checkAnswer(fromProtoDF.select($"value_from.*"), toFromProtoDF.select($"value_to_from.*")) + } + } + + test("roundtrip in from_protobuf and to_protobuf - Repeated Message Twice") { + val repeatedMessageDesc = ProtobufUtils.buildDescriptor(testFileDesc, "RepeatedMessage") + val basicMessageDesc = ProtobufUtils.buildDescriptor(testFileDesc, "BasicMessage") + + val basicMessage1 = DynamicMessage + .newBuilder(basicMessageDesc) + .setField(basicMessageDesc.findFieldByName("id"), 1111L) + .setField(basicMessageDesc.findFieldByName("string_value"), "value1") + .setField(basicMessageDesc.findFieldByName("int32_value"), 12345) + .setField(basicMessageDesc.findFieldByName("int64_value"), 0x90000000000L) + .setField(basicMessageDesc.findFieldByName("double_value"), 10000000000.0d) + .setField(basicMessageDesc.findFieldByName("float_value"), 10902.0f) + .setField(basicMessageDesc.findFieldByName("bool_value"), true) + .setField( + basicMessageDesc.findFieldByName("bytes_value"), + ByteString.copyFromUtf8("ProtobufDeserializer1")) + .build() + val basicMessage2 = DynamicMessage + .newBuilder(basicMessageDesc) + .setField(basicMessageDesc.findFieldByName("id"), 1112L) + .setField(basicMessageDesc.findFieldByName("string_value"), "value2") + .setField(basicMessageDesc.findFieldByName("int32_value"), 12346) + .setField(basicMessageDesc.findFieldByName("int64_value"), 0x90000000000L) + .setField(basicMessageDesc.findFieldByName("double_value"), 10000000000.0d) + .setField(basicMessageDesc.findFieldByName("float_value"), 10903.0f) + .setField(basicMessageDesc.findFieldByName("bool_value"), false) + .setField( + basicMessageDesc.findFieldByName("bytes_value"), + ByteString.copyFromUtf8("ProtobufDeserializer2")) + .build() + + val dynamicMessage = DynamicMessage + .newBuilder(repeatedMessageDesc) + .addRepeatedField(repeatedMessageDesc.findFieldByName("basic_message"), basicMessage1) + .addRepeatedField(repeatedMessageDesc.findFieldByName("basic_message"), basicMessage2) + .build() + + val df = Seq(dynamicMessage.toByteArray).toDF("value") + + checkWithFileAndClassName("RepeatedMessage") { + case (name, descFilePathOpt) => + val fromProtoDF = df.select( + from_protobuf_wrapper($"value", name, descFilePathOpt).as("value_from")) + val toProtoDF = fromProtoDF.select( + to_protobuf_wrapper($"value_from", name, descFilePathOpt).as("value_to")) + val toFromProtoDF = toProtoDF.select( + from_protobuf_wrapper($"value_to", name, descFilePathOpt).as("value_to_from")) + checkAnswer(fromProtoDF.select($"value_from.*"), toFromProtoDF.select($"value_to_from.*")) + } + } + + test("roundtrip in from_protobuf and to_protobuf - Map") { + val messageMapDesc = ProtobufUtils.buildDescriptor(testFileDesc, "SimpleMessageMap") + + val mapStr1 = DynamicMessage + .newBuilder(messageMapDesc.findNestedTypeByName("StringMapdataEntry")) + .setField( + messageMapDesc.findNestedTypeByName("StringMapdataEntry").findFieldByName("key"), + "string_key") + .setField( + messageMapDesc.findNestedTypeByName("StringMapdataEntry").findFieldByName("value"), + "value1") + .build() + val mapStr2 = DynamicMessage + .newBuilder(messageMapDesc.findNestedTypeByName("StringMapdataEntry")) + .setField( + messageMapDesc.findNestedTypeByName("StringMapdataEntry").findFieldByName("key"), + "string_key") + .setField( + messageMapDesc.findNestedTypeByName("StringMapdataEntry").findFieldByName("value"), + "value2") + .build() + val mapInt64 = DynamicMessage + .newBuilder(messageMapDesc.findNestedTypeByName("Int64MapdataEntry")) + .setField( + messageMapDesc.findNestedTypeByName("Int64MapdataEntry").findFieldByName("key"), + 0x90000000000L) + .setField( + messageMapDesc.findNestedTypeByName("Int64MapdataEntry").findFieldByName("value"), + 0x90000000001L) + .build() + val mapInt32 = DynamicMessage + .newBuilder(messageMapDesc.findNestedTypeByName("Int32MapdataEntry")) + .setField( + messageMapDesc.findNestedTypeByName("Int32MapdataEntry").findFieldByName("key"), + 12345) + .setField( + messageMapDesc.findNestedTypeByName("Int32MapdataEntry").findFieldByName("value"), + 54321) + .build() + val mapFloat = DynamicMessage + .newBuilder(messageMapDesc.findNestedTypeByName("FloatMapdataEntry")) + .setField( + messageMapDesc.findNestedTypeByName("FloatMapdataEntry").findFieldByName("key"), + "float_key") + .setField( + messageMapDesc.findNestedTypeByName("FloatMapdataEntry").findFieldByName("value"), + 109202.234f) + .build() + val mapDouble = DynamicMessage + .newBuilder(messageMapDesc.findNestedTypeByName("DoubleMapdataEntry")) + .setField( + messageMapDesc.findNestedTypeByName("DoubleMapdataEntry").findFieldByName("key"), + "double_key") + .setField( + messageMapDesc.findNestedTypeByName("DoubleMapdataEntry").findFieldByName("value"), + 109202.12d) + .build() + val mapBool = DynamicMessage + .newBuilder(messageMapDesc.findNestedTypeByName("BoolMapdataEntry")) + .setField( + messageMapDesc.findNestedTypeByName("BoolMapdataEntry").findFieldByName("key"), + true) + .setField( + messageMapDesc.findNestedTypeByName("BoolMapdataEntry").findFieldByName("value"), + false) + .build() + + val dynamicMessage = DynamicMessage + .newBuilder(messageMapDesc) + .setField(messageMapDesc.findFieldByName("key"), "key") + .setField(messageMapDesc.findFieldByName("value"), "value") + .addRepeatedField(messageMapDesc.findFieldByName("string_mapdata"), mapStr1) + .addRepeatedField(messageMapDesc.findFieldByName("string_mapdata"), mapStr2) + .addRepeatedField(messageMapDesc.findFieldByName("int64_mapdata"), mapInt64) + .addRepeatedField(messageMapDesc.findFieldByName("int32_mapdata"), mapInt32) + .addRepeatedField(messageMapDesc.findFieldByName("float_mapdata"), mapFloat) + .addRepeatedField(messageMapDesc.findFieldByName("double_mapdata"), mapDouble) + .addRepeatedField(messageMapDesc.findFieldByName("bool_mapdata"), mapBool) + .build() + + val df = Seq(dynamicMessage.toByteArray).toDF("value") + + checkWithFileAndClassName("SimpleMessageMap") { + case (name, descFilePathOpt) => + val fromProtoDF = df.select( + from_protobuf_wrapper($"value", name, descFilePathOpt).as("value_from")) + val toProtoDF = fromProtoDF.select( + to_protobuf_wrapper($"value_from", name, descFilePathOpt).as("value_to")) + val toFromProtoDF = toProtoDF.select( + from_protobuf_wrapper($"value_to", name, descFilePathOpt).as("value_to_from")) + checkAnswer(fromProtoDF.select($"value_from.*"), toFromProtoDF.select($"value_to_from.*")) + } + } + + test("roundtrip in from_protobuf and to_protobuf - Enum") { + val messageEnumDesc = ProtobufUtils.buildDescriptor(testFileDesc, "SimpleMessageEnum") + val basicEnumDesc = ProtobufUtils.buildDescriptor(testFileDesc, "BasicEnumMessage") + + val dynamicMessage = DynamicMessage + .newBuilder(messageEnumDesc) + .setField(messageEnumDesc.findFieldByName("key"), "key") + .setField(messageEnumDesc.findFieldByName("value"), "value") + .setField( + messageEnumDesc.findFieldByName("nested_enum"), + messageEnumDesc.findEnumTypeByName("NestedEnum").findValueByName("NESTED_NOTHING")) + .setField( + messageEnumDesc.findFieldByName("nested_enum"), + messageEnumDesc.findEnumTypeByName("NestedEnum").findValueByName("NESTED_FIRST")) + .setField( + messageEnumDesc.findFieldByName("basic_enum"), + basicEnumDesc.findEnumTypeByName("BasicEnum").findValueByName("FIRST")) + .setField( + messageEnumDesc.findFieldByName("basic_enum"), + basicEnumDesc.findEnumTypeByName("BasicEnum").findValueByName("NOTHING")) + .build() + + val df = Seq(dynamicMessage.toByteArray).toDF("value") + + checkWithFileAndClassName("SimpleMessageEnum") { + case (name, descFilePathOpt) => + val fromProtoDF = df.select( + from_protobuf_wrapper($"value", name, descFilePathOpt).as("value_from")) + val toProtoDF = fromProtoDF.select( + to_protobuf_wrapper($"value_from", name, descFilePathOpt).as("value_to")) + val toFromProtoDF = toProtoDF.select( + from_protobuf_wrapper($"value_to", name, descFilePathOpt).as("value_to_from")) + checkAnswer(fromProtoDF.select($"value_from.*"), toFromProtoDF.select($"value_to_from.*")) + } + } + + test("round trip in from_protobuf and to_protobuf - Multiple Message") { + val messageMultiDesc = ProtobufUtils.buildDescriptor(testFileDesc, "MultipleExample") + val messageIncludeDesc = ProtobufUtils.buildDescriptor(testFileDesc, "IncludedExample") + val messageOtherDesc = ProtobufUtils.buildDescriptor(testFileDesc, "OtherExample") + + val otherMessage = DynamicMessage + .newBuilder(messageOtherDesc) + .setField(messageOtherDesc.findFieldByName("other"), "other value") + .build() + + val includeMessage = DynamicMessage + .newBuilder(messageIncludeDesc) + .setField(messageIncludeDesc.findFieldByName("included"), "included value") + .setField(messageIncludeDesc.findFieldByName("other"), otherMessage) + .build() + + val dynamicMessage = DynamicMessage + .newBuilder(messageMultiDesc) + .setField(messageMultiDesc.findFieldByName("included_example"), includeMessage) + .build() + + val df = Seq(dynamicMessage.toByteArray).toDF("value") + + checkWithFileAndClassName("MultipleExample") { + case (name, descFilePathOpt) => + val fromProtoDF = df.select( + from_protobuf_wrapper($"value", name, descFilePathOpt).as("value_from")) + val toProtoDF = fromProtoDF.select( + to_protobuf_wrapper($"value_from", name, descFilePathOpt).as("value_to")) + val toFromProtoDF = toProtoDF.select( + from_protobuf_wrapper($"value_to", name, descFilePathOpt).as("value_to_from")) + checkAnswer(fromProtoDF.select($"value_from.*"), toFromProtoDF.select($"value_to_from.*")) + } + + // Simple recursion + checkWithFileAndClassName("recursiveB") { // B -> A -> B + case (name, descFilePathOpt) => + val e = intercept[AnalysisException] { + emptyBinaryDF.select( + from_protobuf_wrapper($"binary", name, descFilePathOpt).as("messageFromProto")) + .show() + } + assert(e.getMessage.contains( + "Found recursive reference in Protobuf schema, which can not be processed by Spark" + )) + } + } + + test("Recursive fields in Protobuf should result in an error, C->D->Array(C)") { + checkWithFileAndClassName("recursiveD") { + case (name, descFilePathOpt) => + val e = intercept[AnalysisException] { + emptyBinaryDF.select( + from_protobuf_wrapper($"binary", name, descFilePathOpt).as("messageFromProto")) + .show() + } + assert(e.getMessage.contains( + "Found recursive reference in Protobuf schema, which can not be processed by Spark" + )) + } + } + + test("Setting depth to 0 or -1 should trigger error on recursive fields (B -> A -> B)") { + for (depth <- Seq("0", "-1")) { + val e = intercept[AnalysisException] { + emptyBinaryDF.select( + functions.from_protobuf( + $"binary", "recursiveB", testFileDesc, + Map("recursive.fields.max.depth" -> depth).asJava + ).as("messageFromProto") + ).show() + } + assert(e.getMessage.contains( + "Found recursive reference in Protobuf schema, which can not be processed by Spark" + )) + } + } + + test("Handle extra fields : oldProducer -> newConsumer") { + val testFileDesc = testFile("catalyst_types.desc", "protobuf/catalyst_types.desc") + val oldProducer = ProtobufUtils.buildDescriptor(testFileDesc, "oldProducer") + val newConsumer = ProtobufUtils.buildDescriptor(testFileDesc, "newConsumer") + + val oldProducerMessage = DynamicMessage + .newBuilder(oldProducer) + .setField(oldProducer.findFieldByName("key"), "key") + .build() + + val df = Seq(oldProducerMessage.toByteArray).toDF("oldProducerData") + val fromProtoDf = df.select( + functions + .from_protobuf($"oldProducerData", "newConsumer", testFileDesc) + .as("fromProto")) + + val toProtoDf = fromProtoDf.select( + functions + .to_protobuf($"fromProto", "newConsumer", testFileDesc) + .as("toProto")) + + val toProtoDfToFromProtoDf = toProtoDf.select( + functions + .from_protobuf($"toProto", "newConsumer", testFileDesc) + .as("toProtoToFromProto")) + + val actualFieldNames = + toProtoDfToFromProtoDf.select("toProtoToFromProto.*").schema.fields.toSeq.map(f => f.name) + newConsumer.getFields.asScala.map { f => + { + assert(actualFieldNames.contains(f.getName)) + + } + } + assert( + toProtoDfToFromProtoDf.select("toProtoToFromProto.value").take(1).toSeq(0).get(0) == null) + assert( + toProtoDfToFromProtoDf.select("toProtoToFromProto.actual.*").take(1).toSeq(0).get(0) == null) + } + + test("Handle extra fields : newProducer -> oldConsumer") { + val testFileDesc = testFile("catalyst_types.desc", "protobuf/catalyst_types.desc") + val newProducer = ProtobufUtils.buildDescriptor(testFileDesc, "newProducer") + val oldConsumer = ProtobufUtils.buildDescriptor(testFileDesc, "oldConsumer") + + val newProducerMessage = DynamicMessage + .newBuilder(newProducer) + .setField(newProducer.findFieldByName("key"), "key") + .setField(newProducer.findFieldByName("value"), 1) + .build() + + val df = Seq(newProducerMessage.toByteArray).toDF("newProducerData") + val fromProtoDf = df.select( + functions + .from_protobuf($"newProducerData", "oldConsumer", testFileDesc) + .as("oldConsumerProto")) + + val expectedFieldNames = oldConsumer.getFields.asScala.map(f => f.getName) + fromProtoDf.select("oldConsumerProto.*").schema.fields.toSeq.map { f => + { + assert(expectedFieldNames.contains(f.name)) + } + } + } + + test("roundtrip in to_protobuf and from_protobuf - with nulls") { + val schema = StructType( + StructField("requiredMsg", + StructType( + StructField("key", StringType, nullable = false) :: + StructField("col_1", IntegerType, nullable = true) :: + StructField("col_2", StringType, nullable = false) :: + StructField("col_3", IntegerType, nullable = true) :: Nil + ), + nullable = true + ) :: Nil + ) + val inputDf = spark.createDataFrame( + spark.sparkContext.parallelize(Seq( + Row(Row("key1", null, "value2", null)) + )), + schema + ) + + val toProtobuf = inputDf.select( + functions.to_protobuf($"requiredMsg", "requiredMsg", testFileDesc) + .as("to_proto")) + + val binary = toProtobuf.take(1).toSeq(0).get(0).asInstanceOf[Array[Byte]] + + val messageDescriptor = ProtobufUtils.buildDescriptor(testFileDesc, "requiredMsg") + val actualMessage = DynamicMessage.parseFrom(messageDescriptor, binary) + + assert(actualMessage.getField(messageDescriptor.findFieldByName("key")) + == inputDf.select("requiredMsg.key").take(1).toSeq(0).get(0)) + assert(actualMessage.getField(messageDescriptor.findFieldByName("col_2")) + == inputDf.select("requiredMsg.col_2").take(1).toSeq(0).get(0)) + assert(actualMessage.getField(messageDescriptor.findFieldByName("col_1")) == 0) + assert(actualMessage.getField(messageDescriptor.findFieldByName("col_3")) == 0) + + val fromProtoDf = toProtobuf.select( + functions.from_protobuf($"to_proto", "requiredMsg", testFileDesc) as 'from_proto) + + assert(fromProtoDf.select("from_proto.key").take(1).toSeq(0).get(0) + == inputDf.select("requiredMsg.key").take(1).toSeq(0).get(0)) + assert(fromProtoDf.select("from_proto.col_2").take(1).toSeq(0).get(0) + == inputDf.select("requiredMsg.col_2").take(1).toSeq(0).get(0)) + assert(fromProtoDf.select("from_proto.col_1").take(1).toSeq(0).get(0) == null) + assert(fromProtoDf.select("from_proto.col_3").take(1).toSeq(0).get(0) == null) + } + + test("from_protobuf filter to_protobuf") { + val basicMessageDesc = ProtobufUtils.buildDescriptor(testFileDesc, "BasicMessage") + + val basicMessage = DynamicMessage + .newBuilder(basicMessageDesc) + .setField(basicMessageDesc.findFieldByName("id"), 1111L) + .setField(basicMessageDesc.findFieldByName("string_value"), "slam") + .setField(basicMessageDesc.findFieldByName("int32_value"), 12345) + .setField(basicMessageDesc.findFieldByName("int64_value"), 0x90000000000L) + .setField(basicMessageDesc.findFieldByName("double_value"), 10000000000.0d) + .setField(basicMessageDesc.findFieldByName("float_value"), 10902.0f) + .setField(basicMessageDesc.findFieldByName("bool_value"), true) + .setField( + basicMessageDesc.findFieldByName("bytes_value"), + ByteString.copyFromUtf8("ProtobufDeserializer")) + .build() + + val df = Seq(basicMessage.toByteArray).toDF("value") + + val resultFrom = df + .select(from_protobuf_wrapper($"value", "BasicMessage", Some(testFileDesc)) as 'sample) + .where("sample.string_value == \"slam\"") + + val resultToFrom = resultFrom + .select(to_protobuf_wrapper($"sample", "BasicMessage", Some(testFileDesc)) as 'value) + .select(from_protobuf_wrapper($"value", "BasicMessage", Some(testFileDesc)) as 'sample) + .where("sample.string_value == \"slam\"") + + assert(resultFrom.except(resultToFrom).isEmpty) + } + + test("Handle TimestampType between to_protobuf and from_protobuf") { + val schema = StructType( + StructField("timeStampMsg", + StructType( + StructField("key", StringType, nullable = true) :: + StructField("stmp", TimestampType, nullable = true) :: Nil + ), + nullable = true + ) :: Nil + ) + + val inputDf = spark.createDataFrame( + spark.sparkContext.parallelize(Seq( + Row(Row("key1", Timestamp.valueOf("2016-05-09 10:12:43.999"))) + )), + schema + ) + + checkWithFileAndClassName("timeStampMsg") { + case (name, descFilePathOpt) => + val toProtoDf = inputDf + .select(to_protobuf_wrapper($"timeStampMsg", name, descFilePathOpt) as 'to_proto) + + val fromProtoDf = toProtoDf + .select(from_protobuf_wrapper($"to_proto", name, descFilePathOpt) as 'timeStampMsg) + + val actualFields = fromProtoDf.schema.fields.toList + val expectedFields = inputDf.schema.fields.toList + + assert(actualFields.size === expectedFields.size) + assert(actualFields === expectedFields) + assert(fromProtoDf.select("timeStampMsg.key").take(1).toSeq(0).get(0) + === inputDf.select("timeStampMsg.key").take(1).toSeq(0).get(0)) + assert(fromProtoDf.select("timeStampMsg.stmp").take(1).toSeq(0).get(0) + === inputDf.select("timeStampMsg.stmp").take(1).toSeq(0).get(0)) + } + } + + test("Handle DayTimeIntervalType between to_protobuf and from_protobuf") { + val schema = StructType( + StructField("durationMsg", + StructType( + StructField("key", StringType, nullable = true) :: + StructField("duration", + DayTimeIntervalType.defaultConcreteType, nullable = true) :: Nil + ), + nullable = true + ) :: Nil + ) + + val inputDf = spark.createDataFrame( + spark.sparkContext.parallelize(Seq( + Row(Row("key1", + Duration.ofDays(1).plusHours(2).plusMinutes(3).plusSeconds(4) + )) + )), + schema + ) + + checkWithFileAndClassName("durationMsg") { + case (name, descFilePathOpt) => + val toProtoDf = inputDf + .select(to_protobuf_wrapper($"durationMsg", name, descFilePathOpt) as 'to_proto) + + val fromProtoDf = toProtoDf + .select(from_protobuf_wrapper($"to_proto", name, descFilePathOpt) as 'durationMsg) + + val actualFields = fromProtoDf.schema.fields.toList + val expectedFields = inputDf.schema.fields.toList + + assert(actualFields.size === expectedFields.size) + assert(actualFields === expectedFields) + assert(fromProtoDf.select("durationMsg.key").take(1).toSeq(0).get(0) + === inputDf.select("durationMsg.key").take(1).toSeq(0).get(0)) + assert(fromProtoDf.select("durationMsg.duration").take(1).toSeq(0).get(0) + === inputDf.select("durationMsg.duration").take(1).toSeq(0).get(0)) + } + } + + test("raise cannot construct protobuf descriptor error") { + val df = Seq(ByteString.empty().toByteArray).toDF("value") + val testFileDescriptor = + testFile("basicmessage_noimports.desc", "protobuf/basicmessage_noimports.desc") + + val e = intercept[AnalysisException] { + df.select(functions.from_protobuf($"value", "BasicMessage", testFileDescriptor) as 'sample) + .where("sample.string_value == \"slam\"").show() + } + checkError( + exception = e, + errorClass = "CANNOT_CONSTRUCT_PROTOBUF_DESCRIPTOR", + parameters = Map("descFilePath" -> testFileDescriptor)) + } + + test("Verify OneOf field between from_protobuf -> to_protobuf and struct -> from_protobuf") { + val descriptor = ProtobufUtils.buildDescriptor(testFileDesc, "OneOfEvent") + val oneOfEvent = OneOfEvent.newBuilder() + .setKey("key") + .setCol1(123) + .setCol3(109202L) + .setCol2("col2value") + .addCol4("col4value").build() + + val df = Seq(oneOfEvent.toByteArray).toDF("value") + + checkWithFileAndClassName("OneOfEvent") { + case (name, descFilePathOpt) => + val fromProtoDf = df.select( + from_protobuf_wrapper($"value", name, descFilePathOpt) as 'sample) + val toDf = fromProtoDf.select( + to_protobuf_wrapper($"sample", name, descFilePathOpt) as 'toProto) + val toFromDf = toDf.select( + from_protobuf_wrapper($"toProto", name, descFilePathOpt) as 'fromToProto) + checkAnswer(fromProtoDf, toFromDf) + val actualFieldNames = fromProtoDf.select("sample.*").schema.fields.toSeq.map(f => f.name) + descriptor.getFields.asScala.map(f => { + assert(actualFieldNames.contains(f.getName)) + }) + + val eventFromSpark = OneOfEvent.parseFrom( + toDf.select("toProto").take(1).toSeq(0).getAs[Array[Byte]](0)) + // OneOf field: the last set value(by order) will overwrite all previous ones. + assert(eventFromSpark.getCol2.equals("col2value")) + assert(eventFromSpark.getCol3 == 0) + val expectedFields = descriptor.getFields.asScala.map(f => f.getName) + eventFromSpark.getDescriptorForType.getFields.asScala.map(f => { + assert(expectedFields.contains(f.getName)) + }) + + val schema = DataType.fromJson( + """ + | { + | "type":"struct", + | "fields":[ + | {"name":"sample","nullable":true,"type":{ + | "type":"struct", + | "fields":[ + | {"name":"key","type":"string","nullable":true}, + | {"name":"col_1","type":"integer","nullable":true}, + | {"name":"col_2","type":"string","nullable":true}, + | {"name":"col_3","type":"long","nullable":true}, + | {"name":"col_4","nullable":true,"type":{ + | "type":"array","elementType":"string","containsNull":false}} + | ]} + | } + | ] + | } + |""".stripMargin).asInstanceOf[StructType] + assert(fromProtoDf.schema == schema) + + val data = Seq( + Row(Row("key", 123, "col2value", 109202L, Seq("col4value"))), + Row(Row("key2", null, null, null, null)) // Leave the rest null, including "col_4" array. + ) + val dataDf = spark.createDataFrame(spark.sparkContext.parallelize(data), schema) + val dataDfToProto = dataDf.select( + to_protobuf_wrapper($"sample", name, descFilePathOpt) as 'toProto) + + val toProtoResults = dataDfToProto.select("toProto").collect() + val eventFromSparkSchema = OneOfEvent.parseFrom(toProtoResults(0).getAs[Array[Byte]](0)) + assert(eventFromSparkSchema.getCol2.isEmpty) + assert(eventFromSparkSchema.getCol3 == 109202L) + eventFromSparkSchema.getDescriptorForType.getFields.asScala.map(f => { + assert(expectedFields.contains(f.getName)) + }) + val secondEventFromSpark = OneOfEvent.parseFrom(toProtoResults(1).getAs[Array[Byte]](0)) + assert(secondEventFromSpark.getKey == "key2") + } + } + + test("Fail for recursion field with complex schema without recursive.fields.max.depth") { + checkWithFileAndClassName("EventWithRecursion") { + case (name, descFilePathOpt) => + val e = intercept[AnalysisException] { + emptyBinaryDF.select( + from_protobuf_wrapper($"binary", name, descFilePathOpt).as("messageFromProto")) + .show() + } + assert(e.getMessage.contains( + "Found recursive reference in Protobuf schema, which can not be processed by Spark" + )) + } + } + + test("Verify recursion field with complex schema with recursive.fields.max.depth") { + val descriptor = ProtobufUtils.buildDescriptor(testFileDesc, "Employee") + + val manager = Employee.newBuilder().setFirstName("firstName").setLastName("lastName").build() + val em2 = EM2.newBuilder().setTeamsize(100).setEm2Manager(manager).build() + val em = EM.newBuilder().setTeamsize(100).setEmManager(manager).build() + val ic = IC.newBuilder().addSkills("java").setIcManager(manager).build() + val employee = Employee.newBuilder().setFirstName("firstName") + .setLastName("lastName").setEm2(em2).setEm(em).setIc(ic).build() + + val df = Seq(employee.toByteArray).toDF("protoEvent") + val options = new java.util.HashMap[String, String]() + options.put("recursive.fields.max.depth", "2") + + val fromProtoDf = df.select( + functions.from_protobuf($"protoEvent", "Employee", testFileDesc, options) as 'sample) + + val toDf = fromProtoDf.select( + functions.to_protobuf($"sample", "Employee", testFileDesc) as 'toProto) + val toFromDf = toDf.select( + functions.from_protobuf($"toProto", + "Employee", + testFileDesc, + options) as 'fromToProto) + + checkAnswer(fromProtoDf, toFromDf) + + val actualFieldNames = fromProtoDf.select("sample.*").schema.fields.toSeq.map(f => f.name) + descriptor.getFields.asScala.map(f => { + assert(actualFieldNames.contains(f.getName)) + }) + + val eventFromSpark = Employee.parseFrom( + toDf.select("toProto").take(1).toSeq(0).getAs[Array[Byte]](0)) + + assert(eventFromSpark.getIc.getIcManager.getFirstName.equals("firstName")) + assert(eventFromSpark.getIc.getIcManager.getLastName.equals("lastName")) + assert(eventFromSpark.getEm2.getEm2Manager.getFirstName.isEmpty) + } + + test("Verify OneOf field with recursive fields between from_protobuf -> to_protobuf." + + "and struct -> from_protobuf") { + val descriptor = ProtobufUtils.buildDescriptor(testFileDesc, "OneOfEventWithRecursion") + + val nestedTwo = OneOfEventWithRecursion.newBuilder() + .setKey("keyNested2").setValue("valueNested2").build() + val nestedOne = EventRecursiveA.newBuilder() + .setKey("keyNested1") + .setRecursiveOneOffInA(nestedTwo).build() + val oneOfRecursionEvent = OneOfEventWithRecursion.newBuilder() + .setKey("keyNested0") + .setValue("valueNested0") + .setRecursiveA(nestedOne).build() + val recursiveA = EventRecursiveA.newBuilder().setKey("recursiveAKey") + .setRecursiveOneOffInA(oneOfRecursionEvent).build() + val recursiveB = EventRecursiveB.newBuilder() + .setKey("recursiveBKey") + .setValue("recursiveBvalue").build() + val oneOfEventWithRecursion = OneOfEventWithRecursion.newBuilder() + .setKey("key") + .setValue("value") + .setRecursiveB(recursiveB) + .setRecursiveA(recursiveA).build() + + val df = Seq(oneOfEventWithRecursion.toByteArray).toDF("value") + + val options = new java.util.HashMap[String, String]() + options.put("recursive.fields.max.depth", "2") // Recursive fields appear twice. + + val fromProtoDf = df.select( + functions.from_protobuf($"value", + "OneOfEventWithRecursion", + testFileDesc, options) as 'sample) + val toDf = fromProtoDf.select( + functions.to_protobuf($"sample", "OneOfEventWithRecursion", testFileDesc) as 'toProto) + val toFromDf = toDf.select( + functions.from_protobuf($"toProto", + "OneOfEventWithRecursion", + testFileDesc, + options) as 'fromToProto) + + checkAnswer(fromProtoDf, toFromDf) + + val actualFieldNames = fromProtoDf.select("sample.*").schema.fields.toSeq.map(f => f.name) + descriptor.getFields.asScala.map(f => { + assert(actualFieldNames.contains(f.getName)) + }) + + val eventFromSpark = OneOfEventWithRecursion.parseFrom( + toDf.select("toProto").take(1).toSeq(0).getAs[Array[Byte]](0)) + + var recursiveField = eventFromSpark.getRecursiveA.getRecursiveOneOffInA + assert(recursiveField.getKey.equals("keyNested0")) + assert(recursiveField.getValue.equals("valueNested0")) + assert(recursiveField.getRecursiveA.getKey.equals("keyNested1")) + assert(recursiveField.getRecursiveA.getRecursiveOneOffInA.getKey.isEmpty()) + + val expectedFields = descriptor.getFields.asScala.map(f => f.getName) + eventFromSpark.getDescriptorForType.getFields.asScala.map(f => { + assert(expectedFields.contains(f.getName)) + }) + + val schemaDDL = + """ + | -- OneOfEvenWithRecursion with max depth 2. + | sample STRUCT< -- 1st level for OneOffWithRecursion + | key string, + | recursiveA STRUCT< -- 1st level for RecursiveA + | recursiveOneOffInA STRUCT< -- 2st level for OneOffWithRecursion + | key string, + | recursiveA STRUCT< -- 2st level for RecursiveA + | key string + | -- Removed recursiveOneOffInA: 3rd level for OneOffWithRecursion + | >, + | recursiveB STRUCT< + | key string, + | value string + | -- Removed recursiveOneOffInB: 3rd level for OneOffWithRecursion + | >, + | value string + | >, + | key string + | >, + | recursiveB STRUCT< -- 1st level for RecursiveB + | key string, + | value string, + | recursiveOneOffInB STRUCT< -- 2st level for OneOffWithRecursion + | key string, + | recursiveA STRUCT< -- 1st level for RecursiveA + | key string + | -- Removed recursiveOneOffInA: 3rd level for OneOffWithRecursion + | >, + | recursiveB STRUCT< + | key string, + | value string + | -- Removed recursiveOneOffInB: 3rd level for OneOffWithRecursion + | >, + | value string + | > + | >, + | value string + | > + |""".stripMargin + val schema = structFromDDL(schemaDDL) + assert(fromProtoDf.schema == schema) + val data = Seq( + Row( + Row("key1", + Row( + Row("keyNested0", null, null, "valueNested0"), + "recursiveAKey"), + null, + "value1") + ) + ) + val dataDf = spark.createDataFrame(spark.sparkContext.parallelize(data), schema) + val dataDfToProto = dataDf.select( + functions.to_protobuf($"sample", "OneOfEventWithRecursion", testFileDesc) as 'toProto) + + val eventFromSparkSchema = OneOfEventWithRecursion.parseFrom( + dataDfToProto.select("toProto").take(1).toSeq(0).getAs[Array[Byte]](0)) + recursiveField = eventFromSparkSchema.getRecursiveA.getRecursiveOneOffInA + assert(recursiveField.getKey.equals("keyNested0")) + assert(recursiveField.getValue.equals("valueNested0")) + assert(recursiveField.getRecursiveA.getKey.isEmpty()) + eventFromSparkSchema.getDescriptorForType.getFields.asScala.map(f => { + assert(expectedFields.contains(f.getName)) + }) + } + + test("Verify recursive.fields.max.depth Levels 1,2, and 3 with Simple Schema") { + val eventPerson3 = EventPerson.newBuilder().setName("person3").build() + val eventPerson2 = EventPerson.newBuilder().setName("person2").setBff(eventPerson3).build() + val eventPerson1 = EventPerson.newBuilder().setName("person1").setBff(eventPerson2).build() + val eventPerson0 = EventPerson.newBuilder().setName("person0").setBff(eventPerson1).build() + val df = Seq(eventPerson0.toByteArray).toDF("value") + + val optionsZero = new java.util.HashMap[String, String]() + optionsZero.put("recursive.fields.max.depth", "1") + val schemaOne = structFromDDL( + "sample STRUCT" // 'bff' field is dropped to due to limit of 1. + ) + val expectedDfOne = spark.createDataFrame( + spark.sparkContext.parallelize(Seq(Row(Row("person0", null)))), schemaOne) + testFromProtobufWithOptions(df, expectedDfOne, optionsZero, "EventPerson") + + val optionsTwo = new java.util.HashMap[String, String]() + optionsTwo.put("recursive.fields.max.depth", "2") + val schemaTwo = structFromDDL( + """ + | sample STRUCT< + | name: STRING, + | bff: STRUCT -- Recursion is terminated here. + | > + |""".stripMargin) + val expectedDfTwo = spark.createDataFrame( + spark.sparkContext.parallelize(Seq(Row(Row("person0", Row("person1", null))))), schemaTwo) + testFromProtobufWithOptions(df, expectedDfTwo, optionsTwo, "EventPerson") + + val optionsThree = new java.util.HashMap[String, String]() + optionsThree.put("recursive.fields.max.depth", "3") + val schemaThree = structFromDDL( + """ + | sample STRUCT< + | name: STRING, + | bff: STRUCT< + | name: STRING, + | bff: STRUCT + | > + | > + |""".stripMargin) + val expectedDfThree = spark.createDataFrame(spark.sparkContext.parallelize( + Seq(Row(Row("person0", Row("person1", Row("person2", null)))))), schemaThree) + testFromProtobufWithOptions(df, expectedDfThree, optionsThree, "EventPerson") + + // Test recursive level 1 with EventPersonWrapper. In this case the top level struct + // 'EventPersonWrapper' itself does not recurse unlike 'EventPerson'. + // "bff" appears twice: Once allowed recursion and second time as terminated "null" type. + val wrapperSchemaOne = structFromDDL( + """ + | sample STRUCT< + | person: STRUCT< -- 1st level + | name: STRING, + | bff: STRUCT -- 2nd level. Inner 3rd level Person is dropped. + | > + | > + |""".stripMargin).asInstanceOf[StructType] + val expectedWrapperDfTwo = spark.createDataFrame( + spark.sparkContext.parallelize(Seq(Row(Row(Row("person0", Row("person1", null)))))), + wrapperSchemaOne) + testFromProtobufWithOptions( + Seq(EventPersonWrapper.newBuilder().setPerson(eventPerson0).build().toByteArray).toDF(), + expectedWrapperDfTwo, + optionsTwo, + "EventPersonWrapper" + ) + } + + test("Verify exceptions are correctly propagated with errors") { + // This triggers an query compilation error and ensures that original exception is + // also included in in the exception. + + val invalidDescPath = "/non/existent/path.desc" + + val ex = intercept[AnalysisException] { + Seq(Array[Byte]()) + .toDF() + .select( + functions.from_protobuf($"value", "SomeMessage", invalidDescPath) + ).collect() + } + checkError( + ex, + errorClass = "PROTOBUF_DESCRIPTOR_FILE_NOT_FOUND", + parameters = Map("filePath" -> "/non/existent/path.desc") + ) + assert(ex.getCause != null) + assert(ex.getCause.getMessage.matches(".*No such file.*"), ex.getCause.getMessage()) + } + + test("Recursive fields in arrays and maps") { + // Verifies schema for recursive proto in an array field & map field. + val options = Map("recursive.fields.max.depth" -> "3") + + checkWithFileAndClassName("PersonWithRecursiveArray") { + case (name, descFilePathOpt) => + val expectedSchema = StructType( + // DDL: "proto STRUCT>>>>" + // Can not use DataType.fromDDL(), it does not support "containsNull: false" for arrays. + StructField("proto", + StructType( // 1st level + StructField("name", StringType) :: StructField("friends", // 2nd level + ArrayType( + StructType(StructField("name", StringType) :: StructField("friends", // 3rd level + ArrayType( + StructType(StructField("name", StringType) :: Nil), // 4th, array dropped + containsNull = false) + ):: Nil), + containsNull = false) + ) :: Nil + ) + ) :: Nil + ) + + val df = emptyBinaryDF.select( + from_protobuf_wrapper($"binary", name, descFilePathOpt, options).as("proto") + ) + assert(df.schema == expectedSchema) + } + + checkWithFileAndClassName("PersonWithRecursiveMap") { + case (name, descFilePathOpt) => + val expectedSchema = StructType( + // DDL: "proto STRUCT>>>>" + StructField("proto", + StructType( // 1st level + StructField("name", StringType) :: StructField("groups", // 2nd level + MapType( + StringType, + StructType(StructField("name", StringType) :: StructField("groups", // 3rd level + MapType( + StringType, + StructType(StructField("name", StringType) :: Nil), // 4th, array dropped + valueContainsNull = false) + ):: Nil), + valueContainsNull = false) + ) :: Nil + ) + ) :: Nil + ) + + val df = emptyBinaryDF.select( + from_protobuf_wrapper($"binary", name, descFilePathOpt, options).as("proto") + ) + assert(df.schema == expectedSchema) + } + } + + test("Corner case: empty recursive proto fields should be dropped") { + // This verifies that a empty proto like 'message A { A a = 1}' are completely dropped + // irrespective of max depth setting. + + val options = Map("recursive.fields.max.depth" -> "4") + + // EmptyRecursiveProto at the top level. It will be an empty struct. + checkWithFileAndClassName("EmptyRecursiveProto") { + case (name, descFilePathOpt) => + val df = emptyBinaryDF.select( + from_protobuf_wrapper($"binary", name, descFilePathOpt, options).as("empty_proto") + ) + assert(df.schema == structFromDDL("empty_proto struct<>")) + } + + // EmptyRecursiveProto at inner level. + checkWithFileAndClassName("EmptyRecursiveProtoWrapper") { + case (name, descFilePathOpt) => + val df = emptyBinaryDF.select( + from_protobuf_wrapper($"binary", name, descFilePathOpt, options).as("wrapper") + ) + // 'empty_recursive' field is dropped from the schema. Only "name" is present. + assert(df.schema == structFromDDL("wrapper struct")) + } + } + + def testFromProtobufWithOptions( + df: DataFrame, + expectedDf: DataFrame, + options: java.util.HashMap[String, String], + messageName: String): Unit = { + val fromProtoDf = df.select( + functions.from_protobuf($"value", messageName, testFileDesc, options) as 'sample) + assert(expectedDf.schema === fromProtoDf.schema) + checkAnswer(fromProtoDf, expectedDf) + } +} diff --git a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala new file mode 100644 index 0000000000000..356cd20eb4e4d --- /dev/null +++ b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufSerdeSuite.scala @@ -0,0 +1,313 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.protobuf + +import com.google.protobuf.Descriptors.Descriptor +import com.google.protobuf.DynamicMessage + +import org.apache.spark.sql.AnalysisException +import org.apache.spark.sql.catalyst.{InternalRow, NoopFilters} +import org.apache.spark.sql.catalyst.expressions.Cast.toSQLType +import org.apache.spark.sql.protobuf.utils.ProtobufUtils +import org.apache.spark.sql.test.SharedSparkSession +import org.apache.spark.sql.types.{IntegerType, StructType} + +/** + * Tests for [[ProtobufSerializer]] and [[ProtobufDeserializer]] with a more specific focus on + * those classes. + */ +class ProtobufSerdeSuite extends SharedSparkSession with ProtobufTestBase { + + import ProtoSerdeSuite._ + import ProtoSerdeSuite.MatchType._ + + val testFileDesc = testFile("serde_suite.desc", "protobuf/serde_suite.desc") + private val javaClassNamePrefix = "org.apache.spark.sql.protobuf.protos.SerdeSuiteProtos$" + + val proto2Desc = testFile("proto2_messages.desc", "protobuf/proto2_messages.desc") + + test("Test basic conversion") { + withFieldMatchType { fieldMatch => + val (top, nest) = fieldMatch match { + case BY_NAME => ("foo", "bar") + } + val protoFile = ProtobufUtils.buildDescriptor(testFileDesc, "BasicMessage") + + val dynamicMessageFoo = DynamicMessage + .newBuilder(protoFile.getFile.findMessageTypeByName("Foo")) + .setField(protoFile.getFile.findMessageTypeByName("Foo").findFieldByName("bar"), 10902) + .build() + + val dynamicMessage = DynamicMessage + .newBuilder(protoFile) + .setField(protoFile.findFieldByName("foo"), dynamicMessageFoo) + .build() + + val serializer = Serializer.create(CATALYST_STRUCT, protoFile, fieldMatch) + val deserializer = Deserializer.create(CATALYST_STRUCT, protoFile, fieldMatch) + + assert( + serializer.serialize(deserializer.deserialize(dynamicMessage).get) === dynamicMessage) + } + } + + test("Optional fields can be dropped from input SQL schema for the serializer") { + // This test verifies that optional fields can be missing from input Catalyst schema + // while serializing rows to protobuf. + + val desc = ProtobufUtils.buildDescriptor(proto2Desc, "FoobarWithRequiredFieldBar") + + // Confirm desc contains optional field 'foo' and required field bar. + assert(desc.getFields.size() == 2) + assert(desc.findFieldByName("foo").isOptional) + + // Use catalyst type without optional "foo". + val sqlType = structFromDDL("struct") + val serializer = new ProtobufSerializer(sqlType, desc, nullable = false) // Should work fine. + + // Should be able to deserialize a row. + val protoMessage = serializer.serialize(InternalRow(22)).asInstanceOf[DynamicMessage] + + // Verify the descriptor and the value. + assert(protoMessage.getDescriptorForType == desc) + assert(protoMessage.getField(desc.findFieldByName("bar")).asInstanceOf[Int] == 22) + } + + test("Fail to convert with field type mismatch") { + val protoFile = ProtobufUtils.buildDescriptor(testFileDesc, "MissMatchTypeInRoot") + withFieldMatchType { fieldMatch => + assertFailedConversionMessage( + protoFile, + Deserializer, + fieldMatch, + errorClass = "CANNOT_CONVERT_PROTOBUF_MESSAGE_TYPE_TO_SQL_TYPE", + params = Map( + "protobufType" -> "MissMatchTypeInRoot", + "toType" -> toSQLType(CATALYST_STRUCT))) + + assertFailedConversionMessage( + protoFile, + Serializer, + fieldMatch, + errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_MESSAGE_TYPE", + params = Map( + "protobufType" -> "MissMatchTypeInRoot", + "toType" -> toSQLType(CATALYST_STRUCT))) + } + } + + test("Fail to convert with missing nested Protobuf fields for serializer") { + val protoFile = ProtobufUtils.buildDescriptor(testFileDesc, "FieldMissingInProto") + + val nonnullCatalyst = new StructType() + .add("foo", new StructType().add("bar", IntegerType, nullable = false)) + + // serialize fails whether or not 'bar' is nullable + assertFailedConversionMessage( + protoFile, + Serializer, + BY_NAME, + errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_MESSAGE_TYPE", + params = Map( + "protobufType" -> "FieldMissingInProto", + "toType" -> toSQLType(CATALYST_STRUCT))) + + assertFailedConversionMessage(protoFile, + Serializer, + BY_NAME, + errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_MESSAGE_TYPE", + params = Map( + "protobufType" -> "FieldMissingInProto", + "toType" -> toSQLType(nonnullCatalyst))) + } + + test("Fail to convert with deeply nested field type mismatch") { + val protoFile = ProtobufUtils.buildDescriptorFromJavaClass( + s"${javaClassNamePrefix}MissMatchTypeInDeepNested" + ) + val catalyst = new StructType().add("top", CATALYST_STRUCT) + + withFieldMatchType { fieldMatch => + assertFailedConversionMessage( + protoFile, + Deserializer, + fieldMatch, + catalyst, + errorClass = "CANNOT_CONVERT_PROTOBUF_MESSAGE_TYPE_TO_SQL_TYPE", + params = Map( + "protobufType" -> "MissMatchTypeInDeepNested", + "toType" -> toSQLType(catalyst))) + + assertFailedConversionMessage( + protoFile, + Serializer, + fieldMatch, + catalyst, + errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_MESSAGE_TYPE", + params = Map( + "protobufType" -> "MissMatchTypeInDeepNested", + "toType" -> toSQLType(catalyst))) + } + } + + test("Fail to convert with missing Catalyst fields") { + val protoFile = ProtobufUtils.buildDescriptor(testFileDesc, "FieldMissingInSQLRoot") + + val foobarSQLType = structFromDDL("struct") // "bar" is missing. + + assertFailedConversionMessage( + ProtobufUtils.buildDescriptor(proto2Desc, "FoobarWithRequiredFieldBar"), + Serializer, + BY_NAME, + catalystSchema = foobarSQLType, + errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_MESSAGE_TYPE", + params = Map( + "protobufType" -> "FoobarWithRequiredFieldBar", + "toType" -> toSQLType(foobarSQLType))) + + /* deserializing should work regardless of whether the extra field is missing + in SQL Schema or not */ + withFieldMatchType(Deserializer.create(CATALYST_STRUCT, protoFile, _)) + withFieldMatchType(Deserializer.create(CATALYST_STRUCT, protoFile, _)) + + val protoNestedFile = ProtobufUtils + .buildDescriptor(proto2Desc, "NestedFoobarWithRequiredFieldBar") + + val nestedFoobarSQLType = structFromDDL( + "struct>" // "bar" field is missing. + ) + // serializing with extra fails if required field is missing in inner struct + assertFailedConversionMessage( + ProtobufUtils.buildDescriptor(proto2Desc, "NestedFoobarWithRequiredFieldBar"), + Serializer, + BY_NAME, + catalystSchema = nestedFoobarSQLType, + errorClass = "UNABLE_TO_CONVERT_TO_PROTOBUF_MESSAGE_TYPE", + params = Map( + "protobufType" -> "NestedFoobarWithRequiredFieldBar", + "toType" -> toSQLType(nestedFoobarSQLType))) + + /* deserializing should work regardless of whether the extra field is missing + in SQL Schema or not */ + withFieldMatchType(Deserializer.create(nestedFoobarSQLType, protoNestedFile, _)) + } + + test("raise cannot parse and construct protobuf descriptor error") { + // passing serde_suite.proto instead serde_suite.desc + var testFileDesc = testFile("serde_suite.proto", "protobuf/serde_suite.proto") + val e1 = intercept[AnalysisException] { + ProtobufUtils.buildDescriptor(testFileDesc, "SerdeBasicMessage") + } + + checkError( + exception = e1, + errorClass = "CANNOT_PARSE_PROTOBUF_DESCRIPTOR", + parameters = Map("descFilePath" -> testFileDesc)) + + testFileDesc = testFile("basicmessage_noimports.desc", "protobuf/basicmessage_noimports.desc") + val e2 = intercept[AnalysisException] { + ProtobufUtils.buildDescriptor(testFileDesc, "SerdeBasicMessage") + } + + checkError( + exception = e2, + errorClass = "CANNOT_CONSTRUCT_PROTOBUF_DESCRIPTOR", + parameters = Map("descFilePath" -> testFileDesc)) + } + + /** + * Attempt to convert `catalystSchema` to `protoSchema` (or vice-versa if `deserialize` is + * true), assert that it fails, and assert that the _cause_ of the thrown exception has a + * message matching `expectedCauseMessage`. + */ + private def assertFailedConversionMessage( + protoSchema: Descriptor, + serdeFactory: SerdeFactory[_], + fieldMatchType: MatchType, + catalystSchema: StructType = CATALYST_STRUCT, + errorClass: String, + params: Map[String, String]): Unit = { + + val e = intercept[AnalysisException] { + serdeFactory.create(catalystSchema, protoSchema, fieldMatchType) + } + + val expectMsg = serdeFactory match { + case Deserializer => + s"[CANNOT_CONVERT_PROTOBUF_MESSAGE_TYPE_TO_SQL_TYPE] Unable to convert" + + s" ${protoSchema.getName} of Protobuf to SQL type ${toSQLType(catalystSchema)}." + case Serializer => + s"[UNABLE_TO_CONVERT_TO_PROTOBUF_MESSAGE_TYPE] Unable to convert SQL type" + + s" ${toSQLType(catalystSchema)} to Protobuf type ${protoSchema.getName}." + } + + assert(e.getMessage === expectMsg) + checkError( + exception = e, + errorClass = errorClass, + parameters = params) + } + + def withFieldMatchType(f: MatchType => Unit): Unit = { + MatchType.values.foreach { fieldMatchType => + withClue(s"fieldMatchType == $fieldMatchType") { + f(fieldMatchType) + } + } + } +} + +object ProtoSerdeSuite { + + val CATALYST_STRUCT = + new StructType().add("foo", new StructType().add("bar", IntegerType)) + + /** + * Specifier for type of field matching to be used for easy creation of tests that do by-name + * field matching. + */ + object MatchType extends Enumeration { + type MatchType = Value + val BY_NAME = Value + } + + import MatchType._ + + /** + * Specifier for type of serde to be used for easy creation of tests that do both serialization + * and deserialization. + */ + sealed trait SerdeFactory[T] { + def create(sqlSchema: StructType, descriptor: Descriptor, fieldMatchType: MatchType): T + } + + object Serializer extends SerdeFactory[ProtobufSerializer] { + override def create( + sql: StructType, + descriptor: Descriptor, + matchType: MatchType): ProtobufSerializer = new ProtobufSerializer(sql, descriptor, false) + } + + object Deserializer extends SerdeFactory[ProtobufDeserializer] { + override def create( + sql: StructType, + descriptor: Descriptor, + matchType: MatchType): ProtobufDeserializer = + new ProtobufDeserializer(descriptor, sql, new NoopFilters) + } +} diff --git a/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufTestBase.scala b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufTestBase.scala new file mode 100644 index 0000000000000..2ead89e4545c2 --- /dev/null +++ b/connector/protobuf/src/test/scala/org/apache/spark/sql/protobuf/ProtobufTestBase.scala @@ -0,0 +1,41 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark.sql.protobuf + +import org.apache.spark.sql.test.SQLTestUtils +import org.apache.spark.sql.types.{DataType, StructType} + +trait ProtobufTestBase extends SQLTestUtils { + + /** + * Returns full path to the given file in the resource folder, + * if the first choice throw NPE, try to return the full path of alternative. + * The result path doesn't contain the `file:/` protocol part. + */ + protected def testFile(fileName: String, alternateFileName: String): String = { + val ret = try { + testFile(fileName) + } catch { + case _: NullPointerException => testFile(alternateFileName) + } + ret.replace("file:/", "/") + } + + protected def structFromDDL(ddl: String): StructType = + DataType.fromDDL(ddl).asInstanceOf[StructType] +} diff --git a/connector/spark-ganglia-lgpl/pom.xml b/connector/spark-ganglia-lgpl/pom.xml new file mode 100644 index 0000000000000..08fab33634acc --- /dev/null +++ b/connector/spark-ganglia-lgpl/pom.xml @@ -0,0 +1,48 @@ + + + + 4.0.0 + + org.apache.spark + spark-parent_2.12 + 3.4.1 + ../../pom.xml + + + + spark-ganglia-lgpl_2.12 + jar + Spark Ganglia Integration + + + ganglia-lgpl + + + + + org.apache.spark + spark-core_${scala.binary.version} + ${project.version} + + + info.ganglia.gmetric4j + gmetric4j + 1.0.10 + + + diff --git a/external/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaReporter.java b/connector/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaReporter.java similarity index 100% rename from external/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaReporter.java rename to connector/spark-ganglia-lgpl/src/main/java/com/codahale/metrics/ganglia/GangliaReporter.java diff --git a/external/spark-ganglia-lgpl/src/main/scala/org/apache/spark/metrics/sink/GangliaSink.scala b/connector/spark-ganglia-lgpl/src/main/scala/org/apache/spark/metrics/sink/GangliaSink.scala similarity index 100% rename from external/spark-ganglia-lgpl/src/main/scala/org/apache/spark/metrics/sink/GangliaSink.scala rename to connector/spark-ganglia-lgpl/src/main/scala/org/apache/spark/metrics/sink/GangliaSink.scala diff --git a/core/benchmarks/CoalescedRDDBenchmark-jdk11-results.txt b/core/benchmarks/CoalescedRDDBenchmark-jdk11-results.txt index b8294fbabbbdb..cad3f67c79e8f 100644 --- a/core/benchmarks/CoalescedRDDBenchmark-jdk11-results.txt +++ b/core/benchmarks/CoalescedRDDBenchmark-jdk11-results.txt @@ -2,39 +2,39 @@ Coalesced RDD , large scale ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Coalesced RDD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Coalesce Num Partitions: 100 Num Hosts: 1 372 439 69 0.3 3721.8 1.0X -Coalesce Num Partitions: 100 Num Hosts: 5 244 307 82 0.4 2439.9 1.5X -Coalesce Num Partitions: 100 Num Hosts: 10 238 285 56 0.4 2376.3 1.6X -Coalesce Num Partitions: 100 Num Hosts: 20 268 299 50 0.4 2683.0 1.4X -Coalesce Num Partitions: 100 Num Hosts: 40 237 304 84 0.4 2367.0 1.6X -Coalesce Num Partitions: 100 Num Hosts: 80 256 263 8 0.4 2562.7 1.5X -Coalesce Num Partitions: 500 Num Hosts: 1 588 647 83 0.2 5882.1 0.6X -Coalesce Num Partitions: 500 Num Hosts: 5 335 340 6 0.3 3347.3 1.1X -Coalesce Num Partitions: 500 Num Hosts: 10 272 350 71 0.4 2716.5 1.4X -Coalesce Num Partitions: 500 Num Hosts: 20 250 297 60 0.4 2501.5 1.5X -Coalesce Num Partitions: 500 Num Hosts: 40 238 330 83 0.4 2376.8 1.6X -Coalesce Num Partitions: 500 Num Hosts: 80 235 330 83 0.4 2349.8 1.6X -Coalesce Num Partitions: 1000 Num Hosts: 1 1010 1061 60 0.1 10102.5 0.4X -Coalesce Num Partitions: 1000 Num Hosts: 5 411 415 5 0.2 4105.3 0.9X -Coalesce Num Partitions: 1000 Num Hosts: 10 302 348 75 0.3 3022.8 1.2X -Coalesce Num Partitions: 1000 Num Hosts: 20 262 319 93 0.4 2624.4 1.4X -Coalesce Num Partitions: 1000 Num Hosts: 40 291 352 95 0.3 2910.4 1.3X -Coalesce Num Partitions: 1000 Num Hosts: 80 275 324 79 0.4 2746.0 1.4X -Coalesce Num Partitions: 5000 Num Hosts: 1 4077 4116 57 0.0 40771.2 0.1X -Coalesce Num Partitions: 5000 Num Hosts: 5 1401 1449 66 0.1 14012.6 0.3X -Coalesce Num Partitions: 5000 Num Hosts: 10 820 849 43 0.1 8196.3 0.5X -Coalesce Num Partitions: 5000 Num Hosts: 20 556 611 81 0.2 5560.3 0.7X -Coalesce Num Partitions: 5000 Num Hosts: 40 373 431 91 0.3 3732.1 1.0X -Coalesce Num Partitions: 5000 Num Hosts: 80 332 375 64 0.3 3316.1 1.1X -Coalesce Num Partitions: 10000 Num Hosts: 1 7346 7372 32 0.0 73455.9 0.1X -Coalesce Num Partitions: 10000 Num Hosts: 5 2599 2656 85 0.0 25985.4 0.1X -Coalesce Num Partitions: 10000 Num Hosts: 10 1429 1458 43 0.1 14292.3 0.3X -Coalesce Num Partitions: 10000 Num Hosts: 20 822 857 50 0.1 8224.4 0.5X -Coalesce Num Partitions: 10000 Num Hosts: 40 528 592 102 0.2 5278.3 0.7X -Coalesce Num Partitions: 10000 Num Hosts: 80 389 457 101 0.3 3894.3 1.0X +Coalesce Num Partitions: 100 Num Hosts: 1 383 456 89 0.3 3827.2 1.0X +Coalesce Num Partitions: 100 Num Hosts: 5 250 305 48 0.4 2495.2 1.5X +Coalesce Num Partitions: 100 Num Hosts: 10 239 280 61 0.4 2386.8 1.6X +Coalesce Num Partitions: 100 Num Hosts: 20 220 268 81 0.5 2198.6 1.7X +Coalesce Num Partitions: 100 Num Hosts: 40 235 297 101 0.4 2352.3 1.6X +Coalesce Num Partitions: 100 Num Hosts: 80 243 341 86 0.4 2428.1 1.6X +Coalesce Num Partitions: 500 Num Hosts: 1 574 647 88 0.2 5740.9 0.7X +Coalesce Num Partitions: 500 Num Hosts: 5 302 375 65 0.3 3018.6 1.3X +Coalesce Num Partitions: 500 Num Hosts: 10 263 305 68 0.4 2630.0 1.5X +Coalesce Num Partitions: 500 Num Hosts: 20 248 340 154 0.4 2483.9 1.5X +Coalesce Num Partitions: 500 Num Hosts: 40 293 333 68 0.3 2930.1 1.3X +Coalesce Num Partitions: 500 Num Hosts: 80 246 290 74 0.4 2461.0 1.6X +Coalesce Num Partitions: 1000 Num Hosts: 1 989 1019 50 0.1 9894.7 0.4X +Coalesce Num Partitions: 1000 Num Hosts: 5 381 426 76 0.3 3809.1 1.0X +Coalesce Num Partitions: 1000 Num Hosts: 10 351 355 7 0.3 3508.8 1.1X +Coalesce Num Partitions: 1000 Num Hosts: 20 275 326 86 0.4 2752.7 1.4X +Coalesce Num Partitions: 1000 Num Hosts: 40 255 307 88 0.4 2552.8 1.5X +Coalesce Num Partitions: 1000 Num Hosts: 80 292 341 82 0.3 2923.2 1.3X +Coalesce Num Partitions: 5000 Num Hosts: 1 3598 3640 67 0.0 35981.4 0.1X +Coalesce Num Partitions: 5000 Num Hosts: 5 923 954 47 0.1 9230.6 0.4X +Coalesce Num Partitions: 5000 Num Hosts: 10 615 689 124 0.2 6152.9 0.6X +Coalesce Num Partitions: 5000 Num Hosts: 20 428 480 88 0.2 4276.1 0.9X +Coalesce Num Partitions: 5000 Num Hosts: 40 362 414 78 0.3 3618.1 1.1X +Coalesce Num Partitions: 5000 Num Hosts: 80 289 351 105 0.3 2893.4 1.3X +Coalesce Num Partitions: 10000 Num Hosts: 1 7025 7073 73 0.0 70245.1 0.1X +Coalesce Num Partitions: 10000 Num Hosts: 5 1841 1891 63 0.1 18407.3 0.2X +Coalesce Num Partitions: 10000 Num Hosts: 10 1029 1070 55 0.1 10293.9 0.4X +Coalesce Num Partitions: 10000 Num Hosts: 20 650 701 81 0.2 6499.3 0.6X +Coalesce Num Partitions: 10000 Num Hosts: 40 459 512 91 0.2 4586.4 0.8X +Coalesce Num Partitions: 10000 Num Hosts: 80 365 465 87 0.3 3646.8 1.0X diff --git a/core/benchmarks/CoalescedRDDBenchmark-jdk17-results.txt b/core/benchmarks/CoalescedRDDBenchmark-jdk17-results.txt index f29636406de90..9b06c336bb8e9 100644 --- a/core/benchmarks/CoalescedRDDBenchmark-jdk17-results.txt +++ b/core/benchmarks/CoalescedRDDBenchmark-jdk17-results.txt @@ -2,39 +2,39 @@ Coalesced RDD , large scale ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz Coalesced RDD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Coalesce Num Partitions: 100 Num Hosts: 1 314 337 28 0.3 3142.0 1.0X -Coalesce Num Partitions: 100 Num Hosts: 5 224 226 3 0.4 2235.1 1.4X -Coalesce Num Partitions: 100 Num Hosts: 10 198 201 2 0.5 1982.9 1.6X -Coalesce Num Partitions: 100 Num Hosts: 20 194 206 16 0.5 1936.3 1.6X -Coalesce Num Partitions: 100 Num Hosts: 40 189 192 3 0.5 1893.6 1.7X -Coalesce Num Partitions: 100 Num Hosts: 80 229 232 4 0.4 2291.2 1.4X -Coalesce Num Partitions: 500 Num Hosts: 1 553 559 5 0.2 5533.2 0.6X -Coalesce Num Partitions: 500 Num Hosts: 5 276 281 7 0.4 2762.6 1.1X -Coalesce Num Partitions: 500 Num Hosts: 10 265 269 3 0.4 2648.8 1.2X -Coalesce Num Partitions: 500 Num Hosts: 20 214 231 19 0.5 2141.3 1.5X -Coalesce Num Partitions: 500 Num Hosts: 40 237 245 8 0.4 2365.4 1.3X -Coalesce Num Partitions: 500 Num Hosts: 80 206 220 19 0.5 2057.9 1.5X -Coalesce Num Partitions: 1000 Num Hosts: 1 846 879 55 0.1 8459.7 0.4X -Coalesce Num Partitions: 1000 Num Hosts: 5 310 317 9 0.3 3104.2 1.0X -Coalesce Num Partitions: 1000 Num Hosts: 10 269 282 20 0.4 2686.3 1.2X -Coalesce Num Partitions: 1000 Num Hosts: 20 234 236 1 0.4 2339.7 1.3X -Coalesce Num Partitions: 1000 Num Hosts: 40 243 250 7 0.4 2431.1 1.3X -Coalesce Num Partitions: 1000 Num Hosts: 80 206 209 2 0.5 2060.4 1.5X -Coalesce Num Partitions: 5000 Num Hosts: 1 3579 3582 4 0.0 35785.7 0.1X -Coalesce Num Partitions: 5000 Num Hosts: 5 806 844 46 0.1 8059.4 0.4X -Coalesce Num Partitions: 5000 Num Hosts: 10 502 512 11 0.2 5023.4 0.6X -Coalesce Num Partitions: 5000 Num Hosts: 20 416 420 6 0.2 4160.6 0.8X -Coalesce Num Partitions: 5000 Num Hosts: 40 301 305 4 0.3 3013.1 1.0X -Coalesce Num Partitions: 5000 Num Hosts: 80 254 259 5 0.4 2541.9 1.2X -Coalesce Num Partitions: 10000 Num Hosts: 1 6490 6501 13 0.0 64904.9 0.0X -Coalesce Num Partitions: 10000 Num Hosts: 5 1830 1860 26 0.1 18301.2 0.2X -Coalesce Num Partitions: 10000 Num Hosts: 10 1022 1040 25 0.1 10218.7 0.3X -Coalesce Num Partitions: 10000 Num Hosts: 20 677 679 2 0.1 6767.7 0.5X -Coalesce Num Partitions: 10000 Num Hosts: 40 442 445 3 0.2 4420.7 0.7X -Coalesce Num Partitions: 10000 Num Hosts: 80 330 337 9 0.3 3303.2 1.0X +Coalesce Num Partitions: 100 Num Hosts: 1 298 348 64 0.3 2978.0 1.0X +Coalesce Num Partitions: 100 Num Hosts: 5 248 262 21 0.4 2483.7 1.2X +Coalesce Num Partitions: 100 Num Hosts: 10 266 284 22 0.4 2656.0 1.1X +Coalesce Num Partitions: 100 Num Hosts: 20 240 243 3 0.4 2404.1 1.2X +Coalesce Num Partitions: 100 Num Hosts: 40 267 273 6 0.4 2670.8 1.1X +Coalesce Num Partitions: 100 Num Hosts: 80 242 249 12 0.4 2424.5 1.2X +Coalesce Num Partitions: 500 Num Hosts: 1 593 594 2 0.2 5929.2 0.5X +Coalesce Num Partitions: 500 Num Hosts: 5 306 316 14 0.3 3063.3 1.0X +Coalesce Num Partitions: 500 Num Hosts: 10 295 302 6 0.3 2948.6 1.0X +Coalesce Num Partitions: 500 Num Hosts: 20 241 261 26 0.4 2406.6 1.2X +Coalesce Num Partitions: 500 Num Hosts: 40 258 260 4 0.4 2579.8 1.2X +Coalesce Num Partitions: 500 Num Hosts: 80 246 255 15 0.4 2456.2 1.2X +Coalesce Num Partitions: 1000 Num Hosts: 1 897 921 21 0.1 8966.6 0.3X +Coalesce Num Partitions: 1000 Num Hosts: 5 386 387 1 0.3 3860.9 0.8X +Coalesce Num Partitions: 1000 Num Hosts: 10 305 320 24 0.3 3045.2 1.0X +Coalesce Num Partitions: 1000 Num Hosts: 20 271 283 17 0.4 2713.3 1.1X +Coalesce Num Partitions: 1000 Num Hosts: 40 293 296 4 0.3 2931.6 1.0X +Coalesce Num Partitions: 1000 Num Hosts: 80 251 254 3 0.4 2509.2 1.2X +Coalesce Num Partitions: 5000 Num Hosts: 1 3287 3304 22 0.0 32871.0 0.1X +Coalesce Num Partitions: 5000 Num Hosts: 5 879 885 7 0.1 8792.9 0.3X +Coalesce Num Partitions: 5000 Num Hosts: 10 597 599 3 0.2 5968.2 0.5X +Coalesce Num Partitions: 5000 Num Hosts: 20 402 416 16 0.2 4020.3 0.7X +Coalesce Num Partitions: 5000 Num Hosts: 40 315 323 8 0.3 3147.4 0.9X +Coalesce Num Partitions: 5000 Num Hosts: 80 271 281 13 0.4 2706.4 1.1X +Coalesce Num Partitions: 10000 Num Hosts: 1 6884 6902 17 0.0 68842.6 0.0X +Coalesce Num Partitions: 10000 Num Hosts: 5 1809 1844 59 0.1 18088.5 0.2X +Coalesce Num Partitions: 10000 Num Hosts: 10 1042 1057 20 0.1 10425.0 0.3X +Coalesce Num Partitions: 10000 Num Hosts: 20 622 637 13 0.2 6222.6 0.5X +Coalesce Num Partitions: 10000 Num Hosts: 40 468 473 5 0.2 4678.0 0.6X +Coalesce Num Partitions: 10000 Num Hosts: 80 352 360 7 0.3 3520.6 0.8X diff --git a/core/benchmarks/CoalescedRDDBenchmark-results.txt b/core/benchmarks/CoalescedRDDBenchmark-results.txt index e003b58a6458d..89d358ce3b4c3 100644 --- a/core/benchmarks/CoalescedRDDBenchmark-results.txt +++ b/core/benchmarks/CoalescedRDDBenchmark-results.txt @@ -2,39 +2,39 @@ Coalesced RDD , large scale ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Coalesced RDD: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Coalesce Num Partitions: 100 Num Hosts: 1 409 479 115 0.2 4091.1 1.0X -Coalesce Num Partitions: 100 Num Hosts: 5 295 312 28 0.3 2945.5 1.4X -Coalesce Num Partitions: 100 Num Hosts: 10 293 321 26 0.3 2932.2 1.4X -Coalesce Num Partitions: 100 Num Hosts: 20 284 298 25 0.4 2839.6 1.4X -Coalesce Num Partitions: 100 Num Hosts: 40 275 289 20 0.4 2745.5 1.5X -Coalesce Num Partitions: 100 Num Hosts: 80 257 272 21 0.4 2574.1 1.6X -Coalesce Num Partitions: 500 Num Hosts: 1 897 937 57 0.1 8965.2 0.5X -Coalesce Num Partitions: 500 Num Hosts: 5 386 405 17 0.3 3863.1 1.1X -Coalesce Num Partitions: 500 Num Hosts: 10 329 354 26 0.3 3288.6 1.2X -Coalesce Num Partitions: 500 Num Hosts: 20 323 334 16 0.3 3230.1 1.3X -Coalesce Num Partitions: 500 Num Hosts: 40 312 315 3 0.3 3117.4 1.3X -Coalesce Num Partitions: 500 Num Hosts: 80 306 325 17 0.3 3059.2 1.3X -Coalesce Num Partitions: 1000 Num Hosts: 1 1505 1510 6 0.1 15047.4 0.3X -Coalesce Num Partitions: 1000 Num Hosts: 5 498 512 14 0.2 4984.4 0.8X -Coalesce Num Partitions: 1000 Num Hosts: 10 385 407 28 0.3 3850.7 1.1X -Coalesce Num Partitions: 1000 Num Hosts: 20 333 355 30 0.3 3332.9 1.2X -Coalesce Num Partitions: 1000 Num Hosts: 40 305 326 31 0.3 3052.4 1.3X -Coalesce Num Partitions: 1000 Num Hosts: 80 302 312 16 0.3 3024.8 1.4X -Coalesce Num Partitions: 5000 Num Hosts: 1 6444 6484 55 0.0 64443.7 0.1X -Coalesce Num Partitions: 5000 Num Hosts: 5 2014 2112 85 0.0 20141.6 0.2X -Coalesce Num Partitions: 5000 Num Hosts: 10 1181 1198 20 0.1 11805.9 0.3X -Coalesce Num Partitions: 5000 Num Hosts: 20 747 778 29 0.1 7471.8 0.5X -Coalesce Num Partitions: 5000 Num Hosts: 40 506 509 5 0.2 5058.4 0.8X -Coalesce Num Partitions: 5000 Num Hosts: 80 408 415 11 0.2 4082.7 1.0X -Coalesce Num Partitions: 10000 Num Hosts: 1 11825 11980 263 0.0 118254.8 0.0X -Coalesce Num Partitions: 10000 Num Hosts: 5 3905 3979 116 0.0 39050.3 0.1X -Coalesce Num Partitions: 10000 Num Hosts: 10 2116 2159 44 0.0 21159.9 0.2X -Coalesce Num Partitions: 10000 Num Hosts: 20 1199 1221 19 0.1 11992.1 0.3X -Coalesce Num Partitions: 10000 Num Hosts: 40 745 762 15 0.1 7447.8 0.5X -Coalesce Num Partitions: 10000 Num Hosts: 80 522 537 18 0.2 5218.0 0.8X +Coalesce Num Partitions: 100 Num Hosts: 1 235 247 21 0.4 2346.8 1.0X +Coalesce Num Partitions: 100 Num Hosts: 5 167 176 14 0.6 1669.7 1.4X +Coalesce Num Partitions: 100 Num Hosts: 10 163 163 1 0.6 1626.8 1.4X +Coalesce Num Partitions: 100 Num Hosts: 20 157 160 2 0.6 1571.1 1.5X +Coalesce Num Partitions: 100 Num Hosts: 40 155 168 16 0.6 1547.4 1.5X +Coalesce Num Partitions: 100 Num Hosts: 80 151 162 17 0.7 1508.3 1.6X +Coalesce Num Partitions: 500 Num Hosts: 1 594 608 22 0.2 5935.5 0.4X +Coalesce Num Partitions: 500 Num Hosts: 5 245 255 12 0.4 2448.8 1.0X +Coalesce Num Partitions: 500 Num Hosts: 10 198 214 24 0.5 1981.3 1.2X +Coalesce Num Partitions: 500 Num Hosts: 20 178 183 5 0.6 1784.0 1.3X +Coalesce Num Partitions: 500 Num Hosts: 40 161 168 7 0.6 1605.9 1.5X +Coalesce Num Partitions: 500 Num Hosts: 80 168 171 3 0.6 1682.5 1.4X +Coalesce Num Partitions: 1000 Num Hosts: 1 1018 1021 3 0.1 10182.6 0.2X +Coalesce Num Partitions: 1000 Num Hosts: 5 330 336 6 0.3 3296.7 0.7X +Coalesce Num Partitions: 1000 Num Hosts: 10 244 244 1 0.4 2437.6 1.0X +Coalesce Num Partitions: 1000 Num Hosts: 20 199 211 18 0.5 1989.6 1.2X +Coalesce Num Partitions: 1000 Num Hosts: 40 176 186 16 0.6 1758.0 1.3X +Coalesce Num Partitions: 1000 Num Hosts: 80 163 165 2 0.6 1626.6 1.4X +Coalesce Num Partitions: 5000 Num Hosts: 1 4264 4270 5 0.0 42644.7 0.1X +Coalesce Num Partitions: 5000 Num Hosts: 5 1016 1024 9 0.1 10155.1 0.2X +Coalesce Num Partitions: 5000 Num Hosts: 10 594 605 15 0.2 5940.8 0.4X +Coalesce Num Partitions: 5000 Num Hosts: 20 383 387 6 0.3 3827.4 0.6X +Coalesce Num Partitions: 5000 Num Hosts: 40 274 275 1 0.4 2743.5 0.9X +Coalesce Num Partitions: 5000 Num Hosts: 80 216 226 12 0.5 2159.3 1.1X +Coalesce Num Partitions: 10000 Num Hosts: 1 8451 8478 27 0.0 84505.5 0.0X +Coalesce Num Partitions: 10000 Num Hosts: 5 1919 1939 34 0.1 19185.3 0.1X +Coalesce Num Partitions: 10000 Num Hosts: 10 1053 1060 11 0.1 10533.1 0.2X +Coalesce Num Partitions: 10000 Num Hosts: 20 619 635 19 0.2 6187.0 0.4X +Coalesce Num Partitions: 10000 Num Hosts: 40 404 414 10 0.2 4036.4 0.6X +Coalesce Num Partitions: 10000 Num Hosts: 80 279 287 10 0.4 2785.6 0.8X diff --git a/core/benchmarks/KryoBenchmark-jdk11-results.txt b/core/benchmarks/KryoBenchmark-jdk11-results.txt index 1cf0a82b3f898..b2e95106d0dee 100644 --- a/core/benchmarks/KryoBenchmark-jdk11-results.txt +++ b/core/benchmarks/KryoBenchmark-jdk11-results.txt @@ -2,27 +2,27 @@ Benchmark Kryo Unsafe vs safe Serialization ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Benchmark Kryo Unsafe vs safe Serialization: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -basicTypes: Int with unsafe:true 269 273 6 3.7 268.7 1.0X -basicTypes: Long with unsafe:true 296 301 7 3.4 296.3 0.9X -basicTypes: Float with unsafe:true 292 294 2 3.4 291.8 0.9X -basicTypes: Double with unsafe:true 296 299 3 3.4 296.0 0.9X -Array: Int with unsafe:true 4 4 0 270.1 3.7 72.6X -Array: Long with unsafe:true 6 6 0 168.4 5.9 45.2X -Array: Float with unsafe:true 4 4 0 275.2 3.6 73.9X -Array: Double with unsafe:true 6 6 0 164.1 6.1 44.1X -Map of string->Double with unsafe:true 41 41 1 24.6 40.6 6.6X -basicTypes: Int with unsafe:false 311 316 6 3.2 310.7 0.9X -basicTypes: Long with unsafe:false 344 347 2 2.9 344.5 0.8X -basicTypes: Float with unsafe:false 305 310 7 3.3 305.4 0.9X -basicTypes: Double with unsafe:false 311 313 1 3.2 311.1 0.9X -Array: Int with unsafe:false 24 24 0 42.4 23.6 11.4X -Array: Long with unsafe:false 34 34 0 29.7 33.7 8.0X -Array: Float with unsafe:false 10 10 0 101.8 9.8 27.3X -Array: Double with unsafe:false 15 15 1 67.6 14.8 18.2X -Map of string->Double with unsafe:false 41 42 1 24.3 41.2 6.5X +basicTypes: Int with unsafe:true 243 250 4 4.1 242.9 1.0X +basicTypes: Long with unsafe:true 281 283 2 3.6 280.9 0.9X +basicTypes: Float with unsafe:true 282 283 2 3.5 282.0 0.9X +basicTypes: Double with unsafe:true 289 290 1 3.5 289.2 0.8X +Array: Int with unsafe:true 3 3 0 343.7 2.9 83.5X +Array: Long with unsafe:true 4 5 0 229.3 4.4 55.7X +Array: Float with unsafe:true 3 3 0 343.5 2.9 83.5X +Array: Double with unsafe:true 4 5 0 229.2 4.4 55.7X +Map of string->Double with unsafe:true 36 37 0 27.7 36.1 6.7X +basicTypes: Int with unsafe:false 306 309 4 3.3 306.0 0.8X +basicTypes: Long with unsafe:false 323 325 1 3.1 323.3 0.8X +basicTypes: Float with unsafe:false 299 300 1 3.3 299.1 0.8X +basicTypes: Double with unsafe:false 313 315 1 3.2 313.4 0.8X +Array: Int with unsafe:false 20 20 0 50.5 19.8 12.3X +Array: Long with unsafe:false 29 30 0 34.1 29.4 8.3X +Array: Float with unsafe:false 8 8 0 130.4 7.7 31.7X +Array: Double with unsafe:false 13 13 0 75.0 13.3 18.2X +Map of string->Double with unsafe:false 39 39 0 25.8 38.8 6.3X diff --git a/core/benchmarks/KryoBenchmark-jdk17-results.txt b/core/benchmarks/KryoBenchmark-jdk17-results.txt index ea5a371228cd9..a8c208d87791a 100644 --- a/core/benchmarks/KryoBenchmark-jdk17-results.txt +++ b/core/benchmarks/KryoBenchmark-jdk17-results.txt @@ -2,27 +2,27 @@ Benchmark Kryo Unsafe vs safe Serialization ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Benchmark Kryo Unsafe vs safe Serialization: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -basicTypes: Int with unsafe:true 271 277 9 3.7 271.3 1.0X -basicTypes: Long with unsafe:true 271 273 2 3.7 271.1 1.0X -basicTypes: Float with unsafe:true 264 266 1 3.8 264.5 1.0X -basicTypes: Double with unsafe:true 269 272 5 3.7 268.6 1.0X -Array: Int with unsafe:true 3 3 0 365.9 2.7 99.2X -Array: Long with unsafe:true 5 5 0 214.8 4.7 58.3X -Array: Float with unsafe:true 3 3 0 375.7 2.7 101.9X -Array: Double with unsafe:true 5 5 0 210.6 4.7 57.1X -Map of string->Double with unsafe:true 37 37 1 27.4 36.5 7.4X -basicTypes: Int with unsafe:false 286 287 1 3.5 285.7 0.9X -basicTypes: Long with unsafe:false 301 315 17 3.3 300.8 0.9X -basicTypes: Float with unsafe:false 279 294 17 3.6 278.6 1.0X -basicTypes: Double with unsafe:false 284 285 1 3.5 283.7 1.0X -Array: Int with unsafe:false 19 19 0 53.6 18.6 14.5X -Array: Long with unsafe:false 29 29 0 34.7 28.8 9.4X -Array: Float with unsafe:false 7 8 0 133.5 7.5 36.2X -Array: Double with unsafe:false 13 13 1 77.2 13.0 20.9X -Map of string->Double with unsafe:false 38 38 0 26.4 37.9 7.2X +basicTypes: Int with unsafe:true 261 265 3 3.8 260.7 1.0X +basicTypes: Long with unsafe:true 295 299 4 3.4 295.1 0.9X +basicTypes: Float with unsafe:true 286 288 3 3.5 285.6 0.9X +basicTypes: Double with unsafe:true 289 292 1 3.5 289.4 0.9X +Array: Int with unsafe:true 3 3 0 323.7 3.1 84.4X +Array: Long with unsafe:true 5 6 0 195.6 5.1 51.0X +Array: Float with unsafe:true 3 3 0 325.0 3.1 84.7X +Array: Double with unsafe:true 5 6 0 192.9 5.2 50.3X +Map of string->Double with unsafe:true 39 39 0 25.9 38.6 6.7X +basicTypes: Int with unsafe:false 303 305 2 3.3 303.1 0.9X +basicTypes: Long with unsafe:false 329 334 7 3.0 328.8 0.8X +basicTypes: Float with unsafe:false 303 307 3 3.3 303.5 0.9X +basicTypes: Double with unsafe:false 307 311 3 3.3 307.0 0.8X +Array: Int with unsafe:false 22 22 0 46.4 21.5 12.1X +Array: Long with unsafe:false 31 32 0 31.8 31.4 8.3X +Array: Float with unsafe:false 8 9 0 119.0 8.4 31.0X +Array: Double with unsafe:false 15 15 0 68.4 14.6 17.8X +Map of string->Double with unsafe:false 40 40 1 25.2 39.6 6.6X diff --git a/core/benchmarks/KryoBenchmark-results.txt b/core/benchmarks/KryoBenchmark-results.txt index 563b9699f4f2d..21161e5943c66 100644 --- a/core/benchmarks/KryoBenchmark-results.txt +++ b/core/benchmarks/KryoBenchmark-results.txt @@ -2,27 +2,27 @@ Benchmark Kryo Unsafe vs safe Serialization ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Benchmark Kryo Unsafe vs safe Serialization: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -basicTypes: Int with unsafe:true 276 281 6 3.6 276.0 1.0X -basicTypes: Long with unsafe:true 309 320 8 3.2 309.4 0.9X -basicTypes: Float with unsafe:true 301 305 5 3.3 300.6 0.9X -basicTypes: Double with unsafe:true 304 307 3 3.3 303.7 0.9X -Array: Int with unsafe:true 4 5 1 225.2 4.4 62.2X -Array: Long with unsafe:true 7 7 1 140.7 7.1 38.8X -Array: Float with unsafe:true 4 4 0 235.0 4.3 64.9X -Array: Double with unsafe:true 7 8 1 137.2 7.3 37.9X -Map of string->Double with unsafe:true 52 52 1 19.2 52.1 5.3X -basicTypes: Int with unsafe:false 313 317 4 3.2 312.9 0.9X -basicTypes: Long with unsafe:false 342 348 4 2.9 342.3 0.8X -basicTypes: Float with unsafe:false 306 310 4 3.3 306.1 0.9X -basicTypes: Double with unsafe:false 319 323 6 3.1 318.8 0.9X -Array: Int with unsafe:false 27 27 1 37.7 26.5 10.4X -Array: Long with unsafe:false 41 41 1 24.6 40.7 6.8X -Array: Float with unsafe:false 12 12 0 83.7 11.9 23.1X -Array: Double with unsafe:false 19 19 1 52.6 19.0 14.5X -Map of string->Double with unsafe:false 54 54 1 18.6 53.8 5.1X +basicTypes: Int with unsafe:true 222 227 9 4.5 222.2 1.0X +basicTypes: Long with unsafe:true 247 252 5 4.1 246.6 0.9X +basicTypes: Float with unsafe:true 242 245 4 4.1 241.7 0.9X +basicTypes: Double with unsafe:true 245 247 2 4.1 244.6 0.9X +Array: Int with unsafe:true 4 5 0 228.9 4.4 50.9X +Array: Long with unsafe:true 8 8 1 128.7 7.8 28.6X +Array: Float with unsafe:true 4 5 0 251.7 4.0 55.9X +Array: Double with unsafe:true 8 8 0 126.9 7.9 28.2X +Map of string->Double with unsafe:true 42 43 1 23.6 42.3 5.2X +basicTypes: Int with unsafe:false 262 263 2 3.8 261.7 0.8X +basicTypes: Long with unsafe:false 283 286 2 3.5 282.7 0.8X +basicTypes: Float with unsafe:false 259 260 2 3.9 259.3 0.9X +basicTypes: Double with unsafe:false 261 264 2 3.8 261.5 0.8X +Array: Int with unsafe:false 25 25 0 40.3 24.8 9.0X +Array: Long with unsafe:false 33 33 0 30.4 32.9 6.8X +Array: Float with unsafe:false 10 11 0 98.5 10.2 21.9X +Array: Double with unsafe:false 17 17 0 60.1 16.6 13.3X +Map of string->Double with unsafe:false 44 44 1 22.9 43.6 5.1X diff --git a/core/benchmarks/KryoSerializerBenchmark-jdk11-results.txt b/core/benchmarks/KryoSerializerBenchmark-jdk11-results.txt index 39d96dbdbea13..e3782c70a6aaa 100644 --- a/core/benchmarks/KryoSerializerBenchmark-jdk11-results.txt +++ b/core/benchmarks/KryoSerializerBenchmark-jdk11-results.txt @@ -2,11 +2,11 @@ Benchmark KryoPool vs old"pool of 1" implementation ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Benchmark KryoPool vs old"pool of 1" implementation: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -KryoPool:true 8341 10252 NaN 0.0 16681305.6 1.0X -KryoPool:false 13337 15481 NaN 0.0 26673338.8 0.6X +KryoPool:true 10198 12788 404 0.0 20396051.3 1.0X +KryoPool:false 14108 16412 743 0.0 28215846.4 0.7X diff --git a/core/benchmarks/KryoSerializerBenchmark-jdk17-results.txt b/core/benchmarks/KryoSerializerBenchmark-jdk17-results.txt index 15d873500aec9..83d576b2aed0c 100644 --- a/core/benchmarks/KryoSerializerBenchmark-jdk17-results.txt +++ b/core/benchmarks/KryoSerializerBenchmark-jdk17-results.txt @@ -2,11 +2,11 @@ Benchmark KryoPool vs old"pool of 1" implementation ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Benchmark KryoPool vs old"pool of 1" implementation: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -KryoPool:true 9650 11973 NaN 0.0 19300202.2 1.0X -KryoPool:false 14016 17091 NaN 0.0 28031247.1 0.7X +KryoPool:true 7626 9968 762 0.0 15251792.5 1.0X +KryoPool:false 10017 12864 NaN 0.0 20034768.2 0.8X diff --git a/core/benchmarks/KryoSerializerBenchmark-results.txt b/core/benchmarks/KryoSerializerBenchmark-results.txt index 6f75e0e9b79de..09b4faf05131c 100644 --- a/core/benchmarks/KryoSerializerBenchmark-results.txt +++ b/core/benchmarks/KryoSerializerBenchmark-results.txt @@ -2,11 +2,11 @@ Benchmark KryoPool vs old"pool of 1" implementation ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Benchmark KryoPool vs old"pool of 1" implementation: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ----------------------------------------------------------------------------------------------------------------------------------- -KryoPool:true 7305 9327 NaN 0.0 14610208.1 1.0X -KryoPool:false 11487 14654 573 0.0 22973501.4 0.6X +KryoPool:true 7098 8972 NaN 0.0 14196810.5 1.0X +KryoPool:false 10232 11945 744 0.0 20464754.5 0.7X diff --git a/core/benchmarks/MapStatusesConvertBenchmark-jdk11-results.txt b/core/benchmarks/MapStatusesConvertBenchmark-jdk11-results.txt new file mode 100644 index 0000000000000..4d9b00d4078fc --- /dev/null +++ b/core/benchmarks/MapStatusesConvertBenchmark-jdk11-results.txt @@ -0,0 +1,13 @@ +================================================================================================ +MapStatuses Convert Benchmark +================================================================================================ + +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1030-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +MapStatuses Convert: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Num Maps: 50000 Fetch partitions:500 1193 1256 60 0.0 1193082590.0 1.0X +Num Maps: 50000 Fetch partitions:1000 2590 2626 59 0.0 2590497762.0 0.5X +Num Maps: 50000 Fetch partitions:1500 3800 4011 183 0.0 3799891103.0 0.3X + + diff --git a/core/benchmarks/MapStatusesConvertBenchmark-jdk17-results.txt b/core/benchmarks/MapStatusesConvertBenchmark-jdk17-results.txt new file mode 100644 index 0000000000000..b52d96dbdc2bd --- /dev/null +++ b/core/benchmarks/MapStatusesConvertBenchmark-jdk17-results.txt @@ -0,0 +1,13 @@ +================================================================================================ +MapStatuses Convert Benchmark +================================================================================================ + +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1030-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +MapStatuses Convert: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Num Maps: 50000 Fetch partitions:500 1116 1126 9 0.0 1115840641.0 1.0X +Num Maps: 50000 Fetch partitions:1000 2325 2330 7 0.0 2324597259.0 0.5X +Num Maps: 50000 Fetch partitions:1500 3477 3516 37 0.0 3476784419.0 0.3X + + diff --git a/core/benchmarks/MapStatusesConvertBenchmark-results.txt b/core/benchmarks/MapStatusesConvertBenchmark-results.txt new file mode 100644 index 0000000000000..f47b2e33a06f8 --- /dev/null +++ b/core/benchmarks/MapStatusesConvertBenchmark-results.txt @@ -0,0 +1,13 @@ +================================================================================================ +MapStatuses Convert Benchmark +================================================================================================ + +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1030-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz +MapStatuses Convert: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative +------------------------------------------------------------------------------------------------------------------------ +Num Maps: 50000 Fetch partitions:500 1046 1051 5 0.0 1045588914.0 1.0X +Num Maps: 50000 Fetch partitions:1000 2038 2072 41 0.0 2038116226.0 0.5X +Num Maps: 50000 Fetch partitions:1500 3208 3440 378 0.0 3207647789.0 0.3X + + diff --git a/core/benchmarks/MapStatusesSerDeserBenchmark-jdk11-results.txt b/core/benchmarks/MapStatusesSerDeserBenchmark-jdk11-results.txt index bd3ad0ed6ed5f..23fb75bb4fb14 100644 --- a/core/benchmarks/MapStatusesSerDeserBenchmark-jdk11-results.txt +++ b/core/benchmarks/MapStatusesSerDeserBenchmark-jdk11-results.txt @@ -1,64 +1,64 @@ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz 200000 MapOutputs, 10 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Serialization 169 174 4 1.2 847.1 1.0X -Deserialization 209 274 61 1.0 1043.6 0.8X +Serialization 156 163 5 1.3 778.2 1.0X +Deserialization 208 304 104 1.0 1038.3 0.7X -Compressed Serialized MapStatus sizes: 410 bytes +Compressed Serialized MapStatus sizes: 427 bytes Compressed Serialized Broadcast MapStatus sizes: 2 MB -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz 200000 MapOutputs, 10 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Serialization 140 147 9 1.4 701.5 1.0X -Deserialization 206 267 87 1.0 1031.0 0.7X +Serialization 127 134 7 1.6 635.9 1.0X +Deserialization 204 271 80 1.0 1019.6 0.6X Compressed Serialized MapStatus sizes: 2 MB Compressed Serialized Broadcast MapStatus sizes: 0 bytes -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz 200000 MapOutputs, 100 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Serialization 280 296 9 0.7 1399.4 1.0X -Deserialization 237 313 86 0.8 1182.6 1.2X +Serialization 252 266 17 0.8 1258.6 1.0X +Deserialization 255 362 153 0.8 1275.8 1.0X -Compressed Serialized MapStatus sizes: 429 bytes +Compressed Serialized MapStatus sizes: 445 bytes Compressed Serialized Broadcast MapStatus sizes: 13 MB -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz 200000 MapOutputs, 100 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Serialization 241 249 9 0.8 1203.6 1.0X -Deserialization 240 381 128 0.8 1201.1 1.0X +Serialization 231 244 20 0.9 1154.5 1.0X +Deserialization 261 355 130 0.8 1306.5 0.9X Compressed Serialized MapStatus sizes: 13 MB Compressed Serialized Broadcast MapStatus sizes: 0 bytes -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz 200000 MapOutputs, 1000 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Serialization 1205 1235 42 0.2 6025.9 1.0X -Deserialization 631 673 36 0.3 3154.0 1.9X +Serialization 1128 1141 18 0.2 5641.9 1.0X +Deserialization 586 636 54 0.3 2932.2 1.9X -Compressed Serialized MapStatus sizes: 554 bytes +Compressed Serialized MapStatus sizes: 571 bytes Compressed Serialized Broadcast MapStatus sizes: 121 MB -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz 200000 MapOutputs, 1000 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Serialization 1037 1043 7 0.2 5187.1 1.0X -Deserialization 621 648 37 0.3 3104.6 1.7X +Serialization 1062 1098 51 0.2 5310.3 1.0X +Deserialization 580 615 26 0.3 2901.3 1.8X Compressed Serialized MapStatus sizes: 121 MB Compressed Serialized Broadcast MapStatus sizes: 0 bytes diff --git a/core/benchmarks/MapStatusesSerDeserBenchmark-jdk17-results.txt b/core/benchmarks/MapStatusesSerDeserBenchmark-jdk17-results.txt index 54963df994352..d1a1be3df6dec 100644 --- a/core/benchmarks/MapStatusesSerDeserBenchmark-jdk17-results.txt +++ b/core/benchmarks/MapStatusesSerDeserBenchmark-jdk17-results.txt @@ -1,64 +1,64 @@ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz 200000 MapOutputs, 10 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Serialization 142 151 7 1.4 709.7 1.0X -Deserialization 236 261 34 0.8 1178.1 0.6X +Serialization 169 180 8 1.2 847.0 1.0X +Deserialization 272 305 52 0.7 1362.0 0.6X -Compressed Serialized MapStatus sizes: 410 bytes +Compressed Serialized MapStatus sizes: 427 bytes Compressed Serialized Broadcast MapStatus sizes: 2 MB -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz 200000 MapOutputs, 10 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Serialization 135 139 3 1.5 675.1 1.0X -Deserialization 258 260 2 0.8 1288.2 0.5X +Serialization 147 160 6 1.4 735.2 1.0X +Deserialization 264 279 20 0.8 1319.6 0.6X Compressed Serialized MapStatus sizes: 2 MB Compressed Serialized Broadcast MapStatus sizes: 0 bytes -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz 200000 MapOutputs, 100 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Serialization 266 273 13 0.8 1329.7 1.0X -Deserialization 261 281 19 0.8 1306.2 1.0X +Serialization 307 333 33 0.7 1533.6 1.0X +Deserialization 297 322 23 0.7 1484.4 1.0X -Compressed Serialized MapStatus sizes: 429 bytes +Compressed Serialized MapStatus sizes: 445 bytes Compressed Serialized Broadcast MapStatus sizes: 13 MB -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz 200000 MapOutputs, 100 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Serialization 251 256 4 0.8 1257.5 1.0X -Deserialization 263 278 23 0.8 1317.0 1.0X +Serialization 286 291 5 0.7 1427.5 1.0X +Deserialization 328 350 37 0.6 1637.8 0.9X Compressed Serialized MapStatus sizes: 13 MB Compressed Serialized Broadcast MapStatus sizes: 0 bytes -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz 200000 MapOutputs, 1000 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Serialization 1243 1248 7 0.2 6215.7 1.0X -Deserialization 469 534 69 0.4 2344.6 2.7X +Serialization 1296 1339 61 0.2 6479.8 1.0X +Deserialization 535 563 34 0.4 2677.0 2.4X -Compressed Serialized MapStatus sizes: 556 bytes +Compressed Serialized MapStatus sizes: 571 bytes Compressed Serialized Broadcast MapStatus sizes: 121 MB -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz 200000 MapOutputs, 1000 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Serialization 1006 1035 41 0.2 5029.6 1.0X -Deserialization 479 488 14 0.4 2394.8 2.1X +Serialization 1160 1190 42 0.2 5801.2 1.0X +Deserialization 519 532 16 0.4 2596.4 2.2X Compressed Serialized MapStatus sizes: 121 MB Compressed Serialized Broadcast MapStatus sizes: 0 bytes diff --git a/core/benchmarks/MapStatusesSerDeserBenchmark-results.txt b/core/benchmarks/MapStatusesSerDeserBenchmark-results.txt index f800ca0a71b6b..3d67dc886c5de 100644 --- a/core/benchmarks/MapStatusesSerDeserBenchmark-results.txt +++ b/core/benchmarks/MapStatusesSerDeserBenchmark-results.txt @@ -1,64 +1,64 @@ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz 200000 MapOutputs, 10 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------- -Serialization 144 151 10 1.4 722.4 1.0X -Deserialization 229 254 33 0.9 1145.8 0.6X +Serialization 161 165 7 1.2 805.0 1.0X +Deserialization 274 346 132 0.7 1367.8 0.6X -Compressed Serialized MapStatus sizes: 410 bytes +Compressed Serialized MapStatus sizes: 427 bytes Compressed Serialized Broadcast MapStatus sizes: 2 MB -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz 200000 MapOutputs, 10 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Serialization 137 139 2 1.5 686.3 1.0X -Deserialization 229 243 24 0.9 1143.4 0.6X +Serialization 156 158 2 1.3 780.6 1.0X +Deserialization 260 269 19 0.8 1300.7 0.6X Compressed Serialized MapStatus sizes: 2 MB Compressed Serialized Broadcast MapStatus sizes: 0 bytes -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz 200000 MapOutputs, 100 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------- -Serialization 271 312 104 0.7 1354.6 1.0X -Deserialization 254 270 27 0.8 1270.6 1.1X +Serialization 296 350 118 0.7 1482.4 1.0X +Deserialization 277 291 18 0.7 1383.0 1.1X -Compressed Serialized MapStatus sizes: 429 bytes +Compressed Serialized MapStatus sizes: 445 bytes Compressed Serialized Broadcast MapStatus sizes: 13 MB -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz 200000 MapOutputs, 100 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Serialization 255 257 3 0.8 1275.2 1.0X -Deserialization 254 269 27 0.8 1268.0 1.0X +Serialization 274 284 6 0.7 1369.0 1.0X +Deserialization 278 294 22 0.7 1390.2 1.0X Compressed Serialized MapStatus sizes: 13 MB Compressed Serialized Broadcast MapStatus sizes: 0 bytes -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz 200000 MapOutputs, 1000 blocks w/ broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative --------------------------------------------------------------------------------------------------------------------------- -Serialization 1150 1546 561 0.2 5750.3 1.0X -Deserialization 469 522 60 0.4 2342.8 2.5X +Serialization 1287 1645 506 0.2 6434.5 1.0X +Deserialization 530 573 48 0.4 2650.2 2.4X -Compressed Serialized MapStatus sizes: 556 bytes +Compressed Serialized MapStatus sizes: 571 bytes Compressed Serialized Broadcast MapStatus sizes: 121 MB -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8171M CPU @ 2.60GHz 200000 MapOutputs, 1000 blocks w/o broadcast: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ---------------------------------------------------------------------------------------------------------------------------- -Serialization 1036 1040 6 0.2 5182.2 1.0X -Deserialization 465 493 26 0.4 2326.3 2.2X +Serialization 1114 1124 15 0.2 5568.6 1.0X +Deserialization 512 552 46 0.4 2557.6 2.2X Compressed Serialized MapStatus sizes: 121 MB Compressed Serialized Broadcast MapStatus sizes: 0 bytes diff --git a/core/benchmarks/PropertiesCloneBenchmark-jdk11-results.txt b/core/benchmarks/PropertiesCloneBenchmark-jdk11-results.txt index 3f49be9f9251b..d90aee325ff39 100644 --- a/core/benchmarks/PropertiesCloneBenchmark-jdk11-results.txt +++ b/core/benchmarks/PropertiesCloneBenchmark-jdk11-results.txt @@ -2,39 +2,39 @@ Properties Cloning ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Empty Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 0 0 0 0.2 5400.0 1.0X +SerializationUtils.clone 0 0 0 0.2 4900.0 1.0X Utils.cloneProperties 0 0 0 Infinity 0.0 InfinityX -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz System Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 0 0 0 0.0 216703.0 1.0X -Utils.cloneProperties 0 0 0 0.2 5800.0 37.4X +SerializationUtils.clone 0 0 0 0.0 216802.0 1.0X +Utils.cloneProperties 0 0 0 0.2 5400.0 40.1X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Small Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 1 1 0 0.0 565907.0 1.0X -Utils.cloneProperties 0 0 0 0.2 5100.0 111.0X +SerializationUtils.clone 1 1 0 0.0 557203.0 1.0X +Utils.cloneProperties 0 0 0 0.3 3800.0 146.6X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Medium Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 2 3 0 0.0 2114325.0 1.0X -Utils.cloneProperties 0 0 0 0.0 28700.0 73.7X +SerializationUtils.clone 2 2 0 0.0 2227113.0 1.0X +Utils.cloneProperties 0 0 0 0.0 22500.0 99.0X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Large Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 4 5 1 0.0 4373951.0 1.0X -Utils.cloneProperties 0 0 0 0.0 57800.0 75.7X +SerializationUtils.clone 4 4 0 0.0 4311827.0 1.0X +Utils.cloneProperties 0 0 0 0.0 45500.0 94.8X diff --git a/core/benchmarks/PropertiesCloneBenchmark-jdk17-results.txt b/core/benchmarks/PropertiesCloneBenchmark-jdk17-results.txt index 0420aced19c61..bf25c0d63da0d 100644 --- a/core/benchmarks/PropertiesCloneBenchmark-jdk17-results.txt +++ b/core/benchmarks/PropertiesCloneBenchmark-jdk17-results.txt @@ -2,39 +2,39 @@ Properties Cloning ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Empty Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 0 0 0 0.2 4600.0 1.0X +SerializationUtils.clone 0 0 0 0.2 4700.0 1.0X Utils.cloneProperties 0 0 0 Infinity 0.0 InfinityX -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz System Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 0 0 1 0.0 191601.0 1.0X -Utils.cloneProperties 0 0 0 0.2 5600.0 34.2X +SerializationUtils.clone 0 0 0 0.0 197602.0 1.0X +Utils.cloneProperties 0 0 0 0.1 7000.0 28.2X -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Small Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 1 1 1 0.0 636805.0 1.0X -Utils.cloneProperties 0 0 0 0.2 5200.0 122.5X +SerializationUtils.clone 1 1 0 0.0 566404.0 1.0X +Utils.cloneProperties 0 0 0 0.3 3400.0 166.6X -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Medium Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 3 3 1 0.0 2534717.0 1.0X -Utils.cloneProperties 0 0 0 0.0 29000.0 87.4X +SerializationUtils.clone 2 2 0 0.0 2302915.0 1.0X +Utils.cloneProperties 0 0 0 0.0 20300.0 113.4X -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Large Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 5 6 2 0.0 5125031.0 1.0X -Utils.cloneProperties 0 0 0 0.0 61401.0 83.5X +SerializationUtils.clone 4 5 0 0.0 4454330.0 1.0X +Utils.cloneProperties 0 0 0 0.0 41400.0 107.6X diff --git a/core/benchmarks/PropertiesCloneBenchmark-results.txt b/core/benchmarks/PropertiesCloneBenchmark-results.txt index 13f5abae39b5a..85aecd34792db 100644 --- a/core/benchmarks/PropertiesCloneBenchmark-results.txt +++ b/core/benchmarks/PropertiesCloneBenchmark-results.txt @@ -2,39 +2,39 @@ Properties Cloning ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Empty Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 0 0 0 0.2 5400.0 1.0X +SerializationUtils.clone 0 0 0 0.2 5599.0 1.0X Utils.cloneProperties 0 0 0 Infinity 0.0 InfinityX -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz System Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 0 0 0 0.0 398906.0 1.0X -Utils.cloneProperties 0 0 0 0.5 2200.0 181.3X +SerializationUtils.clone 0 0 0 0.0 272997.0 1.0X +Utils.cloneProperties 0 0 0 0.9 1099.0 248.4X -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Small Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 1 1 0 0.0 594008.0 1.0X -Utils.cloneProperties 0 0 0 0.2 5400.0 110.0X +SerializationUtils.clone 1 1 0 0.0 538293.0 1.0X +Utils.cloneProperties 0 0 0 0.2 4299.0 125.2X -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Medium Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 2 2 0 0.0 2353934.0 1.0X -Utils.cloneProperties 0 0 0 0.0 26200.0 89.8X +SerializationUtils.clone 2 2 0 0.0 2328670.0 1.0X +Utils.cloneProperties 0 0 0 0.0 21099.0 110.4X -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Large Properties: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -SerializationUtils.clone 5 5 0 0.0 4513466.0 1.0X -Utils.cloneProperties 0 0 0 0.0 51300.0 88.0X +SerializationUtils.clone 5 5 0 0.0 4522650.0 1.0X +Utils.cloneProperties 0 0 0 0.0 41300.0 109.5X diff --git a/core/benchmarks/XORShiftRandomBenchmark-jdk11-results.txt b/core/benchmarks/XORShiftRandomBenchmark-jdk11-results.txt index f35ec9218e80c..2661ff79f20db 100644 --- a/core/benchmarks/XORShiftRandomBenchmark-jdk11-results.txt +++ b/core/benchmarks/XORShiftRandomBenchmark-jdk11-results.txt @@ -2,43 +2,43 @@ Pseudo random ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz nextInt: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 1034 1034 0 96.7 10.3 1.0X -XORShiftRandom 177 177 0 564.8 1.8 5.8X +java.util.Random 1366 1384 26 73.2 13.7 1.0X +XORShiftRandom 227 233 5 439.6 2.3 6.0X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz nextLong: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 2071 2071 1 48.3 20.7 1.0X -XORShiftRandom 446 447 0 224.1 4.5 4.6X +java.util.Random 2711 2770 52 36.9 27.1 1.0X +XORShiftRandom 630 640 9 158.8 6.3 4.3X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz nextDouble: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 2067 2068 0 48.4 20.7 1.0X -XORShiftRandom 443 443 0 225.7 4.4 4.7X +java.util.Random 2656 2684 24 37.6 26.6 1.0X +XORShiftRandom 624 631 6 160.2 6.2 4.3X -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz nextGaussian: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 5508 5509 1 18.2 55.1 1.0X -XORShiftRandom 3952 3953 2 25.3 39.5 1.4X +java.util.Random 7008 7241 260 14.3 70.1 1.0X +XORShiftRandom 5546 5621 121 18.0 55.5 1.3X ================================================================================================ hash seed ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.13+8-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) CPU E5-2673 v3 @ 2.40GHz Hash seed: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -XORShiftRandom.hashSeed 40 56 25 249.3 4.0 1.0X +XORShiftRandom.hashSeed 39 41 2 255.4 3.9 1.0X diff --git a/core/benchmarks/XORShiftRandomBenchmark-jdk17-results.txt b/core/benchmarks/XORShiftRandomBenchmark-jdk17-results.txt index 623416e375092..8d572c0ce9bb8 100644 --- a/core/benchmarks/XORShiftRandomBenchmark-jdk17-results.txt +++ b/core/benchmarks/XORShiftRandomBenchmark-jdk17-results.txt @@ -2,43 +2,43 @@ Pseudo random ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz nextInt: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 1172 1173 0 85.3 11.7 1.0X -XORShiftRandom 202 202 0 495.3 2.0 5.8X +java.util.Random 1361 1376 16 73.5 13.6 1.0X +XORShiftRandom 228 235 11 438.8 2.3 6.0X -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz nextLong: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 2345 2346 2 42.6 23.4 1.0X -XORShiftRandom 502 502 0 199.2 5.0 4.7X +java.util.Random 2807 2836 26 35.6 28.1 1.0X +XORShiftRandom 667 679 11 149.9 6.7 4.2X -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz nextDouble: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 2344 2344 0 42.7 23.4 1.0X -XORShiftRandom 502 502 0 199.2 5.0 4.7X +java.util.Random 2751 2774 26 36.4 27.5 1.0X +XORShiftRandom 646 658 11 154.7 6.5 4.3X -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz nextGaussian: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 7816 7817 1 12.8 78.2 1.0X -XORShiftRandom 4452 4453 0 22.5 44.5 1.8X +java.util.Random 8671 8676 9 11.5 86.7 1.0X +XORShiftRandom 4778 4851 94 20.9 47.8 1.8X ================================================================================================ hash seed ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.1+12-LTS on Linux 5.11.0-1022-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz Hash seed: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -XORShiftRandom.hashSeed 1 1 0 11965.9 0.1 1.0X +XORShiftRandom.hashSeed 1 1 0 10149.1 0.1 1.0X diff --git a/core/benchmarks/XORShiftRandomBenchmark-results.txt b/core/benchmarks/XORShiftRandomBenchmark-results.txt index a582b50292607..06e82f02195bf 100644 --- a/core/benchmarks/XORShiftRandomBenchmark-results.txt +++ b/core/benchmarks/XORShiftRandomBenchmark-results.txt @@ -2,43 +2,43 @@ Pseudo random ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz nextInt: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 1139 1140 1 87.8 11.4 1.0X -XORShiftRandom 201 201 0 498.0 2.0 5.7X +java.util.Random 1138 1142 3 87.9 11.4 1.0X +XORShiftRandom 201 201 0 498.1 2.0 5.7X -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz nextLong: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 2343 2343 0 42.7 23.4 1.0X -XORShiftRandom 535 535 0 186.8 5.4 4.4X +java.util.Random 2348 2349 1 42.6 23.5 1.0X +XORShiftRandom 502 503 1 199.3 5.0 4.7X -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz nextDouble: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 2343 2343 0 42.7 23.4 1.0X -XORShiftRandom 541 541 0 184.9 5.4 4.3X +java.util.Random 2344 2346 2 42.7 23.4 1.0X +XORShiftRandom 502 502 0 199.2 5.0 4.7X -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz nextGaussian: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -java.util.Random 6264 6264 1 16.0 62.6 1.0X -XORShiftRandom 4609 4611 2 21.7 46.1 1.4X +java.util.Random 6231 6237 5 16.0 62.3 1.0X +XORShiftRandom 4476 4476 0 22.3 44.8 1.4X ================================================================================================ hash seed ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_312-b07 on Linux 5.11.0-1022-azure +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Hash seed: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------ -XORShiftRandom.hashSeed 62 63 1 160.3 6.2 1.0X +XORShiftRandom.hashSeed 62 63 2 160.6 6.2 1.0X diff --git a/core/benchmarks/ZStandardBenchmark-jdk11-results.txt b/core/benchmarks/ZStandardBenchmark-jdk11-results.txt index 53c9299e84366..268f64d7d8cbc 100644 --- a/core/benchmarks/ZStandardBenchmark-jdk11-results.txt +++ b/core/benchmarks/ZStandardBenchmark-jdk11-results.txt @@ -2,26 +2,26 @@ Benchmark ZStandardCompressionCodec ================================================================================================ -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Benchmark ZStandardCompressionCodec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Compression 10000 times at level 1 without buffer pool 584 604 15 0.0 58407.5 1.0X -Compression 10000 times at level 2 without buffer pool 654 665 11 0.0 65444.9 0.9X -Compression 10000 times at level 3 without buffer pool 907 916 8 0.0 90677.0 0.6X -Compression 10000 times at level 1 with buffer pool 674 686 11 0.0 67437.9 0.9X -Compression 10000 times at level 2 with buffer pool 759 769 10 0.0 75916.2 0.8X -Compression 10000 times at level 3 with buffer pool 1006 1017 16 0.0 100600.2 0.6X +Compression 10000 times at level 1 without buffer pool 859 872 21 0.0 85890.3 1.0X +Compression 10000 times at level 2 without buffer pool 930 932 2 0.0 92995.6 0.9X +Compression 10000 times at level 3 without buffer pool 1137 1138 2 0.0 113664.6 0.8X +Compression 10000 times at level 1 with buffer pool 662 664 1 0.0 66244.7 1.3X +Compression 10000 times at level 2 with buffer pool 725 726 1 0.0 72541.4 1.2X +Compression 10000 times at level 3 with buffer pool 929 930 2 0.0 92851.4 0.9X -OpenJDK 64-Bit Server VM 11.0.14+9-LTS on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 11.0.17+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Benchmark ZStandardCompressionCodec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------ -Decompression 10000 times from level 1 without buffer pool 693 698 9 0.0 69257.4 1.0X -Decompression 10000 times from level 2 without buffer pool 699 707 7 0.0 69857.8 1.0X -Decompression 10000 times from level 3 without buffer pool 689 697 7 0.0 68858.9 1.0X -Decompression 10000 times from level 1 with buffer pool 450 476 37 0.0 45005.9 1.5X -Decompression 10000 times from level 2 with buffer pool 527 550 26 0.0 52653.2 1.3X -Decompression 10000 times from level 3 with buffer pool 452 513 43 0.0 45201.4 1.5X +Decompression 10000 times from level 1 without buffer pool 1001 1002 1 0.0 100140.5 1.0X +Decompression 10000 times from level 2 without buffer pool 1003 1004 2 0.0 100270.9 1.0X +Decompression 10000 times from level 3 without buffer pool 1002 1002 1 0.0 100172.1 1.0X +Decompression 10000 times from level 1 with buffer pool 895 896 1 0.0 89525.2 1.1X +Decompression 10000 times from level 2 with buffer pool 895 896 1 0.0 89524.7 1.1X +Decompression 10000 times from level 3 with buffer pool 894 895 1 0.0 89423.5 1.1X diff --git a/core/benchmarks/ZStandardBenchmark-jdk17-results.txt b/core/benchmarks/ZStandardBenchmark-jdk17-results.txt index c6d84b79cb29c..341db6b993b1c 100644 --- a/core/benchmarks/ZStandardBenchmark-jdk17-results.txt +++ b/core/benchmarks/ZStandardBenchmark-jdk17-results.txt @@ -2,26 +2,26 @@ Benchmark ZStandardCompressionCodec ================================================================================================ -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Benchmark ZStandardCompressionCodec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Compression 10000 times at level 1 without buffer pool 2380 2426 65 0.0 238014.5 1.0X -Compression 10000 times at level 2 without buffer pool 1532 2271 1045 0.0 153222.7 1.6X -Compression 10000 times at level 3 without buffer pool 1746 1757 15 0.0 174619.0 1.4X -Compression 10000 times at level 1 with buffer pool 1177 1178 2 0.0 117681.3 2.0X -Compression 10000 times at level 2 with buffer pool 1267 1273 8 0.0 126719.0 1.9X -Compression 10000 times at level 3 with buffer pool 1517 1603 122 0.0 151729.8 1.6X +Compression 10000 times at level 1 without buffer pool 2316 2316 0 0.0 231553.5 1.0X +Compression 10000 times at level 2 without buffer pool 2231 2306 107 0.0 223095.1 1.0X +Compression 10000 times at level 3 without buffer pool 2436 2438 4 0.0 243551.0 1.0X +Compression 10000 times at level 1 with buffer pool 2064 2065 1 0.0 206377.9 1.1X +Compression 10000 times at level 2 with buffer pool 2133 2134 3 0.0 213253.8 1.1X +Compression 10000 times at level 3 with buffer pool 2320 2321 2 0.0 231978.8 1.0X -OpenJDK 64-Bit Server VM 17.0.2+8-LTS on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) CPU E5-2673 v4 @ 2.30GHz +OpenJDK 64-Bit Server VM 17.0.5+8 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz Benchmark ZStandardCompressionCodec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------ -Decompression 10000 times from level 1 without buffer pool 2241 2271 42 0.0 224123.2 1.0X -Decompression 10000 times from level 2 without buffer pool 2210 2253 62 0.0 220980.7 1.0X -Decompression 10000 times from level 3 without buffer pool 2220 2228 12 0.0 221964.2 1.0X -Decompression 10000 times from level 1 with buffer pool 1987 1995 12 0.0 198705.4 1.1X -Decompression 10000 times from level 2 with buffer pool 1966 1968 4 0.0 196572.3 1.1X -Decompression 10000 times from level 3 with buffer pool 1983 1991 11 0.0 198277.7 1.1X +Decompression 10000 times from level 1 without buffer pool 2135 2136 3 0.0 213458.1 1.0X +Decompression 10000 times from level 2 without buffer pool 2133 2138 6 0.0 213310.9 1.0X +Decompression 10000 times from level 3 without buffer pool 2127 2131 5 0.0 212738.2 1.0X +Decompression 10000 times from level 1 with buffer pool 1956 1958 2 0.0 195628.1 1.1X +Decompression 10000 times from level 2 with buffer pool 1957 1958 0 0.0 195735.7 1.1X +Decompression 10000 times from level 3 with buffer pool 1955 1956 2 0.0 195504.1 1.1X diff --git a/core/benchmarks/ZStandardBenchmark-results.txt b/core/benchmarks/ZStandardBenchmark-results.txt index 5de6d182fa6de..01c4b667ad6db 100644 --- a/core/benchmarks/ZStandardBenchmark-results.txt +++ b/core/benchmarks/ZStandardBenchmark-results.txt @@ -2,26 +2,26 @@ Benchmark ZStandardCompressionCodec ================================================================================================ -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Benchmark ZStandardCompressionCodec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative -------------------------------------------------------------------------------------------------------------------------------------- -Compression 10000 times at level 1 without buffer pool 633 774 122 0.0 63315.3 1.0X -Compression 10000 times at level 2 without buffer pool 748 749 2 0.0 74771.7 0.8X -Compression 10000 times at level 3 without buffer pool 945 949 7 0.0 94461.5 0.7X -Compression 10000 times at level 1 with buffer pool 287 289 2 0.0 28703.6 2.2X -Compression 10000 times at level 2 with buffer pool 336 342 3 0.0 33641.3 1.9X -Compression 10000 times at level 3 with buffer pool 517 528 8 0.0 51747.9 1.2X +Compression 10000 times at level 1 without buffer pool 368 370 2 0.0 36754.6 1.0X +Compression 10000 times at level 2 without buffer pool 409 410 1 0.0 40864.4 0.9X +Compression 10000 times at level 3 without buffer pool 547 548 1 0.0 54662.2 0.7X +Compression 10000 times at level 1 with buffer pool 255 257 2 0.0 25517.0 1.4X +Compression 10000 times at level 2 with buffer pool 296 298 1 0.0 29590.6 1.2X +Compression 10000 times at level 3 with buffer pool 426 428 2 0.0 42609.7 0.9X -OpenJDK 64-Bit Server VM 1.8.0_322-b06 on Linux 5.11.0-1027-azure -Intel(R) Xeon(R) Platinum 8272CL CPU @ 2.60GHz +OpenJDK 64-Bit Server VM 1.8.0_352-b08 on Linux 5.15.0-1023-azure +Intel(R) Xeon(R) Platinum 8370C CPU @ 2.80GHz Benchmark ZStandardCompressionCodec: Best Time(ms) Avg Time(ms) Stdev(ms) Rate(M/s) Per Row(ns) Relative ------------------------------------------------------------------------------------------------------------------------------------------ -Decompression 10000 times from level 1 without buffer pool 683 689 9 0.0 68294.8 1.0X -Decompression 10000 times from level 2 without buffer pool 684 685 1 0.0 68441.8 1.0X -Decompression 10000 times from level 3 without buffer pool 684 685 1 0.0 68446.7 1.0X -Decompression 10000 times from level 1 with buffer pool 494 495 2 0.0 49362.5 1.4X -Decompression 10000 times from level 2 with buffer pool 493 495 2 0.0 49330.7 1.4X -Decompression 10000 times from level 3 with buffer pool 494 497 5 0.0 49359.8 1.4X +Decompression 10000 times from level 1 without buffer pool 545 547 1 0.0 54546.0 1.0X +Decompression 10000 times from level 2 without buffer pool 546 548 2 0.0 54610.1 1.0X +Decompression 10000 times from level 3 without buffer pool 549 551 1 0.0 54863.6 1.0X +Decompression 10000 times from level 1 with buffer pool 435 437 1 0.0 43517.5 1.3X +Decompression 10000 times from level 2 with buffer pool 435 437 1 0.0 43524.7 1.3X +Decompression 10000 times from level 3 with buffer pool 436 437 1 0.0 43591.5 1.3X diff --git a/core/pom.xml b/core/pom.xml index 0711ecc3e0744..ccf8b1d4cb11a 100644 --- a/core/pom.xml +++ b/core/pom.xml @@ -21,7 +21,7 @@ org.apache.spark spark-parent_2.12 - 3.3.1 + 3.4.1 ../pom.xml @@ -181,6 +181,10 @@ commons-codec commons-codec + + org.apache.commons + commons-compress + org.apache.commons commons-lang3 @@ -224,7 +228,7 @@ org.apache.logging.log4j - log4j-slf4j-impl + log4j-slf4j2-impl org.apache.logging.log4j @@ -304,10 +308,32 @@ jersey-test-framework-provider-simple test + io.netty netty-all + + io.netty + netty-transport-native-epoll + linux-x86_64 + + + io.netty + netty-transport-native-epoll + linux-aarch_64 + + + io.netty + netty-transport-native-kqueue + osx-aarch_64 + + + io.netty + netty-transport-native-kqueue + osx-x86_64 + + com.clearspring.analytics stream @@ -427,12 +453,12 @@ net.razorvine pickle - 1.2 + 1.3 net.sf.py4j py4j - 0.10.9.5 + 0.10.9.7 org.apache.spark @@ -510,7 +536,12 @@ org.apache.commons commons-crypto - + + com.google.protobuf + protobuf-java + ${protobuf.version} + compile + target/scala-${scala.binary.version}/classes @@ -531,12 +562,42 @@ maven-antrun-plugin + choose-shell-and-script + validate + + run + + + true + + + + + + + + + + + + Shell to use for generating spark-version-info.properties file = + ${shell} + + Script to use for generating spark-version-info.properties file = + ${spark-build-info-script} + + + + + + generate-spark-build-info generate-resources - - + + @@ -594,6 +655,52 @@ + + org.apache.maven.plugins + maven-shade-plugin + + false + true + + + org.spark-project.spark:unused + org.eclipse.jetty:jetty-io + org.eclipse.jetty:jetty-http + org.eclipse.jetty:jetty-proxy + org.eclipse.jetty:jetty-client + org.eclipse.jetty:jetty-continuation + org.eclipse.jetty:jetty-servlet + org.eclipse.jetty:jetty-servlets + org.eclipse.jetty:jetty-plus + org.eclipse.jetty:jetty-security + org.eclipse.jetty:jetty-util + org.eclipse.jetty:jetty-server + com.google.guava:guava + com.google.protobuf:* + + + + + org.eclipse.jetty + ${spark.shade.packageName}.jetty + + org.eclipse.jetty.** + + + + com.google.common + ${spark.shade.packageName}.guava + + + com.google.protobuf + ${spark.shade.packageName}.spark_core.protobuf + + com.google.protobuf.** + + + + + @@ -644,6 +751,69 @@ + + default-protoc + + + !skipDefaultProtoc + + + + + + com.github.os72 + protoc-jar-maven-plugin + ${protoc-jar-maven-plugin.version} + + + generate-sources + + run + + + com.google.protobuf:protoc:${protobuf.version} + ${protobuf.version} + + src/main/protobuf + + + + + + + + + + user-defined-protoc + + ${env.SPARK_PROTOC_EXEC_PATH} + + + + + com.github.os72 + protoc-jar-maven-plugin + ${protoc-jar-maven-plugin.version} + + + generate-sources + + run + + + com.google.protobuf:protoc:${protobuf.version} + ${protobuf.version} + ${spark.protoc.executable.path} + + src/main/protobuf + + + + + + + + diff --git a/core/src/main/java/org/apache/spark/QueryContext.java b/core/src/main/java/org/apache/spark/QueryContext.java new file mode 100644 index 0000000000000..de5b29d02951d --- /dev/null +++ b/core/src/main/java/org/apache/spark/QueryContext.java @@ -0,0 +1,48 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package org.apache.spark; + +import org.apache.spark.annotation.Evolving; + +/** + * Query context of a {@link SparkThrowable}. It helps users understand where error occur + * while executing queries. + * + * @since 3.4.0 + */ +@Evolving +public interface QueryContext { + // The object type of the query which throws the exception. + // If the exception is directly from the main query, it should be an empty string. + // Otherwise, it should be the exact object type in upper case. For example, a "VIEW". + String objectType(); + + // The object name of the query which throws the exception. + // If the exception is directly from the main query, it should be an empty string. + // Otherwise, it should be the object name. For example, a view name "V1". + String objectName(); + + // The starting index in the query text which throws the exception. The index starts from 0. + int startIndex(); + + // The stopping index in the query which throws the exception. The index starts from 0. + int stopIndex(); + + // The corresponding fragment of the query which throws the exception. + String fragment(); +} diff --git a/core/src/main/java/org/apache/spark/SparkFirehoseListener.java b/core/src/main/java/org/apache/spark/SparkFirehoseListener.java index 7cb2455affe48..2602acf59fff4 100644 --- a/core/src/main/java/org/apache/spark/SparkFirehoseListener.java +++ b/core/src/main/java/org/apache/spark/SparkFirehoseListener.java @@ -200,11 +200,13 @@ public void onSpeculativeTaskSubmitted(SparkListenerSpeculativeTaskSubmitted spe onEvent(speculativeTask); } + @Override public void onUnschedulableTaskSetAdded( SparkListenerUnschedulableTaskSetAdded unschedulableTaskSetAdded) { onEvent(unschedulableTaskSetAdded); } + @Override public void onUnschedulableTaskSetRemoved( SparkListenerUnschedulableTaskSetRemoved unschedulableTaskSetRemoved) { onEvent(unschedulableTaskSetRemoved); diff --git a/core/src/main/java/org/apache/spark/SparkThrowable.java b/core/src/main/java/org/apache/spark/SparkThrowable.java index 2be0c3c0f9466..e1235b2982ba0 100644 --- a/core/src/main/java/org/apache/spark/SparkThrowable.java +++ b/core/src/main/java/org/apache/spark/SparkThrowable.java @@ -19,6 +19,9 @@ import org.apache.spark.annotation.Evolving; +import java.util.HashMap; +import java.util.Map; + /** * Interface mixed into Throwables thrown from Spark. * @@ -46,4 +49,10 @@ default String getSqlState() { default boolean isInternalError() { return SparkThrowableHelper.isInternalError(this.getErrorClass()); } + + default Map getMessageParameters() { + return new HashMap<>(); + } + + default QueryContext[] getQueryContext() { return new QueryContext[0]; } } diff --git a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java index fc8689354274f..ecbb0ccb4d200 100644 --- a/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java +++ b/core/src/main/java/org/apache/spark/memory/MemoryConsumer.java @@ -19,6 +19,7 @@ import java.io.IOException; +import org.apache.spark.errors.SparkCoreErrors; import org.apache.spark.unsafe.array.LongArray; import org.apache.spark.unsafe.memory.MemoryBlock; @@ -153,9 +154,6 @@ private void throwOom(final MemoryBlock page, final long required) { taskMemoryManager.freePage(page, this); } taskMemoryManager.showMemoryUsage(); - // checkstyle.off: RegexpSinglelineJava - throw new SparkOutOfMemoryError("UNABLE_TO_ACQUIRE_MEMORY", - new String[]{Long.toString(required), Long.toString(got)}); - // checkstyle.on: RegexpSinglelineJava + throw SparkCoreErrors.outOfMemoryError(required, got); } } diff --git a/core/src/main/java/org/apache/spark/memory/SparkOutOfMemoryError.java b/core/src/main/java/org/apache/spark/memory/SparkOutOfMemoryError.java index 7c992c80f4641..8ec5c2221b6e9 100644 --- a/core/src/main/java/org/apache/spark/memory/SparkOutOfMemoryError.java +++ b/core/src/main/java/org/apache/spark/memory/SparkOutOfMemoryError.java @@ -20,6 +20,8 @@ import org.apache.spark.SparkThrowableHelper; import org.apache.spark.annotation.Private; +import java.util.Map; + /** * This exception is thrown when a task can not acquire memory from the Memory manager. * Instead of throwing {@link OutOfMemoryError}, which kills the executor, @@ -28,7 +30,7 @@ @Private public final class SparkOutOfMemoryError extends OutOfMemoryError implements SparkThrowable { String errorClass; - String[] messageParameters; + Map messageParameters; public SparkOutOfMemoryError(String s) { super(s); @@ -38,12 +40,18 @@ public SparkOutOfMemoryError(OutOfMemoryError e) { super(e.getMessage()); } - public SparkOutOfMemoryError(String errorClass, String[] messageParameters) { - super(SparkThrowableHelper.getMessage(errorClass, messageParameters, "")); + public SparkOutOfMemoryError(String errorClass, Map messageParameters) { + super(SparkThrowableHelper.getMessage(errorClass, messageParameters)); this.errorClass = errorClass; this.messageParameters = messageParameters; } + @Override + public Map getMessageParameters() { + return messageParameters; + } + + @Override public String getErrorClass() { return errorClass; } diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java index da7a51854cc1f..d067c870acc9e 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/BypassMergeSortShuffleWriter.java @@ -118,7 +118,7 @@ final class BypassMergeSortShuffleWriter ShuffleExecutorComponents shuffleExecutorComponents) throws SparkException { // Use getSizeAsKb (not bytes) to maintain backwards compatibility if no units are provided this.fileBufferSize = (int) (long) conf.get(package$.MODULE$.SHUFFLE_FILE_BUFFER_SIZE()) * 1024; - this.transferToEnabled = conf.getBoolean("spark.file.transferTo", true); + this.transferToEnabled = (boolean) conf.get(package$.MODULE$.SHUFFLE_MERGE_PREFER_NIO()); this.blockManager = blockManager; final ShuffleDependency dep = handle.dependency(); this.mapId = mapId; diff --git a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java index b1779a135b786..9c54184105951 100644 --- a/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java +++ b/core/src/main/java/org/apache/spark/shuffle/sort/UnsafeShuffleWriter.java @@ -135,7 +135,7 @@ public UnsafeShuffleWriter( this.shuffleExecutorComponents = shuffleExecutorComponents; this.taskContext = taskContext; this.sparkConf = sparkConf; - this.transferToEnabled = sparkConf.getBoolean("spark.file.transferTo", true); + this.transferToEnabled = (boolean) sparkConf.get(package$.MODULE$.SHUFFLE_MERGE_PREFER_NIO()); this.initialSortBufferSize = (int) (long) sparkConf.get(package$.MODULE$.SHUFFLE_SORT_INIT_BUFFER_SIZE()); this.inputBufferSizeInBytes = diff --git a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java index f474c30b8b3d8..c64c6ce889fd7 100644 --- a/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java +++ b/core/src/main/java/org/apache/spark/unsafe/map/BytesToBytesMap.java @@ -387,11 +387,6 @@ public synchronized long spill(long numBytes) throws IOException { return released; } - @Override - public void remove() { - throw new UnsupportedOperationException(); - } - private void handleFailedDelete() { if (spillWriters.size() > 0) { // remove the spill file from disk diff --git a/core/src/main/java/org/apache/spark/util/collection/TimSort.java b/core/src/main/java/org/apache/spark/util/collection/TimSort.java index 31428665f9634..863e2e213e703 100644 --- a/core/src/main/java/org/apache/spark/util/collection/TimSort.java +++ b/core/src/main/java/org/apache/spark/util/collection/TimSort.java @@ -1,20 +1,3 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - /* * Based on TimSort.java from the Android Open Source Project * diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java index 54abaf93a7461..e7b128397e13d 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/PrefixComparators.java @@ -104,6 +104,7 @@ public static final class UnsignedPrefixComparator extends RadixSortSupport { @Override public boolean sortDescending() { return false; } @Override public boolean sortSigned() { return false; } @Override public boolean nullsFirst() { return true; } + @Override public int compare(long aPrefix, long bPrefix) { return UnsignedLongs.compare(aPrefix, bPrefix); } @@ -113,6 +114,7 @@ public static final class UnsignedPrefixComparatorNullsLast extends RadixSortSup @Override public boolean sortDescending() { return false; } @Override public boolean sortSigned() { return false; } @Override public boolean nullsFirst() { return false; } + @Override public int compare(long aPrefix, long bPrefix) { return UnsignedLongs.compare(aPrefix, bPrefix); } @@ -122,6 +124,7 @@ public static final class UnsignedPrefixComparatorDescNullsFirst extends RadixSo @Override public boolean sortDescending() { return true; } @Override public boolean sortSigned() { return false; } @Override public boolean nullsFirst() { return true; } + @Override public int compare(long bPrefix, long aPrefix) { return UnsignedLongs.compare(aPrefix, bPrefix); } @@ -131,6 +134,7 @@ public static final class UnsignedPrefixComparatorDesc extends RadixSortSupport @Override public boolean sortDescending() { return true; } @Override public boolean sortSigned() { return false; } @Override public boolean nullsFirst() { return false; } + @Override public int compare(long bPrefix, long aPrefix) { return UnsignedLongs.compare(aPrefix, bPrefix); } @@ -140,6 +144,7 @@ public static final class SignedPrefixComparator extends RadixSortSupport { @Override public boolean sortDescending() { return false; } @Override public boolean sortSigned() { return true; } @Override public boolean nullsFirst() { return true; } + @Override public int compare(long a, long b) { return Long.compare(a, b); } @@ -149,6 +154,7 @@ public static final class SignedPrefixComparatorNullsLast extends RadixSortSuppo @Override public boolean sortDescending() { return false; } @Override public boolean sortSigned() { return true; } @Override public boolean nullsFirst() { return false; } + @Override public int compare(long a, long b) { return Long.compare(a, b); } @@ -158,6 +164,7 @@ public static final class SignedPrefixComparatorDescNullsFirst extends RadixSort @Override public boolean sortDescending() { return true; } @Override public boolean sortSigned() { return true; } @Override public boolean nullsFirst() { return true; } + @Override public int compare(long b, long a) { return Long.compare(a, b); } @@ -167,6 +174,7 @@ public static final class SignedPrefixComparatorDesc extends RadixSortSupport { @Override public boolean sortDescending() { return true; } @Override public boolean sortSigned() { return true; } @Override public boolean nullsFirst() { return false; } + @Override public int compare(long b, long a) { return Long.compare(a, b); } diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java index ac8170c9d97a0..d442b0ef0ef1b 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeExternalSorter.java @@ -18,6 +18,7 @@ package org.apache.spark.util.collection.unsafe.sort; import javax.annotation.Nullable; +import java.io.Closeable; import java.io.File; import java.io.IOException; import java.util.LinkedList; @@ -26,13 +27,14 @@ import java.util.function.Supplier; import com.google.common.annotations.VisibleForTesting; -import org.apache.spark.memory.SparkOutOfMemoryError; +import org.apache.commons.io.IOUtils; import org.slf4j.Logger; import org.slf4j.LoggerFactory; import org.apache.spark.TaskContext; import org.apache.spark.executor.ShuffleWriteMetrics; import org.apache.spark.memory.MemoryConsumer; +import org.apache.spark.memory.SparkOutOfMemoryError; import org.apache.spark.memory.TaskMemoryManager; import org.apache.spark.memory.TooLargePageException; import org.apache.spark.serializer.SerializerManager; @@ -793,7 +795,7 @@ private void moveOver(UnsafeSorterIterator iter, int steps) /** * Chain multiple UnsafeSorterIterator together as single one. */ - static class ChainedIterator extends UnsafeSorterIterator { + static class ChainedIterator extends UnsafeSorterIterator implements Closeable { private final Queue iterators; private UnsafeSorterIterator current; @@ -846,5 +848,23 @@ public void loadNext() throws IOException { @Override public long getKeyPrefix() { return current.getKeyPrefix(); } + + @Override + public void close() throws IOException { + if (iterators != null && !iterators.isEmpty()) { + for (UnsafeSorterIterator iterator : iterators) { + closeIfPossible(iterator); + } + } + if (current != null) { + closeIfPossible(current); + } + } + + private void closeIfPossible(UnsafeSorterIterator iterator) { + if (iterator instanceof Closeable) { + IOUtils.closeQuietly(((Closeable) iterator)); + } + } } } diff --git a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java index 765ee035855d6..4de5440cc156f 100644 --- a/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java +++ b/core/src/main/java/org/apache/spark/util/collection/unsafe/sort/UnsafeInMemorySorter.java @@ -277,6 +277,7 @@ private SortedIterator(int numRecords, int offset) { this.offset = offset; } + @Override public SortedIterator clone() { SortedIterator iter = new SortedIterator(numRecords, offset); iter.position = position; diff --git a/core/src/main/protobuf/org/apache/spark/status/protobuf/store_types.proto b/core/src/main/protobuf/org/apache/spark/status/protobuf/store_types.proto new file mode 100644 index 0000000000000..94ce1b8b58a34 --- /dev/null +++ b/core/src/main/protobuf/org/apache/spark/status/protobuf/store_types.proto @@ -0,0 +1,818 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +syntax = "proto3"; +package org.apache.spark.status.protobuf; + +/** + * Developer guides: + * - Coding style: https://developers.google.com/protocol-buffers/docs/style + * - Use int64 for job/stage IDs, in case of future extension in Spark core. + * - For string fields: + * - use `optional string` for protobuf definition + * - on serialization, check if the string is null to avoid NPE + * - on deserialization, set string fields as null if it is not set. Also, use `weakIntern` on + * string values in create new objects during deserialization. + * - add tests with null string inputs + */ + +enum JobExecutionStatus { + JOB_EXECUTION_STATUS_UNSPECIFIED = 0; + JOB_EXECUTION_STATUS_RUNNING = 1; + JOB_EXECUTION_STATUS_SUCCEEDED = 2; + JOB_EXECUTION_STATUS_FAILED = 3; + JOB_EXECUTION_STATUS_UNKNOWN = 4; +} + +message JobData { + // All IDs are int64 for extendability, even when they are currently int32 in Spark. + int64 job_id = 1; + optional string name = 2; + optional string description = 3; + optional int64 submission_time = 4; + optional int64 completion_time = 5; + repeated int64 stage_ids = 6; + optional string job_group = 7; + JobExecutionStatus status = 8; + int32 num_tasks = 9; + int32 num_active_tasks = 10; + int32 num_completed_tasks = 11; + int32 num_skipped_tasks = 12; + int32 num_failed_tasks = 13; + int32 num_killed_tasks = 14; + int32 num_completed_indices = 15; + int32 num_active_stages = 16; + int32 num_completed_stages = 17; + int32 num_skipped_stages = 18; + int32 num_failed_stages = 19; + map kill_tasks_summary = 20; +} + +message JobDataWrapper { + JobData info = 1; + repeated int32 skipped_stages = 2; + optional int64 sql_execution_id = 3; +} + +message AccumulableInfo { + int64 id = 1; + optional string name = 2; + optional string update = 3; + optional string value = 4; +} + +message TaskDataWrapper { + int64 task_id = 1; + int32 index = 2; + int32 attempt = 3; + int32 partition_id = 4; + int64 launch_time = 5; + int64 result_fetch_start = 6; + int64 duration = 7; + optional string executor_id = 8; + optional string host = 9; + optional string status = 10; + optional string task_locality = 11; + bool speculative = 12; + repeated AccumulableInfo accumulator_updates = 13; + optional string error_message = 14; + bool has_metrics = 15; + int64 executor_deserialize_time = 16; + int64 executor_deserialize_cpu_time = 17; + int64 executor_run_time = 18; + int64 executor_cpu_time = 19; + int64 result_size = 20; + int64 jvm_gc_time = 21; + int64 result_serialization_time = 22; + int64 memory_bytes_spilled = 23; + int64 disk_bytes_spilled = 24; + int64 peak_execution_memory = 25; + int64 input_bytes_read = 26; + int64 input_records_read = 27; + int64 output_bytes_written = 28; + int64 output_records_written = 29; + int64 shuffle_remote_blocks_fetched = 30; + int64 shuffle_local_blocks_fetched = 31; + int64 shuffle_fetch_wait_time = 32; + int64 shuffle_remote_bytes_read = 33; + int64 shuffle_remote_bytes_read_to_disk = 34; + int64 shuffle_local_bytes_read = 35; + int64 shuffle_records_read = 36; + int64 shuffle_bytes_written = 37; + int64 shuffle_write_time = 38; + int64 shuffle_records_written = 39; + int64 stage_id = 40; + int32 stage_attempt_id = 41; + int64 shuffle_corrupt_merged_block_chunks = 42; + int64 shuffle_merged_fetch_fallback_count = 43; + int64 shuffle_merged_remote_blocks_fetched = 44; + int64 shuffle_merged_local_blocks_fetched = 45; + int64 shuffle_merged_remote_chunks_fetched = 46; + int64 shuffle_merged_local_chunks_fetched = 47; + int64 shuffle_merged_remote_bytes_read = 48; + int64 shuffle_merged_local_bytes_read = 49; + int64 shuffle_remote_reqs_duration = 50; + int64 shuffle_merged_remote_req_duration = 51; +} + +message ExecutorMetrics { + map metrics = 1; +} + +message ExecutorStageSummary { + int64 task_time = 1; + int32 failed_tasks = 2; + int32 succeeded_tasks = 3; + int32 killed_tasks = 4; + int64 input_bytes = 5; + int64 input_records = 6; + int64 output_bytes = 7; + int64 output_records = 8; + int64 shuffle_read = 9; + int64 shuffle_read_records = 10; + int64 shuffle_write = 11; + int64 shuffle_write_records = 12; + int64 memory_bytes_spilled = 13; + int64 disk_bytes_spilled = 14; + bool is_blacklisted_for_stage = 15; + optional ExecutorMetrics peak_memory_metrics = 16; + bool is_excluded_for_stage = 17; +} + +message ExecutorStageSummaryWrapper { + int64 stage_id = 1; + int32 stage_attempt_id = 2; + optional string executor_id = 3; + ExecutorStageSummary info = 4; +} + +message ExecutorResourceRequest { + optional string resource_name = 1; + int64 amount = 2; + optional string discoveryScript = 3; + optional string vendor = 4; +} + +message TaskResourceRequest { + optional string resource_name = 1; + double amount = 2; +} + +message ResourceProfileInfo { + int32 id = 1; + map executor_resources = 2; + map task_resources = 3; +} + +message RuntimeInfo { + optional string java_version = 1; + optional string java_home = 2; + optional string scala_version = 3; +} + +message PairStrings { + optional string value1 = 1; + optional string value2 = 2; +} + +message ApplicationEnvironmentInfo { + RuntimeInfo runtime = 1; + repeated PairStrings spark_properties = 2; + repeated PairStrings hadoop_properties = 3; + repeated PairStrings system_properties = 4; + repeated PairStrings metrics_properties = 5; + repeated PairStrings classpath_entries = 6; + repeated ResourceProfileInfo resource_profiles = 7; +} + +message ApplicationEnvironmentInfoWrapper { + ApplicationEnvironmentInfo info = 1; +} + +message ApplicationAttemptInfo { + optional string attempt_id = 1; + int64 start_time = 2; + int64 end_time = 3; + int64 last_updated = 4; + int64 duration = 5; + optional string spark_user = 6; + bool completed = 7; + optional string app_spark_version = 8; +} + +message ApplicationInfo { + optional string id = 1; + optional string name = 2; + optional int32 cores_granted = 3; + optional int32 max_cores = 4; + optional int32 cores_per_executor = 5; + optional int32 memory_per_executor_mb = 6; + repeated ApplicationAttemptInfo attempts = 7; +} + +message ApplicationInfoWrapper { + ApplicationInfo info = 1; +} + +message StreamBlockData { + optional string name = 1; + optional string executor_id = 2; + optional string host_port = 3; + optional string storage_level = 4; + bool use_memory = 5; + bool use_disk = 6; + bool deserialized = 7; + int64 mem_size = 8; + int64 disk_size = 9; +} + +message RDDDataDistribution { + optional string address = 1; + int64 memory_used = 2; + int64 memory_remaining = 3; + int64 disk_used = 4; + optional int64 on_heap_memory_used = 5; + optional int64 off_heap_memory_used = 6; + optional int64 on_heap_memory_remaining = 7; + optional int64 off_heap_memory_remaining = 8; +} + +message RDDPartitionInfo { + optional string block_name = 1; + optional string storage_level = 2; + int64 memory_used = 3; + int64 disk_used = 4; + repeated string executors = 5; +} + +message RDDStorageInfo { + int32 id = 1; + optional string name = 2; + int32 num_partitions = 3; + int32 num_cached_partitions = 4; + optional string storage_level = 5; + int64 memory_used = 6; + int64 disk_used = 7; + repeated RDDDataDistribution data_distribution = 8; + repeated RDDPartitionInfo partitions = 9; +} + +message RDDStorageInfoWrapper { + RDDStorageInfo info = 1; +} + +message ResourceProfileWrapper { + ResourceProfileInfo rpInfo = 1; +} + +message CachedQuantile { + int64 stage_id = 1; + int32 stage_attempt_id = 2; + optional string quantile = 3; + int64 task_count = 4; + double duration = 5; + double executor_deserialize_time = 6; + double executor_deserialize_cpu_time = 7; + double executor_run_time = 8; + double executor_cpu_time = 9; + double result_size = 10; + double jvm_gc_time = 11; + double result_serialization_time = 12; + double getting_result_time = 13; + double scheduler_delay = 14; + double peak_execution_memory = 15; + double memory_bytes_spilled = 16; + double disk_bytes_spilled = 17; + double bytes_read = 18; + double records_read = 19; + double bytes_written = 20; + double records_written = 21; + double shuffle_read_bytes = 22; + double shuffle_records_read = 23; + double shuffle_remote_blocks_fetched = 24; + double shuffle_local_blocks_fetched = 25; + double shuffle_fetch_wait_time = 26; + double shuffle_remote_bytes_read = 27; + double shuffle_remote_bytes_read_to_disk = 28; + double shuffle_total_blocks_fetched = 29; + double shuffle_write_bytes = 30; + double shuffle_write_records = 31; + double shuffle_write_time = 32; + double shuffle_corrupt_merged_block_chunks = 33; + double shuffle_merged_fetch_fallback_count = 34; + double shuffle_merged_remote_blocks_fetched = 35; + double shuffle_merged_local_blocks_fetched = 36; + double shuffle_merged_remote_chunks_fetched = 37; + double shuffle_merged_local_chunks_fetched = 38; + double shuffle_merged_remote_bytes_read = 39; + double shuffle_merged_local_bytes_read = 40; + double shuffle_remote_reqs_duration = 41; + double shuffle_merged_remote_reqs_duration = 42; +} + +message SpeculationStageSummary { + int32 num_tasks = 1; + int32 num_active_tasks = 2; + int32 num_completed_tasks = 3; + int32 num_failed_tasks = 4; + int32 num_killed_tasks = 5; +} + +message SpeculationStageSummaryWrapper { + int64 stage_id = 1; + int32 stage_attempt_id = 2; + SpeculationStageSummary info = 3; +} + +message ProcessSummary { + optional string id = 1; + optional string host_port = 2; + bool is_active = 3; + int32 total_cores = 4; + int64 add_time = 5; + optional int64 remove_time = 6; + map process_logs = 7; +} + +message ProcessSummaryWrapper { + ProcessSummary info = 1; +} + +message MemoryMetrics { + int64 used_on_heap_storage_memory = 1; + int64 used_off_heap_storage_memory = 2; + int64 total_on_heap_storage_memory = 3; + int64 total_off_heap_storage_memory = 4; +} + +message ResourceInformation { + optional string name = 1; + repeated string addresses = 2; +} + +message ExecutorSummary { + optional string id = 1; + optional string host_port = 2; + bool is_active = 3; + int32 rdd_blocks = 4; + int64 memory_used = 5; + int64 disk_used = 6; + int32 total_cores = 7; + int32 max_tasks = 8; + int32 active_tasks = 9; + int32 failed_tasks = 10; + int32 completed_tasks = 11; + int32 total_tasks = 12; + int64 total_duration = 13; + int64 total_gc_time = 14; + int64 total_input_bytes = 15; + int64 total_shuffle_read = 16; + int64 total_shuffle_write = 17; + bool is_blacklisted = 18; + int64 max_memory = 19; + int64 add_time = 20; + optional int64 remove_time = 21; + optional string remove_reason = 22; + map executor_logs = 23; + optional MemoryMetrics memory_metrics = 24; + repeated int64 blacklisted_in_stages = 25; + optional ExecutorMetrics peak_memory_metrics = 26; + map attributes = 27; + map resources = 28; + int32 resource_profile_id = 29; + bool is_excluded = 30; + repeated int64 excluded_in_stages = 31; +} + +message ExecutorSummaryWrapper { + ExecutorSummary info = 1; +} + +message SQLPlanMetric { + optional string name = 1; + int64 accumulator_id = 2; + optional string metric_type = 3; +} + +message SQLExecutionUIData { + int64 execution_id = 1; + int64 root_execution_id = 2; + optional string description = 3; + optional string details = 4; + optional string physical_plan_description = 5; + map modified_configs = 6; + repeated SQLPlanMetric metrics = 7; + int64 submission_time = 8; + optional int64 completion_time = 9; + optional string error_message = 10; + map jobs = 11; + repeated int64 stages = 12; + bool metric_values_is_null = 13; + map metric_values = 14; +} + +message SparkPlanGraphNode { + int64 id = 1; + optional string name = 2; + optional string desc = 3; + repeated SQLPlanMetric metrics = 4; +} + +message SparkPlanGraphClusterWrapper { + int64 id = 1; + optional string name = 2; + optional string desc = 3; + repeated SparkPlanGraphNodeWrapper nodes = 4; + repeated SQLPlanMetric metrics = 5; +} + +message SparkPlanGraphNodeWrapper { + oneof wrapper { + SparkPlanGraphNode node = 1; + SparkPlanGraphClusterWrapper cluster = 2; + } +} + +message SparkPlanGraphEdge { + int64 from_id = 1; + int64 to_id = 2; +} + +message SparkPlanGraphWrapper { + int64 execution_id = 1; + repeated SparkPlanGraphNodeWrapper nodes = 2; + repeated SparkPlanGraphEdge edges = 3; +} + +message RDDOperationEdge { + int32 from_id = 1; + int32 to_id = 2; +} + +enum DeterministicLevel { + DETERMINISTIC_LEVEL_UNSPECIFIED = 0; + DETERMINISTIC_LEVEL_DETERMINATE = 1; + DETERMINISTIC_LEVEL_UNORDERED = 2; + DETERMINISTIC_LEVEL_INDETERMINATE = 3; +} + +message RDDOperationNode { + int32 id = 1; + optional string name = 2; + bool cached = 3; + bool barrier = 4; + optional string callsite = 5; + DeterministicLevel output_deterministic_level = 6; +} + +message RDDOperationClusterWrapper { + optional string id = 1; + optional string name = 2; + repeated RDDOperationNode child_nodes = 3; + repeated RDDOperationClusterWrapper child_clusters = 4; +} + +message RDDOperationGraphWrapper { + int64 stage_id = 1; + repeated RDDOperationEdge edges = 2; + repeated RDDOperationEdge outgoing_edges = 3; + repeated RDDOperationEdge incoming_edges = 4; + RDDOperationClusterWrapper root_cluster = 5; +} + +message StreamingQueryData { + optional string name = 1; + optional string id = 2; + optional string run_id = 3; + bool is_active = 4; + optional string exception = 5; + int64 start_timestamp = 6; + optional int64 end_timestamp = 7; +} + +message StageDataWrapper { + StageData info = 1; + repeated int64 job_ids = 2; + map locality = 3; +} + +message TaskData { + int64 task_id = 1; + int32 index = 2; + int32 attempt = 3; + int32 partition_id = 4; + int64 launch_time = 5; + optional int64 result_fetch_start = 6; + optional int64 duration = 7; + optional string executor_id = 8; + optional string host = 9; + optional string status = 10; + optional string task_locality = 11; + bool speculative = 12; + repeated AccumulableInfo accumulator_updates = 13; + optional string error_message = 14; + optional TaskMetrics task_metrics = 15; + map executor_logs = 16; + int64 scheduler_delay = 17; + int64 getting_result_time = 18; +} + +enum StageStatus { + STAGE_STATUS_UNSPECIFIED = 0; + STAGE_STATUS_ACTIVE = 1; + STAGE_STATUS_COMPLETE = 2; + STAGE_STATUS_FAILED = 3; + STAGE_STATUS_PENDING = 4; + STAGE_STATUS_SKIPPED = 5; +} + +message StageData { + StageStatus status = 1; + int64 stage_id = 2; + int32 attempt_id = 3; + int32 num_tasks = 4; + int32 num_active_tasks = 5; + int32 num_complete_tasks = 6; + int32 num_failed_tasks = 7; + int32 num_killed_tasks = 8; + int32 num_completed_indices = 9; + + optional int64 submission_time = 10; + optional int64 first_task_launched_time = 11; + optional int64 completion_time = 12; + optional string failure_reason = 13; + + int64 executor_deserialize_time = 14; + int64 executor_deserialize_cpu_time = 15; + int64 executor_run_time = 16; + int64 executor_cpu_time = 17; + int64 result_size = 18; + int64 jvm_gc_time = 19; + int64 result_serialization_time = 20; + int64 memory_bytes_spilled = 21; + int64 disk_bytes_spilled = 22; + int64 peak_execution_memory = 23; + int64 input_bytes = 24; + int64 input_records = 25; + int64 output_bytes = 26; + int64 output_records = 27; + int64 shuffle_remote_blocks_fetched = 28; + int64 shuffle_local_blocks_fetched = 29; + int64 shuffle_fetch_wait_time = 30; + int64 shuffle_remote_bytes_read = 31; + int64 shuffle_remote_bytes_read_to_disk = 32; + int64 shuffle_local_bytes_read = 33; + int64 shuffle_read_bytes = 34; + int64 shuffle_read_records = 35; + int64 shuffle_write_bytes = 36; + int64 shuffle_write_time = 37; + int64 shuffle_write_records = 38; + + optional string name = 39; + optional string description = 40; + optional string details = 41; + optional string scheduling_pool = 42; + + repeated int64 rdd_ids = 43; + repeated AccumulableInfo accumulator_updates = 44; + map tasks = 45; + map executor_summary = 46; + optional SpeculationStageSummary speculation_summary = 47; + map killed_tasks_summary = 48; + int32 resource_profile_id = 49; + optional ExecutorMetrics peak_executor_metrics = 50; + optional TaskMetricDistributions task_metrics_distributions = 51; + optional ExecutorMetricsDistributions executor_metrics_distributions = 52; + + int64 shuffle_corrupt_merged_block_chunks = 53; + int64 shuffle_merged_fetch_fallback_count = 54; + int64 shuffle_merged_remote_blocks_fetched = 55; + int64 shuffle_merged_local_blocks_fetched = 56; + int64 shuffle_merged_remote_chunks_fetched = 57; + int64 shuffle_merged_local_chunks_fetched = 58; + int64 shuffle_merged_remote_bytes_read = 59; + int64 shuffle_merged_local_bytes_read = 60; + int64 shuffle_remote_reqs_duration = 61; + int64 shuffle_merged_remote_reqs_duration = 62; + bool is_shuffle_push_enabled = 63; + int32 shuffle_mergers_count = 64; +} + +message TaskMetrics { + int64 executor_deserialize_time = 1; + int64 executor_deserialize_cpu_time = 2; + int64 executor_run_time = 3; + int64 executor_cpu_time = 4; + int64 result_size = 5; + int64 jvm_gc_time = 6; + int64 result_serialization_time = 7; + int64 memory_bytes_spilled = 8; + int64 disk_bytes_spilled = 9; + int64 peak_execution_memory = 10; + InputMetrics input_metrics = 11; + OutputMetrics output_metrics = 12; + ShuffleReadMetrics shuffle_read_metrics = 13; + ShuffleWriteMetrics shuffle_write_metrics = 14; +} + +message InputMetrics { + int64 bytes_read = 1; + int64 records_read = 2; +} + +message OutputMetrics { + int64 bytes_written = 1; + int64 records_written = 2; +} + +message ShuffleReadMetrics { + int64 remote_blocks_fetched = 1; + int64 local_blocks_fetched = 2; + int64 fetch_wait_time = 3; + int64 remote_bytes_read = 4; + int64 remote_bytes_read_to_disk = 5; + int64 local_bytes_read = 6; + int64 records_read = 7; + int64 remote_reqs_duration = 8; + ShufflePushReadMetrics shuffle_push_read_metrics = 9; +} + +message ShufflePushReadMetrics { + int64 corrupt_merged_block_chunks = 1; + int64 merged_fetch_fallback_count = 2; + int64 remote_merged_blocks_fetched = 3; + int64 local_merged_blocks_fetched = 4; + int64 remote_merged_chunks_fetched = 5; + int64 local_merged_chunks_fetched = 6; + int64 remote_merged_bytes_read = 7; + int64 local_merged_bytes_read = 8; + int64 remote_merged_reqs_duration = 9; +} + +message ShuffleWriteMetrics { + int64 bytes_written = 1; + int64 write_time = 2; + int64 records_written = 3; +} + +message TaskMetricDistributions { + repeated double quantiles = 1; + repeated double duration = 2; + repeated double executor_deserialize_time = 3; + repeated double executor_deserialize_cpu_time = 4; + repeated double executor_run_time = 5; + repeated double executor_cpu_time = 6; + repeated double result_size = 7; + repeated double jvm_gc_time = 8; + repeated double result_serialization_time = 9; + repeated double getting_result_time = 10; + repeated double scheduler_delay = 11; + repeated double peak_execution_memory = 12; + repeated double memory_bytes_spilled = 13; + repeated double disk_bytes_spilled = 14; + InputMetricDistributions input_metrics = 15; + OutputMetricDistributions output_metrics = 16; + ShuffleReadMetricDistributions shuffle_read_metrics = 17; + ShuffleWriteMetricDistributions shuffle_write_metrics = 18; +} + +message InputMetricDistributions { + repeated double bytes_read = 1; + repeated double records_read = 2; +} + +message OutputMetricDistributions { + repeated double bytes_written = 1; + repeated double records_written = 2; +} + +message ShuffleReadMetricDistributions { + repeated double read_bytes = 1; + repeated double read_records = 2; + repeated double remote_blocks_fetched = 3; + repeated double local_blocks_fetched = 4; + repeated double fetch_wait_time = 5; + repeated double remote_bytes_read = 6; + repeated double remote_bytes_read_to_disk = 7; + repeated double total_blocks_fetched = 8; + repeated double remote_reqs_duration = 9; + ShufflePushReadMetricDistributions shuffle_push_read_metrics_dist = 10; +} + +message ShufflePushReadMetricDistributions { + repeated double corrupt_merged_block_chunks = 1; + repeated double merged_fetch_fallback_count = 2; + repeated double remote_merged_blocks_fetched = 3; + repeated double local_merged_blocks_fetched = 4; + repeated double remote_merged_chunks_fetched = 5; + repeated double local_merged_chunks_fetched = 6; + repeated double remote_merged_bytes_read = 7; + repeated double local_merged_bytes_read = 8; + repeated double remote_merged_reqs_duration = 9; +} + +message ShuffleWriteMetricDistributions { + repeated double write_bytes = 1; + repeated double write_records = 2; + repeated double write_time = 3; +} + +message ExecutorMetricsDistributions { + repeated double quantiles = 1; + + repeated double task_time = 2; + repeated double failed_tasks = 3; + repeated double succeeded_tasks = 4; + repeated double killed_tasks = 5; + repeated double input_bytes = 6; + repeated double input_records = 7; + repeated double output_bytes = 8; + repeated double output_records = 9; + repeated double shuffle_read = 10; + repeated double shuffle_read_records = 11; + repeated double shuffle_write = 12; + repeated double shuffle_write_records = 13; + repeated double memory_bytes_spilled = 14; + repeated double disk_bytes_spilled = 15; + ExecutorPeakMetricsDistributions peak_memory_metrics = 16; +} + +message ExecutorPeakMetricsDistributions { + repeated double quantiles = 1; + repeated ExecutorMetrics executor_metrics = 2; +} + +message AppSummary { + int32 num_completed_jobs = 1; + int32 num_completed_stages = 2; +} + +message PoolData { + optional string name = 1; + repeated int64 stage_ids = 2; +} + +message StateOperatorProgress { + optional string operator_name = 1; + int64 num_rows_total = 2; + int64 num_rows_updated = 3; + int64 all_updates_time_ms = 4; + int64 num_rows_removed = 5; + int64 all_removals_time_ms = 6; + int64 commit_time_ms = 7; + int64 memory_used_bytes = 8; + int64 num_rows_dropped_by_watermark = 9; + int64 num_shuffle_partitions = 10; + int64 num_state_store_instances = 11; + map custom_metrics = 12; +} + +message SourceProgress { + optional string description = 1; + optional string start_offset = 2; + optional string end_offset = 3; + optional string latest_offset = 4; + int64 num_input_rows = 5; + double input_rows_per_second = 6; + double processed_rows_per_second = 7; + map metrics = 8; +} + +message SinkProgress { + optional string description = 1; + int64 num_output_rows = 2; + map metrics = 3; +} + +message StreamingQueryProgress { + optional string id = 1; + optional string run_id = 2; + optional string name = 3; + optional string timestamp = 4; + int64 batch_id = 5; + int64 batch_duration = 6; + map duration_ms = 7; + map event_time = 8; + repeated StateOperatorProgress state_operators = 9; + repeated SourceProgress sources = 10; + SinkProgress sink = 11; + map observed_metrics = 12; +} + +message StreamingQueryProgressWrapper { + StreamingQueryProgress progress = 1; +} diff --git a/core/src/main/resources/META-INF/services/org.apache.spark.status.protobuf.ProtobufSerDe b/core/src/main/resources/META-INF/services/org.apache.spark.status.protobuf.ProtobufSerDe new file mode 100644 index 0000000000000..0319e45f6a32d --- /dev/null +++ b/core/src/main/resources/META-INF/services/org.apache.spark.status.protobuf.ProtobufSerDe @@ -0,0 +1,33 @@ +# +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +org.apache.spark.status.protobuf.RDDStorageInfoWrapperSerializer +org.apache.spark.status.protobuf.ApplicationInfoWrapperSerializer +org.apache.spark.status.protobuf.ApplicationEnvironmentInfoWrapperSerializer +org.apache.spark.status.protobuf.CachedQuantileSerializer +org.apache.spark.status.protobuf.ExecutorStageSummaryWrapperSerializer +org.apache.spark.status.protobuf.StreamBlockDataSerializer +org.apache.spark.status.protobuf.TaskDataWrapperSerializer +org.apache.spark.status.protobuf.JobDataWrapperSerializer +org.apache.spark.status.protobuf.ResourceProfileWrapperSerializer +org.apache.spark.status.protobuf.SpeculationStageSummaryWrapperSerializer +org.apache.spark.status.protobuf.ExecutorSummaryWrapperSerializer +org.apache.spark.status.protobuf.ProcessSummaryWrapperSerializer +org.apache.spark.status.protobuf.RDDOperationGraphWrapperSerializer +org.apache.spark.status.protobuf.StageDataWrapperSerializer +org.apache.spark.status.protobuf.AppSummarySerializer +org.apache.spark.status.protobuf.PoolDataSerializer diff --git a/core/src/main/resources/error/README.md b/core/src/main/resources/error/README.md index f58eb6d9296ef..623c650ec579b 100644 --- a/core/src/main/resources/error/README.md +++ b/core/src/main/resources/error/README.md @@ -8,13 +8,13 @@ and message parameters rather than an arbitrary error message. 1. Check if the error is an internal error. Internal errors are bugs in the code that we do not expect users to encounter; this does not include unsupported operations. If true, use the error class `INTERNAL_ERROR` and skip to step 4. -2. Check if an appropriate error class already exists in `error-class.json`. +2. Check if an appropriate error class already exists in `error-classes.json`. If true, use the error class and skip to step 4. -3. Add a new class to `error-class.json`; keep in mind the invariants below. +3. Add a new class to `error-classes.json`; keep in mind the invariants below. 4. Check if the exception type already extends `SparkThrowable`. If true, skip to step 6. 5. Mix `SparkThrowable` into the exception. -6. Throw the exception with the error class and message parameters. +6. Throw the exception with the error class and message parameters. If the same exception is thrown in several places, create an util function in a central place such as `QueryCompilationErrors.scala` to instantiate the exception. ### Before @@ -24,10 +24,10 @@ Throw with arbitrary error message: ### After -`error-class.json` +`error-classes.json` "PROBLEM_BECAUSE": { - "message": ["Problem %s because %s"], + "message": ["Problem because "], "sqlState": "XXXXX" } @@ -35,16 +35,18 @@ Throw with arbitrary error message: class SparkTestException( errorClass: String, - messageParameters: Seq[String]) + messageParameters: Map[String, String]) extends TestException(SparkThrowableHelper.getMessage(errorClass, messageParameters)) with SparkThrowable { - def getErrorClass: String = errorClass + override def getMessageParameters: java.util.Map[String, String] = messageParameters.asJava + + override def getErrorClass: String = errorClass } Throw with error class and message parameters: - throw new SparkTestException("PROBLEM_BECAUSE", Seq("A", "B")) + throw new SparkTestException("PROBLEM_BECAUSE", Map("problem" -> "A", "cause" -> "B")) ## Access fields @@ -66,6 +68,8 @@ To access error fields, catch exceptions that extend `org.apache.spark.SparkThro Error classes are a succinct, human-readable representation of the error category. +An uncategorized errors can be assigned to a legacy error class with the prefix `_LEGACY_ERROR_TEMP_` and an unused sequential number, for instance `_LEGACY_ERROR_TEMP_0053`. + #### Invariants - Unique @@ -87,9 +91,10 @@ The quality of the error message should match the ### SQLSTATE SQLSTATE is an optional portable error identifier across SQL engines. -For consistency, Spark only sets SQLSTATE as defined in the ANSI/ISO standard. SQLSTATE comprises a 2-character class value followed by a 3-character subclass value. -Spark only uses the standard-defined classes and subclasses, and does not use implementation-defined classes or subclasses. +Spark prefers to re-use existing SQLSTATEs, preferably used by multiple vendors. +For extension Spark claims the 'K**' subclass range. +If a new class is needed it will also claim the 'K0' class. #### Invariants @@ -97,162 +102,1202 @@ Spark only uses the standard-defined classes and subclasses, and does not use im #### ANSI/ISO standard -The following SQLSTATEs are from ISO/IEC CD 9075-2. +The following SQLSTATEs are collated from: +- SQL2016 +- DB2 zOS +- PostgreSQL 15 +- Oracle 12 (last published) +- SQL Server +- Redshift. -|SQLSTATE|Class|Condition |Subclass|Subcondition | -|--------|-----|------------------------------------------------------------|--------|---------------------------------------------------------------| -|07000 |07 |dynamic SQL error |000 |(no subclass) | -|07001 |07 |dynamic SQL error |001 |using clause does not match dynamic parameter specifications | -|07002 |07 |dynamic SQL error |002 |using clause does not match target specifications | -|07003 |07 |dynamic SQL error |003 |cursor specification cannot be executed | -|07004 |07 |dynamic SQL error |004 |using clause required for dynamic parameters | -|07005 |07 |dynamic SQL error |005 |prepared statement not a cursor specification | -|07006 |07 |dynamic SQL error |006 |restricted data type attribute violation | -|07007 |07 |dynamic SQL error |007 |using clause required for result fields | -|07008 |07 |dynamic SQL error |008 |invalid descriptor count | -|07009 |07 |dynamic SQL error |009 |invalid descriptor index | -|0700B |07 |dynamic SQL error |00B |data type transform function violation | -|0700C |07 |dynamic SQL error |00C |undefined DATA value | -|0700D |07 |dynamic SQL error |00D |invalid DATA target | -|0700E |07 |dynamic SQL error |00E |invalid LEVEL value | -|0700F |07 |dynamic SQL error |00F |invalid DATETIME_INTERVAL_CODE | -|08000 |08 |connection exception |000 |(no subclass) | -|08001 |08 |connection exception |001 |SQL-client unable to establish SQL-connection | -|08002 |08 |connection exception |002 |connection name in use | -|08003 |08 |connection exception |003 |connection does not exist | -|08004 |08 |connection exception |004 |SQL-server rejected establishment of SQL-connection | -|08006 |08 |connection exception |006 |connection failure | -|08007 |08 |connection exception |007 |transaction resolution unknown | -|09000 |09 |triggered action exception |000 |(no subclass) | -|0A000 |0A |feature not supported |000 |(no subclass) | -|0A001 |0A |feature not supported |001 |multiple server transactions | -|0D000 |0D |invalid target type specification |000 |(no subclass) | -|0E000 |0E |invalid schema name list specification |000 |(no subclass) | -|0F000 |0F |locator exception |000 |(no subclass) | -|0F001 |0F |locator exception |001 |invalid specification | -|0L000 |0L |invalid grantor |000 |(no subclass) | -|0M000 |0M |invalid SQL-invoked procedure reference |000 |(no subclass) | -|0P000 |0P |invalid role specification |000 |(no subclass) | -|0S000 |0S |invalid transform group name specification |000 |(no subclass) | -|0T000 |0T |target table disagrees with cursor specification |000 |(no subclass) | -|0U000 |0U |attempt to assign to non-updatable column |000 |(no subclass) | -|0V000 |0V |attempt to assign to ordering column |000 |(no subclass) | -|0W000 |0W |prohibited statement encountered during trigger execution |000 |(no subclass) | -|0W001 |0W |prohibited statement encountered during trigger execution |001 |modify table modified by data change delta table | -|0Z000 |0Z |diagnostics exception |000 |(no subclass) | -|0Z001 |0Z |diagnostics exception |001 |maximum number of stacked diagnostics areas exceeded | -|21000 |21 |cardinality violation |000 |(no subclass) | -|22000 |22 |data exception |000 |(no subclass) | -|22001 |22 |data exception |001 |string data, right truncation | -|22002 |22 |data exception |002 |null value, no indicator parameter | -|22003 |22 |data exception |003 |numeric value out of range | -|22004 |22 |data exception |004 |null value not allowed | -|22005 |22 |data exception |005 |error in assignment | -|22006 |22 |data exception |006 |invalid interval format | -|22007 |22 |data exception |007 |invalid datetime format | -|22008 |22 |data exception |008 |datetime field overflow | -|22009 |22 |data exception |009 |invalid time zone displacement value | -|2200B |22 |data exception |00B |escape character conflict | -|2200C |22 |data exception |00C |invalid use of escape character | -|2200D |22 |data exception |00D |invalid escape octet | -|2200E |22 |data exception |00E |null value in array target | -|2200F |22 |data exception |00F |zero-length character string | -|2200G |22 |data exception |00G |most specific type mismatch | -|2200H |22 |data exception |00H |sequence generator limit exceeded | -|2200P |22 |data exception |00P |interval value out of range | -|2200Q |22 |data exception |00Q |multiset value overflow | -|22010 |22 |data exception |010 |invalid indicator parameter value | -|22011 |22 |data exception |011 |substring error | -|22012 |22 |data exception |012 |division by zero | -|22013 |22 |data exception |013 |invalid preceding or following size in window function | -|22014 |22 |data exception |014 |invalid argument for NTILE function | -|22015 |22 |data exception |015 |interval field overflow | -|22016 |22 |data exception |016 |invalid argument for NTH_VALUE function | -|22018 |22 |data exception |018 |invalid character value for cast | -|22019 |22 |data exception |019 |invalid escape character | -|2201B |22 |data exception |01B |invalid regular expression | -|2201C |22 |data exception |01C |null row not permitted in table | -|2201E |22 |data exception |01E |invalid argument for natural logarithm | -|2201F |22 |data exception |01F |invalid argument for power function | -|2201G |22 |data exception |01G |invalid argument for width bucket function | -|2201H |22 |data exception |01H |invalid row version | -|2201S |22 |data exception |01S |invalid XQuery regular expression | -|2201T |22 |data exception |01T |invalid XQuery option flag | -|2201U |22 |data exception |01U |attempt to replace a zero-length string | -|2201V |22 |data exception |01V |invalid XQuery replacement string | -|2201W |22 |data exception |01W |invalid row count in fetch first clause | -|2201X |22 |data exception |01X |invalid row count in result offset clause | -|22020 |22 |data exception |020 |invalid period value | -|22021 |22 |data exception |021 |character not in repertoire | -|22022 |22 |data exception |022 |indicator overflow | -|22023 |22 |data exception |023 |invalid parameter value | -|22024 |22 |data exception |024 |unterminated C string | -|22025 |22 |data exception |025 |invalid escape sequence | -|22026 |22 |data exception |026 |string data, length mismatch | -|22027 |22 |data exception |027 |trim error | -|22029 |22 |data exception |029 |noncharacter in UCS string | -|2202D |22 |data exception |02D |null value substituted for mutator subject parameter | -|2202E |22 |data exception |02E |array element error | -|2202F |22 |data exception |02F |array data, right truncation | -|2202G |22 |data exception |02G |invalid repeat argument in a sample clause | -|2202H |22 |data exception |02H |invalid sample size | -|2202J |22 |data exception |02J |invalid argument for row pattern navigation operation | -|2202K |22 |data exception |02K |skip to non-existent row | -|2202L |22 |data exception |02L |skip to first row of match | -|23000 |23 |integrity constraint violation |000 |(no subclass) | -|23001 |23 |integrity constraint violation |001 |restrict violation | -|24000 |24 |invalid cursor state |000 |(no subclass) | -|25000 |25 |invalid transaction state |000 |(no subclass) | -|25001 |25 |invalid transaction state |001 |active SQL-transaction | -|25002 |25 |invalid transaction state |002 |branch transaction already active | -|25003 |25 |invalid transaction state |003 |inappropriate access mode for branch transaction | -|25004 |25 |invalid transaction state |004 |inappropriate isolation level for branch transaction | -|25005 |25 |invalid transaction state |005 |no active SQL-transaction for branch transaction | -|25006 |25 |invalid transaction state |006 |read-only SQL-transaction | -|25007 |25 |invalid transaction state |007 |schema and data statement mixing not supported | -|25008 |25 |invalid transaction state |008 |held cursor requires same isolation level | -|26000 |26 |invalid SQL statement name |000 |(no subclass) | -|27000 |27 |triggered data change violation |000 |(no subclass) | -|27001 |27 |triggered data change violation |001 |modify table modified by data change delta table | -|28000 |28 |invalid authorization specification |000 |(no subclass) | -|2B000 |2B |dependent privilege descriptors still exist |000 |(no subclass) | -|2C000 |2C |invalid character set name |000 |(no subclass) | -|2C001 |2C |invalid character set name |001 |cannot drop SQL-session default character set | -|2D000 |2D |invalid transaction termination |000 |(no subclass) | -|2E000 |2E |invalid connection name |000 |(no subclass) | -|2F000 |2F |SQL routine exception |000 |(no subclass) | -|2F002 |2F |SQL routine exception |002 |modifying SQL-data not permitted | -|2F003 |2F |SQL routine exception |003 |prohibited SQL-statement attempted | -|2F004 |2F |SQL routine exception |004 |reading SQL-data not permitted | -|2F005 |2F |SQL routine exception |005 |function executed no return statement | -|2H000 |2H |invalid collation name |000 |(no subclass) | -|30000 |30 |invalid SQL statement identifier |000 |(no subclass) | -|33000 |33 |invalid SQL descriptor name |000 |(no subclass) | -|34000 |34 |invalid cursor name |000 |(no subclass) | -|35000 |35 |invalid condition number |000 |(no subclass) | -|36000 |36 |cursor sensitivity exception |000 |(no subclass) | -|36001 |36 |cursor sensitivity exception |001 |request rejected | -|36002 |36 |cursor sensitivity exception |002 |request failed | -|38000 |38 |external routine exception |000 |(no subclass) | -|38001 |38 |external routine exception |001 |containing SQL not permitted | -|38002 |38 |external routine exception |002 |modifying SQL-data not permitted | -|38003 |38 |external routine exception |003 |prohibited SQL-statement attempted | -|38004 |38 |external routine exception |004 |reading SQL-data not permitted | -|39000 |39 |external routine invocation exception |000 |(no subclass) | -|39004 |39 |external routine invocation exception |004 |null value not allowed | -|3B000 |3B |savepoint exception |000 |(no subclass) | -|3B001 |3B |savepoint exception |001 |invalid specification | -|3B002 |3B |savepoint exception |002 |too many | -|3C000 |3C |ambiguous cursor name |000 |(no subclass) | -|3D000 |3D |invalid catalog name |000 |(no subclass) | -|3F000 |3F |invalid schema name |000 |(no subclass) | -|40000 |40 |transaction rollback |000 |(no subclass) | -|40001 |40 |transaction rollback |001 |serialization failure | -|40002 |40 |transaction rollback |002 |integrity constraint violation | -|40003 |40 |transaction rollback |003 |statement completion unknown | -|40004 |40 |transaction rollback |004 |triggered action exception | -|42000 |42 |syntax error or access rule violation |000 |(no subclass) | -|44000 |44 |with check option violation |000 |(no subclass) | -|HZ000 |HZ |remote database access |000 |(no subclass) | +|SQLSTATE |Class|Condition |Subclass|Subcondition |Origin |Standard|Used By | +|---------|-----|--------------------------------------------------|--------|------------------------------------------------------------|---------------|--------|----------------------------------------------------------------------------| +|00000 |00 |successful completion |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle | +|01000 |01 |warning |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation SQL/PSM SQL/XML SQL/JRT PostgreSQL Redshift Oracle SQL Server| +|01001 |01 |warning |001 |cursor operation conflict |SQL/Foundation |Y |SQL/Foundation Oracle SQL Server | +|01002 |01 |warning |002 |disconnect error |SQL/Foundation |Y |SQL/Foundation Oracle SQL Server | +|01003 |01 |warning |003 |null value eliminated in set function |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle SQL Server | +|01004 |01 |warning |004 |string data, right truncation |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Redshift Oracle SQL Server | +|01005 |01 |warning |005 |insufficient item descriptor areas |SQL/Foundation |Y |SQL/Foundation DB2 Oracle | +|01006 |01 |warning |006 |privilege not revoked |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle SQL Server | +|01007 |01 |warning |007 |privilege not granted |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle SQL Server | +|01008 |01 |Warning |008 |implicit_zero_bit_padding |PostgreSQL |N |PostgreSQL Redshift Oracle | +|01009 |01 |warning |009 |search condition too long for information schema |SQL/Foundation |Y |SQL/Foundation Oracle | +|0100A |01 |warning |00A |query expression too long for information schema |SQL/Foundation |Y |SQL/Foundation Oracle | +|0100B |01 |warning |00B |default value too long for information schema |SQL/Foundation |Y |SQL/Foundation | +|0100C |01 |warning |00C |result sets returned |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift | +|0100D |01 |warning |00D |additional result sets returned |SQL/Foundation |Y |SQL/Foundation | +|0100E |01 |warning |00E |attempt to return too many result sets |SQL/Foundation |Y |SQL/Foundation DB2 | +|0100F |01 |warning |00F |statement too long for information schema |SQL/Foundation |Y |SQL/Foundation | +|01010 |01 |warning |010 |column cannot be mapped to XML |SQL/XML |Y |SQL/XML | +|01011 |01 |warning |011 |SQL-Java path too long for infor- mation schema |SQL/JRT |Y |SQL/JRT DB2 | +|01012 |01 |warning |012 |invalid number of conditions |SQL/Foundation |Y |SQL/Foundation | +|0102F |01 |warning |02F |array data, right truncation |SQL/Foundation |Y |SQL/Foundation | +|01503 |01 |Warning |503 |The number of result columns is larger than the number of variables provided.|DB2 |N |DB2 | +|01504 |01 |Warning |504 |The UPDATE or DELETE statement does not include a WHERE clause.|DB2 |N |DB2 | +|01505 |01 |Warning |505 |The statement was not executed because it is unacceptable in this environment.|DB2 |N |DB2 | +|01506 |01 |Warning |506 |An adjustment was made to a DATE or TIMESTAMP value to correct an invalid date resulting from an arithmetic operation.|DB2 |N |DB2 | +|01507 |01 |Warning |507 |One or more non-zero digits were eliminated from the fractional part of a number used as the operand of a multiply or divide operation.|DB2 |N |DB2 | +|01514 |01 |Warning |514 |The tablespace has been placed in the check-pending state. |DB2 |N |DB2 | +|01515 |01 |Warning |515 |The null value has been assigned to a variable, because the non-null value of the column is not within the range of the variable.|DB2 |N |DB2 | +|01516 |01 |Warning |516 |An inapplicable WITH GRANT OPTION has been ignored. |DB2 |N |DB2 | +|01517 |01 |Warning |517 |A character that could not be converted was replaced with a substitute character.|DB2 |N |DB2 | +|01519 |01 |Warning |519 |The null value has been assigned to a variable, because a numeric value is out of range.|DB2 |N |DB2 | +|01520 |01 |Warning |520 |The null value has been assigned to a variable, because the characters cannot be converted.|DB2 |N |DB2 | +|01521 |01 |Warning |521 |A specified server-name is undefined but is not needed until the statement is executed or the alias is used.|DB2 |N |DB2 | +|01522 |01 |Warning |522 |The local table or view name used in the CREATE ALIAS statement is undefined.|DB2 |N |DB2 | +|01523 |01 |Warning |523 |ALL was interpreted to exclude ALTER, INDEX, REFERENCES, and TRIGGER, because these privileges cannot be granted to a remote user.|DB2 |N |DB2 | +|01524 |01 |Warning |524 |The result of an aggregate function does not include the null values that were caused by evaluating the arithmetic expression implied by the column of the view.|DB2 |N |DB2 | +|01525 |01 |Warning |525 |The number of INSERT values is not the same as the number of columns.|DB2 |N |DB2 | +|01527 |01 |Warning |527 |A SET statement references a special register that does not exist at the AS.|DB2 |N |DB2 | +|01528 |01 |Warning |528 |WHERE NOT NULL is ignored, because the index key cannot contain null values.|DB2 |N |DB2 | +|01530 |01 |Warning |530 |Definition change may require a corresponding change on the read-only systems.|DB2 |N |DB2 | +|01532 |01 |Warning |532 |An undefined object name was detected. |DB2 |N |DB2 | +|01533 |01 |Warning |533 |An undefined column name was detected. |DB2 |N |DB2 | +|01537 |01 |Warning |537 |An SQL statement cannot be EXPLAINed, because it references a remote object.|DB2 |N |DB2 | +|01538 |01 |Warning |538 |The table cannot be subsequently defined as a dependent, because it has the maximum number of columns.|DB2 |N |DB2 | +|01539 |01 |Warning |539 |Connection is successful but only SBCS characters should be used.|DB2 |N |DB2 | +|01540 |01 |Warning |540 |A limit key has been truncated to 40 bytes. |DB2 |N |DB2 | +|01542 |01 |Warning |542 |Authorization ID does not have the privilege to perform the operation as specified.|DB2 |N |DB2 | +|01543 |01 |Warning |543 |A duplicate constraint has been ignored. |DB2 |N |DB2 | +|01545 |01 |Warning |545 |An unqualified column name has been interpreted as a correlated reference.|DB2 |N |DB2 | +|01546 |01 |Warning |546 |A column of the explanation table is improperly defined. |DB2 |N |DB2 | +|01548 |01 |Warning |548 |The authorization ID does not have the privilege to perform the specified operation on the identified object.|DB2 |N |DB2 | +|01551 |01 |Warning |551 |A table in a partitioned tablespace is not available, because its partitioned index has not been created.|DB2 |N |DB2 | +|01552 |01 |Warning |552 |An ambiguous qualified column name was resolved to the first of the duplicate names in the FROM clause.|DB2 |N |DB2 | +|01553 |01 |Warning |553 |Isolation level RR conflicts with a tablespace locksize of page.|DB2 |N |DB2 | +|01554 |01 |Warning |554 |Decimal multiplication may cause overflow. |DB2 |N |DB2 | +|01558 |01 |Warning |558 |A distribution protocol has been violated. |DB2 |N |DB2 | +|01560 |01 |Warning |560 |A redundant GRANT has been ignored. |DB2 |N |DB2 | +|01561 |01 |Warning |561 |An update to a data capture table was not signaled to the originating subsystem.|DB2 |N |DB2 | +|01565 |01 |Warning |565 |The null value has been assigned to a variable, because a miscellaneous data exception occurred. For example, the character value for the CAST, DECIMAL, FLOAT, or INTEGER scalar function is invalid; a floating-point NAN (not a number) was detected; invalid data in a packed decimal field was detected; or a mask mapping error was detected.|DB2 |N |DB2 | +|01566 |01 |Warning |566 |The object has been placed in a pending state. |DB2 |N |DB2 | +|01568 |01 |Warning |568 |The dynamic SQL statement ends with a semicolon. |DB2 |N |DB2 | +|01578 |01 |Warning |578 |The bind process detected operands of an operator that are not compatible.|DB2 |N |DB2 | +|01590 |01 |Warning |590 |Type 2 indexes do not have subpages. |DB2 |N |DB2 | +|01591 |01 |Warning |591 |The result of the positioned UPDATE or DELETE may depend on the order of the rows.|DB2 |N |DB2 | +|01594 |01 |Warning |594 |Insufficient number of entries in an SQLDA for ALL information (i.e. not enough descriptors to return the distinct name).|DB2 |N |DB2 | +|01596 |01 |Warning |596 |Comparison functions were not created for a distinct type based on a long string data type.|DB2 |N |DB2 | +|01597 |01 |Warning |597 |Specific and non-specific volume IDs are not allowed in a storage group.|DB2 |N |DB2 | +|01599 |01 |Warning |599 |Bind options were ignored on REBIND. |DB2 |N |DB2 | +|01600 |01 |Warning |600 |SUBPAGES ignored on alter of catalog index. |DB2 |N |DB2 | +|01602 |01 |Warning |602 |Optimization processing encountered a restriction that might have caused it to produce a sub-optimal result.|DB2 |N |DB2 | +|01604 |01 |Warning |604 |The SQL statement was explained and not executed. |DB2 |N |DB2 | +|01605 |01 |Warning |605 |A recursive common table expression may contain an infinite loop.|DB2 |N |DB2 | +|01608 |01 |Warning |608 |An unsupported value has been replaced. |DB2 |N |DB2 | +|01614 |01 |Warning |614 |There are fewer locators than the number of result sets. |DB2 |N |DB2 | +|01615 |01 |Warning |615 |A bind option was ignored. |DB2 |N |DB2 | +|01616 |01 |Warning |616 |The estimated CPU cost exceeds the resource limit. |DB2 |N |DB2 | +|01624 |01 |Warning |624 |The GBPCACHE specification is ignored because the buffer pool does not allow caching.|DB2 |N |DB2 | +|01625 |01 |Warning |625 |The schema name appears more than once in the CURRENT PATH. |DB2 |N |DB2 | +|01628 |01 |Warning |628 |The user-specified access path hints are invalid. The access path hints are ignored.|DB2 |N |DB2 | +|01629 |01 |Warning |629 |User-specified access path hints were used during access path selection.|DB2 |N |DB2 | +|01640 |01 |Warning |640 |ROLLBACK TO SAVEPOINT occurred when there were uncommitted INSERTs or DELETEs that cannot be rolled back.|DB2 |N |DB2 | +|01643 |01 |Warning |643 |Assignment to SQLCODE or SQLSTATE variable does not signal a warning or error.|DB2 |N |DB2 | +|01644 |01 |Warning |644 |DEFINE NO is not applicable for a lob space or data sets using the VCAT option.|DB2 |N |DB2 | +|01656 |01 |Warning |656 |ROLLBACK TO savepoint caused a NOT LOGGED table space to be placed in the LPL.|DB2 |N |DB2 | +|01658 |01 |Warning |658 |Binary data is invalid for DECRYPT_CHAR and DECYRYPT_DB. |DB2 |N |DB2 | +|01659 |01 |Warning |659 |A non-atomic statement successfully processed all requested rows with one or more warning conditions.|DB2 |N |DB2 | +|01663 |01 |Warning |663 |NOT PADDED clause is ignored for indexes created on auxiliary tables.|DB2 |N |DB2 | +|01664 |01 |Warning |664 |Option not specified following the ALTER PARTITION CLAUSE. |DB2 |N |DB2 | +|01665 |01 |Warning |665 |A name or label was truncated. |DB2 |N |DB2 | +|01666 |01 |Warning |666 |The last partition's limit key value is set to the highest or lowest possible value.|DB2 |N |DB2 | +|01668 |01 |Warning |668 |A rowset FETCH statement returned one or more rows of data, with one or more bind out processing error conditions. Use GET DIAGNOSTICS for more information.|DB2 |N |DB2 | +|01676 |01 |Warning |676 |Transfer operation ignored since the authorization ID is already the owner of the database object.|DB2 |N |DB2 | +|01679 |01 |Warning |679 |A trusted connection cannot be established for the specified system authorization ID.|DB2 |N |DB2 | +|01680 |01 |Warning |680 |The option is not supported in the context in which it was specified.|DB2 |N |DB2 | +|01681 |01 |Warning |681 |The trusted context is no longer defined to be used by specific attribute value.|DB2 |N |DB2 | +|01682 |01 |Warning |682 |The ability to use the trusted context was removed from some but not all authorization IDs specified in statement.|DB2 |N |DB2 | +|01683 |01 |Warning |683 |A SELECT containing a non-ATOMIC data change statement successfully returned some rows, but one or more warnings or errors occurred.|DB2 |N |DB2 | +|0168B |01 |Warning |68B |An operation was partially successful and partially unsuccessful. Use GET DIAGNOSTICS for more information.|DB2 |N |DB2 | +|0168C |01 |Warning |68C |A decimal float operation produced an inexact result. |DB2 |N |DB2 | +|0168D |01 |Warning |68D |A decimal floating point operation was invalid. |DB2 |N |DB2 | +|0168E |01 |Warning |68E |A decimal float operation produced an overflow or underflow.|DB2 |N |DB2 | +|0168F |01 |Warning |68F |A decimal float operation produced division by zero. |DB2 |N |DB2 | +|0168G |01 |Warning |68G |A decimal float operation produced a subnormal number. |DB2 |N |DB2 | +|0168L |01 |Warning |68L |No routine was found with the specified name and compatible arguments.|DB2 |N |DB2 | +|0168T |01 |Warning |68T |WITH ROW CHANGE COLUMNS ALWAYS DISTINCT was specified, but the database manager is unable to return distinct row change columns.|DB2 |N |DB2 | +|0168X |01 |Warning |68X |The combination of target namespace and schema location hint is not unique in the XML schema repository.|DB2 |N |DB2 | +|0168Z |01 |Warning |68Z |The statement was successfully prepared, but cannot be executed.|DB2 |N |DB2 | +|01694 |01 |Warning |694 |A deprecated feature has been ignored. |DB2 |N |DB2 | +|01695 |01 |Warning |695 |Adjustment made to a value for a period as a result of a data change operation.|DB2 |N |DB2 | +|0169A |01 |Warning |69A |A configuration parameter was overridden. |DB2 |N |DB2 | +|0169B |01 |Warning |69B |The operation was successful on the Db2 server, but may not have been successful on the accelerator server.|DB2 |N |DB2 | +|0169D |01 |Warning |69D |The accelerator does not exist. |DB2 |N |DB2 | +|01H54 |01 |Warning |H54 |The procedure has returned successfully but encountered an error in the format or content of a parameter. Information about the error in the parameter value is returned in an output parameter.|DB2 |N |DB2 | +|01H55 |01 |Warning |H55 |The procedure has returned successfully but encountered an internal processing error. Information about the internal error situation is returned in an output parameter.|DB2 |N |DB2 | +|01H56 |01 |Warning |H56 |The procedure has returned successfully but supports a higher version for a parameter than the one that was specified.|DB2 |N |DB2 | +|01H57 |01 |Warning |H57 |The procedure has returned output in an alternate locale instead of the locale specified.|DB2 |N |DB2 | +|01Hxx |01 |Warning |Hxx |Valid warning SQLSTATEs returned by a user-defined function, external procedure CALL, or command invocation.|DB2 |N |DB2 | +|01P01 |01 |Warning |P01 |deprecated_feature |PostgreSQL |N |PostgreSQL Redshift | +|01S00 |01 |Warning |S00 |Invalid connection string attribute |SQL Server |N |SQL Server | +|01S01 |01 |Warning |S01 |Error in row |SQL Server |N |SQL Server | +|01S02 |01 |Warning |S02 |Option value changed |SQL Server |N |SQL Server | +|01S06 |01 |Warning |S06 |Attempt to fetch before the result set returned the first rowset|SQL Server |N |SQL Server | +|01S07 |01 |Warning |S07 |Fractional truncation |SQL Server |N |SQL Server | +|01S08 |01 |Warning |S08 |Error saving File DSN |SQL Server |N |SQL Server | +|01S09 |01 |Warning |S09 |Invalid keyword |SQL Server |N |SQL Server | +|02000 |02 |no data |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle | +|02001 |02 |no data |001 |no additional result sets returned |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|02502 |02 |No Data |502 |Delete or update hole detected. |DB2 |N |DB2 | +|02504 |02 |No Data |504 |FETCH PRIOR ROWSET returned a partial rowset. |DB2 |N |DB2 | +|03000 |03 |SQL Statement Not Yet Complete |000 |sql_statement_not_yet_complete |PostgreSQL |N |PostgreSQL Redshift | +|07000 |07 |dynamic SQL error |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation Oracle | +|07001 |07 |dynamic SQL error |001 |using clause does not match dynamic parameter specifications|SQL/Foundation |Y |SQL/Foundation DB2 Oracle SQL Server | +|07002 |07 |dynamic SQL error |002 |using clause does not match target specifications |SQL/Foundation |Y |SQL/Foundation DB2 Oracle SQL Server | +|07003 |07 |dynamic SQL error |003 |cursor specification cannot be executed |SQL/Foundation |Y |SQL/Foundation DB2 Oracle | +|07004 |07 |dynamic SQL error |004 |using clause required for dynamic parameters |SQL/Foundation |Y |SQL/Foundation Oracle | +|07005 |07 |dynamic SQL error |005 |prepared statement not a cursor specification |SQL/Foundation |Y |SQL/Foundation DB2 Oracle SQL Server | +|07006 |07 |dynamic SQL error |006 |restricted data type attribute violation |SQL/Foundation |Y |SQL/Foundation Oracle SQL Server | +|07007 |07 |dynamic SQL error |007 |using clause required for result fields |SQL/Foundation |Y |SQL/Foundation Oracle | +|07008 |07 |dynamic SQL error |008 |invalid descriptor count |SQL/Foundation |Y |SQL/Foundation Oracle | +|07009 |07 |dynamic SQL error |009 |invalid descriptor index |SQL/Foundation |Y |SQL/Foundation Oracle SQL Server | +|0700B |07 |dynamic SQL error |00B |data type transform function violation |SQL/Foundation |Y |SQL/Foundation | +|0700C |07 |dynamic SQL error |00C |undefined DATA value |SQL/Foundation |Y |SQL/Foundation | +|0700D |07 |dynamic SQL error |00D |invalid DATA target |SQL/Foundation |Y |SQL/Foundation | +|0700E |07 |dynamic SQL error |00E |invalid LEVEL value |SQL/Foundation |Y |SQL/Foundation | +|0700F |07 |dynamic SQL error |00F |invalid DATETIME_INTERVAL_CODE |SQL/Foundation |Y |SQL/Foundation | +|0700G |07 |dynamic SQL error |00G |invalid pass-through surrogate value |SQL/Foundation |Y |SQL/Foundation | +|0700H |07 |dynamic SQL error |00H |PIPE ROW not during PTF execution |SQL/Foundation |Y |SQL/Foundation | +|07501 |07 |Dynamic SQL Error |501 |The option specified on PREPARE or EXECUTE is not valid. |DB2 |N |DB2 | +|07S01 |07 |dynamic SQL error |S01 |Invalid use of default parameter |SQL Server |N |SQL Server | +|08000 |08 |connection exception |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle | +|08001 |08 |connection exception |001 |SQL-client unable to establish SQL-connection |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle SQL Server | +|08002 |08 |connection exception |002 |connection name in use |SQL/Foundation |Y |SQL/Foundation DB2 Oracle SQL Server | +|08003 |08 |connection exception |003 |connection does not exist |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle SQL Server | +|08004 |08 |connection exception |004 |SQL-server rejected establishment of SQL-connection |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle SQL Server | +|08006 |08 |connection exception |006 |connection failure |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle | +|08007 |08 |connection exception |007 |transaction resolution unknown |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle SQL Server | +|08P01 |08 |Connection Exception |P01 |protocol_violation |PostgreSQL |N |PostgreSQL Redshift | +|08S01 |08 |connection exception |S01 |Communication link failure |SQL Server |N |SQL Server | +|09000 |09 |triggered action exception |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift | +|0A000 |0A |feature not supported |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Redshift Oracle | +|0A001 |0A |feature not supported |001 |multiple server transactions |SQL/Foundation |Y |SQL/Foundation DB2 Oracle | +|0B000 |0B |Invalid Transaction Initiation |000 |invalid_transaction_initiation |PostgreSQL |N |PostgreSQL Redshift | +|0D000 |0D |invalid target type specification |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation | +|0E000 |0E |invalid schema name list specification |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation | +|0F000 |0F |locator exception |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|0F001 |0F |locator exception |001 |invalid specification |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift | +|0K000 |0K |resignal when handler not active |000 |(no subclass) |SQL/PSM |Y |SQL/PSM DB2 | +|0L000 |0L |invalid grantor |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|0LP01 |0L |Invalid Grantor |P01 |invalid_grant_operation |PostgreSQL |N |PostgreSQL Redshift | +|0M000 |0M |invalid SQL-invoked procedure reference |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation | +|0N000 |0N |SQL/XML mapping error |000 |(no subclass) |SQL/XML |Y |SQL/XML | +|0N001 |0N |SQL/XML mapping error |001 |unmappable XML Name |SQL/XML |Y |SQL/XML | +|0N002 |0N |SQL/XML mapping error |002 |invalid XML character |SQL/XML |Y |SQL/XML DB2 | +|0P000 |0P |invalid role specification |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|0S000 |0S |invalid transform group name specification |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation | +|0T000 |0T |target table disagrees with cursor specification |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation | +|0U000 |0U |attempt to assign to non-updatable column |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation | +|0V000 |0V |attempt to assign to ordering column |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation | +|0W000 |0W |prohibited statement encountered during trigger execution|000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation | +|0W001 |0W |prohibited statement encountered during trigger execution|001 |modify table modified by data change delta table |SQL/Foundation |Y |SQL/Foundation | +|0X000 |0X |invalid foreign server specification |000 |(no subclass) |SQL/MED |Y |SQL/MED | +|0Y000 |0Y |pass-through specific condition |000 |(no subclass) |SQL/MED |Y |SQL/MED | +|0Y001 |0Y |pass-through specific condition |001 |invalid cursor option |SQL/MED |Y |SQL/MED | +|0Y002 |0Y |pass-through specific condition |002 |invalid cursor allocation |SQL/MED |Y |SQL/MED | +|0Z000 |0Z |diagnostics exception |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|0Z001 |0Z |diagnostics exception |001 |maximum number of stacked diagnostics areas exceeded |SQL/Foundation |Y |SQL/Foundation | +|0Z002 |0Z |diagnostics exception |002 |stacked diagnostics accessed without active handler |SQL/PSM |Y |SQL/PSM PostgreSQL Redshift | +|10000 |10 |XQuery error |000 |(no subclass) |SQL/XML |Y |SQL/XML | +|10501 |10 |XQuery Error |501 |An XQuery expression is missing the assignment of a static or dynamic context component.|DB2 |N |DB2 | +|10502 |10 |XQuery Error |502 |An error was encountered in the prolog of an XQuery expression.|DB2 |N |DB2 | +|10503 |10 |XQuery Error |503 |A duplicate name was defined in an XQuery or XPath expression.|DB2 |N |DB2 | +|10504 |10 |XQuery Error |504 |An XQuery namespace declaration specified an invalid URI. |DB2 |N |DB2 | +|10505 |10 |XQuery Error |505 |A character, token or clause is missing or invalid in an XQuery expression.|DB2 |N |DB2 | +|10506 |10 |XQuery Error |506 |An XQuery expression references a name that is not defined. |DB2 |N |DB2 | +|10507 |10 |XQuery Error |507 |A type error was encountered processing an XPath or XQuery expression.|DB2 |N |DB2 | +|10509 |10 |XQuery Error |509 |An unsupported XQuery language feature is specified. |DB2 |N |DB2 | +|10601 |10 |XQuery Error |601 |An arithmetic error was encountered processing an XQuery function or operator.|DB2 |N |DB2 | +|10602 |10 |XQuery Error |602 |A casting error was encountered processing an XQuery function or operator.|DB2 |N |DB2 | +|10606 |10 |XQuery Error |606 |There is no context item for processing an XQuery function or operator.|DB2 |N |DB2 | +|10608 |10 |XQuery Error |608 |An error was encountered in the argument of an XQuery function or operator.|DB2 |N |DB2 | +|10609 |10 |XQuery Error |609 |A regular expression error was encountered processing an XQuery function or operator.|DB2 |N |DB2 | +|10703 |10 |XQuery Error |703 |The target node of an XQuery basic updating expression is not valid.|DB2 |N |DB2 | +|11000 |11 |prohibited column reference encountered during trigger execution|000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation | +|20000 |20 |case not found for case statement |000 |(no subclass) |SQL/PSM |Y |SQL/PSM PostgreSQL DB2 Redshift | +|21000 |21 |cardinality violation |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle | +|21501 |21 |Cardinality Violation |501 |A multiple-row INSERT into a self-referencing table is invalid.|DB2 |N |DB2 | +|21502 |21 |Cardinality Violation |502 |A multiple-row UPDATE of a primary key is invalid. |DB2 |N |DB2 | +|21S01 |21 |cardinality violation |S01 |Insert value list does not match column list |SQL Server |N |SQL Server | +|21S02 |21 |cardinality violation |S02 |Degree of derived table does not match column list |SQL Server |N |SQL Server | +|22000 |22 |data exception |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle | +|22001 |22 |data exception |001 |string data, right truncation |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle SQL Server | +|22002 |22 |data exception |002 |null value, no indicator parameter |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle SQL Server | +|22003 |22 |data exception |003 |numeric value out of range |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle SQL Server | +|22004 |22 |data exception |004 |null value not allowed |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift | +|22005 |22 |data exception |005 |error in assignment |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle | +|22006 |22 |data exception |006 |invalid interval format |SQL/Foundation |Y |SQL/Foundation | +|22007 |22 |data exception |007 |invalid datetime format |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle SQL Server | +|22008 |22 |data exception |008 |datetime field overflow |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle SQL Server | +|22009 |22 |data exception |009 |invalid time zone displacement value |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle | +|2200B |22 |data exception |00B |escape character conflict |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|2200C |22 |data exception |00C |invalid use of escape character |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|2200D |22 |data exception |00D |invalid escape octet |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|2200E |22 |data exception |00E |null value in array target |SQL/Foundation |Y |SQL/Foundation | +|2200F |22 |data exception |00F |zero-length character string |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|2200G |22 |data exception |00G |most specific type mismatch |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|2200H |22 |data exception |00H |sequence generator limit exceeded |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|2200J |22 |data exception |00J |nonidentical notations with the same name |SQL/XML |Y |SQL/XML | +|2200K |22 |data exception |00K |nonidentical unparsed entities with the same name |SQL/XML |Y |SQL/XML | +|2200L |22 |data exception |00L |not an XML document |SQL/XML |Y |SQL/XML PostgreSQL DB2 | +|2200M |22 |data exception |00M |invalid XML document |SQL/XML |Y |SQL/XML PostgreSQL DB2 | +|2200N |22 |data exception |00N |invalid XML content |SQL/XML |Y |SQL/XML PostgreSQL | +|2200P |22 |data exception |00P |interval value out of range |SQL/Foundation |Y |SQL/Foundation | +|2200Q |22 |data exception |00Q |multiset value overflow |SQL/Foundation |Y |SQL/Foundation | +|2200R |22 |data exception |00R |XML value overflow |SQL/XML |Y |SQL/XML | +|2200S |22 |data exception |00S |invalid comment |SQL/XML |Y |SQL/XML PostgreSQL DB2 | +|2200T |22 |data exception |00T |invalid processing instruction |SQL/XML |Y |SQL/XML PostgreSQL DB2 | +|2200U |22 |data exception |00U |not an XQuery document node |SQL/XML |Y |SQL/XML | +|2200V |22 |data exception |00V |invalid XQuery context item |SQL/XML |Y |SQL/XML DB2 | +|2200W |22 |data exception |00W |XQuery serialization error |SQL/XML |Y |SQL/XML DB2 | +|22010 |22 |data exception |010 |invalid indicator parameter value |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift | +|22011 |22 |data exception |011 |substring error |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle | +|22012 |22 |data exception |012 |division by zero |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle SQL Server | +|22013 |22 |data exception |013 |invalid preceding or following size in window function |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|22014 |22 |data exception |014 |invalid argument for NTILE function |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 | +|22015 |22 |data exception |015 |interval field overflow |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle SQL Server | +|22016 |22 |data exception |016 |invalid argument for NTH_VALUE function |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 | +|22017 |22 |data exception |017 |invalid data specified for datalink |SQL/MED |Y |SQL/MED | +|22018 |22 |data exception |018 |invalid character value for cast |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle SQL Server | +|22019 |22 |data exception |019 |invalid escape character |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle SQL Server | +|2201A |22 |data exception |01A |null argument passed to datalink constructor |SQL/MED |Y |SQL/MED | +|2201B |22 |data exception |01B |invalid regular expression |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|2201C |22 |data exception |01C |null row not permitted in table |SQL/Foundation |Y |SQL/Foundation | +|2201D |22 |data exception |01D |datalink value exceeds maximum length |SQL/MED |Y |SQL/MED | +|2201E |22 |data exception |01E |invalid argument for natural logarithm |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|2201F |22 |data exception |01F |invalid argument for power function |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|2201G |22 |data exception |01G |invalid argument for width bucket function |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|2201H |22 |data exception |01H |invalid row version |SQL/Foundation |Y |SQL/Foundation | +|2201J |22 |data exception |01J |XQuery sequence cannot be vali- dated |SQL/XML |Y |SQL/XML | +|2201K |22 |data exception |01K |XQuery document node cannot be validated |SQL/XML |Y |SQL/XML | +|2201L |22 |data exception |01L |no XML schema found |SQL/XML |Y |SQL/XML | +|2201M |22 |data exception |01M |element namespace not declared |SQL/XML |Y |SQL/XML | +|2201N |22 |data exception |01N |global element not declared |SQL/XML |Y |SQL/XML | +|2201P |22 |data exception |01P |no XML element with the specified QName |SQL/XML |Y |SQL/XML | +|2201Q |22 |data exception |01Q |no XML element with the specified namespace |SQL/XML |Y |SQL/XML | +|2201R |22 |data exception |01R |validation failure |SQL/XML |Y |SQL/XML DB2 | +|2201S |22 |data exception |01S |invalid XQuery regular expression |SQL/Foundation |Y |SQL/Foundation | +|2201T |22 |data exception |01T |invalid XQuery option flag |SQL/Foundation |Y |SQL/Foundation | +|2201U |22 |data exception |01U |attempt to replace a zero-length string |SQL/Foundation |Y |SQL/Foundation | +|2201V |22 |data exception |01V |invalid XQuery replacement string |SQL/Foundation |Y |SQL/Foundation | +|2201W |22 |data exception |01W |invalid row count in fetch first clause |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|2201X |22 |data exception |01X |invalid row count in result offset clause |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|2201Y |22 |data exception |01Y |zero-length binary string |SQL/Foundation |Y |SQL/Foundation | +|22020 |22 |data exception |020 |invalid period value |SQL/Foundation |Y |SQL/Foundation | +|22021 |22 |data exception |021 |character not in repertoire |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle | +|22022 |22 |data exception |022 |indicator overflow |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle | +|22023 |22 |data exception |023 |invalid parameter value |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle | +|22024 |22 |data exception |024 |unterminated C string |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle | +|22025 |22 |data exception |025 |invalid escape sequence |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle SQL Server | +|22026 |22 |data exception |026 |string data, length mismatch |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle SQL Server | +|22027 |22 |data exception |027 |trim error |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle | +|22029 |22 |data exception |029 |noncharacter in UCS string |SQL/Foundation |Y |SQL/Foundation | +|2202A |22 |data exception |02A |null value in field reference |SQL/PSM |Y |SQL/PSM | +|2202D |22 |data exception |02D |null value substituted for mutator subject parameter |SQL/Foundation |Y |SQL/Foundation | +|2202E |22 |data exception |02E |array element error |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|2202F |22 |data exception |02F |array data, right truncation |SQL/Foundation |Y |SQL/Foundation | +|2202G |22 |data exception |02G |invalid repeat argument in a sample clause |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|2202H |22 |data exception |02H |invalid sample size |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|2202J |22 |data exception |02J |invalid argument for row pattern navigation operation |SQL/Foundation |Y |SQL/Foundation | +|2202K |22 |data exception |02K |skip to non-existent row |SQL/Foundation |Y |SQL/Foundation | +|2202L |22 |data exception |02L |skip to first row of match |SQL/Foundation |Y |SQL/Foundation | +|22030 |22 |data exception |030 |duplicate JSON object key value |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|22031 |22 |data exception |031 |invalid argument for SQL/JSON datetime function |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|22032 |22 |data exception |032 |invalid JSON text |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|22033 |22 |data exception |033 |invalid SQL/JSON subscript |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|22034 |22 |data exception |034 |more than one SQL/JSON item |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|22035 |22 |data exception |035 |no SQL/JSON item |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|22036 |22 |data exception |036 |non-numeric SQL/JSON item |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|22037 |22 |data exception |037 |non-unique keys in a JSON object |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|22038 |22 |data exception |038 |singleton SQL/JSON item required |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|22039 |22 |data exception |039 |SQL/JSON array not found |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|2203A |22 |data exception |03A |SQL/JSON member not found |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|2203B |22 |data exception |03B |SQL/JSON number not found |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|2203C |22 |data exception |03C |SQL/JSON object not found |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|2203D |22 |data exception |03D |too many JSON array elements |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|2203E |22 |data exception |03E |too many JSON object members |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|2203F |22 |data exception |03F |SQL/JSON scalar required |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|2203G |22 |Data Exception |03G |sql_json_item_cannot_be_cast_to_target_type |PostgreSQL |N |PostgreSQL | +|2203J |22 |data exception |03J |MD-array null limit in subset |SQL/MD |Y |SQL/MD | +|2203K |22 |data exception |03K |MD-array null limit in MD-extent |SQL/MD |Y |SQL/MD | +|2203L |22 |data exception |03L |MD-array subset not within MD- extent |SQL/MD |Y |SQL/MD | +|2203M |22 |data exception |03M |MD-array duplicate coordinate in query constructor |SQL/MD |Y |SQL/MD | +|2203N |22 |data exception |03N |MD-array null coordinate in query constructor |SQL/MD |Y |SQL/MD | +|2203P |22 |data exception |03P |MD-array coordinate not within specified MD-extent |SQL/MD |Y |SQL/MD | +|2203Q |22 |data exception |03Q |MD-array source MD-extent not strictly within maximum target MD-extent|SQL/MD |Y |SQL/MD | +|2203R |22 |data exception |03R |MD-array operands with non- matching MD-extents |SQL/MD |Y |SQL/MD | +|2203T |22 |data exception |03T |MD-array invalid MD-axis |SQL/MD |Y |SQL/MD | +|2203U |22 |data exception |03U |MD-array lower limit greater than upper limit |SQL/MD |Y |SQL/MD | +|2203V |22 |data exception |03V |MD-array axis name not unique in MD-extent |SQL/MD |Y |SQL/MD | +|2203X |22 |data exception |03X |MD-array element error |SQL/MD |Y |SQL/MD | +|2203Y |22 |data exception |03Y |MD-array decoding error |SQL/MD |Y |SQL/MD | +|2203Z |22 |data exception |03Z |MD-array encoding error |SQL/MD |Y |SQL/MD | +|22040 |22 |data exception |040 |MD-array element reference not within MD-extent |SQL/MD |Y |SQL/MD | +|22041 |22 |data exception |041 |MD-array null value in MD-array target |SQL/MD |Y |SQL/MD | +|22042 |22 |data exception |042 |MD-array source MD-extent not strictly within target MD-extent|SQL/MD |Y |SQL/MD | +|22043 |22 |data exception |043 |MD-array target MD-extent not strictly within maximum MD extent|SQL/MD |Y |SQL/MD | +|22044 |22 |data exception |044 |MD-array limit in MD-extent out of bounds |SQL/MD |Y |SQL/MD | +|22501 |22 |Data Exception |501 |The length control field of a variable length string is negative or greater than the maximum.|DB2 |N |DB2 | +|22502 |22 |Data Exception |502 |Signalling NaN was encountered. |DB2 |N |DB2 | +|22503 |22 |Data Exception |503 |The string representation of a name is invalid. |DB2 |N |DB2 | +|22504 |22 |Data Exception |504 |A mixed data value is invalid. |DB2 |N |DB2 | +|22505 |22 |Data Exception |505 |The local date or time length has been increased, but the executing program relies on the old length.|DB2 |N |DB2 | +|22506 |22 |Data Exception |506 |A reference to a datetime special register is invalid, because the clock is malfunctioning or the operating system time zone parameter is out of range.|DB2 |N |DB2 | +|22508 |22 |Data Exception |508 |CURRENT PACKAGESET is blank. |DB2 |N |DB2 | +|22511 |22 |Data Exception |511 |The value for a ROWID or reference column is not valid. |DB2 |N |DB2 | +|22512 |22 |Data Exception |512 |A variable in a predicate is invalid, because its indicator variable is negative.|DB2 |N |DB2 | +|22522 |22 |Data Exception |522 |A CCSID value is not valid at all, not valid for the data type or subtype, or not valid for the encoding scheme.|DB2 |N |DB2 | +|22525 |22 |Data Exception |525 |Partitioning key value is not valid. |DB2 |N |DB2 | +|22527 |22 |Data Exception |527 |Invalid input data detected for a multiple-row insert. |DB2 |N |DB2 | +|22528 |22 |Data Exception |528 |Binary data is invalid for DECRYPT_CHAR and DECYRYPT_DB. |DB2 |N |DB2 | +|22529 |22 |Data Exception |529 |A non-atomic statement successfully completed for at least one row, but one or more errors occurred.|DB2 |N |DB2 | +|22530 |22 |Data Exception |530 |A non-atomic statement attempted to process multiple rows of data, but no row was inserted and one or more errors occurred.|DB2 |N |DB2 | +|22531 |22 |Data Exception |531 |The argument of a built-in or system provided routine resulted in an error.|DB2 |N |DB2 | +|22532 |22 |Data Exception |532 |An XSROBJECT is not found in the XML schema repository. |DB2 |N |DB2 | +|22533 |22 |Data Exception |533 |A unique XSROBJECT could not be found in the XML schema repository.|DB2 |N |DB2 | +|22534 |22 |Data Exception |534 |An XML schema document is not connected to the other XML schema documents using an include or redefine.|DB2 |N |DB2 | +|22537 |22 |Data Exception |537 |A rowset FETCH statement returned one or more rows of data, with one or more non-terminating error conditions. Use GET DIAGNOSTICS for more information.|DB2 |N |DB2 | +|22539 |22 |Data Exception |539 |Invalid use of extended indicator parameter value. |DB2 |N |DB2 | +|22541 |22 |Data Exception |541 |The binary XML value contains unrecognized data. |DB2 |N |DB2 | +|22542 |22 |Data Exception |542 |The INSERT or UPDATE in not allowed because a resulting row does not satisfy row permissions.|DB2 |N |DB2 | +|22544 |22 |Data Exception |544 |The binary XML value contains a version that is not supported.|DB2 |N |DB2 | +|22546 |22 |Data Exception |546 |The value for a routine argument is not valid. |DB2 |N |DB2 | +|22547 |22 |Data Exception |547 |Multiple result values cannot be returned from the scalar function.|DB2 |N |DB2 | +|225DE |22 |Data Exception |5DE |An XML schema cannot be enabled for decomposition. |DB2 |N |DB2 | +|22P01 |22 |Data Exception |P01 |floating_point_exception |PostgreSQL |N |PostgreSQL Redshift | +|22P02 |22 |Data Exception |P02 |invalid_text_representation |PostgreSQL |N |PostgreSQL Redshift | +|22P03 |22 |Data Exception |P03 |invalid_binary_representation |PostgreSQL |N |PostgreSQL Redshift | +|22P04 |22 |Data Exception |P04 |bad_copy_file_format |PostgreSQL |N |PostgreSQL Redshift | +|22P05 |22 |Data Exception |P05 |untranslatable_character |PostgreSQL |N |PostgreSQL Redshift | +|22P06 |22 |Data Exception |P06 |nonstandard_use_of_escape_character |PostgreSQL |N |PostgreSQL Redshift | +|23000 |23 |integrity constraint violation |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle SQL Server | +|23001 |23 |integrity constraint violation |001 |restrict violation |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|23502 |23 |Constraint Violation |502 |An insert or update value is null, but the column cannot contain null values.|DB2 |N |PostgreSQL DB2 Redshift | +|23503 |23 |Constraint Violation |503 |The insert or update value of a foreign key is invalid. |DB2 |N |PostgreSQL DB2 Redshift | +|23504 |23 |Constraint Violation |504 |The update or delete of a parent key is prevented by a NO ACTION update or delete rule.|DB2 |N |DB2 | +|23505 |23 |Constraint Violation |505 |A violation of the constraint imposed by a unique index or a unique constraint occurred.|DB2 |N |PostgreSQL DB2 Redshift | +|23506 |23 |Constraint Violation |506 |A violation of a constraint imposed by an edit or validation procedure occurred.|DB2 |N |DB2 | +|23507 |23 |Constraint Violation |507 |A violation of a constraint imposed by a field procedure occurred.|DB2 |N |DB2 | +|23508 |23 |Constraint Violation |508 |A violation of a constraint imposed by the DDL Registration Facility occurred.|DB2 |N |DB2 | +|23509 |23 |Constraint Violation |509 |The owner of the package has constrained its use to environments which do not include that of the application process.|DB2 |N |DB2 | +|23510 |23 |Constraint Violation |510 |A violation of a constraint on the use of the command imposed by the RLST table occurred.|DB2 |N |DB2 | +|23511 |23 |Constraint Violation |511 |A parent row cannot be deleted, because the check constraint restricts the deletion.|DB2 |N |DB2 | +|23512 |23 |Constraint Violation |512 |The check constraint cannot be added, because the table contains rows that do not satisfy the constraint definition.|DB2 |N |DB2 | +|23513 |23 |Constraint Violation |513 |The resulting row of the INSERT or UPDATE does not conform to the check constraint definition.|DB2 |N |DB2 | +|23514 |23 |Integrity Constraint Violation |514 |check_violation |PostgreSQL |N |PostgreSQL Redshift | +|23515 |23 |Constraint Violation |515 |The unique index could not be created or unique constraint added, because the table contains duplicate values of the specified key.|DB2 |N |DB2 | +|23522 |23 |Constraint Violation |522 |The range of values for the identity column or sequence is exhausted.|DB2 |N |DB2 | +|23523 |23 |Constraint Violation |523 |An invalid value has been provided for the SECURITY LABEL column.|DB2 |N |DB2 | +|23525 |23 |Constraint Violation |525 |A violation of a constraint imposed by an XML values index occurred.|DB2 |N |DB2 | +|23526 |23 |Constraint Violation |526 |An XML values index could not be created because the table data contains values that violate a constraint imposed by the index.|DB2 |N |DB2 | +|23P01 |23 |Integrity Constraint Violation |P01 |exclusion_violation |PostgreSQL |N |PostgreSQL | +|24000 |24 |invalid cursor state |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle SQL Server | +|24501 |24 |Invalid Cursor State |501 |The identified cursor is not open. |DB2 |N |DB2 | +|24502 |24 |Invalid Cursor State |502 |The cursor identified in an OPEN statement is already open. |DB2 |N |DB2 | +|24504 |24 |Invalid Cursor State |504 |The cursor identified in the UPDATE, DELETE, SET, or GET statement is not positioned on a row.|DB2 |N |DB2 | +|24506 |24 |Invalid Cursor State |506 |The statement identified in the PREPARE is the statement of an open cursor.|DB2 |N |DB2 | +|24510 |24 |Invalid Cursor State |510 |An UPDATE or DELETE operation was attempted against a delete or update hole|DB2 |N |DB2 | +|24512 |24 |Invalid Cursor State |512 |The result table does not agree with the base table. |DB2 |N |DB2 | +|24513 |24 |Invalid Cursor State |513 |FETCH NEXT, PRIOR, CURRENT, or RELATIVE is not allowed, because the cursor position is not known.|DB2 |N |DB2 | +|24516 |24 |Invalid Cursor State |516 |A cursor has already been assigned to a result set. |DB2 |N |DB2 | +|24517 |24 |Invalid Cursor State |517 |A cursor was left open by a function or method. |DB2 |N |DB2 | +|24518 |24 |Invalid Cursor State |518 |A cursor is not defined to handle row sets, but a rowset was requested.|DB2 |N |DB2 | +|24519 |24 |Invalid Cursor State |519 |A hole was detected on a multiple-row FETCH statement, but indicator variables were not provided.|DB2 |N |DB2 | +|24520 |24 |Invalid Cursor State |520 |The cursor identified in the UPDATE or DELETE statement is not positioned on a rowset.|DB2 |N |DB2 | +|24521 |24 |Invalid Cursor State |521 |A positioned DELETE or UPDATE statement specified a row of a rowset, but the row is not contained within the current rowset.|DB2 |N |DB2 | +|24522 |24 |Invalid Cursor State |522 |The fetch orientation is inconsistent with the definition of the cursor and whether rowsets are supported for the cursor.|DB2 |N |DB2 | +|24524 |24 |Invalid Cursor State |524 |A FETCH CURRENT CONTINUE was requested, but there is no truncated LOB or XML data to return.|DB2 |N |DB2 | +|25000 |25 |invalid transaction state |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle SQL Server | +|25001 |25 |invalid transaction state |001 |active SQL-transaction |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|25002 |25 |invalid transaction state |002 |branch transaction already active |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|25003 |25 |invalid transaction state |003 |inappropriate access mode for branch transaction |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|25004 |25 |invalid transaction state |004 |inappropriate isolation level for branch transaction |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|25005 |25 |invalid transaction state |005 |no active SQL-transaction for branch transaction |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|25006 |25 |invalid transaction state |006 |read-only SQL-transaction |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|25007 |25 |invalid transaction state |007 |schema and data statement mixing not supported |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|25008 |25 |invalid transaction state |008 |held cursor requires same isolation level |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|25P01 |25 |Invalid Transaction State |P01 |no_active_sql_transaction |PostgreSQL |N |PostgreSQL Redshift | +|25P02 |25 |Invalid Transaction State |P02 |in_failed_sql_transaction |PostgreSQL |N |PostgreSQL Redshift | +|25P03 |25 |Invalid Transaction State |P03 |idle_in_transaction_session_timeout |PostgreSQL |N |PostgreSQL | +|25S01 |25 |invalid transaction state |S01 |Transaction state |SQL Server |N |SQL Server | +|25S02 |25 |invalid transaction state |S02 |Transaction is still active |SQL Server |N |SQL Server | +|25S03 |25 |invalid transaction state |S03 |Transaction is rolled back |SQL Server |N |SQL Server | +|26000 |26 |invalid SQL statement name |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle | +|26501 |26 |Invalid SQL Statement Identifier |501 |The statement identified does not exist. |DB2 |N |DB2 | +|27000 |27 |triggered data change violation |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Oracle | +|27001 |27 |triggered data change violation |001 |modify table modified by data change delta table |SQL/Foundation |Y |SQL/Foundation | +|28000 |28 |invalid authorization specification |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle SQL Server | +|28P01 |28 |Invalid Authorization Specification |P01 |invalid_password |PostgreSQL |N |PostgreSQL | +|2A000 |2A |direct SQL syntax error or access rule violation |000 |direct SQL syntax error or access rule violation |Oracle |N |Oracle | +|2B000 |2B |dependent privilege descriptors still exist |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle | +|2BP01 |2B |Dependent Privilege Descriptors Still Exist |P01 |dependent_objects_still_exist |PostgreSQL |N |PostgreSQL Redshift | +|2C000 |2C |invalid character set name |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation Oracle | +|2C001 |2C |invalid character set name |001 |cannot drop SQL-session default character set |SQL/Foundation |Y |SQL/Foundation | +|2D000 |2D |invalid transaction termination |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle | +|2D521 |2D |Invalid Transaction Termination |521 |SQL COMMIT or ROLLBACK are invalid in the current operating environment.|DB2 |N |DB2 | +|2D528 |2D |Invalid Transaction Termination |528 |Dynamic COMMIT or COMMIT ON RETURN procedure is invalid for the application execution environment|DB2 |N |DB2 | +|2D529 |2D |Invalid Transaction Termination |529 |Dynamic ROLLBACK is invalid for the application execution environment.|DB2 |N |DB2 | +|2E000 |2E |invalid connection name |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation Oracle | +|2F000 |2F |SQL routine exception |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|2F002 |2F |SQL routine exception |002 |modifying SQL-data not permitted |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|2F003 |2F |SQL routine exception |003 |prohibited SQL-statement attempted |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|2F004 |2F |SQL routine exception |004 |reading SQL-data not permitted |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|2F005 |2F |SQL routine exception |005 |function executed no return statement |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|2H000 |2H |invalid collation name |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation | +|30000 |30 |invalid SQL statement identifier |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation | +|33000 |33 |invalid SQL descriptor name |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation Oracle | +|34000 |34 |invalid cursor name |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle SQL Server | +|35000 |35 |invalid condition number |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation DB2 Oracle | +|36000 |36 |cursor sensitivity exception |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation | +|36001 |36 |cursor sensitivity exception |001 |request rejected |SQL/Foundation |Y |SQL/Foundation DB2 | +|36002 |36 |cursor sensitivity exception |002 |request failed |SQL/Foundation |Y |SQL/Foundation | +|37000 |37 |dynamic SQL syntax error or access rule violation |000 |dynamic SQL syntax error or access rule violation |Oracle |N |Oracle | +|38000 |38 |external routine exception |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift | +|38001 |38 |external routine exception |001 |containing SQL not permitted |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift | +|38002 |38 |external routine exception |002 |modifying SQL-data not permitted |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift | +|38003 |38 |external routine exception |003 |prohibited SQL-statement attempted |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift | +|38004 |38 |external routine exception |004 |reading SQL-data not permitted |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift | +|38503 |38 |External Function Exception |503 |A user-defined function or procedure has abnormally terminated (abend).|DB2 |N |DB2 | +|38504 |38 |External Function Exception |504 |A routine, trigger, or anonymous block has been interrupted by the user.|DB2 |N |DB2 | +|38505 |38 |External Function Exception |505 |An SQL statement is not allowed in a routine on a FINAL CALL.|DB2 |N |DB2 | +|38H01 |38 |External Function Exception |H01 |An IBM® MQ function failed to initialize. |DB2 |N |DB2 | +|38H02 |38 |External Function Exception |H02 |IBM MQ Application Messaging Interface failed to terminate the session.|DB2 |N |DB2 | +|38H03 |38 |External Function Exception |H03 |IBM MQ Application Messaging Interface failed to properly process a message.|DB2 |N |DB2 | +|38H04 |38 |External Function Exception |H04 |IBM MQ Application Messaging Interface failed in sending a message.|DB2 |N |DB2 | +|38H05 |38 |External Function Exception |H05 |IBM MQ Application Messaging Interface failed to read/receive a message.|DB2 |N |DB2 | +|38H06 |38 |External Function Exception |H06 |An IBM MQ Application Messaging Interface message was truncated.|DB2 |N |DB2 | +|38H10 |38 |External Function Exception |H10 |Error occurred during text search processing. |DB2 |N |DB2 | +|38H11 |38 |External Function Exception |H11 |Text search support is not available. |DB2 |N |DB2 | +|38H12 |38 |External Function Exception |H12 |Text search is not allowed on a column because a text search index does not exist on the column.|DB2 |N |DB2 | +|38H13 |38 |External Function Exception |H13 |A conflicting search administration procedure or command is running on the same text search index.|DB2 |N |DB2 | +|38H14 |38 |External Function Exception |H14 |Text search administration procedure error. |DB2 |N |DB2 | +|39000 |39 |external routine invocation exception |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift | +|39001 |39 |External Routine Invocation Exception |001 |invalid_sqlstate_returned |PostgreSQL |N |PostgreSQL Redshift | +|39004 |39 |external routine invocation exception |004 |null value not allowed |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift | +|39501 |39 |External Function Call Exception |501 |An output argument value returned from a function or a procedure was too long.|DB2 |N |DB2 | +|39P01 |39 |External Routine Invocation Exception |P01 |trigger_protocol_violated |PostgreSQL |N |PostgreSQL Redshift | +|39P02 |39 |External Routine Invocation Exception |P02 |srf_protocol_violated |PostgreSQL |N |PostgreSQL Redshift | +|39P03 |39 |External Routine Invocation Exception |P03 |event_trigger_protocol_violated |PostgreSQL |N |PostgreSQL | +|3B000 |3B |savepoint exception |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL | +|3B001 |3B |savepoint exception |001 |invalid specification |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 | +|3B002 |3B |savepoint exception |002 |too many |SQL/Foundation |Y |SQL/Foundation | +|3B501 |3B |Savepoint Exception |501 |A duplicate savepoint name was detected. |DB2 |N |DB2 | +|3B502 |3B |Savepoint Exception |502 |A RELEASE or ROLLBACK TO SAVEPOINT was specified, but a savepoint does not exist.|DB2 |N |DB2 | +|3B503 |3B |Savepoint Exception |503 |A SAVEPOINT, RELEASE SAVEPOINT, or ROLLBACK TO SAVEPOINT is not allowed in a trigger, function, or global transaction.|DB2 |N |DB2 | +|3C000 |3C |ambiguous cursor name |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation DB2 Oracle SQL Server | +|3D000 |3D |invalid catalog name |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle SQL Server | +|3F000 |3F |invalid schema name |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle SQL Server | +|40000 |40 |transaction rollback |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Oracle | +|40001 |40 |transaction rollback |001 |serialization failure |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Oracle SQL Server | +|40002 |40 |transaction rollback |002 |integrity constraint violation |SQL/Foundation |Y |SQL/Foundation PostgreSQL Oracle SQL Server | +|40003 |40 |transaction rollback |003 |statement completion unknown |SQL/Foundation |Y |SQL/Foundation PostgreSQL Oracle SQL Server | +|40004 |40 |transaction rollback |004 |triggered action exception |SQL/Foundation |Y |SQL/Foundation | +|40P01 |40 |Transaction Rollback |P01 |deadlock_detected |PostgreSQL |N |PostgreSQL | +|42000 |42 |syntax error or access rule violation |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL Redshift Oracle SQL Server | +|42501 |42 |Syntax Error or Access Rule Violation |501 |The authorization ID does not have the privilege to perform the specified operation on the identified object.|DB2 |N |PostgreSQL DB2 Redshift | +|42502 |42 |Syntax Error or Access Rule Violation |502 |The authorization ID does not have the privilege to perform the operation as specified.|DB2 |N |DB2 | +|42503 |42 |Syntax Error or Access Rule Violation |503 |The specified authorization ID or one of the authorization IDs of the application process is not allowed.|DB2 |N |DB2 | +|42504 |42 |Syntax Error or Access Rule Violation |504 |A specified privilege, security label, or exemption cannot be revoked from a specified authorization-name.|DB2 |N |DB2 | +|42505 |42 |Syntax Error or Access Rule Violation |505 |Connection authorization failure occurred. |DB2 |N |DB2 | +|42506 |42 |Syntax Error or Access Rule Violation |506 |Owner authorization failure occurred. |DB2 |N |DB2 | +|42509 |42 |Syntax Error or Access Rule Violation |509 |SQL statement is not authorized, because of the DYNAMICRULES option.|DB2 |N |DB2 | +|42510 |42 |Syntax Error or Access Rule Violation |510 |The authorization ID does not have the privilege to create functions or procedures in the WLM environment.|DB2 |N |DB2 | +|42512 |42 |Syntax Error or Access Rule Violation |512 |The authorization ID does not have security to the protected column.|DB2 |N |DB2 | +|42513 |42 |Syntax Error or Access Rule Violation |513 |The authorization ID does not have the MLS WRITE-DOWN privilege.|DB2 |N |DB2 | +|42517 |42 |Syntax Error or Access Rule Violation |517 |The specified authorization ID is not allowed to use the trusted context.|DB2 |N |DB2 | +|42601 |42 |Syntax Error or Access Rule Violation |601 |A character, token, or clause is invalid or missing. |DB2 |N |PostgreSQL DB2 Redshift | +|42602 |42 |Syntax Error or Access Rule Violation |602 |A character that is invalid in a name has been detected. |DB2 |N |PostgreSQL DB2 Redshift | +|42603 |42 |Syntax Error or Access Rule Violation |603 |An unterminated string constant has been detected. |DB2 |N |DB2 | +|42604 |42 |Syntax Error or Access Rule Violation |604 |An invalid numeric or string constant has been detected. |DB2 |N |DB2 | +|42605 |42 |Syntax Error or Access Rule Violation |605 |The number of arguments specified for a scalar function is invalid.|DB2 |N |DB2 | +|42606 |42 |Syntax Error or Access Rule Violation |606 |An invalid hexadecimal constant has been detected. |DB2 |N |DB2 | +|42607 |42 |Syntax Error or Access Rule Violation |607 |An operand of an aggregate function or CONCAT operator is invalid.|DB2 |N |DB2 | +|42608 |42 |Syntax Error or Access Rule Violation |608 |The use of NULL or DEFAULT in VALUES or an assignment statement is invalid.|DB2 |N |DB2 | +|42609 |42 |Syntax Error or Access Rule Violation |609 |All operands of an operator or predicate are parameter markers.|DB2 |N |DB2 | +|42610 |42 |Syntax Error or Access Rule Violation |610 |A parameter marker or the null value is not allowed. |DB2 |N |DB2 | +|42611 |42 |Syntax Error or Access Rule Violation |611 |The column, argument, parameter, or global variable definition is invalid.|DB2 |N |PostgreSQL DB2 Redshift | +|42612 |42 |Syntax Error or Access Rule Violation |612 |The statement string is an SQL statement that is not acceptable in the context in which it is presented.|DB2 |N |DB2 | +|42613 |42 |Syntax Error or Access Rule Violation |613 |Clauses are mutually exclusive. |DB2 |N |DB2 | +|42614 |42 |Syntax Error or Access Rule Violation |614 |A duplicate keyword or clause is invalid. |DB2 |N |DB2 | +|42615 |42 |Syntax Error or Access Rule Violation |615 |An invalid alternative was detected. |DB2 |N |DB2 | +|42617 |42 |Syntax Error or Access Rule Violation |617 |The statement string is blank or empty. |DB2 |N |DB2 | +|42618 |42 |Syntax Error or Access Rule Violation |618 |A variable is not allowed. |DB2 |N |DB2 | +|42620 |42 |Syntax Error or Access Rule Violation |620 |Read-only SCROLL was specified with the UPDATE clause. |DB2 |N |DB2 | +|42621 |42 |Syntax Error or Access Rule Violation |621 |The check constraint or generated column expression is invalid.|DB2 |N |DB2 | +|42622 |42 |Syntax Error or Access Rule Violation |622 |A name or label is too long. |DB2 |N |PostgreSQL DB2 Redshift | +|42623 |42 |Syntax Error or Access Rule Violation |623 |A DEFAULT clause cannot be specified. |DB2 |N |DB2 | +|42625 |42 |Syntax Error or Access Rule Violation |625 |A CASE expression is invalid. |DB2 |N |DB2 | +|42626 |42 |Syntax Error or Access Rule Violation |626 |A column specification is not allowed for a CREATE INDEX that is built on an auxiliary table.|DB2 |N |DB2 | +|42629 |42 |Syntax Error or Access Rule Violation |629 |Parameter names must be specified for SQL routines. |DB2 |N |DB2 | +|42630 |42 |Syntax Error or Access Rule Violation |630 |An SQLSTATE or SQLCODE variable is not valid in this context.|DB2 |N |DB2 | +|42631 |42 |Syntax Error or Access Rule Violation |631 |An expression must be specified on a RETURN statement in an SQL function.|DB2 |N |DB2 | +|42633 |42 |Syntax Error or Access Rule Violation |633 |An AS clause is required for an argument of XMLATTRIBUTES or XMLFOREST.|DB2 |N |DB2 | +|42634 |42 |Syntax Error or Access Rule Violation |634 |The XML name is not valid. |DB2 |N |DB2 | +|42701 |42 |Syntax Error or Access Rule Violation |701 |The same target is specified more than once for assignment in the same SQL statement.|DB2 |N |PostgreSQL DB2 Redshift | +|42702 |42 |Syntax Error or Access Rule Violation |702 |A column reference is ambiguous, because of duplicate names.|DB2 |N |PostgreSQL DB2 Redshift | +|42703 |42 |Syntax Error or Access Rule Violation |703 |An undefined column or parameter name was detected. |DB2 |N |PostgreSQL DB2 Redshift | +|42704 |42 |Syntax Error or Access Rule Violation |704 |An undefined object or constraint name was detected. |DB2 |N |PostgreSQL DB2 Redshift | +|42705 |42 |Syntax Error or Access Rule Violation |705 |An undefined server-name was detected. |DB2 |N |DB2 | +|42707 |42 |Syntax Error or Access Rule Violation |707 |A column name in ORDER BY does not identify a column of the result table.|DB2 |N |DB2 | +|42708 |42 |Syntax Error or Access Rule Violation |708 |The locale specified in a SET LOCALE or locale sensitive function was not found.|DB2 |N |DB2 | +|42709 |42 |Syntax Error or Access Rule Violation |709 |A duplicate column name was specified in a key column list. |DB2 |N |DB2 | +|42710 |42 |Syntax Error or Access Rule Violation |710 |A duplicate object or constraint name was detected. |DB2 |N |PostgreSQL DB2 Redshift | +|42711 |42 |Syntax Error or Access Rule Violation |711 |A duplicate column name was detected in the object definition or ALTER TABLE statement.|DB2 |N |DB2 | +|42712 |42 |Syntax Error or Access Rule Violation |712 |A duplicate table designator was detected in the FROM clause or REFERENCING clause of a CREATE TRIGGER statement.|DB2 |N |PostgreSQL DB2 Redshift | +|42713 |42 |Syntax Error or Access Rule Violation |713 |A duplicate object was detected in a list or is the same as an existing object.|DB2 |N |DB2 | +|42714 |42 |Syntax Error or Access Rule Violation |714 |A host variable can be defined only once. |DB2 |N |DB2 | +|42718 |42 |Syntax Error or Access Rule Violation |718 |The local server name is not defined. |DB2 |N |DB2 | +|42721 |42 |Syntax Error or Access Rule Violation |721 |The special register name is unknown at the server. |DB2 |N |DB2 | +|42723 |42 |Syntax Error or Access Rule Violation |723 |A routine with the same signature already exists in the schema, module, or compound block where it is defined.|DB2 |N |PostgreSQL DB2 Redshift | +|42724 |42 |Syntax Error or Access Rule Violation |724 |Unable to access an external program used for a user-defined function or a procedure.|DB2 |N |DB2 | +|42725 |42 |Syntax Error or Access Rule Violation |725 |A routine was referenced directly (not by either signature or by specific instance name), but there is more than one specific instance of that routine.|DB2 |N |PostgreSQL DB2 Redshift | +|42726 |42 |Syntax Error or Access Rule Violation |726 |Duplicate names for common table expressions were detected. |DB2 |N |DB2 | +|42732 |42 |Syntax Error or Access Rule Violation |732 |A duplicate schema name in a special register was detected. |DB2 |N |DB2 | +|42734 |42 |Syntax Error or Access Rule Violation |734 |A duplicate parameter-name, SQL variable name, label, or condition-name was detected.|DB2 |N |DB2 | +|42736 |42 |Syntax Error or Access Rule Violation |736 |The label specified on the GOTO, ITERATE, or LEAVE statement is not found or not valid.|DB2 |N |DB2 | +|42737 |42 |Syntax Error or Access Rule Violation |737 |The condition specified is not defined. |DB2 |N |DB2 | +|42749 |42 |Syntax Error or Access Rule Violation |749 |An XML schema document with the same target namespace and schema location already exists for the XML schema.|DB2 |N |DB2 | +|4274A |42 |Syntax Error or Access Rule Violation |74A |An XSROBJECT is not found in the XML schema repository. |DB2 |N |DB2 | +|4274B |42 |Syntax Error or Access Rule Violation |74B |A unique XSROBJECT could not be found in the XML schema repository.|DB2 |N |DB2 | +|4274C |42 |Syntax Error or Access Rule Violation |74C |The specified attribute was not found in the trusted context.|DB2 |N |DB2 | +|4274D |42 |Syntax Error or Access Rule Violation |74D |The specified attribute already exists in the trusted context.|DB2 |N |DB2 | +|4274E |42 |Syntax Error or Access Rule Violation |74E |The specified attribute is not supported in the trusted context.|DB2 |N |DB2 | +|4274M |42 |Syntax Error or Access Rule Violation |74M |An undefined period name was detected. |DB2 |N |DB2 | +|42801 |42 |Syntax Error or Access Rule Violation |801 |Isolation level UR is invalid, because the result table is not read-only.|DB2 |N |DB2 | +|42802 |42 |Syntax Error or Access Rule Violation |802 |The number of target values is not the same as the number of source values.|DB2 |N |DB2 | +|42803 |42 |Syntax Error or Access Rule Violation |803 |A column reference in the SELECT or HAVING clause is invalid, because it is not a grouping column; or a column reference in the GROUP BY clause is invalid.|DB2 |N |PostgreSQL DB2 Redshift | +|42804 |42 |Syntax Error or Access Rule Violation |804 |The result expressions in a CASE expression are not compatible.|DB2 |N |PostgreSQL DB2 Redshift | +|42805 |42 |Syntax Error or Access Rule Violation |805 |An integer in the ORDER BY clause does not identify a column of the result table.|DB2 |N |DB2 | +|42806 |42 |Syntax Error or Access Rule Violation |806 |A value cannot be assigned to a variable, because the data types are not compatible.|DB2 |N |DB2 | +|42807 |42 |Syntax Error or Access Rule Violation |807 |The data-change statement is not permitted on this object. |DB2 |N |DB2 | +|42808 |42 |Syntax Error or Access Rule Violation |808 |A column identified in the INSERT or UPDATE operation is not updatable.|DB2 |N |DB2 | +|42809 |42 |Syntax Error or Access Rule Violation |809 |The identified object is not the type of object to which the statement applies.|DB2 |N |PostgreSQL DB2 Redshift | +|42810 |42 |Syntax Error or Access Rule Violation |810 |A base table is not identified in a FOREIGN KEY clause. |DB2 |N |DB2 | +|42811 |42 |Syntax Error or Access Rule Violation |811 |The number of columns specified is not the same as the number of columns in the SELECT clause.|DB2 |N |DB2 | +|42813 |42 |Syntax Error or Access Rule Violation |813 |WITH CHECK OPTION cannot be used for the specified view. |DB2 |N |DB2 | +|42814 |42 |Syntax Error or Access Rule Violation |814 |The column cannot be dropped because it is the only column in the table.|DB2 |N |DB2 | +|42815 |42 |Syntax Error or Access Rule Violation |815 |The data type, length, scale, value, or CCSID is invalid. |DB2 |N |DB2 | +|42816 |42 |Syntax Error or Access Rule Violation |816 |A datetime value or duration in an expression is invalid. |DB2 |N |DB2 | +|42817 |42 |Syntax Error or Access Rule Violation |817 |The column cannot be dropped because a view or constraint is dependent on the column, the column is part of a partitioning key, or the column is a security label column.|DB2 |N |DB2 | +|42818 |42 |Syntax Error or Access Rule Violation |818 |The operands of an operator or function are not compatible or comparable.|DB2 |N |DB2 | +|42819 |42 |Syntax Error or Access Rule Violation |819 |An operand of an arithmetic operation or an operand of a function that requires a number is invalid.|DB2 |N |DB2 | +|42820 |42 |Syntax Error or Access Rule Violation |820 |A numeric constant is too long, or it has a value that is not within the range of its data type.|DB2 |N |DB2 | +|42821 |42 |Syntax Error or Access Rule Violation |821 |A data type for an assignment to a column or variable is not compatible with the data type.|DB2 |N |DB2 | +|42822 |42 |Syntax Error or Access Rule Violation |822 |An expression in the ORDER BY clause or GROUP BY clause is not valid.|DB2 |N |DB2 | +|42823 |42 |Syntax Error or Access Rule Violation |823 |Multiple columns are returned from a subquery that only allows one column.|DB2 |N |DB2 | +|42824 |42 |Syntax Error or Access Rule Violation |824 |An operand of LIKE is not a string, or the first operand is not a column.|DB2 |N |DB2 | +|42825 |42 |Syntax Error or Access Rule Violation |825 |The rows of UNION, INTERSECT, EXCEPT, or VALUES do not have compatible columns.|DB2 |N |DB2 | +|42826 |42 |Syntax Error or Access Rule Violation |826 |The rows of UNION, INTERSECT, EXCEPT, or VALUES do not have the same number of columns.|DB2 |N |DB2 | +|42827 |42 |Syntax Error or Access Rule Violation |827 |The table identified in the UPDATE or DELETE is not the same table designated by the cursor.|DB2 |N |DB2 | +|42828 |42 |Syntax Error or Access Rule Violation |828 |The table designated by the cursor of the UPDATE or DELETE statement cannot be modified, or the cursor is read-only.|DB2 |N |DB2 | +|42829 |42 |Syntax Error or Access Rule Violation |829 |FOR UPDATE OF is invalid, because the result table designated by the cursor cannot be modified.|DB2 |N |DB2 | +|42830 |42 |Syntax Error or Access Rule Violation |830 |The foreign key does not conform to the description of the parent key.|DB2 |N |PostgreSQL DB2 Redshift | +|42831 |42 |Syntax Error or Access Rule Violation |831 |Null values are not allowed in a column of a primary key, a column of a unique key, a ROWID column, a row change timestamp column, a row-begin column, a row-end column, or a column of an application period.|DB2 |N |DB2 | +|42832 |42 |Syntax Error or Access Rule Violation |832 |The operation is not allowed on system objects. |DB2 |N |DB2 | +|42834 |42 |Syntax Error or Access Rule Violation |834 |SET NULL cannot be specified, because no column of the foreign key can be assigned the null value.|DB2 |N |DB2 | +|42835 |42 |Syntax Error or Access Rule Violation |835 |Cyclic references cannot be specified between named derived tables.|DB2 |N |DB2 | +|42836 |42 |Syntax Error or Access Rule Violation |836 |The specification of a recursive, named derived table is invalid.|DB2 |N |DB2 | +|42837 |42 |Syntax Error or Access Rule Violation |837 |The column cannot be altered, because its attributes are not compatible with the current column attributes.|DB2 |N |DB2 | +|42842 |42 |Syntax Error or Access Rule Violation |842 |A column or parameter definition is invalid, because a specified option is inconsistent with the column description.|DB2 |N |DB2 | +|42845 |42 |Syntax Error or Access Rule Violation |845 |An invalid use of a NOT DETERMINISTIC or EXTERNAL ACTION function was detected.|DB2 |N |DB2 | +|42846 |42 |Syntax Error or Access Rule Violation |846 |Cast from source type to target type is not supported. |DB2 |N |PostgreSQL DB2 Redshift | +|42849 |42 |Syntax Error or Access Rule Violation |849 |The specified option is not supported for the routine type. |DB2 |N |DB2 | +|42852 |42 |Syntax Error or Access Rule Violation |852 |The privileges specified in GRANT or REVOKE are invalid or inconsistent. (For example, GRANT ALTER on a view.)|DB2 |N |DB2 | +|42855 |42 |Syntax Error or Access Rule Violation |855 |The assignment of the LOB or XML to this variable is not allowed. The target variable for all fetches of a LOB or XML value for this cursor must be the same for all FETCHes.|DB2 |N |DB2 | +|42856 |42 |Syntax Error or Access Rule Violation |856 |The alter of a CCSID to the specified CCSID is not valid. |DB2 |N |DB2 | +|42866 |42 |Syntax Error or Access Rule Violation |866 |The data type in either the RETURNS clause or the CAST FROM clause in the CREATE FUNCTION statement is not appropriate for the data type returned from the sourced function or RETURN statement in the function body.|DB2 |N |DB2 | +|42872 |42 |Syntax Error or Access Rule Violation |872 |FETCH statement clauses are incompatible with the cursor definition.|DB2 |N |DB2 | +|42873 |42 |Syntax Error or Access Rule Violation |873 |An invalid number of rows was specified in a multiple-row FETCH or multiple-row INSERT.|DB2 |N |DB2 | +|42877 |42 |Syntax Error or Access Rule Violation |877 |The column name cannot be qualified. |DB2 |N |DB2 | +|42878 |42 |Syntax Error or Access Rule Violation |878 |An invalid function or procedure name was used with the EXTERNAL keyword.|DB2 |N |DB2 | +|42879 |42 |Syntax Error or Access Rule Violation |879 |The data type of one or more input parameters in the CREATE FUNCTION statement is not appropriate for the corresponding data type in the source function.|DB2 |N |DB2 | +|42880 |42 |Syntax Error or Access Rule Violation |880 |The CAST TO and CAST FROM data types are incompatible, or would always result in truncation of a fixed string.|DB2 |N |DB2 | +|42882 |42 |Syntax Error or Access Rule Violation |882 |The specific instance name qualifier is not equal to the function name qualifier.|DB2 |N |DB2 | +|42883 |42 |Syntax Error or Access Rule Violation |883 |No routine was found with a matching signature. |DB2 |N |PostgreSQL DB2 Redshift | +|42884 |42 |Syntax Error or Access Rule Violation |884 |No routine was found with the specified name and compatible arguments.|DB2 |N |DB2 | +|42885 |42 |Syntax Error or Access Rule Violation |885 |The number of input parameters specified on a CREATE FUNCTION statement does not match the number provided by the function named in the SOURCE clause.|DB2 |N |DB2 | +|42886 |42 |Syntax Error or Access Rule Violation |886 |The IN, OUT, or INOUT parameter attributes do not match. |DB2 |N |DB2 | +|42887 |42 |Syntax Error or Access Rule Violation |887 |The function or table-reference is not valid in the context where it occurs.|DB2 |N |DB2 | +|42888 |42 |Syntax Error or Access Rule Violation |888 |The table does not have a primary key. |DB2 |N |DB2 | +|42889 |42 |Syntax Error or Access Rule Violation |889 |The table already has a primary key. |DB2 |N |DB2 | +|42890 |42 |Syntax Error or Access Rule Violation |890 |A column list was specified in the references clause, but the identified parent table does not have a unique constraint with the specified column names.|DB2 |N |DB2 | +|42891 |42 |Syntax Error or Access Rule Violation |891 |A duplicate constraint already exists. |DB2 |N |DB2 | +|42893 |42 |Syntax Error or Access Rule Violation |893 |The object or constraint cannot be dropped, altered, or transferred or authorities cannot be revoked from the object, because other objects are dependent on it.|DB2 |N |DB2 | +|42894 |42 |Syntax Error or Access Rule Violation |894 |The value of a column or sequence attribute is invalid. |DB2 |N |DB2 | +|42895 |42 |Syntax Error or Access Rule Violation |895 |For static SQL, an input variable cannot be used, because its data type is not compatible with the parameter of a procedure or user-defined function.|DB2 |N |DB2 | +|42898 |42 |Syntax Error or Access Rule Violation |898 |An invalid correlated reference or transition table was detected in a trigger.|DB2 |N |DB2 | +|42899 |42 |Syntax Error or Access Rule Violation |899 |Correlated references and column names are not allowed for triggered actions with the FOR EACH STATEMENT clause.|DB2 |N |DB2 | +|428A1 |42 |Syntax Error or Access Rule Violation |8A1 |Unable to access a file referenced by a file reference variable.|DB2 |N |DB2 | +|428B0 |42 |Syntax Error or Access Rule Violation |8B0 |Nesting not valid in ROLLUP, CUBE, or GROUPING SETs. |DB2 |N |DB2 | +|428B3 |42 |Syntax Error or Access Rule Violation |8B3 |An invalid SQLSTATE was specified. |DB2 |N |DB2 | +|428B4 |42 |Syntax Error or Access Rule Violation |8B4 |The part clause of a LOCK TABLE statement is not valid. |DB2 |N |DB2 | +|428B7 |42 |Syntax Error or Access Rule Violation |8B7 |A number specified in an SQL statement is out of the valid range.|DB2 |N |DB2 | +|428C1 |42 |Syntax Error or Access Rule Violation |8C1 |The data type or attribute of a column can only be specified once for a table.|DB2 |N |DB2 | +|428C2 |42 |Syntax Error or Access Rule Violation |8C2 |Examination of the function body indicates that the given clause should have been specified on the CREATE FUNCTION statement.|DB2 |N |DB2 | +|428C4 |42 |Syntax Error or Access Rule Violation |8C4 |The number of elements on each side of the predicate operator is not the same.|DB2 |N |DB2 | +|428C7 |42 |Syntax Error or Access Rule Violation |8C7 |A ROWID or reference column specification is not valid or used in an invalid context.|DB2 |N |DB2 | +|428C9 |42 |Syntax Error or Access Rule Violation |8C9 |A column defined as GENERATED ALWAYS cannot be specified as the target column of an insert or update operation.|DB2 |N |PostgreSQL DB2 | +|428D2 |42 |Syntax Error or Access Rule Violation |8D2 |AS LOCATOR cannot be specified for a non-LOB parameter. |DB2 |N |DB2 | +|428D3 |42 |Syntax Error or Access Rule Violation |8D3 |GENERATED is not allowed for the specified data type or attribute of a column.|DB2 |N |DB2 | +|428D4 |42 |Syntax Error or Access Rule Violation |8D4 |A cursor specified in a FOR statement cannot be referenced in an OPEN, CLOSE, or FETCH statement.|DB2 |N |DB2 | +|428D5 |42 |Syntax Error or Access Rule Violation |8D5 |The ending label does not match the beginning label. |DB2 |N |DB2 | +|428D6 |42 |Syntax Error or Access Rule Violation |8D6 |UNDO is not allowed for NOT ATOMIC compound statements. |DB2 |N |DB2 | +|428D7 |42 |Syntax Error or Access Rule Violation |8D7 |The condition value is not allowed. |DB2 |N |DB2 | +|428D8 |42 |Syntax Error or Access Rule Violation |8D8 |The sqlcode or sqlstate variable declaration is not valid. |DB2 |N |DB2 | +|428EC |42 |Syntax Error or Access Rule Violation |8EC |The fullselect specified for the materialized query table is not valid.|DB2 |N |DB2 | +|428EK |42 |Syntax Error or Access Rule Violation |8EK |The schema qualifier is not valid. |DB2 |N |DB2 | +|428EW |42 |Syntax Error or Access Rule Violation |8EW |The table cannot be converted to or from a materialized query table.|DB2 |N |DB2 | +|428F2 |42 |Syntax Error or Access Rule Violation |8F2 |An integer expression must be specified on a RETURN statement in an SQL procedure.|DB2 |N |DB2 | +|428F4 |42 |Syntax Error or Access Rule Violation |8F4 |The SENSITIVITY specified on FETCH is not allowed for the cursor.|DB2 |N |DB2 | +|428F5 |42 |Syntax Error or Access Rule Violation |8F5 |The invocation of a routine is ambiguous. |DB2 |N |DB2 | +|428F9 |42 |Syntax Error or Access Rule Violation |8F9 |A sequence expression cannot be specified in this context. |DB2 |N |DB2 | +|428FA |42 |Syntax Error or Access Rule Violation |8FA |The scale of the decimal number must be zero. |DB2 |N |DB2 | +|428FB |42 |Syntax Error or Access Rule Violation |8FB |Sequence-name must not be a sequence generated by the system.|DB2 |N |DB2 | +|428FC |42 |Syntax Error or Access Rule Violation |8FC |The length of the encryption password is not valid. |DB2 |N |DB2 | +|428FE |42 |Syntax Error or Access Rule Violation |8FE |The data is not a result of the ENCRYPT function. |DB2 |N |DB2 | +|428FJ |42 |Syntax Error or Access Rule Violation |8FJ |ORDER BY or FETCH FIRST is not allowed in the outer fullselect of a view or materialized query table.|DB2 |N |DB2 | +|428FL |42 |Syntax Error or Access Rule Violation |8FL |A data change statement is not allowed in the context in which it was specified.|DB2 |N |DB2 | +|428FM |42 |Syntax Error or Access Rule Violation |8FM |An SQL data change statement within a SELECT specified a view which is not a symmetric view.|DB2 |N |DB2 | +|428FP |42 |Syntax Error or Access Rule Violation |8FP |Only one INSTEAD OF trigger is allowed for each kind of operation on a view.|DB2 |N |DB2 | +|428FQ |42 |Syntax Error or Access Rule Violation |8FQ |An INSTEAD OF trigger cannot be created because of how the view is defined.|DB2 |N |DB2 | +|428FR |42 |Syntax Error or Access Rule Violation |8FR |A column cannot be altered as specified. |DB2 |N |DB2 | +|428FS |42 |Syntax Error or Access Rule Violation |8FS |A column cannot be added to an index. |DB2 |N |DB2 | +|428FT |42 |Syntax Error or Access Rule Violation |8FT |The partitioning clause specified on CREATE or ALTER is not valid.|DB2 |N |DB2 | +|428FY |42 |Syntax Error or Access Rule Violation |8FY |A column cannot be added, dropped, or altered in a materialized query table.|DB2 |N |DB2 | +|428G3 |42 |Syntax Error or Access Rule Violation |8G3 |FINAL TABLE is not valid when the target view of the SQL data change statement in a fullselect has an INSTEAD OF trigger defined.|DB2 |N |DB2 | +|428G4 |42 |Syntax Error or Access Rule Violation |8G4 |Invalid use of INPUT SEQUENCE ordering. |DB2 |N |DB2 | +|428G5 |42 |Syntax Error or Access Rule Violation |8G5 |The assignment clause of the UPDATE statement must specify at least one column that is not an INCLUDE column.|DB2 |N |DB2 | +|428G8 |42 |Syntax Error or Access Rule Violation |8G8 |The view cannot be enabled for query optimization. |DB2 |N |DB2 | +|428GB |42 |Syntax Error or Access Rule Violation |8GB |A character could not be converted and substitution characters are not allowed.|DB2 |N |DB2 | +|428GC |42 |Syntax Error or Access Rule Violation |8GC |An invalid string unit was specified for a function. |DB2 |N |DB2 | +|428GH |42 |Syntax Error or Access Rule Violation |8GH |The data type of one or more parameters specified in the ADD VERSION clause does not match the corresponding data type in the routine being altered.|DB2 |N |DB2 | +|428GI |42 |Syntax Error or Access Rule Violation |8GI |An XML schema is not complete because an XML schema document is missing.|DB2 |N |DB2 | +|428GJ |42 |Syntax Error or Access Rule Violation |8GJ |The table cannot be truncated because DELETE triggers exist for the table or the table is a parent table of a referential constraint that would be affected by the statement.|DB2 |N |DB2 | +|428GK |42 |Syntax Error or Access Rule Violation |8GK |An ALTER TRUSTED CONTEXT attempted to remove one or more of the minimum required attributes.|DB2 |N |DB2 | +|428GL |42 |Syntax Error or Access Rule Violation |8GL |The system authorization ID specified for a trusted context is already specified in another trusted context.|DB2 |N |DB2 | +|428GM |42 |Syntax Error or Access Rule Violation |8GM |The trusted context is already defined to be used by this authorization ID or PUBLIC.|DB2 |N |DB2 | +|428GN |42 |Syntax Error or Access Rule Violation |8GN |The specified authorization ID or PUBLIC is not defined in the specified trusted context.|DB2 |N |DB2 | +|428GU |42 |Syntax Error or Access Rule Violation |8GU |A table must include at least one column that is not implicitly hidden.|DB2 |N |DB2 | +|428H2 |42 |Syntax Error or Access Rule Violation |8H2 |Data type is not supported in the context where it is being used.|DB2 |N |DB2 | +|428H8 |42 |Syntax Error or Access Rule Violation |8H8 |The object must be defined as secure because another object depends on it for row-level or column-level access control.|DB2 |N |DB2 | +|428H9 |42 |Syntax Error or Access Rule Violation |8H9 |PERMISSION or MASK cannot be altered. |DB2 |N |DB2 | +|428HA |42 |Syntax Error or Access Rule Violation |8HA |An argument of a user-defined function must not reference a column for which a column mask is defined.|DB2 |N |DB2 | +|428HB |42 |Syntax Error or Access Rule Violation |8HB |A permission or mask cannot be created on the specified object.|DB2 |N |DB2 | +|428HC |42 |Syntax Error or Access Rule Violation |8HC |A column mask is already defined for the specified column. |DB2 |N |DB2 | +|428HD |42 |Syntax Error or Access Rule Violation |8HD |The statement cannot be processed because a column mask cannot be applied or the definition of the mask conflicts with the statement.|DB2 |N |DB2 | +|428HJ |42 |Syntax Error or Access Rule Violation |8HJ |The organization clause specified on CREATE or ALTER is not valid.|DB2 |N |DB2 | +|428HK |42 |Syntax Error or Access Rule Violation |8HK |The specified hash space is not valid for the implicitly created table space.|DB2 |N |DB2 | +|428HL |42 |Syntax Error or Access Rule Violation |8HL |Another version of the routine exists and is defined with an incompatible option.|DB2 |N |DB2 | +|428HM |42 |Syntax Error or Access Rule Violation |8HM |The table cannot be used as a system-period temporal table or an archive-enabled table.|DB2 |N |DB2 | +|428HN |42 |Syntax Error or Access Rule Violation |8HN |The period specification is not valid. |DB2 |N |DB2 | +|428HW |42 |Syntax Error or Access Rule Violation |8HW |The period specification or period clause in an index or constraint is not valid.|DB2 |N |DB2 | +|428HX |42 |Syntax Error or Access Rule Violation |8HX |The table is not valid for a history table or archive table.|DB2 |N |DB2 | +|428HY |42 |Syntax Error or Access Rule Violation |8HY |The period specification or period condition is not valid. |DB2 |N |DB2 | +|428HZ |42 |Syntax Error or Access Rule Violation |8HZ |The temporal attribute of the table was not valid for the specified ALTER operation.|DB2 |N |DB2 | +|428I1 |42 |Syntax Error or Access Rule Violation |8I1 |The columns updated by the XMLMODIFY function were not specified in the UPDATE SET clause.|DB2 |N |DB2 | +|428I4 |42 |Syntax Error or Access Rule Violation |8I4 |The combination of UNNEST arguments are not valid. |DB2 |N |DB2 | +|428I5 |42 |Syntax Error or Access Rule Violation |8I5 |The attributes of an object at one location do not match the attributes of the same object at another location.|DB2 |N |DB2 | +|428I6 |42 |Syntax Error or Access Rule Violation |8I6 |The archive enabled table is not allowed in this context. |DB2 |N |DB2 | +|428IC |42 |Syntax Error or Access Rule Violation |8IC |An Invalid combination of replication overrides is in use for a data change operation.|DB2 |N |DB2 | +|428ID |42 |Syntax Error or Access Rule Violation |8ID |Model columns specified in an SQL Data Insights function could not be determined or are not usable.|DB2 |N |DB2 | +|42902 |42 |Syntax Error or Access Rule Violation |902 |The object of the INSERT, UPDATE, or DELETE is also identified (possibly implicitly through a view) in a FROM clause.|DB2 |N |DB2 | +|42903 |42 |Syntax Error or Access Rule Violation |903 |Invalid use of an aggregate function or OLAP function. |DB2 |N |DB2 | +|42905 |42 |Syntax Error or Access Rule Violation |905 |DISTINCT is specified more than once in a subselect. |DB2 |N |DB2 | +|42906 |42 |Syntax Error or Access Rule Violation |906 |An aggregate function in a subquery of a HAVING clause includes an expression that applies an operator to a correlated reference.|DB2 |N |DB2 | +|42907 |42 |Syntax Error or Access Rule Violation |907 |The string is too long in the context it was specified. |DB2 |N |DB2 | +|42908 |42 |Syntax Error or Access Rule Violation |908 |The statement does not include a required column list. |DB2 |N |DB2 | +|42909 |42 |Syntax Error or Access Rule Violation |909 |CREATE VIEW includes an operator or operand that is not valid for views.|DB2 |N |DB2 | +|42911 |42 |Syntax Error or Access Rule Violation |911 |A decimal divide operation is invalid, because the result would have a negative scale.|DB2 |N |DB2 | +|42912 |42 |Syntax Error or Access Rule Violation |912 |A column cannot be updated, because it is not identified in the UPDATE clause of the select-statement of the cursor.|DB2 |N |DB2 | +|42914 |42 |Syntax Error or Access Rule Violation |914 |The DELETE is invalid, because a table referenced in a subquery can be affected by the operation.|DB2 |N |DB2 | +|42915 |42 |Syntax Error or Access Rule Violation |915 |An invalid referential constraint has been detected. |DB2 |N |DB2 | +|42917 |42 |Syntax Error or Access Rule Violation |917 |The object cannot be explicitly dropped, altered, or replaced.|DB2 |N |DB2 | +|42918 |42 |Syntax Error or Access Rule Violation |918 |A user-defined data type cannot be created with a system-defined data type name (for example, INTEGER).|DB2 |N |DB2 | +|42924 |42 |Syntax Error or Access Rule Violation |924 |An alias resolved to another alias rather than a table or view at the remote location.|DB2 |N |DB2 | +|42925 |42 |Syntax Error or Access Rule Violation |925 |Recursive named derived tables cannot specify SELECT DISTINCT and must specify UNION ALL.|DB2 |N |DB2 | +|42927 |42 |Syntax Error or Access Rule Violation |927 |The function cannot be altered to NOT DETERMINISTIC or EXTERNAL ACTION because it is referenced by one or more existing views.|DB2 |N |DB2 | +|42932 |42 |Syntax Error or Access Rule Violation |932 |The program preparation assumptions are incorrect. |DB2 |N |DB2 | +|42939 |42 |Syntax Error or Access Rule Violation |939 |The name cannot be used, because the specified identifier is reserved for system use.|DB2 |N |PostgreSQL DB2 Redshift | +|42945 |42 |Syntax Error or Access Rule Violation |945 |ALTER CCSID is not allowed on a table space or database that contains a view.|DB2 |N |DB2 | +|42961 |42 |Syntax Error or Access Rule Violation |961 |The server name specified does not match the current server.|DB2 |N |DB2 | +|42962 |42 |Syntax Error or Access Rule Violation |962 |The column type is not allowed in an index, a key, generated column, or a constraint.|DB2 |N |DB2 | +|42963 |42 |Syntax Error or Access Rule Violation |963 |Invalid specification of a security label column. |DB2 |N |DB2 | +|42969 |42 |Syntax Error or Access Rule Violation |969 |The package was not created. |DB2 |N |DB2 | +|42972 |42 |Syntax Error or Access Rule Violation |972 |An expression in a join-condition or ON clause of a MERGE statement references columns in more than one of the operand tables.|DB2 |N |DB2 | +|42986 |42 |Syntax Error or Access Rule Violation |986 |The source table of a rename operation is referenced in a context where is it not supported.|DB2 |N |DB2 | +|42987 |42 |Syntax Error or Access Rule Violation |987 |The statement or routine is not allowed in a trigger. |DB2 |N |DB2 | +|42988 |42 |Syntax Error or Access Rule Violation |988 |The operation is not allowed with mixed ASCII data. |DB2 |N |DB2 | +|42993 |42 |Syntax Error or Access Rule Violation |993 |The column, as defined, is too large to be logged. |DB2 |N |DB2 | +|42995 |42 |Syntax Error or Access Rule Violation |995 |The requested function does not apply to global temporary tables.|DB2 |N |DB2 | +|42996 |42 |Syntax Error or Access Rule Violation |996 |A specified column may not be used in a partition key. |DB2 |N |DB2 | +|42997 |42 |Syntax Error or Access Rule Violation |997 |Capability is not supported by this version of the Db2 application requester, Db2 application server, or the combination of the two.|DB2 |N |DB2 | +|429B1 |42 |Syntax Error or Access Rule Violation |9B1 |A procedure specifying COMMIT ON RETURN cannot be the target of a nested CALL statement.|DB2 |N |DB2 | +|429BB |42 |Syntax Error or Access Rule Violation |9BB |The data type of a column, parameter, or SQL variable is not supported.|DB2 |N |DB2 | +|429BD |42 |Syntax Error or Access Rule Violation |9BD |RETURN must be the last SQL statement of the atomic compound statement within an SQL row or table function.|DB2 |N |DB2 | +|429BI |42 |Syntax Error or Access Rule Violation |9BI |The condition area is full and cannot handle more errors for a NOT ATOMIC statement.|DB2 |N |DB2 | +|429BN |42 |Syntax Error or Access Rule Violation |9BN |A CREATE statement cannot be processed when the value of CURRENT SCHEMA differs from CURRENT SQLID.|DB2 |N |DB2 | +|429BQ |42 |Syntax Error or Access Rule Violation |9BQ |The specified alter of the data type or attribute is not allowed.|DB2 |N |DB2 | +|429BS |42 |Syntax Error or Access Rule Violation |9BS |Invalid index definition involving an XMLPATTERN clause or a column defined with a data type of XML.|DB2 |N |DB2 | +|429BV |42 |Syntax Error or Access Rule Violation |9BV |Invalid specification of a ROW CHANGE TIMESTAMP column. |DB2 |N |DB2 | +|429BW |42 |Syntax Error or Access Rule Violation |9BW |The statement cannot be processed due to related implicitly created objects.|DB2 |N |DB2 | +|429BX |42 |Syntax Error or Access Rule Violation |9BX |The expression for an index key is not valid. |DB2 |N |DB2 | +|429BY |42 |Syntax Error or Access Rule Violation |9BY |The statement is not allowed when using a trusted connection.|DB2 |N |DB2 | +|429C1 |42 |Syntax Error or Access Rule Violation |9C1 |A data type cannot be determined for an untyped expression. |DB2 |N |DB2 | +|429CB |42 |Syntax Error or Access Rule Violation |9CB |The attributes of the table or column are not supported for the table type.|DB2 |N |DB2 | +|42K01 |42 |Syntax Error or Access Rule Violation |K01 |data type not fully specified |Spark |N |Spark | +|42K02 |42 |Syntax Error or Access Rule Violation |K02 |data source not found |Spark |N |Spark | +|42K03 |42 |Syntax Error or Access Rule Violation |K03 |File not found |Spark |N |Spark | +|42K04 |42 |Syntax Error or Access Rule Violation |K04 |Duplicate file |Spark |N |Spark | +|42K05 |42 |Syntax Error or Access Rule Violation |K05 |Name is not valid |Spark |N |Spark | +|42K06 |42 |Syntax Error or Access Rule Violation |K06 |Invalid type for options |Spark |N |Spark | +|42K07 |42 |Syntax Error or Access Rule Violation |K07 |Not a valid schema literal |Spark |N |Spark | +|42K08 |42 |Syntax Error or Access Rule Violation |K08 |Not a constant |Spark |N |Spark | +|42K09 |42 |Syntax Error or Access Rule Violation |K09 |Data type mismatch |Spark |N |Spark | +|42K0A |42 |Syntax error or Access Rule violation |K0A |Invalid UNPIVOT clause |Spark |N |Spark | +|42K0B |42 |Syntax error or Access Rule violation |K0B |Legacy feature blocked |Spark |N |Spark | +|42KD0 |42 |Syntax error or Access Rule violation |KD0 |Ambiguous name reference. |Databricks |N |Databricks | +|42KD1 |42 |Syntax error or Access Rule violation |KD1 |Operation not supported in READ ONLY session mode. |Databricks |N |Databricks | +|42KD2 |42 |Syntax error or Access Rule violation |KD2 |The source and target table names of a SYNC operaton must be the same.|Databricks |N |Databricks | +|42KD3 |42 |Syntax error or Access Rule violation |KD3 |A column can not be added as specified. |Databricks |N |Databricks | +|42KD4 |42 |Syntax error or Access Rule violation |KD4 |Operation not supported because table schema has changed. |Databricks |N |Databricks | +|42KD5 |42 |Syntax error or Access Rule violation |KD5 |Cannot create file or path. |Databricks |N |Databricks | +|42KD6 |42 |Syntax error or Access Rule violation |KD6 |No partition information found. |Databricks |N |Databricks | +|42KD7 |42 |Syntax error or Access Rule violation |KD7 |Table signature mismatch. |Databricks |N |Databricks | +|42KD8 |42 |Syntax error or Access Rule violation |KD8 |Column position out of range. |Databricks |N |Databricks | +|42KD9 |42 |Syntax error or Access Rule violation |KD9 |Cannot infer table schema. |Databricks |N |Databricks | +|42KDA |42 |Syntax error or Access Rule violation |KDA |Failed to merge file into table schema. |Databricks |N |Databricks | +|42P01 |42 |Syntax error or Access Rule violation |P01 |undefined_table |PostgreSQL |N |PostgreSQL Redshift | +|42P02 |42 |Syntax Error or Access Rule Violation |P02 |undefined_parameter |PostgreSQL |N |PostgreSQL Redshift | +|42P03 |42 |Syntax Error or Access Rule Violation |P03 |duplicate_cursor |PostgreSQL |N |PostgreSQL Redshift | +|42P04 |42 |Syntax Error or Access Rule Violation |P04 |duplicate_database |PostgreSQL |N |PostgreSQL Redshift | +|42P05 |42 |Syntax Error or Access Rule Violation |P05 |duplicate_prepared_statement |PostgreSQL |N |PostgreSQL Redshift | +|42P06 |42 |Syntax Error or Access Rule Violation |P06 |duplicate_schema |PostgreSQL |N |PostgreSQL Redshift | +|42P07 |42 |Syntax Error or Access Rule Violation |P07 |duplicate_table |PostgreSQL |N |PostgreSQL Redshift | +|42P08 |42 |Syntax Error or Access Rule Violation |P08 |ambiguous_parameter |PostgreSQL |N |PostgreSQL Redshift | +|42P09 |42 |Syntax Error or Access Rule Violation |P09 |ambiguous_alias |PostgreSQL |N |PostgreSQL Redshift | +|42P10 |42 |Syntax Error or Access Rule Violation |P10 |invalid_column_reference |PostgreSQL |N |PostgreSQL Redshift | +|42P11 |42 |Syntax Error or Access Rule Violation |P11 |invalid_cursor_definition |PostgreSQL |N |PostgreSQL Redshift | +|42P12 |42 |Syntax Error or Access Rule Violation |P12 |invalid_database_definition |PostgreSQL |N |PostgreSQL Redshift | +|42P13 |42 |Syntax Error or Access Rule Violation |P13 |invalid_function_definition |PostgreSQL |N |PostgreSQL Redshift | +|42P14 |42 |Syntax Error or Access Rule Violation |P14 |invalid_prepared_statement_definition |PostgreSQL |N |PostgreSQL Redshift | +|42P15 |42 |Syntax Error or Access Rule Violation |P15 |invalid_schema_definition |PostgreSQL |N |PostgreSQL Redshift | +|42P16 |42 |Syntax Error or Access Rule Violation |P16 |invalid_table_definition |PostgreSQL |N |PostgreSQL Redshift | +|42P17 |42 |Syntax Error or Access Rule Violation |P17 |invalid_object_definition |PostgreSQL |N |PostgreSQL Redshift | +|42P18 |42 |Syntax Error or Access Rule Violation |P18 |indeterminate_datatype |PostgreSQL |N |PostgreSQL Redshift | +|42P19 |42 |Syntax Error or Access Rule Violation |P19 |invalid_recursion |PostgreSQL |N |PostgreSQL | +|42P20 |42 |Syntax Error or Access Rule Violation |P20 |windowing_error |PostgreSQL |N |PostgreSQL | +|42P21 |42 |Syntax Error or Access Rule Violation |P21 |collation_mismatch |PostgreSQL |N |PostgreSQL | +|42P22 |42 |Syntax Error or Access Rule Violation |P22 |indeterminate_collation |PostgreSQL |N |PostgreSQL | +|42S01 |42 |Syntax error or Access rule violation |S01 |Base table or view already exists |SQL Server |N |SQL Server | +|42S02 |42 |Syntax error or Access rule violation |S02 |Base table or view not found |SQL Server |N |SQL Server | +|42S11 |42 |Syntax error or Access rule violation |S11 |Index already exists |SQL Server |N |SQL Server | +|42S12 |42 |Syntax error or Access rule violation |S12 |Index not found |SQL Server |N |SQL Server | +|42S21 |42 |Syntax error or Access rule violation |S21 |Column already exists |SQL Server |N |SQL Server | +|42S22 |42 |Syntax error or Access rule violation |S22 |Column not found |SQL Server |N |SQL Server | +|44000 |44 |with check option violation |000 |(no subclass) |SQL/Foundation |Y |SQL/Foundation PostgreSQL DB2 Redshift Oracle SQL Server | +|45000 |45 |unhandled user-defined exception |000 |(no subclass) |SQL/PSM |Y |SQL/PSM | +|46000 |46 |Java DDL 1 |000 |(no subclass) |SQL/JRT |Y |SQL/JRT SQL/JRT SQL/OLB | +|46001 |46 |Java DDL 1 |001 |invalid URL |SQL/JRT |Y |SQL/JRT DB2 | +|46002 |46 |Java DDL 1 |002 |invalid JAR name |SQL/JRT |Y |SQL/JRT DB2 | +|46003 |46 |Java DDL 1 |003 |invalid class deletion |SQL/JRT |Y |SQL/JRT DB2 | +|46005 |46 |Java DDL 1 |005 |invalid replacement |SQL/JRT |Y |SQL/JRT | +|46007 |46 |Java™ Errors |007 |A Java function has a Java method with an invalid signature.|DB2 |N |DB2 | +|46008 |46 |Java™ Errors |008 |A Java function could not map to a single Java method. |DB2 |N |DB2 | +|4600A |46 |Java DDL 1 |00A |attempt to replace uninstalled JAR |SQL/JRT |Y |SQL/JRT | +|4600B |46 |Java DDL 1 |00B |attempt to remove uninstalled JAR |SQL/JRT |Y |SQL/JRT | +|4600C |46 |Java DDL 1 |00C |invalid JAR removal |SQL/JRT |Y |SQL/JRT DB2 | +|4600D |46 |Java DDL 1 |00D |invalid path |SQL/JRT |Y |SQL/JRT DB2 | +|4600E |46 |Java DDL 1 |00E |self-referencing path |SQL/JRT |Y |SQL/JRT DB2 | +|46102 |46 |Java execution 1 |102 |invalid JAR name in path |SQL/JRT |Y |SQL/JRT | +|46103 |46 |Java execution 1 |103 |unresolved class name |SQL/JRT |Y |SQL/JRT DB2 | +|46110 |46 |OLB-specific error |110 |unsupported feature |SQL/OLB |Y |SQL/OLB | +|46120 |46 |OLB-specific error |120 |invalid class declaration |SQL/OLB |Y |SQL/OLB | +|46121 |46 |OLB-specific error |121 |invalid column name |SQL/OLB |Y |SQL/OLB | +|46122 |46 |OLB-specific error |122 |invalid number of columns |SQL/OLB |Y |SQL/OLB | +|46130 |46 |OLB-specific error |130 |invalid profile state |SQL/OLB |Y |SQL/OLB | +|46501 |46 |Java™ Errors |501 |The install or remove jar procedure specified the use of a deployment descriptor.|DB2 |N |DB2 | +|46502 |46 |Java™ Errors |502 |A user-defined procedure has returned a DYNAMIC RESULT SET of an invalid class. The parameter is not a Db2 result set.|DB2 |N |DB2 | +|51002 |51 |Invalid Application State |002 |The package corresponding to an SQL statement execution request was not found.|DB2 |N |DB2 | +|51003 |51 |Invalid Application State |003 |Consistency tokens do not match. |DB2 |N |DB2 | +|51004 |51 |Invalid Application State |004 |An address in the SQLDA is invalid. |DB2 |N |DB2 | +|51005 |51 |Invalid Application State |005 |The previous system error has disabled this function. |DB2 |N |DB2 | +|51006 |51 |Invalid Application State |006 |A valid connection has not been established. |DB2 |N |DB2 | +|51008 |51 |Invalid Application State |008 |The release number of the program or package is not valid. |DB2 |N |DB2 | +|51015 |51 |Invalid Application State |015 |An attempt was made to execute a section that was found to be in error at bind time.|DB2 |N |DB2 | +|51021 |51 |Invalid Application State |021 |SQL statements cannot be executed until the application process executes a rollback operation.|DB2 |N |DB2 | +|51024 |51 |Invalid Application State |024 |An object cannot be used, because it has been marked inoperative.|DB2 |N |DB2 | +|51030 |51 |Invalid Application State |030 |The procedure referenced in a DESCRIBE PROCEDURE or ASSOCIATE LOCATOR statement has not yet been called within the application process.|DB2 |N |DB2 | +|51032 |51 |Invalid Application State |032 |A valid CCSID has not yet been specified for this Db2 for z/OS® subsystem.|DB2 |N |DB2 | +|51033 |51 |Invalid Application State |033 |The operation is not allowed because it operates on a result set that was not created by the current server.|DB2 |N |DB2 | +|51034 |51 |Invalid Application State |034 |The routine defined with MODIFIES SQL DATA is not valid in the context in which it is invoked.|DB2 |N |DB2 | +|51035 |51 |Invalid Application State |035 |A PREVIOUS VALUE expression cannot be used because a value has not been generated for the sequence yet in this session.|DB2 |N |DB2 | +|51036 |51 |Invalid Application State |036 |An implicit connect to a remote server is not allowed because a savepoint is outstanding.|DB2 |N |DB2 | +|51039 |51 |Invalid Application State |039 |The ENCRYPTION PASSWORD value is not set. |DB2 |N |DB2 | +|51043 |51 |Invalid Application State |043 |Procedure cannot be called because the nested environment already called an autonomous procedure.|DB2 |N |DB2 | +|53000 |53 |Insufficient Resources |000 |insufficient_resources |PostgreSQL |N |PostgreSQL Redshift | +|53001 |53 |Invalid Operand or Inconsistent Specification |001 |A clause is invalid, because the table space is a workfile. |DB2 |N |DB2 | +|53004 |53 |Invalid Operand or Inconsistent Specification |004 |DSNDB07 is the implicit workfile database. |DB2 |N |DB2 | +|53014 |53 |Invalid Operand or Inconsistent Specification |014 |The specified OBID is invalid. |DB2 |N |DB2 | +|53022 |53 |Invalid Operand or Inconsistent Specification |022 |Variable or parameter is not allowed. |DB2 |N |DB2 | +|53035 |53 |Invalid Operand or Inconsistent Specification |035 |Key limits must be specified in the CREATE or ALTER INDEX statement.|DB2 |N |DB2 | +|53036 |53 |Invalid Operand or Inconsistent Specification |036 |The number of PARTITION specifications is not the same as the number of partitions.|DB2 |N |DB2 | +|53037 |53 |Invalid Operand or Inconsistent Specification |037 |A partitioned index cannot be created on a table. |DB2 |N |DB2 | +|53038 |53 |Invalid Operand or Inconsistent Specification |038 |The number of key limit values is zero or greater than the number of columns in the key.|DB2 |N |DB2 | +|53039 |53 |Invalid Operand or Inconsistent Specification |039 |The PARTITION clause of the ALTER statement is omitted or invalid.|DB2 |N |DB2 | +|53040 |53 |Invalid Operand or Inconsistent Specification |040 |The buffer pool cannot be changed as specified. |DB2 |N |DB2 | +|53041 |53 |Invalid Operand or Inconsistent Specification |041 |The page size of the buffer pool is invalid. |DB2 |N |DB2 | +|53043 |53 |Invalid Operand or Inconsistent Specification |043 |Columns with different field procedures cannot be compared. |DB2 |N |DB2 | +|53044 |53 |Invalid Operand or Inconsistent Specification |044 |The columns have a field procedure, but the field types are not compatible.|DB2 |N |DB2 | +|53045 |53 |Invalid Operand or Inconsistent Specification |045 |The data type of the key limit constant is not the same as the data type of the column.|DB2 |N |DB2 | +|53088 |53 |Invalid Operand or Inconsistent Specification |088 |LOCKMAX is inconsistent with the specified LOCKSIZE. |DB2 |N |DB2 | +|53089 |53 |Invalid Operand or Inconsistent Specification |089 |The number of variable parameters for a stored procedure is not equal to the number of expected variable parameters.|DB2 |N |DB2 | +|53090 |53 |Invalid Operand or Inconsistent Specification |090 |Only data from one encoding scheme, either ASCII, EBCDIC or Unicode, can be referenced in the same SQL statement.|DB2 |N |DB2 | +|53091 |53 |Invalid Operand or Inconsistent Specification |091 |The encoding scheme specified is not the same as the encoding scheme currently in use for the containing table space.|DB2 |N |DB2 | +|53092 |53 |Invalid Operand or Inconsistent Specification |092 |Type 1 index cannot be created for a table using the ASCII encoding scheme.|DB2 |N |DB2 | +|53093 |53 |Invalid Operand or Inconsistent Specification |093 |The CCSID ASCII or UNICODE clause is not supported for this database or table space.|DB2 |N |DB2 | +|53094 |53 |Invalid Operand or Inconsistent Specification |094 |The PLAN_TABLE cannot be created with the FOR ASCII clause. |DB2 |N |DB2 | +|53095 |53 |Invalid Operand or Inconsistent Specification |095 |CREATE or ALTER statement cannot define an object with the specified encoding scheme.|DB2 |N |DB2 | +|53096 |53 |Invalid Operand or Inconsistent Specification |096 |The PARTITION clause was specified on CREATE AUXILIARY TABLE, but the base table is not partitioned.|DB2 |N |DB2 | +|53098 |53 |Invalid Operand or Inconsistent Specification |098 |The auxiliary table cannot be created because a column was specified that is not a LOB column.|DB2 |N |DB2 | +|53099 |53 |Invalid Operand or Inconsistent Specification |099 |A WLM ENVIRONMENT name must be specified on the CREATE FUNCTION statement.|DB2 |N |DB2 | +|530A1 |53 |Invalid Operand or Inconsistent Specification |0A1 |An ALTER TABLE statement specified FLOAT as the new data type for a column, but there is an existing index or constraint that restricts the use of FLOAT.|DB2 |N |DB2 | +|530A2 |53 |Invalid Operand or Inconsistent Specification |0A2 |The PARTITIONING clause is not allowed on the specified index.|DB2 |N |DB2 | +|530A3 |53 |Invalid Operand or Inconsistent Specification |0A3 |The specified option is not allowed for the internal representation of the routine specified|DB2 |N |DB2 | +|530A4 |53 |Invalid Operand or Inconsistent Specification |0A4 |The options specified on ALTER statement are not the same as those specified when the object was created.|DB2 |N |DB2 | +|530A5 |53 |Invalid Operand or Inconsistent Specification |0A5 |The REGENERATE option is only valid for an index with key expressions.|DB2 |N |DB2 | +|530A7 |53 |Invalid Operand or Inconsistent Specification |0A7 |EXCHANGE DATA is not allowed because the tables do not have a defined clone relationship.|DB2 |N |DB2 | +|530A8 |53 |Invalid Operand or Inconsistent Specification |0A8 |A system parameter is incompatible with the specified SQL statement.|DB2 |N |DB2 | +|530A9 |53 |Invalid Operand or Inconsistent Specification |0A9 |A temporal table is not allowed in this context. |DB2 |N |DB2 | +|53100 |53 |Insufficient Resources |100 |disk_full |PostgreSQL |N |PostgreSQL Redshift | +|53200 |53 |Insufficient Resources |200 |out_of_memory |PostgreSQL |N |PostgreSQL Redshift | +|53300 |53 |Insufficient Resources |300 |too_many_connections |PostgreSQL |N |PostgreSQL Redshift | +|53400 |53 |Insufficient Resources |400 |configuration_limit_exceeded |PostgreSQL |N |PostgreSQL | +|54000 |54 |Program Limit Exceeded |000 |program_limit_exceeded |PostgreSQL |N |PostgreSQL Redshift | +|54001 |54 |SQL or Product Limit Exceeded |001 |The statement is too long or too complex. |DB2 |N |PostgreSQL DB2 Redshift | +|54002 |54 |SQL or Product Limit Exceeded |002 |A string constant is too long. |DB2 |N |DB2 | +|54004 |54 |SQL or Product Limit Exceeded |004 |The statement has too many table names or too many items in a SELECT or INSERT list.|DB2 |N |DB2 | +|54005 |54 |SQL or Product Limit Exceeded |005 |The sort key is too long, or has too many columns. |DB2 |N |DB2 | +|54006 |54 |SQL or Product Limit Exceeded |006 |The result string is too long. |DB2 |N |DB2 | +|54008 |54 |SQL or Product Limit Exceeded |008 |The key is too long, a column of the key is too long, or the key has too many columns.|DB2 |N |DB2 | +|54010 |54 |SQL or Product Limit Exceeded |010 |The record length of the table is too long. |DB2 |N |DB2 | +|54011 |54 |SQL or Product Limit Exceeded |011 |Too many columns were specified for a table, view, or table function.|DB2 |N |PostgreSQL DB2 Redshift | +|54012 |54 |SQL or Product Limit Exceeded |012 |The literal is too long. |DB2 |N |DB2 | +|54023 |54 |Program Limit Exceeded |023 |too_many_arguments |PostgreSQL |N |PostgreSQL Redshift | +|54024 |54 |SQL or Product Limit Exceeded |024 |The check constraint, generated column, or key expression is too long.|DB2 |N |DB2 | +|54025 |54 |SQL or Product Limit Exceeded |025 |The table description exceeds the maximum size of the object descriptor.|DB2 |N |DB2 | +|54027 |54 |SQL or Product Limit Exceeded |027 |The catalog has the maximum number of user-defined indexes. |DB2 |N |DB2 | +|54035 |54 |SQL or Product Limit Exceeded |035 |An internal object limit exceeded. |DB2 |N |DB2 | +|54038 |54 |SQL or Product Limit Exceeded |038 |Maximum depth of nested routines or triggers was exceeded. |DB2 |N |DB2 | +|54041 |54 |SQL or Product Limit Exceeded |041 |The maximum number of internal identifiers has been reached.|DB2 |N |DB2 | +|54042 |54 |SQL or Product Limit Exceeded |042 |Only one index is allowed on an auxiliary table. |DB2 |N |DB2 | +|54051 |54 |SQL or Product Limit Exceeded |051 |Value specified on FETCH ABSOLUTE or RELATIVE is invalid. |DB2 |N |DB2 | +|54054 |54 |SQL or Product Limit Exceeded |054 |The number of data partitions is exceeded. |DB2 |N |DB2 | +|54055 |54 |SQL or Product Limit Exceeded |055 |The maximum number of versions has been reached for a table or index.|DB2 |N |DB2 | +|54058 |54 |SQL or Product Limit Exceeded |058 |The internal representation of an XML path is too long. |DB2 |N |DB2 | +|54065 |54 |SQL or Product Limit Exceeded |065 |The maximum of 99999 implicitly generated object names has been exceeded.|DB2 |N |DB2 | +|54068 |54 |SQL or Product Limit Exceeded |068 |Seamless automatic client reroute retry limit exceeded. |DB2 |N |DB2 | +|55000 |55 |Object Not In Prerequisite State |000 |object_not_in_prerequisite_state |PostgreSQL |N |PostgreSQL Redshift | +|55002 |55 |Object Not in Prerequisite State |002 |The explanation table is not defined properly. |DB2 |N |DB2 | +|55003 |55 |Object Not in Prerequisite State |003 |The DDL registration table is not defined properly. |DB2 |N |DB2 | +|55004 |55 |Object Not in Prerequisite State |004 |The database cannot be accessed, because it is no longer a shared database.|DB2 |N |DB2 | +|55006 |55 |Object Not in Prerequisite State |006 |The object cannot be dropped, because it is currently in use by the same application process.|DB2 |N |PostgreSQL DB2 Redshift | +|55007 |55 |Object Not in Prerequisite State |007 |The object cannot be altered, because it is currently in use by the same application process.|DB2 |N |DB2 | +|55011 |55 |Object Not in Prerequisite State |011 |The operation is disallowed, because the workfile database is not in the stopped state.|DB2 |N |DB2 | +|55012 |55 |Object Not in Prerequisite State |012 |A clustering index is not valid on the table. |DB2 |N |DB2 | +|55014 |55 |Object Not in Prerequisite State |014 |The table does not have an index to enforce the uniqueness of the primary key.|DB2 |N |DB2 | +|55015 |55 |Object Not in Prerequisite State |015 |The ALTER statement cannot be executed, because the pageset is not in the stopped state.|DB2 |N |DB2 | +|55016 |55 |Object Not in Prerequisite State |016 |The ALTER statement is invalid, because the pageset has user-managed data sets.|DB2 |N |DB2 | +|55017 |55 |Object Not in Prerequisite State |017 |The table cannot be created in the table space, because it already contains a table.|DB2 |N |DB2 | +|55019 |55 |Object Not in Prerequisite State |019 |The object is in an invalid state for the operation. |DB2 |N |DB2 | +|55020 |55 |Object Not in Prerequisite State |020 |A work file database is already defined for the member. |DB2 |N |DB2 | +|55023 |55 |Object Not in Prerequisite State |023 |An error occurred calling a procedure. |DB2 |N |DB2 | +|55030 |55 |Object Not in Prerequisite State |030 |A package specified in a remote BIND REPLACE operation must not have a system list.|DB2 |N |DB2 | +|55035 |55 |Object Not in Prerequisite State |035 |The table cannot be dropped, because it is protected. |DB2 |N |DB2 | +|55048 |55 |Object Not in Prerequisite State |048 |Encrypted data cannot be encrypted. |DB2 |N |DB2 | +|55058 |55 |Object Not in Prerequisite State |058 |The DEBUG MODE cannot be changed for a routine that was created with DISABLE DEBUG MODE.|DB2 |N |DB2 | +|55059 |55 |Object Not in Prerequisite State |059 |The currently active version for a routine cannot be dropped.|DB2 |N |DB2 | +|55063 |55 |Object Not in Prerequisite State |063 |The XML schema is not in the correct state for the operation.|DB2 |N |DB2 | +|55078 |55 |Object Not in Prerequisite State |078 |The table is already in the specified state. |DB2 |N |DB2 | +|55079 |55 |Object Not in Prerequisite State |079 |The operation cannot be performed because the XML column is not in the versioning format.|DB2 |N |DB2 | +|55P02 |55 |Object Not In Prerequisite State |P02 |cant_change_runtime_param |PostgreSQL |N |PostgreSQL Redshift | +|55P03 |55 |Object Not In Prerequisite State |P03 |lock_not_available |PostgreSQL |N |PostgreSQL Redshift | +|55P04 |55 |Object Not In Prerequisite State |P04 |unsafe_new_enum_value_usage |PostgreSQL |N |PostgreSQL | +|56010 |56 |Miscellaneous SQL or Product Error |010 |The subtype of a string variable is not the same as the subtype at bind time, and the difference cannot be resolved by character conversion.|DB2 |N |DB2 | +|56016 |56 |Miscellaneous SQL or Product Error |016 |The ranges specified for data partitions are not valid. |DB2 |N |DB2 | +|56018 |56 |Miscellaneous SQL or Product Error |018 |A column cannot be added to the table, because it has an edit procedure.|DB2 |N |DB2 | +|56023 |56 |Miscellaneous SQL or Product Error |023 |An invalid reference to a remote object has been detected. |DB2 |N |DB2 | +|56025 |56 |Miscellaneous SQL or Product Error |025 |An invalid use of AT ALL LOCATIONS in GRANT or REVOKE has been detected.|DB2 |N |DB2 | +|56027 |56 |Miscellaneous SQL or Product Error |027 |A nullable column of a foreign key with a delete rule of SET NULL cannot be part of the key of a partitioned index.|DB2 |N |DB2 | +|56031 |56 |Miscellaneous SQL or Product Error |031 |The clause or scalar function is invalid, because mixed and DBCS data are not supported on this system.|DB2 |N |DB2 | +|56036 |56 |Miscellaneous SQL or Product Error |036 |Specific and non-specific volume IDs are not allowed in a storage group.|DB2 |N |DB2 | +|56038 |56 |Miscellaneous SQL or Product Error |038 |The requested feature is not supported in this environment. |DB2 |N |DB2 | +|56040 |56 |Miscellaneous SQL or Product Error |040 |CURRENT SQLID cannot be used in a statement that references remote objects.|DB2 |N |DB2 | +|56045 |56 |Miscellaneous SQL or Product Error |045 |The application must issue a rollback operation to back out the change that was made at the read-only application server.|DB2 |N |DB2 | +|56052 |56 |Miscellaneous SQL or Product Error |052 |The remote requester tried to bind, rebind, or free a trigger package.|DB2 |N |DB2 | +|56053 |56 |Miscellaneous SQL or Product Error |053 |The parent of a table in a read-only shared database must also be a table in a read-only shared database.|DB2 |N |DB2 | +|56054 |56 |Miscellaneous SQL or Product Error |054 |User-defined data sets for objects in a shared database must be defined with SHAREOPTIONS(1,3).|DB2 |N |DB2 | +|56055 |56 |Miscellaneous SQL or Product Error |055 |The database is defined as SHARE READ, but the table space or indexspace has not been defined on the owning system.|DB2 |N |DB2 | +|56056 |56 |Miscellaneous SQL or Product Error |056 |The description of an object in a SHARE READ database must be consistent with its description in the OWNER system.|DB2 |N |DB2 | +|56057 |56 |Miscellaneous SQL or Product Error |057 |A database cannot be altered from SHARE READ to SHARE OWNER.|DB2 |N |DB2 | +|56059 |56 |Miscellaneous SQL or Product Error |059 |An error occurred when binding a triggered SQL statement. |DB2 |N |DB2 | +|56060 |56 |Miscellaneous SQL or Product Error |060 |An LE function failed. |DB2 |N |DB2 | +|56062 |56 |Miscellaneous SQL or Product Error |062 |A distributed operation is invalid, because the unit of work was started before DDF.|DB2 |N |DB2 | +|56064 |56 |Miscellaneous SQL or Product Error |064 |The bind operation is disallowed, because the program depends on functions of a release from which fallback has occurred.|DB2 |N |DB2 | +|56065 |56 |Miscellaneous SQL or Product Error |065 |The bind operation is disallowed, because the DBRM has been modified or was created for a different release.|DB2 |N |DB2 | +|56066 |56 |Miscellaneous SQL or Product Error |066 |The rebind operation is disallowed, because the plan or package depends on functions of a release from which fallback has occurred.|DB2 |N |DB2 | +|56067 |56 |Miscellaneous SQL or Product Error |067 |The rebind operation is disallowed, because the value of SYSPACKAGE.IBMREQD is invalid.|DB2 |N |DB2 | +|56072 |56 |Miscellaneous SQL or Product Error |072 |Execution failed due to the function not supported by a downlevel server that will not affect the execution of subsequent SQL statements.|DB2 |N |DB2 | +|56073 |56 |Miscellaneous SQL or Product Error |073 |Execution failed due to the function not supported by a downlevel server that will affect the execution of subsequent SQL statements.|DB2 |N |DB2 | +|56080 |56 |Miscellaneous SQL or Product Error |080 |The data type is not allowed in Db2 private protocol processing.|DB2 |N |DB2 | +|56084 |56 |Miscellaneous SQL or Product Error |084 |An unsupported SQLTYPE was encountered in a select list or input list.|DB2 |N |DB2 | +|56088 |56 |Miscellaneous SQL or Product Error |088 |ALTER FUNCTION failed because functions cannot modify data when they are processed in parallel.|DB2 |N |DB2 | +|56089 |56 |Miscellaneous SQL or Product Error |089 |Specified option requires type 2 indexes. |DB2 |N |DB2 | +|56090 |56 |Miscellaneous SQL or Product Error |090 |The alter of an index or table is not allowed. |DB2 |N |DB2 | +|56095 |56 |Miscellaneous SQL or Product Error |095 |A bind option is invalid. |DB2 |N |DB2 | +|56096 |56 |Miscellaneous SQL or Product Error |096 |Bind options are incompatible. |DB2 |N |DB2 | +|560A1 |56 |Miscellaneous SQL or Product Error |0A1 |The table space name is not valid. |DB2 |N |DB2 | +|560A2 |56 |Miscellaneous SQL or Product Error |0A2 |A LOB table and its associated base table space must be in the same database.|DB2 |N |DB2 | +|560A3 |56 |Miscellaneous SQL or Product Error |0A3 |The table is not compatible with the database. |DB2 |N |DB2 | +|560A4 |56 |Miscellaneous SQL or Product Error |0A4 |The operation is not allowed on an auxiliary table. |DB2 |N |DB2 | +|560A5 |56 |Miscellaneous SQL or Product Error |0A5 |An auxiliary table already exists for the specified column or partition.|DB2 |N |DB2 | +|560A6 |56 |Miscellaneous SQL or Product Error |0A6 |A table cannot have a LOB column unless it also has a ROWID column or cannot have an XML column unless it also has a DOCID.|DB2 |N |DB2 | +|560A7 |56 |Miscellaneous SQL or Product Error |0A7 |GBPCACHE NONE cannot be specified for a table space or index in GRECP.|DB2 |N |DB2 | +|560A8 |56 |Miscellaneous SQL or Product Error |0A8 |An 8K or 16K buffer pool pagesize is invalid for a WORKFILE object.|DB2 |N |DB2 | +|560A9 |56 |Miscellaneous SQL or Product Error |0A9 |A discontinued parameter, option, or clause was specified. |DB2 |N |DB2 | +|560AB |56 |Miscellaneous SQL or Product Error |0AB |The data type is not supported in an SQL routine. |DB2 |N |DB2 | +|560AD |56 |Miscellaneous SQL or Product Error |0AD |A view name was specified after LIKE in addition to the INCLUDING IDENTITY COLUMN ATTRIBUTES clause.|DB2 |N |DB2 | +|560AE |56 |Miscellaneous SQL or Product Error |0AE |The specified table or view is not allowed in a LIKE clause.|DB2 |N |DB2 | +|560B1 |56 |Miscellaneous SQL or Product Error |0B1 |Procedure failed because a result set was scrollable but the cursor was not positioned before the first row.|DB2 |N |DB2 | +|560B2 |56 |Miscellaneous SQL or Product Error |0B2 |Open failed because the cursor is scrollable but the client does not support scrollable cursors.|DB2 |N |DB2 | +|560B3 |56 |Miscellaneous SQL or Product Error |0B3 |Procedure failed because one or more result sets cannot be returned by the procedure to the calling application.|DB2 |N |DB2 | +|560B5 |56 |Miscellaneous SQL or Product Error |0B5 |Local special register is not valid as used. |DB2 |N |DB2 | +|560B8 |56 |Miscellaneous SQL or Product Error |0B8 |The SQL statement cannot be executed because it was precompiled at a level that is incompatible with the current value of the ENCODING bind option or special register.|DB2 |N |DB2 | +|560B9 |56 |Miscellaneous SQL or Product Error |0B9 |Hexadecimal constant GX is not allowed. |DB2 |N |DB2 | +|560BF |56 |Miscellaneous SQL or Product Error |0BF |The cryptographic facility has not been installed. |DB2 |N |DB2 | +|560C3 |56 |Miscellaneous SQL or Product Error |0C3 |An AFTER trigger cannot modify a row being inserted for an INSERT statement.|DB2 |N |DB2 | +|560C5 |56 |Miscellaneous SQL or Product Error |0C5 |The package must be bound or rebound to be successfully executed.|DB2 |N |DB2 | +|560C7 |56 |Miscellaneous SQL or Product Error |0C7 |ALTER VIEW failed. |DB2 |N |DB2 | +|560CC |56 |Miscellaneous SQL or Product Error |0CC |ALTER INDEX failed. |DB2 |N |DB2 | +|560CG |56 |Miscellaneous SQL or Product Error |0CG |An XML value contains a combination of XML nodes that causes an internal identifier limit to be exceeded.|DB2 |N |DB2 | +|560CH |56 |Miscellaneous SQL or Product Error |0CH |The maximum number of children nodes for an XML node in an XML value is exceeded.to be exceeded.|DB2 |N |DB2 | +|560CK |56 |Miscellaneous SQL or Product Error |0CK |Explain monitored statements failed. |DB2 |N |DB2 | +|560CM |56 |Miscellaneous SQL or Product Error |0CM |An error occurred in a key expression evaluation. |DB2 |N |DB2 | +|560CU |56 |Miscellaneous SQL or Product Error |0CU |The VARCHAR option is not consistent with the option specified when the procedure was created.|DB2 |N |DB2 | +|560CV |56 |Miscellaneous SQL or Product Error |0CV |Invalid table reference for table locator. |DB2 |N |DB2 | +|560CY |56 |Miscellaneous SQL or Product Error |0CY |A period specification or period clause is not valid as specified.|DB2 |N |DB2 | +|560D5 |56 |Miscellaneous SQL or Product Error |0D5 |The statement cannot be executed by the query accelerator. |DB2 |N |DB2 | +|560DC |56 |Miscellaneous SQL or Product Error |0DC |An error was detected while using the z/OS Unicode Services.|DB2 |N |DB2 | +|57000 |57 |Operator Intervention |000 |operator_intervention |PostgreSQL |N |PostgreSQL Redshift | +|57001 |57 |Resource Not Available or Operator Intervention |001 |The table is unavailable, because it does not have a primary index.|DB2 |N |DB2 | +|57002 |57 |Resource Not Available or Operator Intervention |002 |GRANT and REVOKE are invalid, because authorization has been disabled.|DB2 |N |DB2 | +|57003 |57 |Resource Not Available or Operator Intervention |003 |The specified buffer pool has not been activated. |DB2 |N |DB2 | +|57004 |57 |Resource Not Available or Operator Intervention |004 |The table is unavailable, because it lacks a partitioned index.|DB2 |N |DB2 | +|57005 |57 |Resource Not Available or Operator Intervention |005 |The statement cannot be executed, because a utility or a governor time limit was exceeded.|DB2 |N |DB2 | +|57006 |57 |Resource Not Available or Operator Intervention |006 |The object cannot be created, because a DROP or CREATE is pending.|DB2 |N |DB2 | +|57007 |57 |Resource Not Available or Operator Intervention |007 |The object cannot be used, because an operation is pending. |DB2 |N |DB2 | +|57008 |57 |Resource Not Available or Operator Intervention |008 |The date or time local format exit has not been installed. |DB2 |N |DB2 | +|57010 |57 |Resource Not Available or Operator Intervention |010 |A field procedure could not be loaded. |DB2 |N |DB2 | +|57011 |57 |Resource Not Available or Operator Intervention |011 |Virtual storage or database resource is not available. |DB2 |N |DB2 | +|57012 |57 |Resource Not Available or Operator Intervention |012 |A non-database resource is not available. This will not affect the successful execution of subsequent statements.|DB2 |N |DB2 | +|57013 |57 |Resource Not Available or Operator Intervention |013 |A non-database resource is not available. This will affect the successful execution of subsequent statements.|DB2 |N |DB2 | +|57014 |57 |Resource Not Available or Operator Intervention |014 |Processing was canceled as requested. |DB2 |N |PostgreSQL DB2 Redshift | +|57015 |57 |Resource Not Available or Operator Intervention |015 |Connection to the local Db2 not established. |DB2 |N |DB2 | +|57017 |57 |Resource Not Available or Operator Intervention |017 |Character conversion is not defined. |DB2 |N |DB2 | +|57018 |57 |Resource Not Available or Operator Intervention |018 |A DDL registration table or its unique index does not exist.|DB2 |N |DB2 | +|57023 |57 |Resource Not Available or Operator Intervention |023 |The DDL statement cannot be executed, because a DROP is pending of a DDL registration table.|DB2 |N |DB2 | +|57033 |57 |Resource Not Available or Operator Intervention |033 |Deadlock or timeout occurred without automatic rollback. |DB2 |N |DB2 | +|57051 |57 |Resource Not Available or Operator Intervention |051 |The estimated CPU cost exceeds the resource limit. |DB2 |N |DB2 | +|57053 |57 |Resource Not Available or Operator Intervention |053 |A table is not available in a routine or trigger because of violated nested SQL statement rules.|DB2 |N |DB2 | +|57054 |57 |Resource Not Available or Operator Intervention |054 |A table is not available until the auxiliary tables and indexes for its externally stored columns have been created.|DB2 |N |DB2 | +|57057 |57 |Resource Not Available or Operator Intervention |057 |The SQL statement cannot be executed due to a prior condition in a DRDA chain of SQL statements.|DB2 |N |DB2 | +|57062 |57 |Resource Not Available or Operator Intervention |062 |Adjustment not allowed for a period as a result of a data change operation.|DB2 |N |DB2 | +|57P01 |57 |Operator Intervention |P01 |admin_shutdown |PostgreSQL |N |PostgreSQL Redshift | +|57P02 |57 |Operator Intervention |P02 |crash_shutdown |PostgreSQL |N |PostgreSQL Redshift | +|57P03 |57 |Operator Intervention |P03 |cannot_connect_now |PostgreSQL |N |PostgreSQL Redshift | +|57P04 |57 |Operator Intervention |P04 |database_dropped |PostgreSQL |N |PostgreSQL | +|57P05 |57 |Operator Intervention |P05 |idle_session_timeout |PostgreSQL |N |PostgreSQL | +|58000 |58 |System Error (error external to PostgreSQL itself)|000 |system_error |PostgreSQL |N |PostgreSQL Redshift | +|58001 |58 |System Error |001 |The database cannot be created, because the assigned DBID is a duplicate.|DB2 |N |DB2 | +|58002 |58 |System Error |002 |An exit has returned an error or invalid data. |DB2 |N |DB2 | +|58003 |58 |System Error |003 |An invalid section number was detected. |DB2 |N |DB2 | +|58004 |58 |System Error |004 |A system error (that does not necessarily preclude the successful execution of subsequent SQL statements) occurred.|DB2 |N |DB2 | +|58006 |58 |System Error |006 |A system error occurred during connection. |DB2 |N |DB2 | +|58008 |58 |System Error |008 |Execution failed due to a distribution protocol error that will not affect the successful execution of subsequent DDM commands or SQL statements.|DB2 |N |DB2 | +|58009 |58 |System Error |009 |Execution failed due to a distribution protocol error that caused deallocation of the conversation.|DB2 |N |DB2 | +|58010 |58 |System Error |010 |Execution failed due to a distribution protocol error that will affect the successful execution of subsequent DDM commands or SQL statements.|DB2 |N |DB2 | +|58011 |58 |System Error |011 |The DDM command is invalid while the bind process in progress.|DB2 |N |DB2 | +|58012 |58 |System Error |012 |The bind process with the specified package name and consistency token is not active.|DB2 |N |DB2 | +|58013 |58 |System Error |013 |The SQLCODE is inconsistent with the reply message. |DB2 |N |DB2 | +|58014 |58 |System Error |014 |The DDM command is not supported. |DB2 |N |DB2 | +|58015 |58 |System Error |015 |The DDM object is not supported. |DB2 |N |DB2 | +|58016 |58 |System Error |016 |The DDM parameter is not supported. |DB2 |N |DB2 | +|58017 |58 |System Error |017 |The DDM parameter value is not supported. |DB2 |N |DB2 | +|58018 |58 |System Error |018 |The DDM reply message is not supported. |DB2 |N |DB2 | +|58026 |58 |System Error |026 |The number of variables in the statement is not equal to the number of variables in SQLSTTVRB.|DB2 |N |DB2 | +|58030 |58 |System Error (error external to PostgreSQL itself)|030 |io_error |PostgreSQL |N |PostgreSQL Redshift | +|58P01 |58 |System Error (error external to PostgreSQL itself)|P01 |undefined_file |PostgreSQL |N |PostgreSQL Redshift | +|58P02 |58 |System Error (error external to PostgreSQL itself)|P02 |duplicate_file |PostgreSQL |N |PostgreSQL Redshift | +|5UA01 |5U |Common Utilities and Tools |A01 |The task cannot be removed because it is currently executing.|DB2 |N |DB2 | +|60000 |60 |system error |000 |system error |Oracle |N |Oracle | +|61000 |61 |shared server and detached process errors |000 |shared server and detached process errors |Oracle |N |Oracle | +|62000 |62 |shared server and detached process errors |000 |shared server and detached process errors |Oracle |N |Oracle | +|63000 |63 |Oracle*XA and two-task interface errors |000 |Oracle*XA and two-task interface errors |Oracle |N |Oracle | +|64000 |64 |control file, database file, and redo file errors; archival and media recovery errors|000 |control file, database file, and redo file errors; archival and media recovery errors|Oracle |N |Oracle | +|65000 |65 |PL/SQL errors |000 |PL/SQL errors |Oracle |N |Oracle | +|66000 |66 |Oracle Net driver errors |000 |Oracle Net driver errors |Oracle |N |Oracle | +|67000 |67 |licensing errors |000 |licensing errors |Oracle |N |Oracle | +|69000 |69 |SQL*Connect errors |000 |SQL*Connect errors |Oracle |N |Oracle | +|72000 |72 |Snapshot Failure |000 |snapshot_too_old |PostgreSQL |N |PostgreSQL Oracle | +|82100 |82 |out of memory (could not allocate) |100 |out of memory (could not allocate) |Oracle |N |Oracle | +|82101 |82 |out of memory (could not allocate) |101 |inconsistent cursor cache (UCE/CUC mismatch) |Oracle |N |Oracle | +|82102 |82 |out of memory (could not allocate) |102 |inconsistent cursor cache (no CUC entry for UCE) |Oracle |N |Oracle | +|82103 |82 |out of memory (could not allocate) |103 |inconsistent cursor cache (out-or-range CUC ref) |Oracle |N |Oracle | +|82104 |82 |out of memory (could not allocate) |104 |inconsistent cursor cache (no CUC available) |Oracle |N |Oracle | +|82105 |82 |out of memory (could not allocate) |105 |inconsistent cursor cache (no CUC entry in cache) |Oracle |N |Oracle | +|82106 |82 |out of memory (could not allocate) |106 |inconsistent cursor cache (invalid cursor number) |Oracle |N |Oracle | +|82107 |82 |out of memory (could not allocate) |107 |program too old for runtime library; re-precompile |Oracle |N |Oracle | +|82108 |82 |out of memory (could not allocate) |108 |invalid descriptor passed to runtime library |Oracle |N |Oracle | +|82109 |82 |out of memory (could not allocate) |109 |inconsistent host cache (out-or-range SIT ref) |Oracle |N |Oracle | +|82110 |82 |out of memory (could not allocate) |110 |inconsistent host cache (invalid SQL type) |Oracle |N |Oracle | +|82111 |82 |out of memory (could not allocate) |111 |heap consistency error |Oracle |N |Oracle | +|82113 |82 |out of memory (could not allocate) |113 |code generation internal consistency failed |Oracle |N |Oracle | +|82114 |82 |out of memory (could not allocate) |114 |reentrant code generator gave invalid context |Oracle |N |Oracle | +|82117 |82 |out of memory (could not allocate) |117 |invalid OPEN or PREPARE for this connection |Oracle |N |Oracle | +|82118 |82 |out of memory (could not allocate) |118 |application context not found |Oracle |N |Oracle | +|82119 |82 |out of memory (could not allocate) |119 |unable to obtain error message text |Oracle |N |Oracle | +|82120 |82 |out of memory (could not allocate) |120 |Precompiler/SQLLIB version mismatch |Oracle |N |Oracle | +|82121 |82 |out of memory (could not allocate) |121 |NCHAR error; fetched number of bytes is odd |Oracle |N |Oracle | +|82122 |82 |out of memory (could not allocate) |122 |EXEC TOOLS interface not available |Oracle |N |Oracle | +|82123 |82 |out of memory (could not allocate) |123 |runtime context in use |Oracle |N |Oracle | +|82124 |82 |out of memory (could not allocate) |124 |unable to allocate runtime context |Oracle |N |Oracle | +|82125 |82 |out of memory (could not allocate) |125 |unable to initialize process for use with threads |Oracle |N |Oracle | +|82126 |82 |out of memory (could not allocate) |126 |invalid runtime context |Oracle |N |Oracle | +|F0000 |F0 |Configuration File Error |000 |config_file_error |PostgreSQL |N |PostgreSQL Redshift | +|F0001 |F0 |Configuration File Error |001 |lock_file_exists |PostgreSQL |N |PostgreSQL Redshift | +|HV000 |HV |FDW-specific condition |000 |(no subclass) |SQL/MED |Y |SQL/MED PostgreSQL | +|HV001 |HV |FDW-specific condition |001 |memory allocation error |SQL/MED |Y |SQL/MED PostgreSQL | +|HV002 |HV |FDW-specific condition |002 |dynamic parameter value needed |SQL/MED |Y |SQL/MED PostgreSQL | +|HV004 |HV |FDW-specific condition |004 |invalid data type |SQL/MED |Y |SQL/MED PostgreSQL | +|HV005 |HV |FDW-specific condition |005 |column name not found |SQL/MED |Y |SQL/MED PostgreSQL | +|HV006 |HV |FDW-specific condition |006 |invalid data type descriptors |SQL/MED |Y |SQL/MED PostgreSQL | +|HV007 |HV |FDW-specific condition |007 |invalid column name |SQL/MED |Y |SQL/MED PostgreSQL | +|HV008 |HV |FDW-specific condition |008 |invalid column number |SQL/MED |Y |SQL/MED PostgreSQL | +|HV009 |HV |FDW-specific condition |009 |invalid use of null pointer |SQL/MED |Y |SQL/MED PostgreSQL | +|HV00A |HV |FDW-specific condition |00A |invalid string format |SQL/MED |Y |SQL/MED PostgreSQL | +|HV00B |HV |FDW-specific condition |00B |invalid handle |SQL/MED |Y |SQL/MED PostgreSQL | +|HV00C |HV |FDW-specific condition |00C |invalid option index |SQL/MED |Y |SQL/MED PostgreSQL | +|HV00D |HV |FDW-specific condition |00D |invalid option name |SQL/MED |Y |SQL/MED PostgreSQL | +|HV00J |HV |FDW-specific condition |00J |option name not found |SQL/MED |Y |SQL/MED PostgreSQL | +|HV00K |HV |FDW-specific condition |00K |reply handle |SQL/MED |Y |SQL/MED PostgreSQL | +|HV00L |HV |FDW-specific condition |00L |unable to create execution |SQL/MED |Y |SQL/MED PostgreSQL | +|HV00M |HV |FDW-specific condition |00M |unable to create reply |SQL/MED |Y |SQL/MED PostgreSQL | +|HV00N |HV |FDW-specific condition |00N |unable to establish connection |SQL/MED |Y |SQL/MED PostgreSQL | +|HV00P |HV |FDW-specific condition |00P |no schemas |SQL/MED |Y |SQL/MED PostgreSQL | +|HV00Q |HV |FDW-specific condition |00Q |schema not found |SQL/MED |Y |SQL/MED PostgreSQL | +|HV00R |HV |FDW-specific condition |00R |table not found |SQL/MED |Y |SQL/MED PostgreSQL | +|HV010 |HV |FDW-specific condition |010 |function sequence error |SQL/MED |Y |SQL/MED PostgreSQL | +|HV014 |HV |FDW-specific condition |014 |limit on number of handles exceeded |SQL/MED |Y |SQL/MED PostgreSQL | +|HV021 |HV |FDW-specific condition |021 |inconsistent descriptor informa- tion |SQL/MED |Y |SQL/MED PostgreSQL | +|HV024 |HV |FDW-specific condition |024 |invalid attribute value |SQL/MED |Y |SQL/MED PostgreSQL | +|HV090 |HV |FDW-specific condition |090 |invalid string length or buffer length |SQL/MED |Y |SQL/MED PostgreSQL | +|HV091 |HV |FDW-specific condition |091 |invalid descriptor field identifier |SQL/MED |Y |SQL/MED PostgreSQL | +|HW000 |HW |datalink exception |000 |(no subclass) |SQL/MED |Y |SQL/MED | +|HW001 |HW |datalink exception |001 |external file not linked |SQL/MED |Y |SQL/MED | +|HW002 |HW |datalink exception |002 |external file already linked |SQL/MED |Y |SQL/MED | +|HW003 |HW |datalink exception |003 |referenced file does not exist |SQL/MED |Y |SQL/MED | +|HW004 |HW |datalink exception |004 |invalid write token |SQL/MED |Y |SQL/MED | +|HW005 |HW |datalink exception |005 |invalid datalink construction |SQL/MED |Y |SQL/MED | +|HW006 |HW |datalink exception |006 |invalid write permission for update |SQL/MED |Y |SQL/MED | +|HW007 |HW |datalink exception |007 |referenced file not valid |SQL/MED |Y |SQL/MED | +|HY000 |HY |CLI-specific condition |000 |(no subclass) |SQL/CLI |Y |SQL/CLI SQL Server | +|HY001 |HY |CLI-specific condition |001 |memory allocation error |SQL/CLI |Y |SQL/CLI SQL Server | +|HY003 |HY |CLI-specific condition |003 |invalid data type in application descriptor |SQL/CLI |Y |SQL/CLI SQL Server | +|HY004 |HY |CLI-specific condition |004 |invalid data type |SQL/CLI |Y |SQL/CLI SQL Server | +|HY007 |HY |CLI-specific condition |007 |associated statement is not pre- pared |SQL/CLI |Y |SQL/CLI SQL Server | +|HY008 |HY |CLI-specific condition |008 |operation canceled |SQL/CLI |Y |SQL/CLI SQL Server | +|HY009 |HY |CLI-specific condition |009 |invalid use of null pointer |SQL/CLI |Y |SQL/CLI SQL Server | +|HY010 |HY |CLI-specific condition |010 |function sequence error |SQL/CLI |Y |SQL/CLI SQL Server | +|HY011 |HY |CLI-specific condition |011 |attribute cannot be set now |SQL/CLI |Y |SQL/CLI SQL Server | +|HY012 |HY |CLI-specific condition |012 |invalid transaction operation code |SQL/CLI |Y |SQL/CLI SQL Server | +|HY013 |HY |CLI-specific condition |013 |memory management error |SQL/CLI |Y |SQL/CLI SQL Server | +|HY014 |HY |CLI-specific condition |014 |limit on number of handles exceeded |SQL/CLI |Y |SQL/CLI SQL Server | +|HY015 |HY |CLI-specific condition |015 |No cursor name available |SQL Server |N |SQL Server | +|HY016 |HY |CLI-specific condition |016 |Cannot modify an implementation row descriptor |SQL Server |N |SQL Server | +|HY017 |HY |CLI-specific condition |017 |invalid use of automatically-allocated descriptor handle |SQL/CLI |Y |SQL/CLI SQL Server | +|HY018 |HY |CLI-specific condition |018 |server declined the cancellation request |SQL/CLI |Y |SQL/CLI SQL Server | +|HY019 |HY |CLI-specific condition |019 |non-string data cannot be sent in pieces |SQL/CLI |Y |SQL/CLI SQL Server | +|HY020 |HY |CLI-specific condition |020 |attempt to concatenate a null value |SQL/CLI |Y |SQL/CLI SQL Server | +|HY021 |HY |CLI-specific condition |021 |inconsistent descriptor information |SQL/CLI |Y |SQL/CLI SQL Server | +|HY024 |HY |CLI-specific condition |024 |invalid attribute value |SQL/CLI |Y |SQL/CLI SQL Server | +|HY055 |HY |CLI-specific condition |055 |non-string data cannot be used with string routine |SQL/CLI |Y |SQL/CLI | +|HY090 |HY |CLI-specific condition |090 |invalid string length or buffer length |SQL/CLI |Y |SQL/CLI SQL Server | +|HY091 |HY |CLI-specific condition |091 |invalid descriptor field identifier |SQL/CLI |Y |SQL/CLI SQL Server | +|HY092 |HY |CLI-specific condition |092 |invalid attribute identifier |SQL/CLI |Y |SQL/CLI SQL Server | +|HY093 |HY |CLI-specific condition |093 |invalid datalink value |SQL/MED |Y |SQL/MED | +|HY095 |HY |CLI-specific condition |095 |invalid FunctionId specified |SQL/CLI |Y |SQL/CLI SQL Server | +|HY096 |HY |CLI-specific condition |096 |invalid information type |SQL/CLI |Y |SQL/CLI SQL Server | +|HY097 |HY |CLI-specific condition |097 |column type out of range |SQL/CLI |Y |SQL/CLI SQL Server | +|HY098 |HY |CLI-specific condition |098 |scope out of range |SQL/CLI |Y |SQL/CLI SQL Server | +|HY099 |HY |CLI-specific condition |099 |nullable type out of range |SQL/CLI |Y |SQL/CLI SQL Server | +|HY100 |HY |CLI-specific condition |100 |Uniqueness option type out of range |SQL Server |N |SQL Server | +|HY101 |HY |CLI-specific condition |101 |Accuracy option type out of range |SQL Server |N |SQL Server | +|HY103 |HY |CLI-specific condition |103 |invalid retrieval code |SQL/CLI |Y |SQL/CLI SQL Server | +|HY104 |HY |CLI-specific condition |104 |invalid LengthPrecision value |SQL/CLI |Y |SQL/CLI SQL Server | +|HY105 |HY |CLI-specific condition |105 |invalid parameter mode |SQL/CLI |Y |SQL/CLI SQL Server | +|HY106 |HY |CLI-specific condition |106 |invalid fetch orientation |SQL/CLI |Y |SQL/CLI SQL Server | +|HY107 |HY |CLI-specific condition |107 |row value out of range |SQL/CLI |Y |SQL/CLI SQL Server | +|HY109 |HY |CLI-specific condition |109 |invalid cursor position |SQL/CLI |Y |SQL/CLI SQL Server | +|HY110 |HY |CLI-specific condition |110 |Invalid driver completion |SQL Server |N |SQL Server | +|HY111 |HY |CLI-specific condition |111 |Invalid bookmark value |SQL Server |N |SQL Server | +|HYC00 |HY |CLI-specific condition |C00 |optional feature not implemented |SQL/CLI |Y |SQL/CLI SQL Server | +|HYT00 |HY |CLI-specific condition |T00 |Timeout expired |SQL Server |N |SQL Server | +|HYT01 |HY |CLI-specific condition |T01 |Connection timeout expired |SQL Server |N |SQL Server | +|HZ000 |HZ |RDA-specific condition |000 |(no subclass) |RDA/SQL |Y |RDA/SQL Oracle | +|HZ301 |HZ |RDA-specific condition |301 |attribute not permitted |RDA/SQL |Y |RDA/SQL | +|HZ302 |HZ |RDA-specific condition |302 |authentication failure |RDA/SQL |Y |RDA/SQL | +|HZ303 |HZ |RDA-specific condition |303 |duplicate request ident |RDA/SQL |Y |RDA/SQL | +|HZ304 |HZ |RDA-specific condition |304 |encoding not supported |RDA/SQL |Y |RDA/SQL | +|HZ305 |HZ |RDA-specific condition |305 |feature not supported – multiple server transactions |RDA/SQL |Y |RDA/SQL | +|HZ306 |HZ |RDA-specific condition |306 |invalid attribute type |RDA/SQL |Y |RDA/SQL | +|HZ307 |HZ |RDA-specific condition |307 |invalid fetch count |RDA/SQL |Y |RDA/SQL | +|HZ308 |HZ |RDA-specific condition |308 |invalid message type |RDA/SQL |Y |RDA/SQL | +|HZ309 |HZ |RDA-specific condition |309 |invalid operation sequence |RDA/SQL |Y |RDA/SQL | +|HZ310 |HZ |RDA-specific condition |310 |invalid transaction operation code |RDA/SQL |Y |RDA/SQL | +|HZ311 |HZ |RDA-specific condition |311 |mismatch between descriptor and row |RDA/SQL |Y |RDA/SQL | +|HZ312 |HZ |RDA-specific condition |312 |no connection handle available |RDA/SQL |Y |RDA/SQL | +|HZ313 |HZ |RDA-specific condition |313 |number of values does not match number of item descriptors |RDA/SQL |Y |RDA/SQL | +|HZ314 |HZ |RDA-specific condition |314 |transaction cannot commit |RDA/SQL |Y |RDA/SQL | +|HZ315 |HZ |RDA-specific condition |315 |transaction state unknown |RDA/SQL |Y |RDA/SQL | +|HZ316 |HZ |RDA-specific condition |316 |transport failure |RDA/SQL |Y |RDA/SQL | +|HZ317 |HZ |RDA-specific condition |317 |unexpected parameter descriptor |RDA/SQL |Y |RDA/SQL | +|HZ318 |HZ |RDA-specific condition |318 |unexpected row descriptor |RDA/SQL |Y |RDA/SQL | +|HZ319 |HZ |RDA-specific condition |319 |unexpected rows |RDA/SQL |Y |RDA/SQL | +|HZ320 |HZ |RDA-specific condition |320 |version not supported |RDA/SQL |Y |RDA/SQL | +|HZ321 |HZ |RDA-specific condition |321 |TCP/IP error |RDA/SQL |Y |RDA/SQL | +|HZ322 |HZ |RDA-specific condition |322 |TLS alert |RDA/SQL |Y |RDA/SQL | +|IM001 |IM |ODBC driver |001 |Driver does not support this function |SQL Server |N |SQL Server | +|IM002 |IM |ODBC driver |002 |Data source name not found and no default driver specified |SQL Server |N |SQL Server | +|IM003 |IM |ODBC driver |003 |Specified driver could not be loaded |SQL Server |N |SQL Server | +|IM004 |IM |ODBC driver |004 |Driver's SQLAllocHandle on SQL_HANDLE_ENV failed |SQL Server |N |SQL Server | +|IM005 |IM |ODBC driver |005 |Driver's SQLAllocHandle on SQL_HANDLE_DBC failed |SQL Server |N |SQL Server | +|IM006 |IM |ODBC driver |006 |Driver's SQLSetConnectAttr failed |SQL Server |N |SQL Server | +|IM007 |IM |ODBC driver |007 |No data source or driver specified; dialog prohibited |SQL Server |N |SQL Server | +|IM008 |IM |ODBC driver |008 |Dialog failed |SQL Server |N |SQL Server | +|IM009 |IM |ODBC driver |009 |Unable to load translation DLL |SQL Server |N |SQL Server | +|IM010 |IM |ODBC driver |010 |Data source name too long |SQL Server |N |SQL Server | +|IM011 |IM |ODBC driver |011 |Driver name too long |SQL Server |N |SQL Server | +|IM012 |IM |ODBC driver |012 |DRIVER keyword syntax error |SQL Server |N |SQL Server | +|IM013 |IM |ODBC driver |013 |Trace file error |SQL Server |N |SQL Server | +|IM014 |IM |ODBC driver |014 |Invalid name of File DSN |SQL Server |N |SQL Server | +|IM015 |IM |ODBC driver |015 |Corrupt file data source |SQL Server |N |SQL Server | +|P0000 |P0 |PL/pgSQL Error |000 |plpgsql_error |PostgreSQL |N |PostgreSQL Redshift | +|P0001 |P0 |PL/pgSQL Error |001 |raise_exception |PostgreSQL |N |PostgreSQL Redshift | +|P0002 |P0 |PL/pgSQL Error |002 |no_data_found |PostgreSQL |N |PostgreSQL Redshift | +|P0003 |P0 |PL/pgSQL Error |003 |too_many_rows |PostgreSQL |N |PostgreSQL Redshift | +|P0004 |P0 |PL/pgSQL Error |004 |assert_failure |PostgreSQL |N |PostgreSQL | +|XX000 |XX |Internal Error |000 |internal_error |PostgreSQL |N |PostgreSQL Redshift | +|XX001 |XX |Internal Error |001 |data_corrupted |PostgreSQL |N |PostgreSQL Redshift | +|XX002 |XX |Internal Error |002 |index_corrupted |PostgreSQL |N |PostgreSQL Redshift | diff --git a/core/src/main/resources/error/error-classes.json b/core/src/main/resources/error/error-classes.json index 89a9d5af587d7..febed9283d89d 100644 --- a/core/src/main/resources/error/error-classes.json +++ b/core/src/main/resources/error/error-classes.json @@ -1,243 +1,5548 @@ { - "AMBIGUOUS_FIELD_NAME" : { - "message" : [ "Field name is ambiguous and has matching fields in the struct." ], + "AMBIGUOUS_COLUMN_OR_FIELD" : { + "message" : [ + "Column or field is ambiguous and has matches." + ], + "sqlState" : "42702" + }, + "AMBIGUOUS_LATERAL_COLUMN_ALIAS" : { + "message" : [ + "Lateral column alias is ambiguous and has matches." + ], + "sqlState" : "42702" + }, + "AMBIGUOUS_REFERENCE" : { + "message" : [ + "Reference is ambiguous, could be: ." + ], + "sqlState" : "42704" + }, + "AMBIGUOUS_REFERENCE_TO_FIELDS" : { + "message" : [ + "Ambiguous reference to the field . It appears times in the schema." + ], "sqlState" : "42000" }, "ARITHMETIC_OVERFLOW" : { - "message" : [ ". If necessary set to \"false\" (except for ANSI interval type) to bypass this error." ], + "message" : [ + ". If necessary set to \"false\" to bypass this error." + ], "sqlState" : "22003" }, "CANNOT_CAST_DATATYPE" : { - "message" : [ "Cannot cast to ." ], - "sqlState" : "22005" + "message" : [ + "Cannot cast to ." + ], + "sqlState" : "42846" + }, + "CANNOT_CONSTRUCT_PROTOBUF_DESCRIPTOR" : { + "message" : [ + "Error constructing FileDescriptor for ." + ] + }, + "CANNOT_CONVERT_PROTOBUF_FIELD_TYPE_TO_SQL_TYPE" : { + "message" : [ + "Cannot convert Protobuf to SQL because schema is incompatible (protobufType = , sqlType = )." + ] + }, + "CANNOT_CONVERT_PROTOBUF_MESSAGE_TYPE_TO_SQL_TYPE" : { + "message" : [ + "Unable to convert of Protobuf to SQL type ." + ] + }, + "CANNOT_CONVERT_SQL_TYPE_TO_PROTOBUF_ENUM_TYPE" : { + "message" : [ + "Cannot convert SQL to Protobuf because cannot be written since it's not defined in ENUM ." + ] + }, + "CANNOT_CONVERT_SQL_TYPE_TO_PROTOBUF_FIELD_TYPE" : { + "message" : [ + "Cannot convert SQL to Protobuf because schema is incompatible (protobufType = , sqlType = )." + ] }, - "CANNOT_CHANGE_DECIMAL_PRECISION" : { - "message" : [ " cannot be represented as Decimal(, ). If necessary set to \"false\" to bypass this error." ], - "sqlState" : "22005" + "CANNOT_DECODE_URL" : { + "message" : [ + "Cannot decode url : ." + ], + "sqlState" : "22546" + }, + "CANNOT_LOAD_FUNCTION_CLASS" : { + "message" : [ + "Cannot load class when registering the function , please make sure it is on the classpath." + ] + }, + "CANNOT_LOAD_PROTOBUF_CLASS" : { + "message" : [ + "Could not load Protobuf class with name . ." + ] + }, + "CANNOT_MERGE_INCOMPATIBLE_DATA_TYPE" : { + "message" : [ + "Failed to merge incompatible data types and ." + ], + "sqlState" : "42825" + }, + "CANNOT_MODIFY_CONFIG" : { + "message" : [ + "Cannot modify the value of the Spark config: .", + "See also '/sql-migration-guide.html#ddl-statements'." + ], + "sqlState" : "46110" }, "CANNOT_PARSE_DECIMAL" : { - "message" : [ "Cannot parse decimal" ], - "sqlState" : "42000" + "message" : [ + "Cannot parse decimal." + ], + "sqlState" : "22018" }, - "CANNOT_UP_CAST_DATATYPE" : { - "message" : [ "Cannot up cast from to .\n
" ] + "CANNOT_PARSE_JSON_FIELD" : { + "message" : [ + "Cannot parse the field name and the value of the JSON token type to target Spark data type ." + ], + "sqlState" : "2203G" + }, + "CANNOT_PARSE_PROTOBUF_DESCRIPTOR" : { + "message" : [ + "Error parsing file descriptor byte[] into Descriptor object." + ] + }, + "CANNOT_PARSE_TIMESTAMP" : { + "message" : [ + ". If necessary set to \"false\" to bypass this error." + ], + "sqlState" : "22007" }, - "CANNOT_USE_MIXTURE" : { - "message" : [ "Cannot use a mixture of aggregate function and group aggregate pandas UDF" ] + "CANNOT_READ_FILE_FOOTER" : { + "message" : [ + "Could not read footer for file: ." + ] + }, + "CANNOT_RECOGNIZE_HIVE_TYPE" : { + "message" : [ + "Cannot recognize hive type string: , column: ." + ], + "sqlState" : "429BB" + }, + "CANNOT_RESTORE_PERMISSIONS_FOR_PATH" : { + "message" : [ + "Failed to set permissions on created path back to ." + ] + }, + "CANNOT_UP_CAST_DATATYPE" : { + "message" : [ + "Cannot up cast from to .", + "
" + ] }, "CAST_INVALID_INPUT" : { - "message" : [ "The value of the type cannot be cast to because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. If necessary set to \"false\" to bypass this error." ], - "sqlState" : "42000" + "message" : [ + "The value of the type cannot be cast to because it is malformed. Correct the value as per the syntax, or change its target type. Use `try_cast` to tolerate malformed input and return NULL instead. If necessary set to \"false\" to bypass this error." + ], + "sqlState" : "22018" }, "CAST_OVERFLOW" : { - "message" : [ "The value of the type cannot be cast to due to an overflow. Use `try_cast` to tolerate overflow and return NULL instead. If necessary set to \"false\" to bypass this error." ], - "sqlState" : "22005" + "message" : [ + "The value of the type cannot be cast to due to an overflow. Use `try_cast` to tolerate overflow and return NULL instead. If necessary set to \"false\" to bypass this error." + ], + "sqlState" : "22003" }, "CAST_OVERFLOW_IN_TABLE_INSERT" : { - "message" : [ "Fail to insert a value of type into the type column due to an overflow. Use `try_cast` on the input value to tolerate overflow and return NULL instead." ], - "sqlState" : "22005" + "message" : [ + "Fail to insert a value of type into the type column due to an overflow. Use `try_cast` on the input value to tolerate overflow and return NULL instead." + ], + "sqlState" : "22003" + }, + "COLUMN_ALREADY_EXISTS" : { + "message" : [ + "The column already exists. Consider to choose another name or rename the existing column." + ], + "sqlState" : "42711" + }, + "COLUMN_NOT_FOUND" : { + "message" : [ + "The column cannot be found. Verify the spelling and correctness of the column name according to the SQL config ." + ], + "sqlState" : "42703" + }, + "COMPARATOR_RETURNS_NULL" : { + "message" : [ + "The comparator has returned a NULL for a comparison between and . It should return a positive integer for \"greater than\", 0 for \"equal\" and a negative integer for \"less than\". To revert to deprecated behavior where NULL is treated as 0 (equal), you must set \"spark.sql.legacy.allowNullComparisonResultInArraySort\" to \"true\"." + ] }, "CONCURRENT_QUERY" : { - "message" : [ "Another instance of this query was just started by a concurrent session." ] + "message" : [ + "Another instance of this query was just started by a concurrent session." + ] + }, + "CONNECT" : { + "message" : [ + "Generic Spark Connect error." + ], + "subClass" : { + "INTERCEPTOR_CTOR_MISSING" : { + "message" : [ + "Cannot instantiate GRPC interceptor because is missing a default constructor without arguments." + ] + }, + "INTERCEPTOR_RUNTIME_ERROR" : { + "message" : [ + "Error instantiating GRPC interceptor: " + ] + }, + "PLUGIN_CTOR_MISSING" : { + "message" : [ + "Cannot instantiate Spark Connect plugin because is missing a default constructor without arguments." + ] + }, + "PLUGIN_RUNTIME_ERROR" : { + "message" : [ + "Error instantiating Spark Connect plugin: " + ] + } + } + }, + "CONVERSION_INVALID_INPUT" : { + "message" : [ + "The value () cannot be converted to because it is malformed. Correct the value as per the syntax, or change its format. Use to tolerate malformed input and return NULL instead." + ], + "sqlState" : "22018" + }, + "CREATE_TABLE_COLUMN_OPTION_DUPLICATE" : { + "message" : [ + "CREATE TABLE column specifies option \"\" more than once, which is invalid." + ], + "sqlState" : "42710" + }, + "DATATYPE_MISMATCH" : { + "message" : [ + "Cannot resolve due to data type mismatch:" + ], + "subClass" : { + "ARRAY_FUNCTION_DIFF_TYPES" : { + "message" : [ + "Input to should have been followed by a value with same element type, but it's [, ]." + ] + }, + "BINARY_ARRAY_DIFF_TYPES" : { + "message" : [ + "Input to function should have been two with same element type, but it's [, ]." + ] + }, + "BINARY_OP_DIFF_TYPES" : { + "message" : [ + "the left and right operands of the binary operator have incompatible types ( and )." + ] + }, + "BINARY_OP_WRONG_TYPE" : { + "message" : [ + "the binary operator requires the input type , not ." + ] + }, + "BLOOM_FILTER_BINARY_OP_WRONG_TYPE" : { + "message" : [ + "The Bloom filter binary input to should be either a constant value or a scalar subquery expression, but it's ." + ] + }, + "BLOOM_FILTER_WRONG_TYPE" : { + "message" : [ + "Input to function should have been followed by value with , but it's []." + ] + }, + "CANNOT_CONVERT_TO_JSON" : { + "message" : [ + "Unable to convert column of type to JSON." + ] + }, + "CANNOT_DROP_ALL_FIELDS" : { + "message" : [ + "Cannot drop all fields in struct." + ] + }, + "CAST_WITHOUT_SUGGESTION" : { + "message" : [ + "cannot cast to ." + ] + }, + "CAST_WITH_CONF_SUGGESTION" : { + "message" : [ + "cannot cast to with ANSI mode on.", + "If you have to cast to , you can set as ." + ] + }, + "CAST_WITH_FUNC_SUGGESTION" : { + "message" : [ + "cannot cast to .", + "To convert values from to , you can use the functions instead." + ] + }, + "CREATE_MAP_KEY_DIFF_TYPES" : { + "message" : [ + "The given keys of function should all be the same type, but they are ." + ] + }, + "CREATE_MAP_VALUE_DIFF_TYPES" : { + "message" : [ + "The given values of function should all be the same type, but they are ." + ] + }, + "CREATE_NAMED_STRUCT_WITHOUT_FOLDABLE_STRING" : { + "message" : [ + "Only foldable `STRING` expressions are allowed to appear at odd position, but they are ." + ] + }, + "DATA_DIFF_TYPES" : { + "message" : [ + "Input to should all be the same type, but it's ." + ] + }, + "FILTER_NOT_BOOLEAN" : { + "message" : [ + "Filter expression of type is not a boolean." + ] + }, + "HASH_MAP_TYPE" : { + "message" : [ + "Input to the function cannot contain elements of the \"MAP\" type. In Spark, same maps may have different hashcode, thus hash expressions are prohibited on \"MAP\" elements. To restore previous behavior set \"spark.sql.legacy.allowHashOnMapType\" to \"true\"." + ] + }, + "INPUT_SIZE_NOT_ONE" : { + "message" : [ + "Length of should be 1." + ] + }, + "INVALID_ARG_VALUE" : { + "message" : [ + "The value must to be a literal of , but got ." + ] + }, + "INVALID_JSON_MAP_KEY_TYPE" : { + "message" : [ + "Input schema can only contain STRING as a key type for a MAP." + ] + }, + "INVALID_JSON_SCHEMA" : { + "message" : [ + "Input schema must be a struct, an array or a map." + ] + }, + "INVALID_MAP_KEY_TYPE" : { + "message" : [ + "The key of map cannot be/contain ." + ] + }, + "INVALID_ORDERING_TYPE" : { + "message" : [ + "The does not support ordering on type ." + ] + }, + "IN_SUBQUERY_DATA_TYPE_MISMATCH" : { + "message" : [ + "The data type of one or more elements in the left hand side of an IN subquery is not compatible with the data type of the output of the subquery. Mismatched columns: [], left side: [], right side: []." + ] + }, + "IN_SUBQUERY_LENGTH_MISMATCH" : { + "message" : [ + "The number of columns in the left hand side of an IN subquery does not match the number of columns in the output of subquery. Left hand side columns(length: ): [], right hand side columns(length: ): []." + ] + }, + "MAP_CONCAT_DIFF_TYPES" : { + "message" : [ + "The should all be of type map, but it's ." + ] + }, + "MAP_FUNCTION_DIFF_TYPES" : { + "message" : [ + "Input to should have been followed by a value with same key type, but it's [, ]." + ] + }, + "MAP_ZIP_WITH_DIFF_TYPES" : { + "message" : [ + "Input to the should have been two maps with compatible key types, but it's [, ]." + ] + }, + "NON_FOLDABLE_INPUT" : { + "message" : [ + "the input should be a foldable expression; however, got ." + ] + }, + "NON_STRING_TYPE" : { + "message" : [ + "all arguments must be strings." + ] + }, + "NULL_TYPE" : { + "message" : [ + "Null typed values cannot be used as arguments of ." + ] + }, + "PARAMETER_CONSTRAINT_VIOLATION" : { + "message" : [ + "The () must be the ()." + ] + }, + "RANGE_FRAME_INVALID_TYPE" : { + "message" : [ + "The data type used in the order specification does not match the data type which is used in the range frame." + ] + }, + "RANGE_FRAME_MULTI_ORDER" : { + "message" : [ + "A range window frame with value boundaries cannot be used in a window specification with multiple order by expressions: ." + ] + }, + "RANGE_FRAME_WITHOUT_ORDER" : { + "message" : [ + "A range window frame cannot be used in an unordered window specification." + ] + }, + "SEQUENCE_WRONG_INPUT_TYPES" : { + "message" : [ + " uses the wrong parameter type. The parameter type must conform to:", + "1. The start and stop expressions must resolve to the same type.", + "2. If start and stop expressions resolve to the type, then the step expression must resolve to the type.", + "3. Otherwise, if start and stop expressions resolve to the type, then the step expression must resolve to the same type." + ] + }, + "SPECIFIED_WINDOW_FRAME_DIFF_TYPES" : { + "message" : [ + "Window frame bounds and do not have the same type: <> ." + ] + }, + "SPECIFIED_WINDOW_FRAME_INVALID_BOUND" : { + "message" : [ + "Window frame upper bound does not follow the lower bound ." + ] + }, + "SPECIFIED_WINDOW_FRAME_UNACCEPTED_TYPE" : { + "message" : [ + "The data type of the bound does not match the expected data type ." + ] + }, + "SPECIFIED_WINDOW_FRAME_WITHOUT_FOLDABLE" : { + "message" : [ + "Window frame bound is not a literal." + ] + }, + "SPECIFIED_WINDOW_FRAME_WRONG_COMPARISON" : { + "message" : [ + "The lower bound of a window frame must be to the upper bound." + ] + }, + "STACK_COLUMN_DIFF_TYPES" : { + "message" : [ + "The data type of the column () do not have the same type: () <> ()." + ] + }, + "UNEXPECTED_CLASS_TYPE" : { + "message" : [ + "class not found." + ] + }, + "UNEXPECTED_INPUT_TYPE" : { + "message" : [ + "Parameter requires the type, however has the type ." + ] + }, + "UNEXPECTED_NULL" : { + "message" : [ + "The must not be null." + ] + }, + "UNEXPECTED_RETURN_TYPE" : { + "message" : [ + "The requires return type, but the actual is type." + ] + }, + "UNEXPECTED_STATIC_METHOD" : { + "message" : [ + "cannot find a static method that matches the argument types in ." + ] + }, + "UNSUPPORTED_INPUT_TYPE" : { + "message" : [ + "The input of can't be type data." + ] + }, + "VALUE_OUT_OF_RANGE" : { + "message" : [ + "The must be between (current value = )." + ] + }, + "WRONG_NUM_ENDPOINTS" : { + "message" : [ + "The number of endpoints must be >= 2 to construct intervals but the actual number is ." + ] + } + }, + "sqlState" : "42K09" + }, + "DATATYPE_MISSING_SIZE" : { + "message" : [ + "DataType requires a length parameter, for example (10). Please specify the length." + ], + "sqlState" : "42K01" + }, + "DATA_SOURCE_NOT_FOUND" : { + "message" : [ + "Failed to find the data source: . Please find packages at `https://spark.apache.org/third-party-projects.html`." + ], + "sqlState" : "42K02" }, "DATETIME_OVERFLOW" : { - "message" : [ "Datetime operation overflow: ." ], + "message" : [ + "Datetime operation overflow: ." + ], "sqlState" : "22008" }, + "DECIMAL_PRECISION_EXCEEDS_MAX_PRECISION" : { + "message" : [ + "Decimal precision exceeds max precision ." + ], + "sqlState" : "22003" + }, + "DEFAULT_DATABASE_NOT_EXISTS" : { + "message" : [ + "Default database does not exist, please create it first or change default database to ``." + ], + "sqlState" : "42704" + }, "DIVIDE_BY_ZERO" : { - "message" : [ "Division by zero. Use `try_divide` to tolerate divisor being 0 and return NULL instead. If necessary set to \"false\" (except for ANSI interval type) to bypass this error." ], + "message" : [ + "Division by zero. Use `try_divide` to tolerate divisor being 0 and return NULL instead. If necessary set to \"false\" to bypass this error." + ], "sqlState" : "22012" }, + "DUPLICATED_MAP_KEY" : { + "message" : [ + "Duplicate map key was found, please check the input data. If you want to remove the duplicated keys, you can set to \"LAST_WIN\" so that the key inserted at last takes precedence." + ], + "sqlState" : "23505" + }, "DUPLICATE_KEY" : { - "message" : [ "Found duplicate keys " ], - "sqlState" : "23000" + "message" : [ + "Found duplicate keys ." + ], + "sqlState" : "23505" + }, + "EMPTY_JSON_FIELD_VALUE" : { + "message" : [ + "Failed to parse an empty string for data type ." + ], + "sqlState" : "42604" + }, + "ENCODER_NOT_FOUND" : { + "message" : [ + "Not found an encoder of the type to Spark SQL internal representation. Consider to change the input type to one of supported at '/sql-ref-datatypes.html'." + ] }, "FAILED_EXECUTE_UDF" : { - "message" : [ "Failed to execute user defined function (: () => )" ] + "message" : [ + "Failed to execute user defined function (: () => )." + ], + "sqlState" : "39000" + }, + "FAILED_FUNCTION_CALL" : { + "message" : [ + "Failed preparing of the function for call. Please, double check function's arguments." + ], + "sqlState" : "38000" }, "FAILED_RENAME_PATH" : { - "message" : [ "Failed to rename to as destination already exists" ], - "sqlState" : "22023" + "message" : [ + "Failed to rename to as destination already exists." + ], + "sqlState" : "42K04" + }, + "FIELD_NOT_FOUND" : { + "message" : [ + "No such struct field in ." + ], + "sqlState" : "42704" + }, + "FORBIDDEN_OPERATION" : { + "message" : [ + "The operation is not allowed on the : ." + ], + "sqlState" : "42809" + }, + "GENERATED_COLUMN_WITH_DEFAULT_VALUE" : { + "message" : [ + "A column cannot have both a default value and a generation expression but column has default value: () and generation expression: ()." + ] }, "GRAPHITE_SINK_INVALID_PROTOCOL" : { - "message" : [ "Invalid Graphite protocol: " ] + "message" : [ + "Invalid Graphite protocol: ." + ] }, "GRAPHITE_SINK_PROPERTY_MISSING" : { - "message" : [ "Graphite sink requires '' property." ] + "message" : [ + "Graphite sink requires '' property." + ] }, "GROUPING_COLUMN_MISMATCH" : { - "message" : [ "Column of grouping () can't be found in grouping columns " ], - "sqlState" : "42000" + "message" : [ + "Column of grouping () can't be found in grouping columns ." + ], + "sqlState" : "42803" }, "GROUPING_ID_COLUMN_MISMATCH" : { - "message" : [ "Columns of grouping_id () does not match grouping columns ()" ], - "sqlState" : "42000" + "message" : [ + "Columns of grouping_id () does not match grouping columns ()." + ], + "sqlState" : "42803" }, "GROUPING_SIZE_LIMIT_EXCEEDED" : { - "message" : [ "Grouping sets size cannot be greater than " ] + "message" : [ + "Grouping sets size cannot be greater than ." + ], + "sqlState" : "54000" + }, + "GROUP_BY_AGGREGATE" : { + "message" : [ + "Aggregate functions are not allowed in GROUP BY, but found ." + ], + "sqlState" : "42903" + }, + "GROUP_BY_POS_AGGREGATE" : { + "message" : [ + "GROUP BY refers to an expression that contains an aggregate function. Aggregate functions are not allowed in GROUP BY." + ], + "sqlState" : "42903" + }, + "GROUP_BY_POS_OUT_OF_RANGE" : { + "message" : [ + "GROUP BY position is not in select list (valid range is [1, ])." + ], + "sqlState" : "42805" + }, + "IDENTIFIER_TOO_MANY_NAME_PARTS" : { + "message" : [ + " is not a valid identifier as it has more than 2 name parts." + ], + "sqlState" : "42601" }, "INCOMPARABLE_PIVOT_COLUMN" : { - "message" : [ "Invalid pivot column . Pivot columns must be comparable." ], - "sqlState" : "42000" + "message" : [ + "Invalid pivot column . Pivot columns must be comparable." + ], + "sqlState" : "42818" + }, + "INCOMPATIBLE_COLUMN_TYPE" : { + "message" : [ + " can only be performed on tables with compatible column types. The column of the table is type which is not compatible with at the same column of the first table.." + ], + "sqlState" : "42825" }, "INCOMPATIBLE_DATASOURCE_REGISTER" : { - "message" : [ "Detected an incompatible DataSourceRegister. Please remove the incompatible library from classpath or upgrade it. Error: " ] + "message" : [ + "Detected an incompatible DataSourceRegister. Please remove the incompatible library from classpath or upgrade it. Error: " + ] + }, + "INCOMPATIBLE_JOIN_TYPES" : { + "message" : [ + "The join types and are incompatible." + ], + "sqlState" : "42613" + }, + "INCOMPATIBLE_VIEW_SCHEMA_CHANGE" : { + "message" : [ + "The SQL query of view has an incompatible schema change and column cannot be resolved. Expected columns named but got .", + "Please try to re-create the view by running: ." + ] + }, + "INCOMPLETE_TYPE_DEFINITION" : { + "message" : [ + "Incomplete complex type:" + ], + "subClass" : { + "ARRAY" : { + "message" : [ + "The definition of \"ARRAY\" type is incomplete. You must provide an element type. For example: \"ARRAY\"." + ] + }, + "MAP" : { + "message" : [ + "The definition of \"MAP\" type is incomplete. You must provide a key type and a value type. For example: \"MAP\"." + ] + }, + "STRUCT" : { + "message" : [ + "The definition of \"STRUCT\" type is incomplete. You must provide at least one field type. For example: \"STRUCT\"." + ] + } + }, + "sqlState" : "42K01" }, "INCONSISTENT_BEHAVIOR_CROSS_VERSION" : { - "message" : [ "You may get a different result due to the upgrading to Spark >= :
" ] + "message" : [ + "You may get a different result due to the upgrading to" + ], + "subClass" : { + "DATETIME_PATTERN_RECOGNITION" : { + "message" : [ + "Spark >= 3.0:", + "Fail to recognize pattern in the DateTimeFormatter. 1) You can set to \"LEGACY\" to restore the behavior before Spark 3.0. 2) You can form a valid datetime pattern with the guide from '/sql-ref-datetime-pattern.html'." + ] + }, + "PARSE_DATETIME_BY_NEW_PARSER" : { + "message" : [ + "Spark >= 3.0:", + "Fail to parse in the new parser. You can set to \"LEGACY\" to restore the behavior before Spark 3.0, or set to \"CORRECTED\" and treat it as an invalid datetime string." + ] + }, + "READ_ANCIENT_DATETIME" : { + "message" : [ + "Spark >= 3.0:", + "reading dates before 1582-10-15 or timestamps before 1900-01-01T00:00:00Z", + "from files can be ambiguous, as the files may be written by", + "Spark 2.x or legacy versions of Hive, which uses a legacy hybrid calendar", + "that is different from Spark 3.0+'s Proleptic Gregorian calendar.", + "See more details in SPARK-31404. You can set the SQL config or", + "the datasource option