Skip to content

Commit

Permalink
update branch
Browse files Browse the repository at this point in the history
  • Loading branch information
laguilarlyft committed Nov 16, 2023
2 parents a96161b + 17a8e67 commit cf294bd
Show file tree
Hide file tree
Showing 5,822 changed files with 366,609 additions and 99,208 deletions.
The diff you're trying to view is too large. We only load the first 3000 changed files.
5 changes: 5 additions & 0 deletions .asf.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -31,3 +31,8 @@ github:
merge: false
squash: true
rebase: true

notifications:
pullrequests: [email protected]
issues: [email protected]
commits: [email protected]
24 changes: 16 additions & 8 deletions .github/labeler.yml
Original file line number Diff line number Diff line change
Expand Up @@ -84,12 +84,12 @@ SPARK SHELL:
- "repl/**/*"
- "bin/spark-shell*"
SQL:
#- any: ["**/sql/**/*", "!python/pyspark/sql/avro/**/*", "!python/pyspark/sql/streaming.py", "!python/pyspark/sql/tests/test_streaming.py"]
#- any: ["**/sql/**/*", "!python/pyspark/sql/avro/**/*", "!python/pyspark/sql/streaming/**/*", "!python/pyspark/sql/tests/streaming/test_streaming.py"]
- "**/sql/**/*"
- "common/unsafe/**/*"
#- "!python/pyspark/sql/avro/**/*"
#- "!python/pyspark/sql/streaming.py"
#- "!python/pyspark/sql/tests/test_streaming.py"
#- "!python/pyspark/sql/streaming/**/*"
#- "!python/pyspark/sql/tests/streaming/test_streaming.py"
- "bin/spark-sql*"
- "bin/beeline*"
- "sbin/*thriftserver*.sh"
Expand All @@ -103,7 +103,7 @@ SQL:
- "**/*schema.R"
- "**/*types.R"
AVRO:
- "external/avro/**/*"
- "connector/avro/**/*"
- "python/pyspark/sql/avro/**/*"
DSTREAM:
- "streaming/**/*"
Expand All @@ -123,13 +123,15 @@ MLLIB:
- "python/pyspark/mllib/**/*"
STRUCTURED STREAMING:
- "**/sql/**/streaming/**/*"
- "external/kafka-0-10-sql/**/*"
- "python/pyspark/sql/streaming.py"
- "python/pyspark/sql/tests/test_streaming.py"
- "connector/kafka-0-10-sql/**/*"
- "python/pyspark/sql/streaming/**/*"
- "python/pyspark/sql/tests/streaming/test_streaming.py"
- "**/*streaming.R"
PYTHON:
- "bin/pyspark*"
- "**/python/**/*"
PANDAS API ON SPARK:
- "python/pyspark/pandas/**/*"
R:
- "**/r/**/*"
- "**/R/**/*"
Expand All @@ -149,4 +151,10 @@ WEB UI:
- "**/*UI.scala"
DEPLOY:
- "sbin/**/*"

CONNECT:
- "connector/connect/**/*"
- "**/sql/sparkconnect/**/*"
- "python/pyspark/sql/**/connect/**/*"
PROTOBUF:
- "connector/protobuf/**/*"
- "python/pyspark/sql/protobuf/**/*"
98 changes: 86 additions & 12 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@ on:
description: 'JDK version: 8, 11 or 17'
required: true
default: '8'
scala:
description: 'Scala version: 2.12 or 2.13'
required: true
default: '2.12'
failfast:
description: 'Failfast: true or false'
required: true
Expand All @@ -50,11 +54,69 @@ jobs:
steps:
- name: Generate matrix
id: set-matrix
run: echo "::set-output name=matrix::["`seq -s, 1 $SPARK_BENCHMARK_NUM_SPLITS`"]"
run: echo "matrix=["`seq -s, 1 $SPARK_BENCHMARK_NUM_SPLITS`"]" >> $GITHUB_OUTPUT

# Any TPC-DS related updates on this job need to be applied to tpcds-1g job of build_and_test.yml as well
tpcds-1g-gen:
name: "Generate an input dataset for TPCDSQueryBenchmark with SF=1"
if: contains(github.event.inputs.class, 'TPCDSQueryBenchmark') || contains(github.event.inputs.class, '*')
runs-on: ubuntu-20.04
env:
SPARK_LOCAL_IP: localhost
steps:
- name: Checkout Spark repository
uses: actions/checkout@v3
# In order to get diff files
with:
fetch-depth: 0
- name: Cache Scala, SBT and Maven
uses: actions/cache@v3
with:
path: |
build/apache-maven-*
build/scala-*
build/*.jar
~/.sbt
key: build-${{ hashFiles('**/pom.xml', 'project/build.properties', 'build/mvn', 'build/sbt', 'build/sbt-launch-lib.bash', 'build/spark-build-info') }}
restore-keys: |
build-
- name: Cache Coursier local repository
uses: actions/cache@v3
with:
path: ~/.cache/coursier
key: benchmark-coursier-${{ github.event.inputs.jdk }}-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
benchmark-coursier-${{ github.event.inputs.jdk }}
- name: Cache TPC-DS generated data
id: cache-tpcds-sf-1
uses: actions/cache@v3
with:
path: ./tpcds-sf-1
key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }}
- name: Checkout tpcds-kit repository
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
uses: actions/checkout@v3
with:
repository: databricks/tpcds-kit
ref: 2a5078a782192ddb6efbcead8de9973d6ab4f069
path: ./tpcds-kit
- name: Build tpcds-kit
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
run: cd tpcds-kit/tools && make OS=LINUX
- name: Install Java ${{ github.event.inputs.jdk }}
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
uses: actions/setup-java@v3
with:
distribution: temurin
java-version: ${{ github.event.inputs.jdk }}
- name: Generate TPC-DS (SF=1) table data
if: steps.cache-tpcds-sf-1.outputs.cache-hit != 'true'
run: build/sbt "sql/Test/runMain org.apache.spark.sql.GenTPCDSData --dsdgenDir `pwd`/tpcds-kit/tools --location `pwd`/tpcds-sf-1 --scaleFactor 1 --numPartitions 1 --overwrite"

benchmark:
name: "Run benchmarks: ${{ github.event.inputs.class }} (JDK ${{ github.event.inputs.jdk }}, ${{ matrix.split }} out of ${{ github.event.inputs.num-splits }} splits)"
needs: matrix-gen
name: "Run benchmarks: ${{ github.event.inputs.class }} (JDK ${{ github.event.inputs.jdk }}, Scala ${{ github.event.inputs.scala }}, ${{ matrix.split }} out of ${{ github.event.inputs.num-splits }} splits)"
if: always()
needs: [matrix-gen, tpcds-1g-gen]
# Ubuntu 20.04 is the latest LTS. The next LTS is 22.04.
runs-on: ubuntu-20.04
strategy:
Expand All @@ -69,14 +131,15 @@ jobs:
SPARK_LOCAL_IP: localhost
# To prevent spark.test.home not being set. See more detail in SPARK-36007.
SPARK_HOME: ${{ github.workspace }}
SPARK_TPCDS_DATA: ${{ github.workspace }}/tpcds-sf-1
steps:
- name: Checkout Spark repository
uses: actions/checkout@v2
uses: actions/checkout@v3
# In order to get diff files
with:
fetch-depth: 0
- name: Cache Scala, SBT and Maven
uses: actions/cache@v2
uses: actions/cache@v3
with:
path: |
build/apache-maven-*
Expand All @@ -87,19 +150,28 @@ jobs:
restore-keys: |
build-
- name: Cache Coursier local repository
uses: actions/cache@v2
uses: actions/cache@v3
with:
path: ~/.cache/coursier
key: benchmark-coursier-${{ github.event.inputs.jdk }}-${{ hashFiles('**/pom.xml', '**/plugins.sbt') }}
restore-keys: |
benchmark-coursier-${{ github.event.inputs.jdk }}
- name: Install Java ${{ github.event.inputs.jdk }}
uses: actions/setup-java@v1
uses: actions/setup-java@v3
with:
distribution: temurin
java-version: ${{ github.event.inputs.jdk }}
- name: Cache TPC-DS generated data
if: contains(github.event.inputs.class, 'TPCDSQueryBenchmark') || contains(github.event.inputs.class, '*')
id: cache-tpcds-sf-1
uses: actions/cache@v3
with:
path: ./tpcds-sf-1
key: tpcds-${{ hashFiles('.github/workflows/benchmark.yml', 'sql/core/src/test/scala/org/apache/spark/sql/TPCDSSchema.scala') }}
- name: Run benchmarks
run: |
./build/sbt -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pspark-ganglia-lgpl test:package
dev/change-scala-version.sh ${{ github.event.inputs.scala }}
./build/sbt -Pscala-${{ github.event.inputs.scala }} -Pyarn -Pmesos -Pkubernetes -Phive -Phive-thriftserver -Phadoop-cloud -Pkinesis-asl -Pspark-ganglia-lgpl Test/package
# Make less noisy
cp conf/log4j2.properties.template conf/log4j2.properties
sed -i 's/rootLogger.level = info/rootLogger.level = warn/g' conf/log4j2.properties
Expand All @@ -109,13 +181,15 @@ jobs:
--jars "`find . -name '*-SNAPSHOT-tests.jar' -o -name '*avro*-SNAPSHOT.jar' | paste -sd ',' -`" \
"`find . -name 'spark-core*-SNAPSHOT-tests.jar'`" \
"${{ github.event.inputs.class }}"
# Revert to default Scala version to clean up unnecessary git diff
dev/change-scala-version.sh 2.12
# To keep the directory structure and file permissions, tar them
# See also https://github.com/actions/upload-artifact#maintaining-file-permissions-and-case-sensitive-files
echo "Preparing the benchmark results:"
tar -cvf benchmark-results-${{ github.event.inputs.jdk }}.tar `git diff --name-only` `git ls-files --others --exclude-standard`
tar -cvf benchmark-results-${{ github.event.inputs.jdk }}-${{ github.event.inputs.scala }}.tar `git diff --name-only` `git ls-files --others --exclude=tpcds-sf-1 --exclude-standard`
- name: Upload benchmark results
uses: actions/upload-artifact@v2
uses: actions/upload-artifact@v3
with:
name: benchmark-results-${{ github.event.inputs.jdk }}-${{ matrix.split }}
path: benchmark-results-${{ github.event.inputs.jdk }}.tar
name: benchmark-results-${{ github.event.inputs.jdk }}-${{ github.event.inputs.scala }}-${{ matrix.split }}
path: benchmark-results-${{ github.event.inputs.jdk }}-${{ github.event.inputs.scala }}.tar

Loading

0 comments on commit cf294bd

Please sign in to comment.