From 4e9338554479599cbc6b7fa2af2fa25b75f9c074 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sat, 16 Mar 2024 19:37:13 -0700 Subject: [PATCH 01/37] Update git ignore. --- .gitignore | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.gitignore b/.gitignore index 7b3e525..3f1b383 100644 --- a/.gitignore +++ b/.gitignore @@ -85,3 +85,7 @@ spark_expectations_sample_rules.json # more python pyspark_venv.tar.gz pyspark_venv/ + +# accel stuff +accelerators/*.jar +accelerators/arrow-datafusion-comet \ No newline at end of file From b0c548027b139070aefeb0496c4bce967fd6bc25 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sat, 16 Mar 2024 19:38:23 -0700 Subject: [PATCH 02/37] Start adding some shims for running accelerators. --- accelerators/gluten_config.properties | 5 +++++ accelerators/run_gluten.sh | 1 + accelerators/setup.sh | 11 +++++++++++ 3 files changed, 17 insertions(+) create mode 100644 accelerators/gluten_config.properties create mode 100644 accelerators/run_gluten.sh create mode 100644 accelerators/setup.sh diff --git a/accelerators/gluten_config.properties b/accelerators/gluten_config.properties new file mode 100644 index 0000000..eab3946 --- /dev/null +++ b/accelerators/gluten_config.properties @@ -0,0 +1,5 @@ +spark.plugins=io.glutenproject.GlutenPlugin +spark.memory.offHeap.enabled=true +spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager +# This static allocation is one of the hardest part of using Gluten +spark.memory.offHeap.size=20g diff --git a/accelerators/run_gluten.sh b/accelerators/run_gluten.sh new file mode 100644 index 0000000..b646f55 --- /dev/null +++ b/accelerators/run_gluten.sh @@ -0,0 +1 @@ +${SPARK_HOME}/bin/spark-shell --master local --jars ${ACCEL_JARS}/gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_2.12-1.1.1.jar --spark-properties=gluten_config.properties diff --git a/accelerators/setup.sh b/accelerators/setup.sh new file mode 100644 index 0000000..ea8b91c --- /dev/null +++ b/accelerators/setup.sh @@ -0,0 +1,11 @@ +ACCEL_JARS=./ +SPARK_MAJOR_VERSION=3.4 + +if [ ! -f "${GLUTEN_JAR}" ]; then + wget https://github.com/apache/incubator-gluten/releases/download/v1.1.1/gluten-velox-bundle-spark3.4_2.12-1.1.1.jar +fi +if [ ! -d arrow-datafusion-comet ]; then + git clone https://github.com/apache/incubator-gluten.git + cd arrow-datafusion-comet + make all PROFILES="-Pspark-3.4" +fi From 8c547e588e895e82cbe9143053bcae6d214966a7 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 22 Nov 2023 20:10:16 -0800 Subject: [PATCH 03/37] Try and setup velox Try and fix dependecny setup issue. Ok if we can't install libgoogle-glog lets see if we can still build. Try explicitilyy install libunwind-dev as suggested by https://github.com/kadalu-tech/pkgs/pull/2/files#r1001042597 Ok try and make velox optional. Get the build to include our GLuten UDF if and only if gluten is present Refactor gluten UDF build to produce a seperate object file so we can conditonally do things based on that. Start work to selectively integrate gluten into the examples. Add script to setup gluten Make some progress to integrating gluten into our examples. simplify build options. Start adding a Gluten 34 ex but it only works in old ubuntu anyways. Give up on Gluten on modern systems because I don't have time for that. But maybe we can get it to work with Clickhouse Style fixes. Work on trying to get something with gluten to run. bloop --- .github/workflows/ci.yml | 24 ++++++ .gitignore | 7 +- c | 2 + env_setup.sh | 0 gluten_spark_34_ex.sh | 34 ++++++++ native/src/CMakeLists.txt | 23 +++++- native/src/c/gluten/GlutenUDF.cpp | 82 +++++++++++++++++++ run_sql_examples.sh | 50 +++++++++-- setup_gluten.sh | 23 ++++++ sql/gluten_only_nonpartitioned_table_join.sql | 12 +++ 10 files changed, 250 insertions(+), 7 deletions(-) create mode 100644 c mode change 100644 => 100755 env_setup.sh create mode 100755 gluten_spark_34_ex.sh create mode 100644 native/src/c/gluten/GlutenUDF.cpp create mode 100755 setup_gluten.sh create mode 100644 sql/gluten_only_nonpartitioned_table_join.sql diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 9392efc..3d865d3 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,6 +64,30 @@ jobs: - name: Run sql examples run: ./run_sql_examples.sh + run-gluten-sql-examples: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Cache Spark and friends + uses: actions/cache@v3 + with: + path: | + spark*.tgz + iceberg*.jar + key: spark-artifacts + - name: Cache Data + uses: actions/cache@v3 + with: + path: | + data/fetched/* + key: data-fetched + - name: Setup gluten + run: + ./setup_gluten.sh + - name: Run sql examples w/ gluten + run: + ./run_sql_examples.sh run-target-examples: runs-on: ubuntu-latest steps: diff --git a/.gitignore b/.gitignore index 3f1b383..babcd0a 100644 --- a/.gitignore +++ b/.gitignore @@ -88,4 +88,9 @@ pyspark_venv/ # accel stuff accelerators/*.jar -accelerators/arrow-datafusion-comet \ No newline at end of file +accelerators/arrow-datafusion-comet +# ignore gluten +gluten +gluten*.jar +spark-3*hadoop*/ +spark-3*hadoop*.tgz diff --git a/c b/c new file mode 100644 index 0000000..cb4d93b --- /dev/null +++ b/c @@ -0,0 +1,2 @@ +bloop + diff --git a/env_setup.sh b/env_setup.sh old mode 100644 new mode 100755 diff --git a/gluten_spark_34_ex.sh b/gluten_spark_34_ex.sh new file mode 100755 index 0000000..0da6f52 --- /dev/null +++ b/gluten_spark_34_ex.sh @@ -0,0 +1,34 @@ +#!/bin/bash + +set -ex + +# Note: this does not work on Ubuntu 23, only on 22 +# You might get something like: +# # C [libgluten.so+0x30c753] gluten::Runtime::registerFactory(std::string const&, std::function, std::equal_to, std::allocator > > const&)>)+0x23 + + +SPARK_VERSION=3.4.2 +HADOOP_VERSION=3 +SPARK_DIR=spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} +SPARK_FILE=${SPARK_DIR}.tgz + +if [ ! -d ${SPARK_DIR} ]; then + wget https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE} + tar -xvf ${SPARK_FILE} +fi + +GLUTEN_JAR=gluten-velox-bundle-spark3.4_2.12-1.1.0.jar + +if [ ! -f ${GLUTEN_JAR} ]; then + wget https://github.com/oap-project/gluten/releases/download/v1.1.0/${GLUTEN_JAR} +fi + +SPARK_HOME=${SPARK_DIR} +export SPARK_HOME +PATH=$(pwd)/${SPARK_DIR}/bin:$PATH +spark-sql --master local[5] \ + --conf spark.plugins=io.glutenproject.GlutenPlugin \ + --conf spark.memory.offHeap.enabled=true \ + --conf spark.memory.offHeap.size=5g \ + --jars ${GLUTEN_JAR} \ + -e "SELECT 1" diff --git a/native/src/CMakeLists.txt b/native/src/CMakeLists.txt index 04acf78..e976645 100644 --- a/native/src/CMakeLists.txt +++ b/native/src/CMakeLists.txt @@ -18,6 +18,28 @@ set(PROJECT_VERSION_MAJOR 0) set(PROJECT_VERSION_MINOR 0) set(PROJECT_VERSION_PATCH 0) +set (LIB_NAME ${PROJECT_NAME}${PROJECT_VERSION_MAJOR}) + +#tag::velox[] +set (GLUTEN_LIB_NAME ${PROJECT_NAME}-gluten-${PROJECT_VERSION_MAJOR}) +# For gluten+velox, you can leave out if not using gluten +set(GLUTEN_HOME ../../gluten) +set(CMAKE_FIND_DEBUG_MODE TRUE) +find_library(VELOX_LIBRARY NAMES velox HINTS + ${GLUTEN_HOME}/cpp/build/releases NO_DEFAULT_PATH) +# End gluten specific + +if(VELOX_LIBRARY) + file(GLOB GLUTEN_UDF_FILES + "./c/gluten/*.cpp") + add_library(${GLUTEN_LIB_NAME} SHARED ${GLUTEN_UDF_FILES}) + target_include_directories(${GLUTEN_LIB_NAME} PRIVATE ${GLUTEN_HOME}/cpp ${GLUTEN_HOME}/ep/build-velox/build/velox_ep) + target_link_libraries(${GLUTEN_LIB_NAME} PRIVATE ${VELOX_LIBRARY}) +else() + message(WARNING "Velox library not found. Specific path not added.") +endif() +#end::velox[] + # Setup JNI find_package(JNI REQUIRED) if (JNI_FOUND) @@ -45,6 +67,5 @@ file(GLOB LIB_SRC # Setup installation targets # (required by sbt-jni) major version should always be appended to library name # -set (LIB_NAME ${PROJECT_NAME}${PROJECT_VERSION_MAJOR}) add_library(${LIB_NAME} SHARED ${LIB_SRC}) install(TARGETS ${LIB_NAME} LIBRARY DESTINATION .) diff --git a/native/src/c/gluten/GlutenUDF.cpp b/native/src/c/gluten/GlutenUDF.cpp new file mode 100644 index 0000000..14019f4 --- /dev/null +++ b/native/src/c/gluten/GlutenUDF.cpp @@ -0,0 +1,82 @@ +// Filename MyUDF.cpp + +#include +#include +#include + + +namespace { +using namespace facebook::velox; + +template +class PlusConstantFunction : public exec::VectorFunction { + public: + explicit PlusConstantFunction(int32_t addition) : addition_(addition) {} + + void apply( + const SelectivityVector& rows, + std::vector& args, + const TypePtr& /* outputType */, + exec::EvalCtx& context, + VectorPtr& result) const override { + using nativeType = typename TypeTraits::NativeType; + VELOX_CHECK_EQ(args.size(), 1); + + auto& arg = args[0]; + + // The argument may be flat or constant. + VELOX_CHECK(arg->isFlatEncoding() || arg->isConstantEncoding()); + + BaseVector::ensureWritable(rows, createScalarType(), context.pool(), result); + + auto* flatResult = result->asFlatVector(); + auto* rawResult = flatResult->mutableRawValues(); + + flatResult->clearNulls(rows); + + if (arg->isConstantEncoding()) { + auto value = arg->as>()->valueAt(0); + rows.applyToSelected([&](auto row) { rawResult[row] = value + addition_; }); + } else { + auto* rawInput = arg->as>()->rawValues(); + + rows.applyToSelected([&](auto row) { rawResult[row] = rawInput[row] + addition_; }); + } + } + + private: + const int32_t addition_; +}; + +static std::vector> integerSignatures() { + // integer -> integer + return {exec::FunctionSignatureBuilder().returnType("integer").argumentType("integer").build()}; +} + +static std::vector> bigintSignatures() { + // bigint -> bigint + return {exec::FunctionSignatureBuilder().returnType("bigint").argumentType("bigint").build()}; +} + +} // namespace + +const int kNumMyUdf = 2; +gluten::UdfEntry myUdf[kNumMyUdf] = {{"myudf1", "integer"}, {"myudf2", "bigint"}}; + +DEFINE_GET_NUM_UDF { + return kNumMyUdf; +} + +DEFINE_GET_UDF_ENTRIES { + for (auto i = 0; i < kNumMyUdf; ++i) { + udfEntries[i] = myUdf[i]; + } +} + +DEFINE_REGISTER_UDF { + facebook::velox::exec::registerVectorFunction( + "myudf1", integerSignatures(), std::make_unique>(5)); + facebook::velox::exec::registerVectorFunction( + "myudf2", bigintSignatures(), std::make_unique>(5)); + std::cout << "registered myudf1, myudf2" << std::endl; +} diff --git a/run_sql_examples.sh b/run_sql_examples.sh index c054b31..f3b25f3 100755 --- a/run_sql_examples.sh +++ b/run_sql_examples.sh @@ -4,9 +4,34 @@ set -o pipefail source env_setup.sh +# Check if we gluten and gluten UDFs present +GLUTEN_NATIVE_LIB_NAME=libhigh-performance-spark-gluten-0.so +NATIVE_LIB_DIR=./native/src/ +NATIVE_LIB_PATH="${NATIVE_LIB_DIR}${GLUTEN_NATIVE_LIB_NAME}" +GLUTEN_HOME=./gluten +if [ -d ${GLUTEN_HOME} ]; then + GLUTEN_EXISTS="true" + gluten_jvm_jar=$(ls "${GLUTEN_HOME}"/package/target/gluten-velox-bundle-spark3.5_2.12-ubuntu_*-*-SNAPSHOT.jar) #TBD + gluten_jvm_package_jar=$(ls "${GLUTEN_HOME}"/package/target/gluten-package*-*-SNAPSHOT.jar) + GLUTEN_SPARK_EXTRA="--conf spark.plugins=io.glutenproject.GlutenPlugin \ + --conf spark.memory.offHeap.enabled=true \ + --conf spark.memory.offHeap.size=5g \ + --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \ + --conf spark.gluten.sql.columnar.backend.velox.udfLibraryPaths=${GLUTEN_NATIVE_LIB_NAME}" + # Enable UDF seperately. + if [ -f "${NATIVE_LIB_PATH}" ]; then + GLUTEN_SPARK_EXTRA="$GLUTEN_SPARK_EXTRA \ + --jars ${gluten_jvm_jar},${gluten_jvm_package_jar} \ + --conf spark.jars=${gluten_jvm_jar} \ + --conf spark.gluten.loadLibFromJar=true \ + --files ${NATIVE_LIB_PATH}" + fi +fi + function run_example () { local sql_file="$1" - # shellcheck disable=SC2046 + local extra="$2" + # shellcheck disable=SC2046,SC2086 spark-sql --master local[5] \ --conf spark.eventLog.enabled=true \ --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ @@ -15,6 +40,7 @@ function run_example () { --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ --conf spark.sql.catalog.local.type=hadoop \ --conf "spark.sql.catalog.local.warehouse=$PWD/warehouse" \ + ${extra} \ $(cat "${sql_file}.conf" || echo "") \ --name "${sql_file}" \ -f "${sql_file}" | tee -a "${sql_file}.out" || ls "${sql_file}.expected_to_fail" @@ -25,12 +51,26 @@ function run_example () { # ${SPARK_PATH}/sbin/start-history-server.sh if [ $# -eq 1 ]; then - run_example "sql/$1" + if [[ "$1" != *"gluten_only"* ]]; then + run_example "sql/$1" + else + echo "Processing gluten ${sql_file}" + # shellcheck disable=SC2046 + run_example "$sql_file" "$GLUTEN_SPARK_EXTRA" + fi else # For each SQL for sql_file in sql/*.sql; do - echo "Processing ${sql_file}" - # shellcheck disable=SC2046 - run_example "$sql_file" + if [[ "$sql_file" != *"gluten_only"* ]]; then + echo "Processing ${sql_file}" + # shellcheck disable=SC2046 + run_example "$sql_file" + elif [[ "$GLUTEN_EXISTS" == "true" ]]; then + echo "Processing gluten ${sql_file}" + # shellcheck disable=SC2046 + run_example "$sql_file" "$GLUTEN_SPARK_EXTRA" + else + echo "Skipping $sql_file since we did not find gluten and this is a gluten only example." + fi done fi diff --git a/setup_gluten.sh b/setup_gluten.sh new file mode 100755 index 0000000..8b085cc --- /dev/null +++ b/setup_gluten.sh @@ -0,0 +1,23 @@ +#!/bin/bash +set -ex + +# Setup deps +sudo apt-get update && sudo apt-get install -y locales wget tar tzdata git ccache cmake ninja-build build-essential llvm-dev clang libiberty-dev libdwarf-dev libre2-dev libz-dev libssl-dev libboost-all-dev libcurl4-openssl-dev maven rapidjson-dev libdouble-conversion-dev libgflags-dev libsodium-dev libsnappy-dev nasm && sudo apt install -y libunwind-dev && sudo apt-get install -y libgoogle-glog-dev && sudo apt-get -y install docker-compose + +# Try gluten w/clickhouse +#if [ ! -d gluten ]; then +# git clone https://github.com/oap-project/gluten.git +# cd gluten +# bash ./ep/build-clickhouse/src/build_clickhouse.sh +#fi + +# Build gluten +if [ ! -d gluten ]; then + # We need Spark 3.5 w/scala212 + git clone git@github.com:holdenk/gluten.git || git clone https://github.com/holdenk/gluten.git + cd gluten + git checkout add-spark35-scala213-hack + ./dev/builddeps-veloxbe.sh + mvn clean package -Pbackends-velox -Pspark-3.5 -DskipTests + cd .. +fi diff --git a/sql/gluten_only_nonpartitioned_table_join.sql b/sql/gluten_only_nonpartitioned_table_join.sql new file mode 100644 index 0000000..572437c --- /dev/null +++ b/sql/gluten_only_nonpartitioned_table_join.sql @@ -0,0 +1,12 @@ +CREATE TABLE IF NOT EXISTS local.udevelopers ( + username string, + firstname string, + lastname string) +USING iceberg; +CREATE TABLE IF NOT EXISTS local.uprojects ( + creator string, + uprojectname string) +USING iceberg; +INSERT INTO local.udevelopers VALUES("krisnova", "Kris", "Nova"); +INSERT INTO local.uprojects VALUES("krisnova", "aurae"); +SELECT * FROM local.udevelopers INNER JOIN local.uprojects ON local.uprojects.creator = local.udevelopers.username; From a14aa7b4d46f4afc4cbfdb0a2902c40c4fbd2658 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 17 Mar 2024 20:33:46 -0700 Subject: [PATCH 04/37] Update accel stuff --- .github/workflows/ci.yml | 48 +++++++++---------- accelerators/gluten_spark_34_ex.sh | 13 +++++ accelerators/setup.sh | 23 +++++++++ .../setup_gluten_from_src.sh | 0 gluten_spark_34_ex.sh | 34 ------------- 5 files changed, 60 insertions(+), 58 deletions(-) create mode 100755 accelerators/gluten_spark_34_ex.sh rename setup_gluten.sh => accelerators/setup_gluten_from_src.sh (100%) delete mode 100755 gluten_spark_34_ex.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 3d865d3..5747679 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,30 +64,30 @@ jobs: - name: Run sql examples run: ./run_sql_examples.sh - run-gluten-sql-examples: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Cache Spark and friends - uses: actions/cache@v3 - with: - path: | - spark*.tgz - iceberg*.jar - key: spark-artifacts - - name: Cache Data - uses: actions/cache@v3 - with: - path: | - data/fetched/* - key: data-fetched - - name: Setup gluten - run: - ./setup_gluten.sh - - name: Run sql examples w/ gluten - run: - ./run_sql_examples.sh +# run-gluten-sql-examples: +# runs-on: ubuntu-latest +# steps: +# - name: Checkout +# uses: actions/checkout@v2 +# - name: Cache Spark and friends +# uses: actions/cache@v3 +# with: +# path: | +# spark*.tgz +# iceberg*.jar +# key: spark-artifacts +# - name: Cache Data +# uses: actions/cache@v3 +# with: +# path: | +# data/fetched/* +# key: data-fetched +# - name: Setup gluten +# run: +# ./setup_gluten.sh +# - name: Run sql examples w/ gluten +# run: +# ./run_sql_examples.sh run-target-examples: runs-on: ubuntu-latest steps: diff --git a/accelerators/gluten_spark_34_ex.sh b/accelerators/gluten_spark_34_ex.sh new file mode 100755 index 0000000..3f98c6e --- /dev/null +++ b/accelerators/gluten_spark_34_ex.sh @@ -0,0 +1,13 @@ +#!/bin/bash + +source setup.sh + +SPARK_HOME=${SPARK_DIR} +export SPARK_HOME +PATH=$(pwd)/${SPARK_DIR}/bin:$PATH +spark-sql --master local[5] \ + --conf spark.plugins=io.glutenproject.GlutenPlugin \ + --conf spark.memory.offHeap.enabled=true \ + --conf spark.memory.offHeap.size=5g \ + --jars ${GLUTEN_JAR} \ + -e "SELECT 1" diff --git a/accelerators/setup.sh b/accelerators/setup.sh index ea8b91c..5f9cf4e 100644 --- a/accelerators/setup.sh +++ b/accelerators/setup.sh @@ -1,6 +1,29 @@ ACCEL_JARS=./ SPARK_MAJOR_VERSION=3.4 +set -ex + +# Note: this does not work on Ubuntu 23, only on 22 +# You might get something like: +# # C [libgluten.so+0x30c753] gluten::Runtime::registerFactory(std::string const&, std::function, std::equal_to, std::allocator > > const&)>)+0x23 + + +SPARK_VERSION=3.4.2 +HADOOP_VERSION=3 +SPARK_DIR=spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} +SPARK_FILE=${SPARK_DIR}.tgz + +if [ ! -d ${SPARK_DIR} ]; then + wget https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE} + tar -xvf ${SPARK_FILE} +fi + +GLUTEN_JAR=gluten-velox-bundle-spark3.4_2.12-1.1.0.jar + +if [ ! -f ${GLUTEN_JAR} ]; then + wget https://github.com/oap-project/gluten/releases/download/v1.1.0/${GLUTEN_JAR} +fi + if [ ! -f "${GLUTEN_JAR}" ]; then wget https://github.com/apache/incubator-gluten/releases/download/v1.1.1/gluten-velox-bundle-spark3.4_2.12-1.1.1.jar fi diff --git a/setup_gluten.sh b/accelerators/setup_gluten_from_src.sh similarity index 100% rename from setup_gluten.sh rename to accelerators/setup_gluten_from_src.sh diff --git a/gluten_spark_34_ex.sh b/gluten_spark_34_ex.sh deleted file mode 100755 index 0da6f52..0000000 --- a/gluten_spark_34_ex.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash - -set -ex - -# Note: this does not work on Ubuntu 23, only on 22 -# You might get something like: -# # C [libgluten.so+0x30c753] gluten::Runtime::registerFactory(std::string const&, std::function, std::equal_to, std::allocator > > const&)>)+0x23 - - -SPARK_VERSION=3.4.2 -HADOOP_VERSION=3 -SPARK_DIR=spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} -SPARK_FILE=${SPARK_DIR}.tgz - -if [ ! -d ${SPARK_DIR} ]; then - wget https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE} - tar -xvf ${SPARK_FILE} -fi - -GLUTEN_JAR=gluten-velox-bundle-spark3.4_2.12-1.1.0.jar - -if [ ! -f ${GLUTEN_JAR} ]; then - wget https://github.com/oap-project/gluten/releases/download/v1.1.0/${GLUTEN_JAR} -fi - -SPARK_HOME=${SPARK_DIR} -export SPARK_HOME -PATH=$(pwd)/${SPARK_DIR}/bin:$PATH -spark-sql --master local[5] \ - --conf spark.plugins=io.glutenproject.GlutenPlugin \ - --conf spark.memory.offHeap.enabled=true \ - --conf spark.memory.offHeap.size=5g \ - --jars ${GLUTEN_JAR} \ - -e "SELECT 1" From 8c90b729e1037fcfa934a45fcbe2727fbfacf722 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 17 Mar 2024 21:14:05 -0700 Subject: [PATCH 05/37] Get Gluten + Spark3.4 to party (note: this fails because of Gluten segfault) re-enable gluten-sql-ex Add cache accel cache. Lets go for 3.5.1 Update shell for style Fix gluten jar dl Add full path for SPARK_PATH Only use pre-built for 20.04 Build deps with sudo Ignore incubator gluten More work getting gluten and comet --- .github/workflows/ci.yml | 74 +++++++++++++++++--------- .gitignore | 1 + accelerators/comet_env_setup.sh | 8 +++ accelerators/comet_ex.sh | 6 +++ accelerators/gluten_env_setup.sh | 44 +++++++++++++++ accelerators/gluten_spark_34_ex.sh | 19 +++++-- accelerators/install_rust_if_needed.sh | 9 ++++ accelerators/run_gluten.sh | 4 +- accelerators/setup.sh | 34 ------------ accelerators/setup_comet.sh | 16 ++++++ accelerators/setup_gluten_deps.sh | 8 +++ accelerators/setup_gluten_from_src.sh | 2 +- accelerators/setup_gluten_spark34.sh | 51 ++++++++++++++++++ env_setup.sh | 5 +- run_sql_examples.sh | 42 ++++----------- target-validator/runme.sh | 2 +- 16 files changed, 225 insertions(+), 100 deletions(-) create mode 100644 accelerators/comet_env_setup.sh create mode 100755 accelerators/comet_ex.sh create mode 100755 accelerators/gluten_env_setup.sh create mode 100644 accelerators/install_rust_if_needed.sh mode change 100644 => 100755 accelerators/run_gluten.sh delete mode 100644 accelerators/setup.sh create mode 100755 accelerators/setup_comet.sh create mode 100755 accelerators/setup_gluten_deps.sh create mode 100755 accelerators/setup_gluten_spark34.sh diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 5747679..1bbcb25 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,30 +64,48 @@ jobs: - name: Run sql examples run: ./run_sql_examples.sh -# run-gluten-sql-examples: -# runs-on: ubuntu-latest -# steps: -# - name: Checkout -# uses: actions/checkout@v2 -# - name: Cache Spark and friends -# uses: actions/cache@v3 -# with: -# path: | -# spark*.tgz -# iceberg*.jar -# key: spark-artifacts -# - name: Cache Data -# uses: actions/cache@v3 -# with: -# path: | -# data/fetched/* -# key: data-fetched -# - name: Setup gluten -# run: -# ./setup_gluten.sh -# - name: Run sql examples w/ gluten -# run: -# ./run_sql_examples.sh + run-gluten-sql-examples: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Cache Spark and friends + uses: actions/cache@v3 + with: + path: | + spark*.tgz + iceberg*.jar + key: spark-artifacts + - name: Cache Data + uses: actions/cache@v3 + with: + path: | + data/fetched/* + key: data-fetched + - name: Run gluten + run: + cd accelerators; ./gluten_spark_34_ex.sh + run-comet-sql-examples: + runs-on: ubuntu-latest + steps: + - name: Checkout + uses: actions/checkout@v2 + - name: Cache Spark and friends + uses: actions/cache@v3 + with: + path: | + spark*.tgz + iceberg*.jar + key: spark-artifacts + - name: Cache Data + uses: actions/cache@v3 + with: + path: | + data/fetched/* + key: data-fetched + - name: Run comet + run: + cd accelerators; ./comet_ex.sh run-target-examples: runs-on: ubuntu-latest steps: @@ -100,6 +118,12 @@ jobs: spark*.tgz iceberg*.jar key: spark-artifacts + - name: Cache Accel + uses: actions/cache@v3 + with: + path: | + accelerators/*.jar + key: accelerators-artifacts - name: Cache Data uses: actions/cache@v3 with: @@ -138,7 +162,7 @@ jobs: - name: Shellcheck run: | sudo apt-get install -y shellcheck - shellcheck $(find -name "*.sh") + shellcheck -e SC2317,SC1091,SC2034,SC2164 $(find -name "*.sh") - name: Setup JDK uses: actions/setup-java@v3 with: diff --git a/.gitignore b/.gitignore index babcd0a..8d1365c 100644 --- a/.gitignore +++ b/.gitignore @@ -94,3 +94,4 @@ gluten gluten*.jar spark-3*hadoop*/ spark-3*hadoop*.tgz +accelerators/incubator-gluten diff --git a/accelerators/comet_env_setup.sh b/accelerators/comet_env_setup.sh new file mode 100644 index 0000000..f6a5050 --- /dev/null +++ b/accelerators/comet_env_setup.sh @@ -0,0 +1,8 @@ +#!/bin/bash + +SPARK_EXTRA="--jars ${COMET_JAR} \ +--conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \ +--conf spark.comet.enabled=true \ +--conf spark.comet.exec.enabled=true \ +--conf spark.comet.exec.all.enabled=true" +export SPARK_EXTRA diff --git a/accelerators/comet_ex.sh b/accelerators/comet_ex.sh new file mode 100755 index 0000000..1f95323 --- /dev/null +++ b/accelerators/comet_ex.sh @@ -0,0 +1,6 @@ +#!/bin/bash + +set -ex +source setup_comet.sh +source comet_env_setup.sh +USE_COMET="true" ../run_sql_examples.sh diff --git a/accelerators/gluten_env_setup.sh b/accelerators/gluten_env_setup.sh new file mode 100755 index 0000000..ca06eb8 --- /dev/null +++ b/accelerators/gluten_env_setup.sh @@ -0,0 +1,44 @@ +#!/bin/bash + +# Check if we gluten and gluten UDFs present +GLUTEN_NATIVE_LIB_NAME=libhigh-performance-spark-gluten-0.so +NATIVE_LIB_DIR=$(pwd)/../native/src/ +NATIVE_LIB_PATH="${NATIVE_LIB_DIR}${GLUTEN_NATIVE_LIB_NAME}" +GLUTEN_HOME=incubator-gluten +source /etc/lsb-release +if [ "$SPARK_MAJOR" == "3.4" && "$DISTRIB_RELEASE" == "20.04" ]; then + GLUTEN_EXISTS="true" + gluten_jvm_jar=$(ls accelerators/gluten-velox-bundle-spark3.4_2.12-1.1.0.jar) + GLUTEN_SPARK_EXTRA="--conf spark.plugins=io.glutenproject.GlutenPlugin \ + --conf spark.memory.offHeap.enabled=true \ + --conf spark.memory.offHeap.size=5g \ + --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \ + --jars ${gluten_jvm_jar}" +else + if [ -d ${GLUTEN_HOME} ]; then + GLUTEN_EXISTS="true" + gluten_jvm_jar=$(ls "${GLUTEN_HOME}"/package/target/gluten-velox-bundle-spark3.5_2.12-ubuntu_*-*-SNAPSHOT.jar) #TBD + gluten_jvm_package_jar=$(ls "${GLUTEN_HOME}"/package/target/gluten-package*-*-SNAPSHOT.jar) + GLUTEN_SPARK_EXTRA="--conf spark.plugins=io.glutenproject.GlutenPlugin \ + --jars ${gluten_jvm_jar},${gluten_jvm_package_jar} \ + --conf spark.memory.offHeap.enabled=true \ + --conf spark.memory.offHeap.size=5g \ + --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager" + # Enable UDF seperately. + fi +fi +if [ -f "${NATIVE_LIB_PATH}" ]; then + if [ "$GLUTEN_EXISTS" == "true" ]; then + GLUTEN_UDF_EXISTS="true" + GLUTEN_SPARK_EXTRA="$GLUTEN_SPARK_EXTRA \ + --conf spark.jars=${gluten_jvm_jar} \ + --conf spark.gluten.loadLibFromJar=true \ + --files ${NATIVE_LIB_PATH} \ + --conf spark.gluten.sql.columnar.backend.velox.udfLibraryPaths=${GLUTEN_NATIVE_LIB_NAME}" + fi +fi +SPARK_EXTRA=GLUTEN_SPARK_EXTRA + +export SPARK_EXTRA +export GLUTEN_UDF_EXISTS +export GLUTEN_EXISTS diff --git a/accelerators/gluten_spark_34_ex.sh b/accelerators/gluten_spark_34_ex.sh index 3f98c6e..0f98ab8 100755 --- a/accelerators/gluten_spark_34_ex.sh +++ b/accelerators/gluten_spark_34_ex.sh @@ -1,13 +1,22 @@ #!/bin/bash -source setup.sh +set -ex + +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +cd "${SCRIPT_DIR}" +source "${SCRIPT_DIR}/setup_gluten_spark34.sh" -SPARK_HOME=${SPARK_DIR} export SPARK_HOME -PATH=$(pwd)/${SPARK_DIR}/bin:$PATH -spark-sql --master local[5] \ +PATH="$(pwd)/${SPARK_DIR}/bin:$PATH" +export PATH +"${SPARK_HOME}/bin/spark-sql" --master local[5] \ --conf spark.plugins=io.glutenproject.GlutenPlugin \ --conf spark.memory.offHeap.enabled=true \ --conf spark.memory.offHeap.size=5g \ - --jars ${GLUTEN_JAR} \ + --jars "${GLUTEN_JAR}" \ + --conf spark.eventLog.enabled=true \ -e "SELECT 1" + +source gluten_env_setup.sh +cd .. +./run_sql_examples.sh || echo "Expected to fail" diff --git a/accelerators/install_rust_if_needed.sh b/accelerators/install_rust_if_needed.sh new file mode 100644 index 0000000..4858508 --- /dev/null +++ b/accelerators/install_rust_if_needed.sh @@ -0,0 +1,9 @@ +#!/bin/bash +if [ -f $HOME/.cargo/env ]; then + source $HOME/.cargo/env +fi + +if ! command -v cargo; then + curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y + source $HOME/.cargo/env +fi diff --git a/accelerators/run_gluten.sh b/accelerators/run_gluten.sh old mode 100644 new mode 100755 index b646f55..34ddb3b --- a/accelerators/run_gluten.sh +++ b/accelerators/run_gluten.sh @@ -1 +1,3 @@ -${SPARK_HOME}/bin/spark-shell --master local --jars ${ACCEL_JARS}/gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_2.12-1.1.1.jar --spark-properties=gluten_config.properties +#!/bin/bash + +"${SPARK_HOME}/bin/spark-shell" --master local --jars "${ACCEL_JARS}/gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_2.12-1.1.1.jar" --spark-properties=gluten_config.properties diff --git a/accelerators/setup.sh b/accelerators/setup.sh deleted file mode 100644 index 5f9cf4e..0000000 --- a/accelerators/setup.sh +++ /dev/null @@ -1,34 +0,0 @@ -ACCEL_JARS=./ -SPARK_MAJOR_VERSION=3.4 - -set -ex - -# Note: this does not work on Ubuntu 23, only on 22 -# You might get something like: -# # C [libgluten.so+0x30c753] gluten::Runtime::registerFactory(std::string const&, std::function, std::equal_to, std::allocator > > const&)>)+0x23 - - -SPARK_VERSION=3.4.2 -HADOOP_VERSION=3 -SPARK_DIR=spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION} -SPARK_FILE=${SPARK_DIR}.tgz - -if [ ! -d ${SPARK_DIR} ]; then - wget https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE} - tar -xvf ${SPARK_FILE} -fi - -GLUTEN_JAR=gluten-velox-bundle-spark3.4_2.12-1.1.0.jar - -if [ ! -f ${GLUTEN_JAR} ]; then - wget https://github.com/oap-project/gluten/releases/download/v1.1.0/${GLUTEN_JAR} -fi - -if [ ! -f "${GLUTEN_JAR}" ]; then - wget https://github.com/apache/incubator-gluten/releases/download/v1.1.1/gluten-velox-bundle-spark3.4_2.12-1.1.1.jar -fi -if [ ! -d arrow-datafusion-comet ]; then - git clone https://github.com/apache/incubator-gluten.git - cd arrow-datafusion-comet - make all PROFILES="-Pspark-3.4" -fi diff --git a/accelerators/setup_comet.sh b/accelerators/setup_comet.sh new file mode 100755 index 0000000..43eee38 --- /dev/null +++ b/accelerators/setup_comet.sh @@ -0,0 +1,16 @@ +#!/bin/bash + +set -ex + +source install_rust_if_needed.sh + +if [ ! -d arrow-datafusion-comet ]; then + git clone https://github.com/apache/arrow-datafusion-comet.git +fi + +if [ -z $(ls arrow-datafusion-comet/spark/target/comet-spark-spark*.jar) ]; then + cd arrow-datafusion-comet + make clean release PROFILES="-Pspark-3.4" +fi +COMET_JAR="$(pwd)/$(ls incubator-comet/spark/target/comet-spark-spark*.jar)" +export COMET_JAR diff --git a/accelerators/setup_gluten_deps.sh b/accelerators/setup_gluten_deps.sh new file mode 100755 index 0000000..b805fd1 --- /dev/null +++ b/accelerators/setup_gluten_deps.sh @@ -0,0 +1,8 @@ +#!/bin/bash +set -ex + +sudo apt-get update +sudo apt-get install -y locales wget tar tzdata git ccache cmake ninja-build build-essential llvm-dev clang libiberty-dev libdwarf-dev libre2-dev libz-dev libssl-dev libboost-all-dev libcurl4-openssl-dev maven rapidjson-dev libdouble-conversion-dev libgflags-dev libsodium-dev libsnappy-dev nasm && sudo apt install -y libunwind-dev +sudo apt-get install -y libgoogle-glog-dev +sudo apt-get -y install docker-compose +sudo apt-get install -y libre2-9 diff --git a/accelerators/setup_gluten_from_src.sh b/accelerators/setup_gluten_from_src.sh index 8b085cc..4788e05 100755 --- a/accelerators/setup_gluten_from_src.sh +++ b/accelerators/setup_gluten_from_src.sh @@ -2,7 +2,7 @@ set -ex # Setup deps -sudo apt-get update && sudo apt-get install -y locales wget tar tzdata git ccache cmake ninja-build build-essential llvm-dev clang libiberty-dev libdwarf-dev libre2-dev libz-dev libssl-dev libboost-all-dev libcurl4-openssl-dev maven rapidjson-dev libdouble-conversion-dev libgflags-dev libsodium-dev libsnappy-dev nasm && sudo apt install -y libunwind-dev && sudo apt-get install -y libgoogle-glog-dev && sudo apt-get -y install docker-compose +source setup_gluten_deps.sh # Try gluten w/clickhouse #if [ ! -d gluten ]; then diff --git a/accelerators/setup_gluten_spark34.sh b/accelerators/setup_gluten_spark34.sh new file mode 100755 index 0000000..b74ba81 --- /dev/null +++ b/accelerators/setup_gluten_spark34.sh @@ -0,0 +1,51 @@ +#!/bin/bash + +mkdir -p /tmp/spark-events +SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd ) +ACCEL_JARS=${SCRIPT_DIR} +SPARK_MAJOR_VERSION=3.4 +SCALA_VERSION=${SCALA_VERSION:-"2.12"} + +set -ex + +# Note: this does not work on Ubuntu 23, only on 22 +# You might get something like: +# # C [libgluten.so+0x30c753] gluten::Runtime::registerFactory(std::string const&, std::function, std::equal_to, std::allocator > > const&)>)+0x23 + + +SPARK_VERSION=3.4.2 +SPARK_MAJOR=3.4 +HADOOP_VERSION=3 +SPARK_DIR="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" +SPARK_FILE="${SPARK_DIR}.tgz" + +export SPARK_MAJOR +export SPARK_VERSION + +source setup_gluten_deps.sh + +cd .. +source /etc/lsb-release +# Pre-baked only +if [ "$DISTRIB_RELEASE" == "20.04" ]; then + source ./env_setup.sh + cd "${SCRIPT_DIR}" + + GLUTEN_JAR="gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-1.1.0.jar" + GLUTEN_JAR_PATH="${SCRIPT_DIR}/gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-1.1.0.jar" + + export GLUTEN_JAR + + if [ ! -f "${GLUTEN_JAR_PATH}" ]; then + wget "https://github.com/oap-project/gluten/releases/download/v1.1.0/${GLUTEN_JAR}" & + fi + + wait +else + if [ ! -d incubator-gluten ]; then + git clone https://github.com/apache/incubator-gluten.git + fi + cd incubator-gluten + sudo ./dev/builddeps-veloxbe.sh --enable_s3=ON + mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests +fi diff --git a/env_setup.sh b/env_setup.sh index 2f4e834..42722ac 100755 --- a/env_setup.sh +++ b/env_setup.sh @@ -3,10 +3,10 @@ # Download Spark and iceberg if not present SPARK_MAJOR=${SPARK_MAJOR:-"3.5"} -SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.0"} +SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.1"} SCALA_VERSION=${SCALA_VERSION:-"2.12"} HADOOP_VERSION="3" -SPARK_PATH="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" +SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz" ICEBERG_VERSION=${ICEBERG_VERSION:-"1.4.0"} if [ ! -f "${SPARK_FILE}" ]; then @@ -18,6 +18,7 @@ if [ ! -f "${ICEBERG_FILE}" ]; then wget "https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-${SPARK_MAJOR}_${SCALA_VERSION}/${ICEBERG_VERSION}/${ICEBERG_FILE}" -O "${ICEBERG_FILE}" & fi wait +sleep 1 # Setup the env if [ ! -d "${SPARK_PATH}" ]; then tar -xf "${SPARK_FILE}" diff --git a/run_sql_examples.sh b/run_sql_examples.sh index f3b25f3..2b50061 100755 --- a/run_sql_examples.sh +++ b/run_sql_examples.sh @@ -4,35 +4,11 @@ set -o pipefail source env_setup.sh -# Check if we gluten and gluten UDFs present -GLUTEN_NATIVE_LIB_NAME=libhigh-performance-spark-gluten-0.so -NATIVE_LIB_DIR=./native/src/ -NATIVE_LIB_PATH="${NATIVE_LIB_DIR}${GLUTEN_NATIVE_LIB_NAME}" -GLUTEN_HOME=./gluten -if [ -d ${GLUTEN_HOME} ]; then - GLUTEN_EXISTS="true" - gluten_jvm_jar=$(ls "${GLUTEN_HOME}"/package/target/gluten-velox-bundle-spark3.5_2.12-ubuntu_*-*-SNAPSHOT.jar) #TBD - gluten_jvm_package_jar=$(ls "${GLUTEN_HOME}"/package/target/gluten-package*-*-SNAPSHOT.jar) - GLUTEN_SPARK_EXTRA="--conf spark.plugins=io.glutenproject.GlutenPlugin \ - --conf spark.memory.offHeap.enabled=true \ - --conf spark.memory.offHeap.size=5g \ - --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \ - --conf spark.gluten.sql.columnar.backend.velox.udfLibraryPaths=${GLUTEN_NATIVE_LIB_NAME}" - # Enable UDF seperately. - if [ -f "${NATIVE_LIB_PATH}" ]; then - GLUTEN_SPARK_EXTRA="$GLUTEN_SPARK_EXTRA \ - --jars ${gluten_jvm_jar},${gluten_jvm_package_jar} \ - --conf spark.jars=${gluten_jvm_jar} \ - --conf spark.gluten.loadLibFromJar=true \ - --files ${NATIVE_LIB_PATH}" - fi -fi - function run_example () { local sql_file="$1" local extra="$2" # shellcheck disable=SC2046,SC2086 - spark-sql --master local[5] \ + ${SPARK_HOME}/bin/spark-sql --master local[5] \ --conf spark.eventLog.enabled=true \ --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ @@ -40,7 +16,7 @@ function run_example () { --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ --conf spark.sql.catalog.local.type=hadoop \ --conf "spark.sql.catalog.local.warehouse=$PWD/warehouse" \ - ${extra} \ + ${extra} ${SPARK_EXTRA} \ $(cat "${sql_file}.conf" || echo "") \ --name "${sql_file}" \ -f "${sql_file}" | tee -a "${sql_file}.out" || ls "${sql_file}.expected_to_fail" @@ -56,21 +32,25 @@ if [ $# -eq 1 ]; then else echo "Processing gluten ${sql_file}" # shellcheck disable=SC2046 - run_example "$sql_file" "$GLUTEN_SPARK_EXTRA" + run_example "$sql_file" fi else # For each SQL for sql_file in sql/*.sql; do - if [[ "$sql_file" != *"gluten_only"* ]]; then + if [[ "$sql_file" != *"_only"* ]]; then echo "Processing ${sql_file}" # shellcheck disable=SC2046 run_example "$sql_file" - elif [[ "$GLUTEN_EXISTS" == "true" ]]; then + elif [[ "$sql_file" != *"gluten_only"* && "$GLUTEN_EXISTS" == "true" ]]; then echo "Processing gluten ${sql_file}" # shellcheck disable=SC2046 - run_example "$sql_file" "$GLUTEN_SPARK_EXTRA" + run_example "$sql_file" + elif [[ "$sql_file" != *"gluten_udf_only"* && "$GLUTEN_UDF_EXISTS" == "true" ]]; then + echo "Processing gluten UDF ${sql_file}" + # shellcheck disable=SC2046 + run_example "$sql_file" else - echo "Skipping $sql_file since we did not find gluten and this is a gluten only example." + echo "Skipping $sql_file since we did not find gluten and this is restricted example." fi done fi diff --git a/target-validator/runme.sh b/target-validator/runme.sh index 52ebe14..b6236dd 100755 --- a/target-validator/runme.sh +++ b/target-validator/runme.sh @@ -15,4 +15,4 @@ sbt -Dspark="${SPARK_VERSION}" clean assembly JAR_PATH="$(pwd)/target/scala-2.12/data-validator-assembly-${SPARK_VERSION}_0.15.0.jar" export JAR_PATH cd .. -spark-submit --master local "$JAR_PATH" --config ex.yaml || echo "Failed as expected." +"${SPARK_HOME}/bin/spark-submit" --master local "$JAR_PATH" --config ex.yaml || echo "Failed as expected." From 9e607f745e7c47e98d20a46436b30f99547fd121 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 20 Mar 2024 11:08:50 -0700 Subject: [PATCH 06/37] Fix comet resolution --- accelerators/comet_ex.sh | 8 ++++++-- accelerators/setup_comet.sh | 5 ++--- 2 files changed, 8 insertions(+), 5 deletions(-) diff --git a/accelerators/comet_ex.sh b/accelerators/comet_ex.sh index 1f95323..403eef9 100755 --- a/accelerators/comet_ex.sh +++ b/accelerators/comet_ex.sh @@ -1,6 +1,10 @@ #!/bin/bash - set -ex + +SPARK_MAJOR=3.4 +export SPARK_MAJOR + source setup_comet.sh source comet_env_setup.sh -USE_COMET="true" ../run_sql_examples.sh +cd .. +USE_COMET="true" ./run_sql_examples.sh diff --git a/accelerators/setup_comet.sh b/accelerators/setup_comet.sh index 43eee38..5cffc34 100755 --- a/accelerators/setup_comet.sh +++ b/accelerators/setup_comet.sh @@ -1,7 +1,6 @@ #!/bin/bash set -ex - source install_rust_if_needed.sh if [ ! -d arrow-datafusion-comet ]; then @@ -10,7 +9,7 @@ fi if [ -z $(ls arrow-datafusion-comet/spark/target/comet-spark-spark*.jar) ]; then cd arrow-datafusion-comet - make clean release PROFILES="-Pspark-3.4" + make clean release PROFILES="-Pspark-${SPARK_MAJOR}" fi -COMET_JAR="$(pwd)/$(ls incubator-comet/spark/target/comet-spark-spark*.jar)" +COMET_JAR="$(pwd)/$(ls arrow-datafusion-comet/spark/target/comet-spark-spark*SNAPSHOT.jar)" export COMET_JAR From 0ac92c7802a1c9dcd7d328de007075ee3c6387aa Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 20 Mar 2024 11:17:05 -0700 Subject: [PATCH 07/37] Multiple extensions (Iceberg and Comet) --- accelerators/comet_env_setup.sh | 9 ++++++++- run_sql_examples.sh | 6 +++++- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/accelerators/comet_env_setup.sh b/accelerators/comet_env_setup.sh index f6a5050..2018b89 100644 --- a/accelerators/comet_env_setup.sh +++ b/accelerators/comet_env_setup.sh @@ -1,8 +1,15 @@ #!/bin/bash SPARK_EXTRA="--jars ${COMET_JAR} \ ---conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \ --conf spark.comet.enabled=true \ --conf spark.comet.exec.enabled=true \ --conf spark.comet.exec.all.enabled=true" +# Instead of --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions we set +# EXTRA_EXTENSIONS so it can be appended to iceberg +if [ -z "$EXTRA_EXTENSIONS" ]; then + EXTRA_EXTENSIONS="org.apache.comet.CometSparkSessionExtensions" +else + EXTRA_EXTENSIONS="org.apache.comet.CometSparkSessionExtensions,$EXTRA_EXTENSIONS" +fi +export EXTRA_EXTENSIONS export SPARK_EXTRA diff --git a/run_sql_examples.sh b/run_sql_examples.sh index 2b50061..d18a54f 100755 --- a/run_sql_examples.sh +++ b/run_sql_examples.sh @@ -7,10 +7,14 @@ source env_setup.sh function run_example () { local sql_file="$1" local extra="$2" + EXTENSIONS=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions + if [ ! -z "$EXTRA_EXTENSIONS" ]; then + EXTENSIONS="$EXTENSIONS,$EXTRA_EXTENSIONS" + fi # shellcheck disable=SC2046,SC2086 ${SPARK_HOME}/bin/spark-sql --master local[5] \ --conf spark.eventLog.enabled=true \ - --conf spark.sql.extensions=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions \ + --conf spark.sql.extensions=$EXTENSIONS \ --conf spark.sql.catalog.spark_catalog=org.apache.iceberg.spark.SparkSessionCatalog \ --conf spark.sql.catalog.spark_catalog.type=hive \ --conf spark.sql.catalog.local=org.apache.iceberg.spark.SparkCatalog \ From 1fe48976cb255a600cda1725a9555d8fa3436fc9 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 20 Mar 2024 11:22:41 -0700 Subject: [PATCH 08/37] Turn on Comet shuffle --- accelerators/comet_env_setup.sh | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/accelerators/comet_env_setup.sh b/accelerators/comet_env_setup.sh index 2018b89..db01605 100644 --- a/accelerators/comet_env_setup.sh +++ b/accelerators/comet_env_setup.sh @@ -3,7 +3,10 @@ SPARK_EXTRA="--jars ${COMET_JAR} \ --conf spark.comet.enabled=true \ --conf spark.comet.exec.enabled=true \ ---conf spark.comet.exec.all.enabled=true" +--conf spark.comet.exec.all.enabled=true \ +--conf spark.shuffle.manager=org.apache.spark.sql.comet.execution.shuffle.CometShuffleManager \ +--conf spark.comet.exec.shuffle.enabled=true \ +--conf spark.comet.columnar.shuffle.enabled=true" # Instead of --conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions we set # EXTRA_EXTENSIONS so it can be appended to iceberg if [ -z "$EXTRA_EXTENSIONS" ]; then From f4fe0396a523ca27dc7d5daa2cf2be1ae9a2c00c Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 20 Mar 2024 11:24:52 -0700 Subject: [PATCH 09/37] Style fixes --- accelerators/gluten_env_setup.sh | 2 +- accelerators/install_rust_if_needed.sh | 6 +++--- accelerators/setup_comet.sh | 2 +- run_sql_examples.sh | 2 +- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/accelerators/gluten_env_setup.sh b/accelerators/gluten_env_setup.sh index ca06eb8..ab9710d 100755 --- a/accelerators/gluten_env_setup.sh +++ b/accelerators/gluten_env_setup.sh @@ -6,7 +6,7 @@ NATIVE_LIB_DIR=$(pwd)/../native/src/ NATIVE_LIB_PATH="${NATIVE_LIB_DIR}${GLUTEN_NATIVE_LIB_NAME}" GLUTEN_HOME=incubator-gluten source /etc/lsb-release -if [ "$SPARK_MAJOR" == "3.4" && "$DISTRIB_RELEASE" == "20.04" ]; then +if [ "$SPARK_MAJOR" == "3.4" ] && [ "$DISTRIB_RELEASE" == "20.04" ]; then GLUTEN_EXISTS="true" gluten_jvm_jar=$(ls accelerators/gluten-velox-bundle-spark3.4_2.12-1.1.0.jar) GLUTEN_SPARK_EXTRA="--conf spark.plugins=io.glutenproject.GlutenPlugin \ diff --git a/accelerators/install_rust_if_needed.sh b/accelerators/install_rust_if_needed.sh index 4858508..76826e8 100644 --- a/accelerators/install_rust_if_needed.sh +++ b/accelerators/install_rust_if_needed.sh @@ -1,9 +1,9 @@ #!/bin/bash -if [ -f $HOME/.cargo/env ]; then - source $HOME/.cargo/env +if [ -f "$HOME/.cargo/env" ]; then + source "$HOME/.cargo/env" fi if ! command -v cargo; then curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y - source $HOME/.cargo/env + source "$HOME/.cargo/env" fi diff --git a/accelerators/setup_comet.sh b/accelerators/setup_comet.sh index 5cffc34..08d7059 100755 --- a/accelerators/setup_comet.sh +++ b/accelerators/setup_comet.sh @@ -7,7 +7,7 @@ if [ ! -d arrow-datafusion-comet ]; then git clone https://github.com/apache/arrow-datafusion-comet.git fi -if [ -z $(ls arrow-datafusion-comet/spark/target/comet-spark-spark*.jar) ]; then +if [ -z "$(ls arrow-datafusion-comet/spark/target/comet-spark-spark*.jar)" ]; then cd arrow-datafusion-comet make clean release PROFILES="-Pspark-${SPARK_MAJOR}" fi diff --git a/run_sql_examples.sh b/run_sql_examples.sh index d18a54f..9edd720 100755 --- a/run_sql_examples.sh +++ b/run_sql_examples.sh @@ -8,7 +8,7 @@ function run_example () { local sql_file="$1" local extra="$2" EXTENSIONS=org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions - if [ ! -z "$EXTRA_EXTENSIONS" ]; then + if [ -n "$EXTRA_EXTENSIONS" ]; then EXTENSIONS="$EXTENSIONS,$EXTRA_EXTENSIONS" fi # shellcheck disable=SC2046,SC2086 From d9d4a39a1d4b901bab54fa04cfeb6d73bf593e50 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 20 Mar 2024 14:00:03 -0700 Subject: [PATCH 10/37] Use version 3.4.2 and also use setup rust action for speed --- .github/workflows/ci.yml | 5 +++++ accelerators/comet_ex.sh | 2 ++ 2 files changed, 7 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 1bbcb25..58bb17c 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -103,6 +103,11 @@ jobs: path: | data/fetched/* key: data-fetched + - name: Setup Rust + uses: actions-rs/toolchain@v1 + with: + toolchain: stable + override: true - name: Run comet run: cd accelerators; ./comet_ex.sh diff --git a/accelerators/comet_ex.sh b/accelerators/comet_ex.sh index 403eef9..cd97d73 100755 --- a/accelerators/comet_ex.sh +++ b/accelerators/comet_ex.sh @@ -2,7 +2,9 @@ set -ex SPARK_MAJOR=3.4 +SPARK_VERSION=3.4.2 export SPARK_MAJOR +export SPARK_VERSION source setup_comet.sh source comet_env_setup.sh From c18210c7fb7b43d031e16da1aa79fe7ded8241ce Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 20 Mar 2024 14:01:04 -0700 Subject: [PATCH 11/37] Seperate out setup comet so we can debug faster. --- .github/workflows/ci.yml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 58bb17c..f8d360f 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -108,6 +108,9 @@ jobs: with: toolchain: stable override: true + - name: Setup comet + run: + cd accelerators; ./setup_comet.sh - name: Run comet run: cd accelerators; ./comet_ex.sh From 9f790852bdaba22bf3b5acadf295c8bfba552b95 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 20 Mar 2024 14:05:40 -0700 Subject: [PATCH 12/37] Setup jdk versions for happiness. --- .github/workflows/ci.yml | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f8d360f..75767e4 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,7 +26,7 @@ jobs: with: distribution: temurin java-version: ${{ matrix.java }} - cache: sbt + cache: sbt, mvn - name: Scala Build and Test run: sbt clean package +test python-test: @@ -76,6 +76,12 @@ jobs: spark*.tgz iceberg*.jar key: spark-artifacts + - name: Setup JDK + uses: actions/setup-java@v3 + with: + distribution: temurin + java-version: 17 + cache: sbt, mvn - name: Cache Data uses: actions/cache@v3 with: @@ -108,6 +114,12 @@ jobs: with: toolchain: stable override: true + - name: Setup JDK + uses: actions/setup-java@v3 + with: + distribution: temurin + java-version: 17 + cache: sbt, mvn - name: Setup comet run: cd accelerators; ./setup_comet.sh From f6e565b0697672b6c67b9102ccdda56db0ea6907 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 20 Mar 2024 14:08:51 -0700 Subject: [PATCH 13/37] Change caching to make sense --- .github/workflows/ci.yml | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 75767e4..4ea8365 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,7 +26,7 @@ jobs: with: distribution: temurin java-version: ${{ matrix.java }} - cache: sbt, mvn + cache: sbt - name: Scala Build and Test run: sbt clean package +test python-test: @@ -81,7 +81,11 @@ jobs: with: distribution: temurin java-version: 17 - cache: sbt, mvn + - name: Cache Maven packages + uses: actions/cache@v2 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-gluten - name: Cache Data uses: actions/cache@v3 with: @@ -109,6 +113,11 @@ jobs: path: | data/fetched/* key: data-fetched + - name: Cache Maven packages + uses: actions/cache@v2 + with: + path: ~/.m2 + key: ${{ runner.os }}-m2-comet - name: Setup Rust uses: actions-rs/toolchain@v1 with: @@ -119,7 +128,6 @@ jobs: with: distribution: temurin java-version: 17 - cache: sbt, mvn - name: Setup comet run: cd accelerators; ./setup_comet.sh From a11ec9668b0c5e4e53fe384a9518dd6d5f3c022c Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 20 Mar 2024 14:37:17 -0700 Subject: [PATCH 14/37] Work around the classloader issue we found. --- accelerators/comet_env_setup.sh | 5 ++++- accelerators/comet_ex.sh | 5 ++++- env_setup.sh | 5 ++++- 3 files changed, 12 insertions(+), 3 deletions(-) diff --git a/accelerators/comet_env_setup.sh b/accelerators/comet_env_setup.sh index db01605..b14b6ee 100644 --- a/accelerators/comet_env_setup.sh +++ b/accelerators/comet_env_setup.sh @@ -1,6 +1,9 @@ #!/bin/bash -SPARK_EXTRA="--jars ${COMET_JAR} \ +# Instead of using --jars ${COMET_JAR} we copy the comet JAR into the SPARK_HOME +# See https://github.com/apache/arrow-datafusion-comet/issues/221 for details +cp ${COMET_JAR} ${SPARK_HOME}/jars/ +SPARK_EXTRA=" --conf spark.comet.enabled=true \ --conf spark.comet.exec.enabled=true \ --conf spark.comet.exec.all.enabled=true \ diff --git a/accelerators/comet_ex.sh b/accelerators/comet_ex.sh index cd97d73..c705371 100755 --- a/accelerators/comet_ex.sh +++ b/accelerators/comet_ex.sh @@ -7,6 +7,9 @@ export SPARK_MAJOR export SPARK_VERSION source setup_comet.sh +pushd .. +source ./env_setup.sh +popd source comet_env_setup.sh -cd .. +pushd .. USE_COMET="true" ./run_sql_examples.sh diff --git a/env_setup.sh b/env_setup.sh index 42722ac..42ed190 100755 --- a/env_setup.sh +++ b/env_setup.sh @@ -1,5 +1,6 @@ #!/bin/bash +set -ex # Download Spark and iceberg if not present SPARK_MAJOR=${SPARK_MAJOR:-"3.5"} @@ -24,7 +25,8 @@ if [ ! -d "${SPARK_PATH}" ]; then tar -xf "${SPARK_FILE}" fi -export SPARK_HOME="${SPARK_PATH}" +SPARK_HOME="${SPARK_PATH}" +export SPARK_HOME if [ ! -f "${SPARK_PATH}/jars/${ICEBERG_FILE}" ]; then # Delete the old JAR first. @@ -42,3 +44,4 @@ mkdir -p ./data/fetched/ if [ ! -f ./data/fetched/2021 ]; then wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" -O ./data/fetched/2021 fi + From 9f57e26eb2f94f29c99ef4bc56194247ed8c7972 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 20 Mar 2024 14:40:44 -0700 Subject: [PATCH 15/37] shellcheck fix. --- accelerators/comet_env_setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/accelerators/comet_env_setup.sh b/accelerators/comet_env_setup.sh index b14b6ee..e930305 100644 --- a/accelerators/comet_env_setup.sh +++ b/accelerators/comet_env_setup.sh @@ -2,7 +2,7 @@ # Instead of using --jars ${COMET_JAR} we copy the comet JAR into the SPARK_HOME # See https://github.com/apache/arrow-datafusion-comet/issues/221 for details -cp ${COMET_JAR} ${SPARK_HOME}/jars/ +cp "${COMET_JAR}" "${SPARK_HOME}/jars/" SPARK_EXTRA=" --conf spark.comet.enabled=true \ --conf spark.comet.exec.enabled=true \ From 44e41b08a14fbe97cb7fe1a1bb4804c8b2e7ca05 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 20 Mar 2024 14:56:04 -0700 Subject: [PATCH 16/37] Hmm why no version. --- accelerators/setup_comet.sh | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/accelerators/setup_comet.sh b/accelerators/setup_comet.sh index 08d7059..3153d10 100755 --- a/accelerators/setup_comet.sh +++ b/accelerators/setup_comet.sh @@ -3,6 +3,13 @@ set -ex source install_rust_if_needed.sh +if [ -z "${SPARK_MAJOR}" ]; then + echo "Need a spark major version specified." + exit 1 +else + echo "Building comet for Spark ${SPARK_MAJOR}" +fi + if [ ! -d arrow-datafusion-comet ]; then git clone https://github.com/apache/arrow-datafusion-comet.git fi From 4182d89f8f799eaafa6a9df73b9b84695bd38824 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 20 Mar 2024 14:58:12 -0700 Subject: [PATCH 17/37] Fix version pass in for setup --- .github/workflows/ci.yml | 2 +- accelerators/comet_ex.sh | 3 ++- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 4ea8365..f7dbddb 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -130,7 +130,7 @@ jobs: java-version: 17 - name: Setup comet run: - cd accelerators; ./setup_comet.sh + cd accelerators; SPARK_MAJOR=3.4 ./setup_comet.sh - name: Run comet run: cd accelerators; ./comet_ex.sh diff --git a/accelerators/comet_ex.sh b/accelerators/comet_ex.sh index c705371..cd08177 100755 --- a/accelerators/comet_ex.sh +++ b/accelerators/comet_ex.sh @@ -1,7 +1,8 @@ #!/bin/bash set -ex -SPARK_MAJOR=3.4 +# If you change this update the workflow version too. +SPARK_MAJOR=${SPARK_MAJOR:-3.4} SPARK_VERSION=3.4.2 export SPARK_MAJOR export SPARK_VERSION From 905752239649f086d850cc91e1ded9f59c0b8745 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 20 Mar 2024 15:48:36 -0700 Subject: [PATCH 18/37] Fix comet setup --- accelerators/setup_comet.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/accelerators/setup_comet.sh b/accelerators/setup_comet.sh index 3153d10..dbfadd5 100755 --- a/accelerators/setup_comet.sh +++ b/accelerators/setup_comet.sh @@ -17,6 +17,7 @@ fi if [ -z "$(ls arrow-datafusion-comet/spark/target/comet-spark-spark*.jar)" ]; then cd arrow-datafusion-comet make clean release PROFILES="-Pspark-${SPARK_MAJOR}" + cd .. fi COMET_JAR="$(pwd)/$(ls arrow-datafusion-comet/spark/target/comet-spark-spark*SNAPSHOT.jar)" export COMET_JAR From b7fc79727161babdffdf5e406dd34da871794de6 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 20 Mar 2024 16:42:46 -0700 Subject: [PATCH 19/37] Try and fix gluten build --- accelerators/gluten_env_setup.sh | 17 ++--------------- accelerators/setup_gluten_spark34.sh | 13 ++++++++----- 2 files changed, 10 insertions(+), 20 deletions(-) diff --git a/accelerators/gluten_env_setup.sh b/accelerators/gluten_env_setup.sh index ab9710d..9e21d43 100755 --- a/accelerators/gluten_env_setup.sh +++ b/accelerators/gluten_env_setup.sh @@ -6,26 +6,13 @@ NATIVE_LIB_DIR=$(pwd)/../native/src/ NATIVE_LIB_PATH="${NATIVE_LIB_DIR}${GLUTEN_NATIVE_LIB_NAME}" GLUTEN_HOME=incubator-gluten source /etc/lsb-release -if [ "$SPARK_MAJOR" == "3.4" ] && [ "$DISTRIB_RELEASE" == "20.04" ]; then +if [ -n "$GLUTEN_JAR_PATH" ]; then GLUTEN_EXISTS="true" - gluten_jvm_jar=$(ls accelerators/gluten-velox-bundle-spark3.4_2.12-1.1.0.jar) GLUTEN_SPARK_EXTRA="--conf spark.plugins=io.glutenproject.GlutenPlugin \ --conf spark.memory.offHeap.enabled=true \ --conf spark.memory.offHeap.size=5g \ --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \ - --jars ${gluten_jvm_jar}" -else - if [ -d ${GLUTEN_HOME} ]; then - GLUTEN_EXISTS="true" - gluten_jvm_jar=$(ls "${GLUTEN_HOME}"/package/target/gluten-velox-bundle-spark3.5_2.12-ubuntu_*-*-SNAPSHOT.jar) #TBD - gluten_jvm_package_jar=$(ls "${GLUTEN_HOME}"/package/target/gluten-package*-*-SNAPSHOT.jar) - GLUTEN_SPARK_EXTRA="--conf spark.plugins=io.glutenproject.GlutenPlugin \ - --jars ${gluten_jvm_jar},${gluten_jvm_package_jar} \ - --conf spark.memory.offHeap.enabled=true \ - --conf spark.memory.offHeap.size=5g \ - --conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager" - # Enable UDF seperately. - fi + --jars ${GLUTEN_JAR_PATH}" fi if [ -f "${NATIVE_LIB_PATH}" ]; then if [ "$GLUTEN_EXISTS" == "true" ]; then diff --git a/accelerators/setup_gluten_spark34.sh b/accelerators/setup_gluten_spark34.sh index b74ba81..d129f08 100755 --- a/accelerators/setup_gluten_spark34.sh +++ b/accelerators/setup_gluten_spark34.sh @@ -34,18 +34,21 @@ if [ "$DISTRIB_RELEASE" == "20.04" ]; then GLUTEN_JAR="gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-1.1.0.jar" GLUTEN_JAR_PATH="${SCRIPT_DIR}/gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-1.1.0.jar" - export GLUTEN_JAR - if [ ! -f "${GLUTEN_JAR_PATH}" ]; then - wget "https://github.com/oap-project/gluten/releases/download/v1.1.0/${GLUTEN_JAR}" & + wget "https://github.com/oap-project/gluten/releases/download/v1.1.0/${GLUTEN_JAR}" || unset GLUTEN_JAR_PATH fi - wait -else +fi +# Rather than if/else we fall through to build if wget fails because major version is not supported. +if [ -z "$GLUTEN_JAR_PATH" ]; then if [ ! -d incubator-gluten ]; then git clone https://github.com/apache/incubator-gluten.git fi cd incubator-gluten sudo ./dev/builddeps-veloxbe.sh --enable_s3=ON mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests + GLUTEN_JAR_PATH="$(pwd)/package/target/gluten-package-*-SNAPSHOT-${SPARK_MAJOR_VERSION}.jar" fi + +export GLUTEN_JAR_PATH + From dbff266d8943b70485a52fd1763de3ad1ed22481 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 20 Mar 2024 16:46:47 -0700 Subject: [PATCH 20/37] Style fix and statically link --- accelerators/gluten_env_setup.sh | 2 +- accelerators/setup_gluten_spark34.sh | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/accelerators/gluten_env_setup.sh b/accelerators/gluten_env_setup.sh index 9e21d43..6bda6ec 100755 --- a/accelerators/gluten_env_setup.sh +++ b/accelerators/gluten_env_setup.sh @@ -18,7 +18,7 @@ if [ -f "${NATIVE_LIB_PATH}" ]; then if [ "$GLUTEN_EXISTS" == "true" ]; then GLUTEN_UDF_EXISTS="true" GLUTEN_SPARK_EXTRA="$GLUTEN_SPARK_EXTRA \ - --conf spark.jars=${gluten_jvm_jar} \ + --conf spark.jars=${GLUTEN_JAR_PATH} \ --conf spark.gluten.loadLibFromJar=true \ --files ${NATIVE_LIB_PATH} \ --conf spark.gluten.sql.columnar.backend.velox.udfLibraryPaths=${GLUTEN_NATIVE_LIB_NAME}" diff --git a/accelerators/setup_gluten_spark34.sh b/accelerators/setup_gluten_spark34.sh index d129f08..826ca7f 100755 --- a/accelerators/setup_gluten_spark34.sh +++ b/accelerators/setup_gluten_spark34.sh @@ -45,7 +45,7 @@ if [ -z "$GLUTEN_JAR_PATH" ]; then git clone https://github.com/apache/incubator-gluten.git fi cd incubator-gluten - sudo ./dev/builddeps-veloxbe.sh --enable_s3=ON + sudo ./dev/builddeps-veloxbe.sh --enable_s3=ON --enable_vcpkg=ON mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests GLUTEN_JAR_PATH="$(pwd)/package/target/gluten-package-*-SNAPSHOT-${SPARK_MAJOR_VERSION}.jar" fi From a9a01d39d64c2c41f4510d277c1264a76d6e50a2 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 20 Mar 2024 17:01:22 -0700 Subject: [PATCH 21/37] vcpkg --- accelerators/setup_gluten_spark34.sh | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/accelerators/setup_gluten_spark34.sh b/accelerators/setup_gluten_spark34.sh index 826ca7f..2de6ec3 100755 --- a/accelerators/setup_gluten_spark34.sh +++ b/accelerators/setup_gluten_spark34.sh @@ -41,6 +41,11 @@ if [ "$DISTRIB_RELEASE" == "20.04" ]; then fi # Rather than if/else we fall through to build if wget fails because major version is not supported. if [ -z "$GLUTEN_JAR_PATH" ]; then + if [ ! -d vcpkg ]; then + git clone https://github.com/microsoft/vcpkg + fi + cd vcpkg + ./vcpkg/bootstrap-vcpkg.sh if [ ! -d incubator-gluten ]; then git clone https://github.com/apache/incubator-gluten.git fi From 5ad5c3be116ee4ed039bdc3206f8adea9d83eb9b Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Wed, 20 Mar 2024 18:04:51 -0700 Subject: [PATCH 22/37] Try and fix vcpkg --- accelerators/setup_gluten_spark34.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/accelerators/setup_gluten_spark34.sh b/accelerators/setup_gluten_spark34.sh index 2de6ec3..ec78c74 100755 --- a/accelerators/setup_gluten_spark34.sh +++ b/accelerators/setup_gluten_spark34.sh @@ -46,6 +46,7 @@ if [ -z "$GLUTEN_JAR_PATH" ]; then fi cd vcpkg ./vcpkg/bootstrap-vcpkg.sh + cd .. if [ ! -d incubator-gluten ]; then git clone https://github.com/apache/incubator-gluten.git fi From 07229d6cdf9fdeafd007583c0a3cafcf8fcde2fd Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 24 Mar 2024 17:09:09 -0700 Subject: [PATCH 23/37] meh vcpkg is kind of a pain, lets skip it. --- accelerators/setup_gluten_spark34.sh | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/accelerators/setup_gluten_spark34.sh b/accelerators/setup_gluten_spark34.sh index ec78c74..d129f08 100755 --- a/accelerators/setup_gluten_spark34.sh +++ b/accelerators/setup_gluten_spark34.sh @@ -41,17 +41,11 @@ if [ "$DISTRIB_RELEASE" == "20.04" ]; then fi # Rather than if/else we fall through to build if wget fails because major version is not supported. if [ -z "$GLUTEN_JAR_PATH" ]; then - if [ ! -d vcpkg ]; then - git clone https://github.com/microsoft/vcpkg - fi - cd vcpkg - ./vcpkg/bootstrap-vcpkg.sh - cd .. if [ ! -d incubator-gluten ]; then git clone https://github.com/apache/incubator-gluten.git fi cd incubator-gluten - sudo ./dev/builddeps-veloxbe.sh --enable_s3=ON --enable_vcpkg=ON + sudo ./dev/builddeps-veloxbe.sh --enable_s3=ON mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests GLUTEN_JAR_PATH="$(pwd)/package/target/gluten-package-*-SNAPSHOT-${SPARK_MAJOR_VERSION}.jar" fi From f716ef7a2dcaf7e1b73240a79b38832cde300f5b Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 24 Mar 2024 17:45:35 -0700 Subject: [PATCH 24/37] Huzzah --driver-class-path does the trick. --- accelerators/comet_env_setup.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/accelerators/comet_env_setup.sh b/accelerators/comet_env_setup.sh index e930305..3563f0e 100644 --- a/accelerators/comet_env_setup.sh +++ b/accelerators/comet_env_setup.sh @@ -1,9 +1,8 @@ #!/bin/bash -# Instead of using --jars ${COMET_JAR} we copy the comet JAR into the SPARK_HOME -# See https://github.com/apache/arrow-datafusion-comet/issues/221 for details -cp "${COMET_JAR}" "${SPARK_HOME}/jars/" SPARK_EXTRA=" +--jars ${COMET_JAR} \ +--driver-class-path ${COMET_JAR} \ --conf spark.comet.enabled=true \ --conf spark.comet.exec.enabled=true \ --conf spark.comet.exec.all.enabled=true \ From 8354fe752a1e9a446b19a8f03efd62190b5119d5 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 24 Mar 2024 17:45:51 -0700 Subject: [PATCH 25/37] Make setup_gluten_deps better formated for book inclusion --- accelerators/setup_gluten_deps.sh | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/accelerators/setup_gluten_deps.sh b/accelerators/setup_gluten_deps.sh index b805fd1..6472390 100755 --- a/accelerators/setup_gluten_deps.sh +++ b/accelerators/setup_gluten_deps.sh @@ -2,7 +2,13 @@ set -ex sudo apt-get update -sudo apt-get install -y locales wget tar tzdata git ccache cmake ninja-build build-essential llvm-dev clang libiberty-dev libdwarf-dev libre2-dev libz-dev libssl-dev libboost-all-dev libcurl4-openssl-dev maven rapidjson-dev libdouble-conversion-dev libgflags-dev libsodium-dev libsnappy-dev nasm && sudo apt install -y libunwind-dev +#tag::gluten_deps[] +sudo apt-get install -y locales wget tar tzdata git ccache cmake ninja-build build-essential \ + llvm-dev clang libiberty-dev libdwarf-dev libre2-dev libz-dev libssl-dev libboost-all-dev \ + libcurl4-openssl-dev maven rapidjson-dev libdouble-conversion-dev libgflags-dev \ + libsodium-dev libsnappy-dev nasm +sudo apt install -y libunwind-dev sudo apt-get install -y libgoogle-glog-dev sudo apt-get -y install docker-compose -sudo apt-get install -y libre2-9 +sudo apt-get install -y libre2-9 || sudo apt-get install -y libre2-10 +#end::gluten_deps[] From 769ffebdf4fe8e425181d86c380b8135e95fb217 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 24 Mar 2024 17:45:59 -0700 Subject: [PATCH 26/37] Tag the gluten setup --- accelerators/setup_gluten_spark34.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/accelerators/setup_gluten_spark34.sh b/accelerators/setup_gluten_spark34.sh index d129f08..0cbfbc1 100755 --- a/accelerators/setup_gluten_spark34.sh +++ b/accelerators/setup_gluten_spark34.sh @@ -41,6 +41,7 @@ if [ "$DISTRIB_RELEASE" == "20.04" ]; then fi # Rather than if/else we fall through to build if wget fails because major version is not supported. if [ -z "$GLUTEN_JAR_PATH" ]; then + #tag::build_gluten[] if [ ! -d incubator-gluten ]; then git clone https://github.com/apache/incubator-gluten.git fi @@ -48,6 +49,7 @@ if [ -z "$GLUTEN_JAR_PATH" ]; then sudo ./dev/builddeps-veloxbe.sh --enable_s3=ON mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests GLUTEN_JAR_PATH="$(pwd)/package/target/gluten-package-*-SNAPSHOT-${SPARK_MAJOR_VERSION}.jar" + #end::build_gluten[] fi export GLUTEN_JAR_PATH From 65f4a98e4dbbfa80d001c4209e87792e45bfc2e4 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 24 Mar 2024 17:48:30 -0700 Subject: [PATCH 27/37] Disable gluten SQL --- .github/workflows/ci.yml | 62 ++++++++++++++++++++-------------------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index f7dbddb..5746690 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -64,37 +64,37 @@ jobs: - name: Run sql examples run: ./run_sql_examples.sh - run-gluten-sql-examples: - runs-on: ubuntu-latest - steps: - - name: Checkout - uses: actions/checkout@v2 - - name: Cache Spark and friends - uses: actions/cache@v3 - with: - path: | - spark*.tgz - iceberg*.jar - key: spark-artifacts - - name: Setup JDK - uses: actions/setup-java@v3 - with: - distribution: temurin - java-version: 17 - - name: Cache Maven packages - uses: actions/cache@v2 - with: - path: ~/.m2 - key: ${{ runner.os }}-m2-gluten - - name: Cache Data - uses: actions/cache@v3 - with: - path: | - data/fetched/* - key: data-fetched - - name: Run gluten - run: - cd accelerators; ./gluten_spark_34_ex.sh + # run-gluten-sql-examples: + # runs-on: ubuntu-latest + # steps: + # - name: Checkout + # uses: actions/checkout@v2 + # - name: Cache Spark and friends + # uses: actions/cache@v3 + # with: + # path: | + # spark*.tgz + # iceberg*.jar + # key: spark-artifacts + # - name: Setup JDK + # uses: actions/setup-java@v3 + # with: + # distribution: temurin + # java-version: 17 + # - name: Cache Maven packages + # uses: actions/cache@v2 + # with: + # path: ~/.m2 + # key: ${{ runner.os }}-m2-gluten + # - name: Cache Data + # uses: actions/cache@v3 + # with: + # path: | + # data/fetched/* + # key: data-fetched + # - name: Run gluten + # run: + # cd accelerators; ./gluten_spark_34_ex.sh run-comet-sql-examples: runs-on: ubuntu-latest steps: From a0fa2bad58bcc87e5e8fa04462e6efaf9adc700e Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 24 Mar 2024 18:18:33 -0700 Subject: [PATCH 28/37] Tag comet example for inclusion. --- accelerators/setup_comet.sh | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/accelerators/setup_comet.sh b/accelerators/setup_comet.sh index dbfadd5..a63f8eb 100755 --- a/accelerators/setup_comet.sh +++ b/accelerators/setup_comet.sh @@ -10,10 +10,13 @@ else echo "Building comet for Spark ${SPARK_MAJOR}" fi +#tag::build[] +# If we don't have fusion checked out do it if [ ! -d arrow-datafusion-comet ]; then git clone https://github.com/apache/arrow-datafusion-comet.git fi +# Build JAR if not present if [ -z "$(ls arrow-datafusion-comet/spark/target/comet-spark-spark*.jar)" ]; then cd arrow-datafusion-comet make clean release PROFILES="-Pspark-${SPARK_MAJOR}" @@ -21,3 +24,4 @@ if [ -z "$(ls arrow-datafusion-comet/spark/target/comet-spark-spark*.jar)" ]; th fi COMET_JAR="$(pwd)/$(ls arrow-datafusion-comet/spark/target/comet-spark-spark*SNAPSHOT.jar)" export COMET_JAR +#end::build[] From a6bd178e15e21a38ed4a57f11fa9a62370428c3a Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 24 Mar 2024 19:47:46 -0700 Subject: [PATCH 29/37] Add Python UDF/UDAF examples. --- python/examples/udf.py | 65 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 python/examples/udf.py diff --git a/python/examples/udf.py b/python/examples/udf.py new file mode 100644 index 0000000..663406f --- /dev/null +++ b/python/examples/udf.py @@ -0,0 +1,65 @@ +# This script triggers a number of different PySpark errors + +from pyspark.sql.session import SparkSession +import sys + +global sc + + +# tag::simple_udf[] +@udf("long") +def classic_add1(e: long) -> long: + return e + 1 + + +# end::simple_udf[] + + +# tag::agg_new_udf[] +@pandas_udf("long") +def pandas_sum(s: pd.Series) -> pd.Series: + return s.sum() + + +# end::agg_new_udf[] + + +# tag::new_udf[] +@pandas_udf("long") +def pandas_add1(s: pd.Series) -> pd.Series: + # Vectorized operation on all of the elems in series at once + return s + 1 + + +# end::new_udf[] + + +# tag::complex_udf[] +@pandas_udf("long") +def pandas_nested_add1(d: pd.pandas) -> pd.Series: + # Takes a struct and returns the age elem + 1, if we wanted + # to update (e.g. return struct) we could update d and return it instead. + return d["age"] + 1 + + +# end::complex_udf[] + + +# tag::batches_of_batches_udf[] +@pandas_udf("col1") +def pandas_batches_of_batches(t: Iterator[pd.Series]) -> Iterator[pd.Series]: + my_db_connection = None # Expensive setup logic goes here + for s in t: + # Vectorized operation on all of the elems in series at once + yield s + 1 + + +# end::batches_of_batches_udf[] + + +if __name__ == "__main__": + spark = SparkSession.builder.master("local[4]").getOrCreate() + # Make sure to make + # "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" + # available as ./data/2021 + uk_df = spark.read.csv("data/fetched/2021", header=True, inferSchema=True) From 50f804ccf946eded07caf98684f51e1d4ec27ffd Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 24 Mar 2024 22:30:28 -0700 Subject: [PATCH 30/37] Style fixes --- python/examples/udf.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/python/examples/udf.py b/python/examples/udf.py index 663406f..33d8de6 100644 --- a/python/examples/udf.py +++ b/python/examples/udf.py @@ -1,14 +1,17 @@ # This script triggers a number of different PySpark errors from pyspark.sql.session import SparkSession +from pyspark.sql.functions import pandas_udf, udf +from collections.abc import Iterator import sys +import pandas as pd global sc # tag::simple_udf[] @udf("long") -def classic_add1(e: long) -> long: +def classic_add1(e: int) -> int: return e + 1 @@ -50,8 +53,10 @@ def pandas_nested_add1(d: pd.pandas) -> pd.Series: def pandas_batches_of_batches(t: Iterator[pd.Series]) -> Iterator[pd.Series]: my_db_connection = None # Expensive setup logic goes here for s in t: - # Vectorized operation on all of the elems in series at once - yield s + 1 + # Do something with your setup logic + if my_db_connection is None: + # Vectorized operation on all of the elems in series at once + yield s + 1 # end::batches_of_batches_udf[] From e377e9e8256db237a5388726de78cfcc6c15d500 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 25 Mar 2024 21:45:24 -0700 Subject: [PATCH 31/37] Move SparkSession builder up --- python/examples/udf.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/examples/udf.py b/python/examples/udf.py index 33d8de6..bb0758b 100644 --- a/python/examples/udf.py +++ b/python/examples/udf.py @@ -8,6 +8,8 @@ global sc +# We need the session before we can use @udf +spark = SparkSession.builder.master("local[4]").getOrCreate() # tag::simple_udf[] @udf("long") @@ -63,7 +65,6 @@ def pandas_batches_of_batches(t: Iterator[pd.Series]) -> Iterator[pd.Series]: if __name__ == "__main__": - spark = SparkSession.builder.master("local[4]").getOrCreate() # Make sure to make # "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" # available as ./data/2021 From 1b8c55c221a193c8e9682a5fffa771eecc6ea86d Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 25 Mar 2024 22:17:22 -0700 Subject: [PATCH 32/37] style fix --- python/examples/udf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/examples/udf.py b/python/examples/udf.py index bb0758b..50c9a49 100644 --- a/python/examples/udf.py +++ b/python/examples/udf.py @@ -8,6 +8,7 @@ global sc + # We need the session before we can use @udf spark = SparkSession.builder.master("local[4]").getOrCreate() From 362e17a3ad01442ec2787ea0e1a79de0976dd49d Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Mon, 25 Mar 2024 22:31:06 -0700 Subject: [PATCH 33/37] Fix typing import + pd.DF --- python/examples/udf.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/examples/udf.py b/python/examples/udf.py index 50c9a49..f73e4db 100644 --- a/python/examples/udf.py +++ b/python/examples/udf.py @@ -2,7 +2,7 @@ from pyspark.sql.session import SparkSession from pyspark.sql.functions import pandas_udf, udf -from collections.abc import Iterator +from typing import Iterator import sys import pandas as pd @@ -23,7 +23,7 @@ def classic_add1(e: int) -> int: # tag::agg_new_udf[] @pandas_udf("long") -def pandas_sum(s: pd.Series) -> pd.Series: +def pandas_sum(s: pd.Series) -> int: return s.sum() @@ -42,7 +42,7 @@ def pandas_add1(s: pd.Series) -> pd.Series: # tag::complex_udf[] @pandas_udf("long") -def pandas_nested_add1(d: pd.pandas) -> pd.Series: +def pandas_nested_add1(d: pd.DataFrame) -> pd.Series: # Takes a struct and returns the age elem + 1, if we wanted # to update (e.g. return struct) we could update d and return it instead. return d["age"] + 1 @@ -52,7 +52,7 @@ def pandas_nested_add1(d: pd.pandas) -> pd.Series: # tag::batches_of_batches_udf[] -@pandas_udf("col1") +@pandas_udf("long") def pandas_batches_of_batches(t: Iterator[pd.Series]) -> Iterator[pd.Series]: my_db_connection = None # Expensive setup logic goes here for s in t: From 4cd8622448c2ff71dde4f031c0086fd5ca3972ee Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 31 Mar 2024 21:37:55 -0700 Subject: [PATCH 34/37] Style fix --- python/examples/udf.py | 1 + 1 file changed, 1 insertion(+) diff --git a/python/examples/udf.py b/python/examples/udf.py index f73e4db..f0d6a60 100644 --- a/python/examples/udf.py +++ b/python/examples/udf.py @@ -12,6 +12,7 @@ # We need the session before we can use @udf spark = SparkSession.builder.master("local[4]").getOrCreate() + # tag::simple_udf[] @udf("long") def classic_add1(e: int) -> int: From 4d23e5eeebf531f619cdd2ad0db862dfb32aea2c Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 31 Mar 2024 21:47:32 -0700 Subject: [PATCH 35/37] Use axel if present --- env_setup.sh | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/env_setup.sh b/env_setup.sh index 42ed190..a3f7599 100755 --- a/env_setup.sh +++ b/env_setup.sh @@ -11,7 +11,13 @@ SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}" SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz" ICEBERG_VERSION=${ICEBERG_VERSION:-"1.4.0"} if [ ! -f "${SPARK_FILE}" ]; then - wget "https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" & + SPARK_DIST_URL="https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" + if ! command -v axel &> /dev/null + then + axel "$SPARK_DIST_URL" & + else + wget "$SPARK_DIST_URL" & + fi fi # Download Icberg if not present ICEBERG_FILE="iceberg-spark-runtime-${SPARK_MAJOR}_${SCALA_VERSION}-${ICEBERG_VERSION}.jar" From 4d3373b91e40669bb2aba923dc6ef4b459267184 Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 31 Mar 2024 21:47:50 -0700 Subject: [PATCH 36/37] Add mypy to tox.ini so we don't depend on it being in the system setup. --- python/tox.ini | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/tox.ini b/python/tox.ini index e661b21..330cd58 100644 --- a/python/tox.ini +++ b/python/tox.ini @@ -32,6 +32,7 @@ deps = pyspark==3.5.0 flake8 spark-testing-base>=0.11.1 + mypy -rrequirements.txt commands = pytest examples \ @@ -64,6 +65,7 @@ extras = tests passenv = * deps = pytest + mypy -rrequirements.txt setenv = {[testenv]setenv} From 2419a8c641eddff8c6484e68ca38765c4368196c Mon Sep 17 00:00:00 2001 From: Holden Karau Date: Sun, 31 Mar 2024 21:50:08 -0700 Subject: [PATCH 37/37] Fix axel command. --- env_setup.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/env_setup.sh b/env_setup.sh index a3f7599..50ff073 100755 --- a/env_setup.sh +++ b/env_setup.sh @@ -12,7 +12,7 @@ SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz" ICEBERG_VERSION=${ICEBERG_VERSION:-"1.4.0"} if [ ! -f "${SPARK_FILE}" ]; then SPARK_DIST_URL="https://dlcdn.apache.org/spark/spark-${SPARK_VERSION}/${SPARK_FILE}" - if ! command -v axel &> /dev/null + if command -v axel &> /dev/null then axel "$SPARK_DIST_URL" & else