Skip to content

Commit

Permalink
Get Gluten + Spark3.4 to party (note: this fails because of Gluten se…
Browse files Browse the repository at this point in the history
…gfault)

re-enable gluten-sql-ex

Add cache accel cache.

Lets go for 3.5.1

Update shell for style

Fix gluten jar dl

Add full path for SPARK_PATH

Only use pre-built for 20.04

Build deps with sudo

Ignore incubator gluten

More work getting gluten and comet
  • Loading branch information
holdenk committed Mar 20, 2024
1 parent a14aa7b commit 8c90b72
Show file tree
Hide file tree
Showing 16 changed files with 225 additions and 100 deletions.
74 changes: 49 additions & 25 deletions .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,30 +64,48 @@ jobs:
- name: Run sql examples
run:
./run_sql_examples.sh
# run-gluten-sql-examples:
# runs-on: ubuntu-latest
# steps:
# - name: Checkout
# uses: actions/checkout@v2
# - name: Cache Spark and friends
# uses: actions/cache@v3
# with:
# path: |
# spark*.tgz
# iceberg*.jar
# key: spark-artifacts
# - name: Cache Data
# uses: actions/cache@v3
# with:
# path: |
# data/fetched/*
# key: data-fetched
# - name: Setup gluten
# run:
# ./setup_gluten.sh
# - name: Run sql examples w/ gluten
# run:
# ./run_sql_examples.sh
run-gluten-sql-examples:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Cache Spark and friends
uses: actions/cache@v3
with:
path: |
spark*.tgz
iceberg*.jar
key: spark-artifacts
- name: Cache Data
uses: actions/cache@v3
with:
path: |
data/fetched/*
key: data-fetched
- name: Run gluten
run:
cd accelerators; ./gluten_spark_34_ex.sh
run-comet-sql-examples:
runs-on: ubuntu-latest
steps:
- name: Checkout
uses: actions/checkout@v2
- name: Cache Spark and friends
uses: actions/cache@v3
with:
path: |
spark*.tgz
iceberg*.jar
key: spark-artifacts
- name: Cache Data
uses: actions/cache@v3
with:
path: |
data/fetched/*
key: data-fetched
- name: Run comet
run:
cd accelerators; ./comet_ex.sh
run-target-examples:
runs-on: ubuntu-latest
steps:
Expand All @@ -100,6 +118,12 @@ jobs:
spark*.tgz
iceberg*.jar
key: spark-artifacts
- name: Cache Accel
uses: actions/cache@v3
with:
path: |
accelerators/*.jar
key: accelerators-artifacts
- name: Cache Data
uses: actions/cache@v3
with:
Expand Down Expand Up @@ -138,7 +162,7 @@ jobs:
- name: Shellcheck
run: |
sudo apt-get install -y shellcheck
shellcheck $(find -name "*.sh")
shellcheck -e SC2317,SC1091,SC2034,SC2164 $(find -name "*.sh")
- name: Setup JDK
uses: actions/setup-java@v3
with:
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -94,3 +94,4 @@ gluten
gluten*.jar
spark-3*hadoop*/
spark-3*hadoop*.tgz
accelerators/incubator-gluten
8 changes: 8 additions & 0 deletions accelerators/comet_env_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash

SPARK_EXTRA="--jars ${COMET_JAR} \
--conf spark.sql.extensions=org.apache.comet.CometSparkSessionExtensions \
--conf spark.comet.enabled=true \
--conf spark.comet.exec.enabled=true \
--conf spark.comet.exec.all.enabled=true"
export SPARK_EXTRA
6 changes: 6 additions & 0 deletions accelerators/comet_ex.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
#!/bin/bash

set -ex
source setup_comet.sh
source comet_env_setup.sh
USE_COMET="true" ../run_sql_examples.sh
44 changes: 44 additions & 0 deletions accelerators/gluten_env_setup.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#!/bin/bash

# Check if we gluten and gluten UDFs present
GLUTEN_NATIVE_LIB_NAME=libhigh-performance-spark-gluten-0.so
NATIVE_LIB_DIR=$(pwd)/../native/src/
NATIVE_LIB_PATH="${NATIVE_LIB_DIR}${GLUTEN_NATIVE_LIB_NAME}"
GLUTEN_HOME=incubator-gluten
source /etc/lsb-release
if [ "$SPARK_MAJOR" == "3.4" && "$DISTRIB_RELEASE" == "20.04" ]; then
GLUTEN_EXISTS="true"
gluten_jvm_jar=$(ls accelerators/gluten-velox-bundle-spark3.4_2.12-1.1.0.jar)
GLUTEN_SPARK_EXTRA="--conf spark.plugins=io.glutenproject.GlutenPlugin \
--conf spark.memory.offHeap.enabled=true \
--conf spark.memory.offHeap.size=5g \
--conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager \
--jars ${gluten_jvm_jar}"
else
if [ -d ${GLUTEN_HOME} ]; then
GLUTEN_EXISTS="true"
gluten_jvm_jar=$(ls "${GLUTEN_HOME}"/package/target/gluten-velox-bundle-spark3.5_2.12-ubuntu_*-*-SNAPSHOT.jar) #TBD
gluten_jvm_package_jar=$(ls "${GLUTEN_HOME}"/package/target/gluten-package*-*-SNAPSHOT.jar)
GLUTEN_SPARK_EXTRA="--conf spark.plugins=io.glutenproject.GlutenPlugin \
--jars ${gluten_jvm_jar},${gluten_jvm_package_jar} \
--conf spark.memory.offHeap.enabled=true \
--conf spark.memory.offHeap.size=5g \
--conf spark.shuffle.manager=org.apache.spark.shuffle.sort.ColumnarShuffleManager"
# Enable UDF seperately.
fi
fi
if [ -f "${NATIVE_LIB_PATH}" ]; then
if [ "$GLUTEN_EXISTS" == "true" ]; then
GLUTEN_UDF_EXISTS="true"
GLUTEN_SPARK_EXTRA="$GLUTEN_SPARK_EXTRA \
--conf spark.jars=${gluten_jvm_jar} \
--conf spark.gluten.loadLibFromJar=true \
--files ${NATIVE_LIB_PATH} \
--conf spark.gluten.sql.columnar.backend.velox.udfLibraryPaths=${GLUTEN_NATIVE_LIB_NAME}"
fi
fi
SPARK_EXTRA=GLUTEN_SPARK_EXTRA

export SPARK_EXTRA
export GLUTEN_UDF_EXISTS
export GLUTEN_EXISTS
19 changes: 14 additions & 5 deletions accelerators/gluten_spark_34_ex.sh
Original file line number Diff line number Diff line change
@@ -1,13 +1,22 @@
#!/bin/bash

source setup.sh
set -ex

SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
cd "${SCRIPT_DIR}"
source "${SCRIPT_DIR}/setup_gluten_spark34.sh"

SPARK_HOME=${SPARK_DIR}
export SPARK_HOME
PATH=$(pwd)/${SPARK_DIR}/bin:$PATH
spark-sql --master local[5] \
PATH="$(pwd)/${SPARK_DIR}/bin:$PATH"
export PATH
"${SPARK_HOME}/bin/spark-sql" --master local[5] \
--conf spark.plugins=io.glutenproject.GlutenPlugin \
--conf spark.memory.offHeap.enabled=true \
--conf spark.memory.offHeap.size=5g \
--jars ${GLUTEN_JAR} \
--jars "${GLUTEN_JAR}" \
--conf spark.eventLog.enabled=true \
-e "SELECT 1"

source gluten_env_setup.sh
cd ..
./run_sql_examples.sh || echo "Expected to fail"
9 changes: 9 additions & 0 deletions accelerators/install_rust_if_needed.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
#!/bin/bash
if [ -f $HOME/.cargo/env ]; then
source $HOME/.cargo/env
fi

if ! command -v cargo; then
curl --proto '=https' --tlsv1.2 -sSf https://sh.rustup.rs | sh -s -- -y
source $HOME/.cargo/env
fi
4 changes: 3 additions & 1 deletion accelerators/run_gluten.sh
100644 → 100755
Original file line number Diff line number Diff line change
@@ -1 +1,3 @@
${SPARK_HOME}/bin/spark-shell --master local --jars ${ACCEL_JARS}/gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_2.12-1.1.1.jar --spark-properties=gluten_config.properties
#!/bin/bash

"${SPARK_HOME}/bin/spark-shell" --master local --jars "${ACCEL_JARS}/gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_2.12-1.1.1.jar" --spark-properties=gluten_config.properties
34 changes: 0 additions & 34 deletions accelerators/setup.sh

This file was deleted.

16 changes: 16 additions & 0 deletions accelerators/setup_comet.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
#!/bin/bash

set -ex

source install_rust_if_needed.sh

if [ ! -d arrow-datafusion-comet ]; then
git clone https://github.com/apache/arrow-datafusion-comet.git
fi

if [ -z $(ls arrow-datafusion-comet/spark/target/comet-spark-spark*.jar) ]; then
cd arrow-datafusion-comet
make clean release PROFILES="-Pspark-3.4"
fi
COMET_JAR="$(pwd)/$(ls incubator-comet/spark/target/comet-spark-spark*.jar)"
export COMET_JAR
8 changes: 8 additions & 0 deletions accelerators/setup_gluten_deps.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
#!/bin/bash
set -ex

sudo apt-get update
sudo apt-get install -y locales wget tar tzdata git ccache cmake ninja-build build-essential llvm-dev clang libiberty-dev libdwarf-dev libre2-dev libz-dev libssl-dev libboost-all-dev libcurl4-openssl-dev maven rapidjson-dev libdouble-conversion-dev libgflags-dev libsodium-dev libsnappy-dev nasm && sudo apt install -y libunwind-dev
sudo apt-get install -y libgoogle-glog-dev
sudo apt-get -y install docker-compose
sudo apt-get install -y libre2-9
2 changes: 1 addition & 1 deletion accelerators/setup_gluten_from_src.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
set -ex

# Setup deps
sudo apt-get update && sudo apt-get install -y locales wget tar tzdata git ccache cmake ninja-build build-essential llvm-dev clang libiberty-dev libdwarf-dev libre2-dev libz-dev libssl-dev libboost-all-dev libcurl4-openssl-dev maven rapidjson-dev libdouble-conversion-dev libgflags-dev libsodium-dev libsnappy-dev nasm && sudo apt install -y libunwind-dev && sudo apt-get install -y libgoogle-glog-dev && sudo apt-get -y install docker-compose
source setup_gluten_deps.sh

# Try gluten w/clickhouse
#if [ ! -d gluten ]; then
Expand Down
51 changes: 51 additions & 0 deletions accelerators/setup_gluten_spark34.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
#!/bin/bash

mkdir -p /tmp/spark-events
SCRIPT_DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
ACCEL_JARS=${SCRIPT_DIR}
SPARK_MAJOR_VERSION=3.4
SCALA_VERSION=${SCALA_VERSION:-"2.12"}

set -ex

# Note: this does not work on Ubuntu 23, only on 22
# You might get something like:
# # C [libgluten.so+0x30c753] gluten::Runtime::registerFactory(std::string const&, std::function<gluten::Runtime* (std::unordered_map<std::string, std::string, std::hash<std::string>, std::equal_to<std::string>, std::allocator<std::pair<std::string const, std::string> > > const&)>)+0x23


SPARK_VERSION=3.4.2
SPARK_MAJOR=3.4
HADOOP_VERSION=3
SPARK_DIR="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
SPARK_FILE="${SPARK_DIR}.tgz"

export SPARK_MAJOR
export SPARK_VERSION

source setup_gluten_deps.sh

cd ..
source /etc/lsb-release
# Pre-baked only
if [ "$DISTRIB_RELEASE" == "20.04" ]; then
source ./env_setup.sh
cd "${SCRIPT_DIR}"

GLUTEN_JAR="gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-1.1.0.jar"
GLUTEN_JAR_PATH="${SCRIPT_DIR}/gluten-velox-bundle-spark${SPARK_MAJOR_VERSION}_${SCALA_VERSION}-1.1.0.jar"

export GLUTEN_JAR

if [ ! -f "${GLUTEN_JAR_PATH}" ]; then
wget "https://github.com/oap-project/gluten/releases/download/v1.1.0/${GLUTEN_JAR}" &
fi

wait
else
if [ ! -d incubator-gluten ]; then
git clone https://github.com/apache/incubator-gluten.git
fi
cd incubator-gluten
sudo ./dev/builddeps-veloxbe.sh --enable_s3=ON
mvn clean package -Pbackends-velox -Pspark-3.4 -DskipTests
fi
5 changes: 3 additions & 2 deletions env_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -3,10 +3,10 @@

# Download Spark and iceberg if not present
SPARK_MAJOR=${SPARK_MAJOR:-"3.5"}
SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.0"}
SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.1"}
SCALA_VERSION=${SCALA_VERSION:-"2.12"}
HADOOP_VERSION="3"
SPARK_PATH="spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
SPARK_FILE="spark-${SPARK_VERSION}-bin-hadoop3.tgz"
ICEBERG_VERSION=${ICEBERG_VERSION:-"1.4.0"}
if [ ! -f "${SPARK_FILE}" ]; then
Expand All @@ -18,6 +18,7 @@ if [ ! -f "${ICEBERG_FILE}" ]; then
wget "https://search.maven.org/remotecontent?filepath=org/apache/iceberg/iceberg-spark-runtime-${SPARK_MAJOR}_${SCALA_VERSION}/${ICEBERG_VERSION}/${ICEBERG_FILE}" -O "${ICEBERG_FILE}" &
fi
wait
sleep 1
# Setup the env
if [ ! -d "${SPARK_PATH}" ]; then
tar -xf "${SPARK_FILE}"
Expand Down
Loading

0 comments on commit 8c90b72

Please sign in to comment.