-
Notifications
You must be signed in to change notification settings - Fork 6
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Iceberg class demo container (high-performance-spark#132)
* Start working on making docker container * Upgrade to latest Iceberg, ignore coursier file, install Scala Jupyter kernel * Install correct jupyter * We want the scala kernel to point to Spark & include the class path for the examples * More progress on the container * Ok we need to use 2.13.8 since using .14 gives us some cats issues, roll back kernel to match .8, cross-mount iceberg-workshop, forward 8877, do some tricks for faster container builds * Make directories for cross mount if not present, add jupyter-lab launch to bash history for folks that want to launch bash and then they can easily up arrow (and by folks I mean me) * Shellcheck fixes for build container script. * Use axel quietly so I can find things * Match scala version of Spark * More shellcheck fixes * Make the wgets quiet too. * oops missed one. * Fix scala version * Update for Spark 4 / Scala 2.13 * Bump sbt version (note see the spark-400 branch for the cherry picked parts) * Use 2.13 target * Match comet to regular build
- Loading branch information
Showing
23 changed files
with
169 additions
and
45 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
# Open JDK11, Spark 3.X and the latest JDKs get a little spicy | ||
FROM azul/zulu-openjdk:11-latest | ||
|
||
RUN apt-get -qq update && \ | ||
apt-get -qq -y upgrade && \ | ||
apt-get -qq -y install gnupg software-properties-common locales curl tzdata apt-transport-https curl gnupg sudo net-tools psmisc htop && \ | ||
locale-gen en_US.UTF-8 && \ | ||
apt-get -qq -y install gnupg software-properties-common curl git-core wget axel python3 python3-pip nano emacs vim && \ | ||
echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | tee /etc/apt/sources.list.d/sbt.list && \ | ||
echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | tee /etc/apt/sources.list.d/sbt_old.list && \ | ||
curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/scalasbt-release.gpg --import && \ | ||
chmod 644 /etc/apt/trusted.gpg.d/scalasbt-release.gpg && \ | ||
apt-get update && \ | ||
apt-get -qq -y install sbt && \ | ||
rm -rf /var/lib/apt/lists/* | ||
|
||
RUN curl -Lo coursier https://git.io/coursier-cli | ||
RUN chmod +x coursier | ||
# ensure the JAR of the CLI is in the coursier cache, in the image | ||
RUN ./coursier --help | ||
RUN pip install jupyter | ||
RUN ./coursier bootstrap \ | ||
-r jitpack \ | ||
-i user -I user:sh.almond:scala-kernel-api_2.13.8:0.14.0-RC4 \ | ||
sh.almond:scala-kernel_2.13.8:0.14.0-RC4 \ | ||
--default=true --sources \ | ||
-o almond && \ | ||
./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13" | ||
RUN chmod a+xr almond coursier | ||
RUN ./coursier launch almond --scala 2.13.8 -- --install | ||
# Fun story: this does not work (Aug 8 2024) because it tries to download Scala 2 from Scala 3 | ||
#RUN ./coursier install scala:2.13.8 && ./coursier install scalac:2.13.8 | ||
RUN (axel --quiet https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb || wget https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb) && dpkg --install scala-2.13.8.deb && rm scala-2.13.8.deb | ||
|
||
RUN adduser dev | ||
RUN adduser dev sudo | ||
RUN echo 'dev:dev' | chpasswd | ||
RUN mkdir -p ~dev | ||
RUN cp ./coursier ~dev/ | ||
RUN echo "color_prompt=yes" >> ~dev/.bashrc | ||
RUN echo "export force_color_prompt=yes" >> ~dev/.bashrc | ||
RUN echo "export SPARK_HOME=/high-performance-spark-examples/spark-3.5.1-bin-hadoop3" >> ~dev/.bashrc | ||
RUN chown -R dev ~dev | ||
USER dev | ||
# Kernels are installed in user so we need to run as the user | ||
RUN ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13" | ||
RUN ./coursier launch almond --scala 2.13.8 -- --install | ||
USER root | ||
|
||
RUN mkdir /high-performance-spark-examples | ||
RUN chown -R dev /high-performance-spark-examples | ||
WORKDIR /high-performance-spark-examples | ||
# Increase the chance of caching by copying just the env setup file first. | ||
COPY --chown=dev:dev env_setup.sh ./ | ||
# Downloads and installs Spark ~3.5 & Iceberg 1.4 and slipstreams the JAR in-place | ||
# Also downloads some test data | ||
RUN SCALA_VERSION=2.13 ./env_setup.sh | ||
RUN mv ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json_back | ||
# Note: We need to use /home in the COPY otherwise no happy pandas | ||
COPY --chown=dev:dev misc/kernel.json /home/dev/kernel.json_new | ||
RUN mv ~dev/kernel.json_new ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json | ||
RUN git clone https://github.com/holdenk/spark-upgrade.git | ||
RUN chown -R dev /high-performance-spark-examples | ||
ADD --chown=dev:dev myapp.tar /high-performance-spark-examples/ | ||
RUN chown -R dev /high-performance-spark-examples | ||
USER dev | ||
RUN echo "jupyter-lab --ip 0.0.0.0 --port 8877" >> ~/.bash_history | ||
RUN sbt clean compile | ||
CMD ["jupyter-lab", "--ip", "0.0.0.0", "--port", "8877"] | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
#!/bin/bash | ||
|
||
set -ex | ||
|
||
cp .git/index /tmp/git_index | ||
export GIT_INDEX_FILE=/tmp/git_index | ||
git add -u | ||
hash=$(git write-tree) | ||
unset GIT_INDEX_FILE | ||
oldhash=$(cat oldhash || true) | ||
if [ "$hash" = "$oldhash" ] && [ -f myapp.tar ]; then | ||
echo "Skipping making tar since we match." | ||
else | ||
echo "Making tar since no match" | ||
git archive -o myapp.tar --format=tar HEAD | ||
echo "$hash" > oldhash | ||
fi | ||
IMAGE=holdenk/hps:0.1 | ||
docker buildx build --platform=linux/amd64,linux/arm64 -t "${IMAGE}" . --push | ||
#docker buildx build --platform=linux/amd64 -t "${IMAGE}" . --push |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
3 changes: 0 additions & 3 deletions
3
core/src/main/scala/com/high-performance-spark-examples/ml/CustomPipeline.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
3 changes: 0 additions & 3 deletions
3
core/src/main/scala/com/high-performance-spark-examples/ml/SimpleNaiveBayes.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
5 changes: 0 additions & 5 deletions
5
core/src/main/scala/com/high-performance-spark-examples/ml/SimplePipeline.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
3 changes: 0 additions & 3 deletions
3
core/src/main/scala/com/high-performance-spark-examples/mllib/GoldilocksMLlib.scala
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,19 @@ | ||
{ | ||
"argv": [ | ||
"java", | ||
"-cp", | ||
"/home/dev/.local/share/jupyter/kernels/scala2.13/launcher.jar:.:/high-performance-spark-examples/:/high-performance-spark-examples/target/scala-2.13/home/dev/.local/share/jupyter/kernels/scala/launcher.jar:/high-performance-spark-examples/spark-3.5.1-bin-hadoop3-scala2.13/jars/*", | ||
"coursier.bootstrap.launcher.Launcher", | ||
"--log", | ||
"info", | ||
"--metabrowse", | ||
"--id", | ||
"scala2.13", | ||
"--display-name", | ||
"Scala 2.13 (w/ Spark 3.5 & Iceberg 1.6)", | ||
"--connection-file", | ||
"{connection_file}" | ||
], | ||
"display_name": "Scala 2.13 (w/ Spark 3.5 & Iceberg 1.6)", | ||
"language": "scala" | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1 @@ | ||
sbt.version=1.9.6 | ||
sbt.version=1.9.9 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
#!/bin/bash | ||
set -ex | ||
./build_container.sh | ||
docker image pull holdenk/hps:0.1 | ||
mkdir -p warehouse | ||
mkdir -p iceberg-workshop | ||
docker container run --mount type=bind,source="$(pwd)"/warehouse,target=/warehouse --mount type=bind,source="$(pwd)/iceberg-workshop",target=/high-performance-spark-examples/iceberg-workshop -p 8877:8877 -it holdenk/hps:0.1 /bin/bash |
Oops, something went wrong.