Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
* Start working on sol nb

* Finish update to Spark 3.5.2, download 2022 & 2023 data, install Python-is-Python3 so we can run the example.

* Add solution

* Ignore incubator-gluten

* Update workshop NB

* Install pyarrow & pyiceberg for folks who want to poke around at the parquet files.

* Update solutions

* More update

* Forward the Spark UI for ze query plans.

* Update solution

* Update solutions

* Re-enable cross build, use launch script

* Make exec

* Lets make a slimmed down container for folks who need it.

* Fix spark home slim down mini some more

eh wait we don't need a root user install of scala.

oops we do need it

* Tag mini image seperately

* Stack them

* Avoid pip cache dir

* Don't keep the spark tarball in the image

* Seperate out build from run

* shell check fixes
  • Loading branch information
holdenk authored Aug 15, 2024
1 parent b3db591 commit e2fecda
Show file tree
Hide file tree
Showing 10 changed files with 1,231 additions and 74 deletions.
7 changes: 6 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -103,4 +103,9 @@ incubator-glutten/*
project/build.sbt
coursier
# Magic file we use for build tracking
oldhash
oldhash
# ignore ipynb checkpoints
.ipynb_checkpoints/

# ignore accel
incubator-gluten/
69 changes: 3 additions & 66 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,70 +1,7 @@
# Open JDK11, Spark 3.X and the latest JDKs get a little spicy
FROM azul/zulu-openjdk:11-latest
ARG base
FROM $base

RUN apt-get -qq update && \
apt-get -qq -y upgrade && \
apt-get -qq -y install gnupg software-properties-common locales curl tzdata apt-transport-https curl gnupg sudo net-tools psmisc htop && \
locale-gen en_US.UTF-8 && \
apt-get -qq -y install gnupg software-properties-common curl git-core wget axel python3 python3-pip nano emacs vim && \
echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | tee /etc/apt/sources.list.d/sbt.list && \
echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | tee /etc/apt/sources.list.d/sbt_old.list && \
curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/scalasbt-release.gpg --import && \
chmod 644 /etc/apt/trusted.gpg.d/scalasbt-release.gpg && \
apt-get update && \
apt-get -qq -y install sbt && \
rm -rf /var/lib/apt/lists/*

RUN curl -Lo coursier https://git.io/coursier-cli
RUN chmod +x coursier
# ensure the JAR of the CLI is in the coursier cache, in the image
RUN ./coursier --help
RUN pip install jupyter
RUN ./coursier bootstrap \
-r jitpack \
-i user -I user:sh.almond:scala-kernel-api_2.13.8:0.14.0-RC4 \
sh.almond:scala-kernel_2.13.8:0.14.0-RC4 \
--default=true --sources \
-o almond && \
./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13"
RUN chmod a+xr almond coursier
RUN ./coursier launch almond --scala 2.13.8 -- --install
# Fun story: this does not work (Aug 8 2024) because it tries to download Scala 2 from Scala 3
#RUN ./coursier install scala:2.13.8 && ./coursier install scalac:2.13.8
RUN (axel --quiet https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb || wget https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb) && dpkg --install scala-2.13.8.deb && rm scala-2.13.8.deb

RUN adduser dev
RUN adduser dev sudo
RUN echo 'dev:dev' | chpasswd
RUN mkdir -p ~dev
RUN cp ./coursier ~dev/
RUN echo "color_prompt=yes" >> ~dev/.bashrc
RUN echo "export force_color_prompt=yes" >> ~dev/.bashrc
RUN echo "export SPARK_HOME=/high-performance-spark-examples/spark-3.5.1-bin-hadoop3" >> ~dev/.bashrc
RUN chown -R dev ~dev
USER dev
# Kernels are installed in user so we need to run as the user
RUN ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13"
RUN ./coursier launch almond --scala 2.13.8 -- --install
USER root

RUN mkdir /high-performance-spark-examples
RUN chown -R dev /high-performance-spark-examples
WORKDIR /high-performance-spark-examples
# Increase the chance of caching by copying just the env setup file first.
COPY --chown=dev:dev env_setup.sh ./
# Downloads and installs Spark ~3.5 & Iceberg 1.4 and slipstreams the JAR in-place
# Also downloads some test data
RUN SCALA_VERSION=2.13 ./env_setup.sh
RUN mv ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json_back
# Note: We need to use /home in the COPY otherwise no happy pandas
COPY --chown=dev:dev misc/kernel.json /home/dev/kernel.json_new
RUN mv ~dev/kernel.json_new ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json
RUN git clone https://github.com/holdenk/spark-upgrade.git
RUN chown -R dev /high-performance-spark-examples
ADD --chown=dev:dev myapp.tar /high-performance-spark-examples/
RUN chown -R dev /high-performance-spark-examples
RUN pip install --no-cache-dir pyarrow pyiceberg[pandas,snappy,daft,s3fs] avro
USER dev
RUN echo "jupyter-lab --ip 0.0.0.0 --port 8877" >> ~/.bash_history
RUN sbt clean compile
CMD ["jupyter-lab", "--ip", "0.0.0.0", "--port", "8877"]

69 changes: 69 additions & 0 deletions Dockerfile-mini
Original file line number Diff line number Diff line change
@@ -0,0 +1,69 @@
# Open JDK11, Spark 3.X and the latest JDKs get a little spicy
FROM azul/zulu-openjdk:11-latest

RUN apt-get -qq update && \
apt-get -qq -y upgrade && \
apt-get -qq -y install gnupg software-properties-common locales curl tzdata apt-transport-https curl gnupg sudo net-tools psmisc htop python-is-python3 && \
locale-gen en_US.UTF-8 && \
apt-get -qq -y install gnupg software-properties-common curl git-core wget axel python3 python3-pip nano emacs vim && \
echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | tee /etc/apt/sources.list.d/sbt.list && \
echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | tee /etc/apt/sources.list.d/sbt_old.list && \
curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/scalasbt-release.gpg --import && \
chmod 644 /etc/apt/trusted.gpg.d/scalasbt-release.gpg && \
apt-get update && \
apt-get -qq -y install sbt && \
rm -rf /var/lib/apt/lists/*

RUN curl -Lo coursier https://git.io/coursier-cli
RUN chmod +x coursier
# ensure the JAR of the CLI is in the coursier cache, in the image
RUN ./coursier --help
RUN pip install --no-cache-dir jupyter
# Fun story: this does not work (Aug 8 2024) because it tries to download Scala 2 from Scala 3
#RUN ./coursier install scala:2.13.8 && ./coursier install scalac:2.13.8
RUN (axel --quiet https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb || wget https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb) && dpkg --install scala-2.13.8.deb && rm scala-2.13.8.deb

RUN ./coursier bootstrap \
-r jitpack \
-i user -I user:sh.almond:scala-kernel-api_2.13.8:0.14.0-RC4 \
sh.almond:scala-kernel_2.13.8:0.14.0-RC4 \
--default=true --sources \
-o almond && \
./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13"


RUN adduser dev
RUN adduser dev sudo
RUN echo 'dev:dev' | chpasswd
RUN mkdir -p ~dev
RUN cp ./coursier ~dev/
RUN echo "color_prompt=yes" >> ~dev/.bashrc
RUN echo "export force_color_prompt=yes" >> ~dev/.bashrc
RUN echo "export SPARK_HOME=/high-performance-spark-examples/spark-3.5.2-bin-hadoop3" >> ~dev/.bashrc
RUN chown -R dev ~dev
USER dev
# Kernels are installed in user so we need to run as the user
RUN ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13"
USER root

RUN mkdir -p /high-performance-spark-examples
RUN mkdir -p /high-performance-spark-examples/warehouse
RUN chown -R dev /high-performance-spark-examples
WORKDIR /high-performance-spark-examples
# Increase the chance of caching by copying just the env setup file first.
COPY --chown=dev:dev env_setup.sh ./
# Downloads and installs Spark ~3.5 & Iceberg 1.4 and slipstreams the JAR in-place
# Also downloads some test data
RUN SCALA_VERSION=2.13 ./env_setup.sh && rm *.tgz
RUN mv ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json_back
# Note: We need to use /home in the COPY otherwise no happy pandas
COPY --chown=dev:dev misc/kernel.json /home/dev/kernel.json_new
RUN mv ~dev/kernel.json_new ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json
RUN chown -R dev /high-performance-spark-examples
ADD --chown=dev:dev myapp.tar /high-performance-spark-examples/
RUN git clone https://github.com/holdenk/spark-upgrade.git
RUN chown -R dev /high-performance-spark-examples
USER dev
RUN echo "jupyter-lab --ip 0.0.0.0 --port 8877" >> ~/.bash_history
CMD ["/high-performance-spark-examples/misc/container_launch.sh"]

7 changes: 5 additions & 2 deletions build_container.sh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,9 @@ else
git archive -o myapp.tar --format=tar HEAD
echo "$hash" > oldhash
fi
IMAGE=holdenk/hps:0.1
docker buildx build --platform=linux/amd64,linux/arm64 -t "${IMAGE}" . --push
VERSION=${VERSION:-0.4}
IMAGE=${IMAGE:-holdenk/hps:$VERSION}
MINI_IMAGE=${MINI_IMAGE:-holdenk/hps-mini:$VERSION}
docker buildx build --platform=linux/amd64,linux/arm64 -t "${MINI_IMAGE}" -f Dockerfile-mini . --push
docker buildx build --platform=linux/amd64,linux/arm64 -t "${IMAGE}" . --push --build-arg base="${MINI_IMAGE}"
#docker buildx build --platform=linux/amd64 -t "${IMAGE}" . --push
8 changes: 7 additions & 1 deletion env_setup.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ set -ex

# Download Spark and iceberg if not present
SPARK_MAJOR=${SPARK_MAJOR:-"3.5"}
SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.1"}
SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.2"}
SCALA_VERSION=${SCALA_VERSION:-"2.13"}
HADOOP_VERSION="3"
SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
Expand Down Expand Up @@ -55,4 +55,10 @@ mkdir -p ./data/fetched/
if [ ! -f ./data/fetched/2021 ]; then
wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" -O ./data/fetched/2021
fi
if [ ! -f ./data/fetched/2022 ]; then
wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2022" -O ./data/fetched/2022
fi
if [ ! -f ./data/fetched/2023 ]; then
wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2023" -O ./data/fetched/2023
fi

Loading

0 comments on commit e2fecda

Please sign in to comment.