Add solution (high-performance-spark#133)

* Start working on sol nb * Finish update to Spark 3.5.2, download 2022 & 2023 data, install Python-is-Python3 so we can run the example. * Add solution * Ignore incubator-gluten * Update workshop NB * Install pyarrow & pyiceberg for folks who want to poke around at the parquet files. * Update solutions * More update * Forward the Spark UI for ze query plans. * Update solution * Update solutions * Re-enable cross build, use launch script * Make exec * Lets make a slimmed down container for folks who need it. * Fix spark home slim down mini some more eh wait we don't need a root user install of scala. oops we do need it * Tag mini image seperately * Stack them * Avoid pip cache dir * Don't keep the spark tarball in the image * Seperate out build from run * shell check fixes
holdenk · Aug 15, 2024 · e2fecda · e2fecda
1 parent b3db591
commit e2fecda
Show file tree

Hide file tree

Showing 10 changed files with 1,231 additions and 74 deletions.
diff --git a/.gitignore b/.gitignore
@@ -103,4 +103,9 @@ incubator-glutten/*
 project/build.sbt
 coursier
 # Magic file we use for build tracking
-oldhash
+oldhash
+# ignore ipynb checkpoints
+.ipynb_checkpoints/
+
+# ignore accel
+incubator-gluten/
diff --git a/Dockerfile b/Dockerfile
@@ -1,70 +1,7 @@
-# Open JDK11, Spark 3.X and the latest JDKs get a little spicy
-FROM azul/zulu-openjdk:11-latest
+ARG base
+FROM $base
 
-RUN apt-get -qq update && \
-    apt-get -qq -y upgrade && \
-    apt-get -qq -y install gnupg software-properties-common locales curl tzdata apt-transport-https curl gnupg sudo net-tools psmisc htop && \
-    locale-gen en_US.UTF-8 && \
-    apt-get -qq -y install gnupg software-properties-common curl git-core wget axel python3 python3-pip nano emacs vim && \
-    echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | tee /etc/apt/sources.list.d/sbt.list && \
-    echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | tee /etc/apt/sources.list.d/sbt_old.list && \
-    curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/scalasbt-release.gpg --import && \
-    chmod 644 /etc/apt/trusted.gpg.d/scalasbt-release.gpg && \
-    apt-get update && \
-    apt-get -qq -y install sbt && \
-    rm -rf /var/lib/apt/lists/*
-
-RUN curl -Lo coursier https://git.io/coursier-cli
-RUN chmod +x coursier
-# ensure the JAR of the CLI is in the coursier cache, in the image
-RUN ./coursier --help
-RUN pip install jupyter
-RUN ./coursier bootstrap \
-      -r jitpack \
-      -i user -I user:sh.almond:scala-kernel-api_2.13.8:0.14.0-RC4 \
-      sh.almond:scala-kernel_2.13.8:0.14.0-RC4 \
-      --default=true --sources \
-      -o almond && \
-    ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13"
-RUN chmod a+xr almond coursier
-RUN ./coursier launch almond --scala 2.13.8 -- --install
-# Fun story: this does not work (Aug 8 2024) because it tries to download Scala 2 from Scala 3
-#RUN ./coursier install scala:2.13.8 && ./coursier install scalac:2.13.8
-RUN (axel --quiet https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb || wget https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb) && dpkg --install scala-2.13.8.deb && rm scala-2.13.8.deb
-
-RUN adduser dev
-RUN adduser dev sudo
-RUN echo 'dev:dev' | chpasswd 
-RUN mkdir -p ~dev
-RUN cp ./coursier ~dev/
-RUN echo "color_prompt=yes" >> ~dev/.bashrc
-RUN echo "export force_color_prompt=yes" >> ~dev/.bashrc
-RUN echo "export SPARK_HOME=/high-performance-spark-examples/spark-3.5.1-bin-hadoop3" >> ~dev/.bashrc
-RUN chown -R dev ~dev
-USER dev
-# Kernels are installed in user so we need to run as the user
-RUN ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13"
-RUN ./coursier launch almond --scala 2.13.8 -- --install
 USER root
-
-RUN mkdir /high-performance-spark-examples
-RUN chown -R dev /high-performance-spark-examples
-WORKDIR /high-performance-spark-examples
-# Increase the chance of caching by copying just the env setup file first.
-COPY --chown=dev:dev env_setup.sh ./
-# Downloads and installs Spark ~3.5 & Iceberg 1.4 and slipstreams the JAR in-place
-# Also downloads some test data
-RUN SCALA_VERSION=2.13 ./env_setup.sh
-RUN mv ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json_back
-# Note: We need to use /home in the COPY otherwise no happy pandas
-COPY --chown=dev:dev misc/kernel.json /home/dev/kernel.json_new
-RUN mv ~dev/kernel.json_new ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json
-RUN git clone https://github.com/holdenk/spark-upgrade.git
-RUN chown -R dev /high-performance-spark-examples
-ADD --chown=dev:dev myapp.tar /high-performance-spark-examples/
-RUN chown -R dev /high-performance-spark-examples
+RUN pip install --no-cache-dir pyarrow pyiceberg[pandas,snappy,daft,s3fs] avro
 USER dev
-RUN echo "jupyter-lab --ip 0.0.0.0 --port 8877" >> ~/.bash_history
 RUN sbt clean compile
-CMD ["jupyter-lab", "--ip", "0.0.0.0", "--port", "8877"]
-
diff --git a/Dockerfile-mini b/Dockerfile-mini
@@ -0,0 +1,69 @@
+# Open JDK11, Spark 3.X and the latest JDKs get a little spicy
+FROM azul/zulu-openjdk:11-latest
+
+RUN apt-get -qq update && \
+    apt-get -qq -y upgrade && \
+    apt-get -qq -y install gnupg software-properties-common locales curl tzdata apt-transport-https curl gnupg sudo net-tools psmisc htop python-is-python3 && \
+    locale-gen en_US.UTF-8 && \
+    apt-get -qq -y install gnupg software-properties-common curl git-core wget axel python3 python3-pip nano emacs vim && \
+    echo "deb https://repo.scala-sbt.org/scalasbt/debian all main" | tee /etc/apt/sources.list.d/sbt.list && \
+    echo "deb https://repo.scala-sbt.org/scalasbt/debian /" | tee /etc/apt/sources.list.d/sbt_old.list && \
+    curl -sL "https://keyserver.ubuntu.com/pks/lookup?op=get&search=0x2EE0EA64E40A89B84B2DF73499E82A75642AC823" | gpg --no-default-keyring --keyring gnupg-ring:/etc/apt/trusted.gpg.d/scalasbt-release.gpg --import && \
+    chmod 644 /etc/apt/trusted.gpg.d/scalasbt-release.gpg && \
+    apt-get update && \
+    apt-get -qq -y install sbt && \
+    rm -rf /var/lib/apt/lists/*
+
+RUN curl -Lo coursier https://git.io/coursier-cli
+RUN chmod +x coursier
+# ensure the JAR of the CLI is in the coursier cache, in the image
+RUN ./coursier --help
+RUN pip install --no-cache-dir  jupyter
+# Fun story: this does not work (Aug 8 2024) because it tries to download Scala 2 from Scala 3
+#RUN ./coursier install scala:2.13.8 && ./coursier install scalac:2.13.8
+RUN (axel --quiet https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb || wget https://downloads.lightbend.com/scala/2.13.8/scala-2.13.8.deb) && dpkg --install scala-2.13.8.deb && rm scala-2.13.8.deb
+
+RUN ./coursier bootstrap \
+      -r jitpack \
+      -i user -I user:sh.almond:scala-kernel-api_2.13.8:0.14.0-RC4 \
+      sh.almond:scala-kernel_2.13.8:0.14.0-RC4 \
+      --default=true --sources \
+      -o almond && \
+    ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13"
+
+
+RUN adduser dev
+RUN adduser dev sudo
+RUN echo 'dev:dev' | chpasswd 
+RUN mkdir -p ~dev
+RUN cp ./coursier ~dev/
+RUN echo "color_prompt=yes" >> ~dev/.bashrc
+RUN echo "export force_color_prompt=yes" >> ~dev/.bashrc
+RUN echo "export SPARK_HOME=/high-performance-spark-examples/spark-3.5.2-bin-hadoop3" >> ~dev/.bashrc
+RUN chown -R dev ~dev
+USER dev
+# Kernels are installed in user so we need to run as the user
+RUN ./almond --install --log info --metabrowse --id scala2.13 --display-name "Scala 2.13"
+USER root
+
+RUN mkdir -p /high-performance-spark-examples
+RUN mkdir -p /high-performance-spark-examples/warehouse
+RUN chown -R dev /high-performance-spark-examples
+WORKDIR /high-performance-spark-examples
+# Increase the chance of caching by copying just the env setup file first.
+COPY --chown=dev:dev env_setup.sh ./
+# Downloads and installs Spark ~3.5 & Iceberg 1.4 and slipstreams the JAR in-place
+# Also downloads some test data
+RUN SCALA_VERSION=2.13 ./env_setup.sh && rm *.tgz
+RUN mv ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json_back
+# Note: We need to use /home in the COPY otherwise no happy pandas
+COPY --chown=dev:dev misc/kernel.json /home/dev/kernel.json_new
+RUN mv ~dev/kernel.json_new ~dev/.local/share/jupyter/kernels/scala2.13/kernel.json
+RUN chown -R dev /high-performance-spark-examples
+ADD --chown=dev:dev myapp.tar /high-performance-spark-examples/
+RUN git clone https://github.com/holdenk/spark-upgrade.git
+RUN chown -R dev /high-performance-spark-examples
+USER dev
+RUN echo "jupyter-lab --ip 0.0.0.0 --port 8877" >> ~/.bash_history
+CMD ["/high-performance-spark-examples/misc/container_launch.sh"]
+
diff --git a/build_container.sh b/build_container.sh
@@ -15,6 +15,9 @@ else
   git archive -o myapp.tar --format=tar HEAD
   echo "$hash" > oldhash
 fi
-IMAGE=holdenk/hps:0.1
-docker buildx build --platform=linux/amd64,linux/arm64 -t "${IMAGE}" .  --push
+VERSION=${VERSION:-0.4}
+IMAGE=${IMAGE:-holdenk/hps:$VERSION}
+MINI_IMAGE=${MINI_IMAGE:-holdenk/hps-mini:$VERSION}
+docker buildx build --platform=linux/amd64,linux/arm64 -t "${MINI_IMAGE}" -f Dockerfile-mini .  --push
+docker buildx build --platform=linux/amd64,linux/arm64 -t "${IMAGE}" .  --push --build-arg base="${MINI_IMAGE}"
 #docker buildx build --platform=linux/amd64 -t "${IMAGE}" .  --push
diff --git a/env_setup.sh b/env_setup.sh
@@ -4,7 +4,7 @@ set -ex
 
 # Download Spark and iceberg if not present
 SPARK_MAJOR=${SPARK_MAJOR:-"3.5"}
-SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.1"}
+SPARK_VERSION=${SPARK_VERSION:-"${SPARK_MAJOR}.2"}
 SCALA_VERSION=${SCALA_VERSION:-"2.13"}
 HADOOP_VERSION="3"
 SPARK_PATH="$(pwd)/spark-${SPARK_VERSION}-bin-hadoop${HADOOP_VERSION}"
@@ -55,4 +55,10 @@ mkdir -p ./data/fetched/
 if [ ! -f ./data/fetched/2021 ]; then
   wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2021" -O ./data/fetched/2021
 fi
+if [ ! -f ./data/fetched/2022 ]; then
+  wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2022" -O ./data/fetched/2022
+fi
+if [ ! -f ./data/fetched/2023 ]; then
+  wget "https://gender-pay-gap.service.gov.uk/viewing/download-data/2023" -O ./data/fetched/2023
+fi