From 5e84f4258f6501dac1e4653bad110ac618e27492 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 5 Jun 2023 16:00:47 -0700 Subject: [PATCH 1/7] Disable cache compression Disable cache compression to allow large images, like images depending on `tensorflow` or `torch`. For more information, see: https://github.com/GoogleContainerTools/kaniko/issues/1669 --- .../apache_beam/runners/portability/sdk_container_builder.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/runners/portability/sdk_container_builder.py b/sdks/python/apache_beam/runners/portability/sdk_container_builder.py index f81e015ea591..cf57c0fa9208 100644 --- a/sdks/python/apache_beam/runners/portability/sdk_container_builder.py +++ b/sdks/python/apache_beam/runners/portability/sdk_container_builder.py @@ -252,7 +252,9 @@ def _invoke_docker_build_and_push(self, container_image_name): build.steps = [] step = cloudbuild.BuildStep() step.name = 'gcr.io/kaniko-project/executor:latest' - step.args = ['--destination=' + container_image_name, '--cache=true'] + # Disable compression caching to allow for large images to be cached. + # See: https://github.com/GoogleContainerTools/kaniko/issues/1669 + step.args = ['--destination=' + container_image_name, '--cache=true', '--compressed-caching=false'] step.dir = SOURCE_FOLDER build.steps.append(step) From d7b24264366a2d7459a93fd358993c93d27b5cfa Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 5 Jun 2023 16:04:32 -0700 Subject: [PATCH 2/7] Update CHANGES.md --- CHANGES.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGES.md b/CHANGES.md index 6ab0852f2b21..b7182d618a19 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -105,6 +105,7 @@ * Dead letter queue support added to RunInference in Python ([#24209](https://github.com/apache/beam/issues/24209)). * Support added for defining pre/postprocessing operations on the RunInference transform ([#26308](https://github.com/apache/beam/issues/26308)) * Adds a Docker Compose based transform service that can be used to discover and use portable Beam transforms ([#26023](https://github.com/apache/beam/pull/26023)). +* Allow prebuilding large images when using `--prebuild_sdk_container_engine=cloud_build`, like images depending on `tensorflow` or `torch` ([#27023](https://github.com/apache/beam/pull/27023)) ## Breaking Changes From c236b49294d7682a3a68a5042fd4116364ab1dff Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Mon, 5 Jun 2023 16:15:04 -0700 Subject: [PATCH 3/7] Fix line too long linter issue --- .../runners/portability/sdk_container_builder.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/sdks/python/apache_beam/runners/portability/sdk_container_builder.py b/sdks/python/apache_beam/runners/portability/sdk_container_builder.py index cf57c0fa9208..19becd3e123f 100644 --- a/sdks/python/apache_beam/runners/portability/sdk_container_builder.py +++ b/sdks/python/apache_beam/runners/portability/sdk_container_builder.py @@ -254,7 +254,11 @@ def _invoke_docker_build_and_push(self, container_image_name): step.name = 'gcr.io/kaniko-project/executor:latest' # Disable compression caching to allow for large images to be cached. # See: https://github.com/GoogleContainerTools/kaniko/issues/1669 - step.args = ['--destination=' + container_image_name, '--cache=true', '--compressed-caching=false'] + step.args = [ + '--destination=' + container_image_name, + '--cache=true', + '--compressed-caching=false', + ] step.dir = SOURCE_FOLDER build.steps.append(step) From 8e84ece03cd3709d03e4a34095e2db1100cf42fa Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 6 Jun 2023 08:09:52 -0700 Subject: [PATCH 4/7] Update CHANGES.md --- CHANGES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index b7182d618a19..5c7733950aeb 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -62,6 +62,7 @@ ## New Features / Improvements +* Allow prebuilding large images when using `--prebuild_sdk_container_engine=cloud_build`, like images depending on `tensorflow` or `torch` ([#27023](https://github.com/apache/beam/pull/27023)) * X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). ## Breaking Changes @@ -105,7 +106,6 @@ * Dead letter queue support added to RunInference in Python ([#24209](https://github.com/apache/beam/issues/24209)). * Support added for defining pre/postprocessing operations on the RunInference transform ([#26308](https://github.com/apache/beam/issues/26308)) * Adds a Docker Compose based transform service that can be used to discover and use portable Beam transforms ([#26023](https://github.com/apache/beam/pull/26023)). -* Allow prebuilding large images when using `--prebuild_sdk_container_engine=cloud_build`, like images depending on `tensorflow` or `torch` ([#27023](https://github.com/apache/beam/pull/27023)) ## Breaking Changes From eb357fe7090fcbc2c5c326ad66b4d1b822bcd970 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 6 Jun 2023 08:27:38 -0700 Subject: [PATCH 5/7] Disable pip cache directory --- sdks/python/container/piputil.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/sdks/python/container/piputil.go b/sdks/python/container/piputil.go index 03ac8325d6d0..a00e017445e3 100644 --- a/sdks/python/container/piputil.go +++ b/sdks/python/container/piputil.go @@ -37,14 +37,14 @@ func pipInstallRequirements(files []string, dir, name string) error { // as possible PyPI downloads. In the first round the --find-links // option will make sure that only things staged in the worker will be // used without following their dependencies. - args := []string{"-m", "pip", "install", "-r", filepath.Join(dir, name), "--disable-pip-version-check", "--no-index", "--no-deps", "--find-links", dir} + args := []string{"-m", "pip", "install", "-r", filepath.Join(dir, name), "--no-cache-dir", "--disable-pip-version-check", "--no-index", "--no-deps", "--find-links", dir} if err := execx.Execute("python", args...); err != nil { fmt.Println("Some packages could not be installed solely from the requirements cache. Installing packages from PyPI.") } // The second install round opens up the search for packages on PyPI and // also installs dependencies. The key is that if all the packages have // been installed in the first round then this command will be a no-op. - args = []string{"-m", "pip", "install", "-r", filepath.Join(dir, name), "--disable-pip-version-check", "--find-links", dir} + args = []string{"-m", "pip", "install", "-r", filepath.Join(dir, name), "--no-cache-dir", "--disable-pip-version-check", "--find-links", dir} return execx.Execute("python", args...) } } @@ -76,18 +76,18 @@ func pipInstallPackage(files []string, dir, name string, force, optional bool, e // installed version will match the package specified, the package itself // will not be reinstalled, but its dependencies will now be resolved and // installed if necessary. This achieves our goal outlined above. - args := []string{"-m", "pip", "install", "--disable-pip-version-check", "--upgrade", "--force-reinstall", "--no-deps", + args := []string{"-m", "pip", "install", "--no-cache-dir", "--disable-pip-version-check", "--upgrade", "--force-reinstall", "--no-deps", filepath.Join(dir, packageSpec)} err := execx.Execute("python", args...) if err != nil { return err } - args = []string{"-m", "pip", "install", "--disable-pip-version-check", filepath.Join(dir, packageSpec)} + args = []string{"-m", "pip", "install", "--no-cache-dir", "--disable-pip-version-check", filepath.Join(dir, packageSpec)} return execx.Execute("python", args...) } // Case when we do not perform a forced reinstall. - args := []string{"-m", "pip", "install", "--disable-pip-version-check", filepath.Join(dir, packageSpec)} + args := []string{"-m", "pip", "install", "--no-cache-dir", "--disable-pip-version-check", filepath.Join(dir, packageSpec)} return execx.Execute("python", args...) } } From 3170c25fe43dc41272c3c5fdc9f64581101afc83 Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 6 Jun 2023 08:29:07 -0700 Subject: [PATCH 6/7] Update CHANGES.md --- CHANGES.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGES.md b/CHANGES.md index 5c7733950aeb..ed2ea71e7ea0 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -62,7 +62,7 @@ ## New Features / Improvements -* Allow prebuilding large images when using `--prebuild_sdk_container_engine=cloud_build`, like images depending on `tensorflow` or `torch` ([#27023](https://github.com/apache/beam/pull/27023)) +* Allow prebuilding large images when using `--prebuild_sdk_container_engine=cloud_build`, like images depending on `tensorflow` or `torch`, and reduce prebuilt Python container size ([#27023](https://github.com/apache/beam/pull/27023)) * X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). ## Breaking Changes From 5fe72f1b771081e5b3a7bcbf4b1b9097b538f92b Mon Sep 17 00:00:00 2001 From: David Cavazos Date: Tue, 6 Jun 2023 11:44:16 -0700 Subject: [PATCH 7/7] revert reducing image size --- CHANGES.md | 2 +- sdks/python/container/Dockerfile | 69 +++++++++++++++----------------- sdks/python/container/piputil.go | 10 ++--- 3 files changed, 39 insertions(+), 42 deletions(-) diff --git a/CHANGES.md b/CHANGES.md index ed2ea71e7ea0..5c7733950aeb 100644 --- a/CHANGES.md +++ b/CHANGES.md @@ -62,7 +62,7 @@ ## New Features / Improvements -* Allow prebuilding large images when using `--prebuild_sdk_container_engine=cloud_build`, like images depending on `tensorflow` or `torch`, and reduce prebuilt Python container size ([#27023](https://github.com/apache/beam/pull/27023)) +* Allow prebuilding large images when using `--prebuild_sdk_container_engine=cloud_build`, like images depending on `tensorflow` or `torch` ([#27023](https://github.com/apache/beam/pull/27023)) * X feature added (Java/Python) ([#X](https://github.com/apache/beam/issues/X)). ## Breaking Changes diff --git a/sdks/python/container/Dockerfile b/sdks/python/container/Dockerfile index 6d64fff98833..83340643cf48 100644 --- a/sdks/python/container/Dockerfile +++ b/sdks/python/container/Dockerfile @@ -22,8 +22,17 @@ LABEL Author "Apache Beam " ARG TARGETOS ARG TARGETARCH -# Install native bindings required for dependencies. -RUN apt-get update && \ +COPY target/base_image_requirements.txt /tmp/base_image_requirements.txt +COPY target/apache-beam.tar.gz /opt/apache/beam/tars/ +COPY target/launcher/${TARGETOS}_${TARGETARCH}/boot target/LICENSE target/NOTICE target/LICENSE.python /opt/apache/beam/ + +ENV CLOUDSDK_CORE_DISABLE_PROMPTS yes +ENV PATH $PATH:/usr/local/gcloud/google-cloud-sdk/bin + +# Use one RUN command to reduce the number of layers. +RUN \ + # Install native bindings required for dependencies. + apt-get update && \ apt-get install -y \ # Required by python-snappy libsnappy-dev \ @@ -32,56 +41,44 @@ RUN apt-get update && \ # This is used to speed up the re-installation of the sdk. ccache \ && \ - rm -rf /var/lib/apt/lists/* + rm -rf /var/lib/apt/lists/* && \ -#### -# Install required packages for Beam Python SDK and common dependencies used by users. -#### + pip install --upgrade setuptools && \ -COPY target/base_image_requirements.txt /tmp/base_image_requirements.txt -RUN \ + # Install required packages for Beam Python SDK and common dependencies used by users. # use --no-deps to ensure the list includes all transitive dependencies. pip install --no-deps -r /tmp/base_image_requirements.txt && \ + rm -rf /tmp/base_image_requirements.txt && \ python -c "import nltk; nltk.download('stopwords')" && \ rm /root/nltk_data/corpora/stopwords.zip && \ + # Check that the protobuf upb(also called micro protobuf) is used. python -c "from google.protobuf.internal import api_implementation; assert api_implementation._implementation_type == 'upb'; print ('Verified fast protobuf used.')" && \ - # Remove pip cache. - rm -rf /root/.cache/pip && \ - rm -rf /tmp/base_image_requirements.txt - -RUN pip install --upgrade pip setuptools -# Install Google Cloud SDK. -ENV CLOUDSDK_CORE_DISABLE_PROMPTS yes -ENV PATH $PATH:/usr/local/gcloud/google-cloud-sdk/bin -RUN mkdir -p /usr/local/gcloud && \ + # Install Google Cloud SDK. + mkdir -p /usr/local/gcloud && \ cd /usr/local/gcloud && \ curl -s -O https://dl.google.com/dl/cloudsdk/release/google-cloud-sdk.tar.gz && \ tar -xf google-cloud-sdk.tar.gz && \ /usr/local/gcloud/google-cloud-sdk/install.sh && \ rm -rf /usr/local/gcloud/google-cloud-sdk/.install/.backup && \ - rm google-cloud-sdk.tar.gz + rm google-cloud-sdk.tar.gz && \ -# Configure ccache prior to installing Beam SDK. -RUN ln -s /usr/bin/ccache /usr/local/bin/gcc -# These parameters are needed as pip compiles artifacts in random temporary directories. -RUN ccache --set-config=sloppiness=file_macro && ccache --set-config=hash_dir=false + # Configure ccache prior to installing Beam SDK. This speeds up wheels compilation when installing the SDK from sources. + ln -s /usr/bin/ccache /usr/local/bin/gcc && \ + # These parameters are needed as pip compiles artifacts in random temporary directories. + ccache --set-config=sloppiness=file_macro && ccache --set-config=hash_dir=false && \ -#### -# Install Apache Beam SDK. Use --no-deps and pip check to verify that all -# necessary dependencies are specified in base_image_requirements.txt. -#### -COPY target/apache-beam.tar.gz /opt/apache/beam/tars/ -RUN pip install --no-deps -v /opt/apache/beam/tars/apache-beam.tar.gz[gcp] -RUN pip check || (echo "Container does not include required Beam dependencies or has conflicting dependencies. If Beam dependencies have changed, you need to regenerate base_image_requirements.txt files. See: https://s.apache.org/beam-python-requirements-generate" && exit 1) -# Log complete list of what exact packages and versions are installed. -RUN pip freeze --all - -COPY target/LICENSE /opt/apache/beam/ -COPY target/LICENSE.python /opt/apache/beam/ -COPY target/NOTICE /opt/apache/beam/ -COPY target/launcher/${TARGETOS}_${TARGETARCH}/boot /opt/apache/beam/ + # Install Apache Beam SDK. Use --no-deps and pip check to verify that all + # necessary dependencies are specified in base_image_requirements.txt. + pip install --no-deps -v /opt/apache/beam/tars/apache-beam.tar.gz[gcp] && \ + pip check || (echo "Container does not include required Beam dependencies or has conflicting dependencies. If Beam dependencies have changed, you need to regenerate base_image_requirements.txt files. See: https://s.apache.org/beam-python-requirements-generate" && exit 1) && \ + + # Log complete list of what exact packages and versions are installed. + pip freeze --all && \ + + # Remove pip cache. + rm -rf /root/.cache/pip ENTRYPOINT ["/opt/apache/beam/boot"] diff --git a/sdks/python/container/piputil.go b/sdks/python/container/piputil.go index a00e017445e3..03ac8325d6d0 100644 --- a/sdks/python/container/piputil.go +++ b/sdks/python/container/piputil.go @@ -37,14 +37,14 @@ func pipInstallRequirements(files []string, dir, name string) error { // as possible PyPI downloads. In the first round the --find-links // option will make sure that only things staged in the worker will be // used without following their dependencies. - args := []string{"-m", "pip", "install", "-r", filepath.Join(dir, name), "--no-cache-dir", "--disable-pip-version-check", "--no-index", "--no-deps", "--find-links", dir} + args := []string{"-m", "pip", "install", "-r", filepath.Join(dir, name), "--disable-pip-version-check", "--no-index", "--no-deps", "--find-links", dir} if err := execx.Execute("python", args...); err != nil { fmt.Println("Some packages could not be installed solely from the requirements cache. Installing packages from PyPI.") } // The second install round opens up the search for packages on PyPI and // also installs dependencies. The key is that if all the packages have // been installed in the first round then this command will be a no-op. - args = []string{"-m", "pip", "install", "-r", filepath.Join(dir, name), "--no-cache-dir", "--disable-pip-version-check", "--find-links", dir} + args = []string{"-m", "pip", "install", "-r", filepath.Join(dir, name), "--disable-pip-version-check", "--find-links", dir} return execx.Execute("python", args...) } } @@ -76,18 +76,18 @@ func pipInstallPackage(files []string, dir, name string, force, optional bool, e // installed version will match the package specified, the package itself // will not be reinstalled, but its dependencies will now be resolved and // installed if necessary. This achieves our goal outlined above. - args := []string{"-m", "pip", "install", "--no-cache-dir", "--disable-pip-version-check", "--upgrade", "--force-reinstall", "--no-deps", + args := []string{"-m", "pip", "install", "--disable-pip-version-check", "--upgrade", "--force-reinstall", "--no-deps", filepath.Join(dir, packageSpec)} err := execx.Execute("python", args...) if err != nil { return err } - args = []string{"-m", "pip", "install", "--no-cache-dir", "--disable-pip-version-check", filepath.Join(dir, packageSpec)} + args = []string{"-m", "pip", "install", "--disable-pip-version-check", filepath.Join(dir, packageSpec)} return execx.Execute("python", args...) } // Case when we do not perform a forced reinstall. - args := []string{"-m", "pip", "install", "--no-cache-dir", "--disable-pip-version-check", filepath.Join(dir, packageSpec)} + args := []string{"-m", "pip", "install", "--disable-pip-version-check", filepath.Join(dir, packageSpec)} return execx.Execute("python", args...) } }