From 88c8d1ae59e48f49adec9c94dcb9e617fee70b13 Mon Sep 17 00:00:00 2001 From: Philip Hyunsu Cho Date: Sat, 8 Feb 2025 01:54:27 -0800 Subject: [PATCH] [CI] Unpin Dask, adopt CUDA 12.8 and RAPIDS 24.12 (#11194) --- doc/contrib/ci.rst | 26 ++++++++++++------- ops/pipeline/build-cpu-arm64.sh | 3 ++- ops/pipeline/build-cpu.sh | 3 ++- ops/pipeline/build-cuda-impl.sh | 2 +- ops/pipeline/build-cuda.sh | 12 +++------ ops/pipeline/build-gpu-rpkg.sh | 3 ++- ops/pipeline/build-jvm-doc.sh | 3 ++- ops/pipeline/build-jvm-gpu.sh | 8 +++--- ops/pipeline/build-jvm-manylinux2014.sh | 3 ++- ops/pipeline/build-manylinux2014.sh | 3 ++- ops/pipeline/build-test-jvm-packages.sh | 3 ++- ops/pipeline/deploy-jvm-packages.sh | 3 ++- ops/pipeline/get-image-tag.sh | 4 +++ ops/pipeline/run-clang-tidy.sh | 3 ++- ops/pipeline/test-cpp-gpu.sh | 7 +++-- ops/pipeline/test-cpp-i386.sh | 3 ++- ops/pipeline/test-jvm-gpu.sh | 3 ++- ops/pipeline/test-python-wheel.sh | 4 ++- .../test_gpu_with_dask/test_gpu_with_dask.py | 12 ++++++++- 19 files changed, 69 insertions(+), 39 deletions(-) create mode 100755 ops/pipeline/get-image-tag.sh diff --git a/doc/contrib/ci.rst b/doc/contrib/ci.rst index 3b3542dcd9cc..7721ae083381 100644 --- a/doc/contrib/ci.rst +++ b/doc/contrib/ci.rst @@ -43,18 +43,26 @@ To make changes to the CI container, carry out the following steps: Consult :ref:`build_run_docker_locally` for this step. 4. Submit a pull request to `dmlc/xgboost-devops `_ with the proposed changes to the Dockerfile. Make note of the pull request number. Example: ``#204`` -5. Clone `dmlc/xgboost `_ and update all references to the - old container to point to the new container. More specifically, all container image URIs of form - ``492475357299.dkr.ecr.us-west-2.amazonaws.com/[image_repo]:main`` should have its image tag - (last component) replaced with ``PR-#``, where ``#`` is the pull request number. - For the example above, - we'd replace ``492475357299.dkr.ecr.us-west-2.amazonaws.com/xgb-ci.gpu:main`` with - ``492475357299.dkr.ecr.us-west-2.amazonaws.com/xgb-ci.gpu:PR-204``. +5. Clone `dmlc/xgboost `_. Locate the file + ``ops/pipeline/get-image-tag.sh``, which should have a single line + + .. code-block:: bash + + IMAGE_TAG=main + + To use the new container, revise the file as follows: + + .. code-block:: bash + + IMAGE_TAG=PR-XX + + where ``XX`` is the pull request number. + 6. Now submit a pull request to `dmlc/xgboost `_. The CI will run tests using the new container. Verify that all tests pass. 7. Merge the pull request in ``dmlc/xgboost-devops``. Wait until the CI completes on the ``main`` branch. -8. Go back to the the pull request for ``dmlc/xgboost`` and revise all the container references to use - the old tag ``:main``. +8. Go back to the the pull request for ``dmlc/xgboost`` and change ``ops/pipeline/get-image-tag.sh`` + back to ``IMAGE_TAG=main``. 9. Merge the pull request in ``dmlc/xgboost``. .. _build_run_docker_locally: diff --git a/ops/pipeline/build-cpu-arm64.sh b/ops/pipeline/build-cpu-arm64.sh index e3e0cebb22c8..7630996cebf0 100755 --- a/ops/pipeline/build-cpu-arm64.sh +++ b/ops/pipeline/build-cpu-arm64.sh @@ -11,9 +11,10 @@ fi source ops/pipeline/classify-git-branch.sh source ops/pipeline/get-docker-registry-details.sh +source ops/pipeline/get-image-tag.sh WHEEL_TAG=manylinux_2_28_aarch64 -IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.aarch64:main +IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.aarch64:${IMAGE_TAG} echo "--- Build CPU code targeting ARM64" set -x diff --git a/ops/pipeline/build-cpu.sh b/ops/pipeline/build-cpu.sh index 2443006bd7c2..a46c141b33ed 100755 --- a/ops/pipeline/build-cpu.sh +++ b/ops/pipeline/build-cpu.sh @@ -5,8 +5,9 @@ set -euo pipefail source ops/pipeline/classify-git-branch.sh source ops/pipeline/get-docker-registry-details.sh +source ops/pipeline/get-image-tag.sh -IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.cpu:main +IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.cpu:${IMAGE_TAG} echo "--- Build CPU code" set -x diff --git a/ops/pipeline/build-cuda-impl.sh b/ops/pipeline/build-cuda-impl.sh index 198936852948..75cbaae03afe 100755 --- a/ops/pipeline/build-cuda-impl.sh +++ b/ops/pipeline/build-cuda-impl.sh @@ -16,7 +16,7 @@ then cmake_prefix_path='/opt/grpc;/opt/rmm;/opt/rmm/lib64/rapids/cmake' cmake_args="${cmake_args} -DPLUGIN_RMM=ON" else - cmake_prefix_path='/opt/grpc;/workspace/cccl' + cmake_prefix_path='/opt/grpc' fi # Disable CMAKE_COMPILE_WARNING_AS_ERROR option temporarily until diff --git a/ops/pipeline/build-cuda.sh b/ops/pipeline/build-cuda.sh index a34a0dcc7c28..02b8d7ecd9ea 100755 --- a/ops/pipeline/build-cuda.sh +++ b/ops/pipeline/build-cuda.sh @@ -33,10 +33,11 @@ esac source ops/pipeline/classify-git-branch.sh source ops/pipeline/get-docker-registry-details.sh +source ops/pipeline/get-image-tag.sh WHEEL_TAG=manylinux_2_28_x86_64 -BUILD_IMAGE_URI="${DOCKER_REGISTRY_URL}/${image_repo}:main" -MANYLINUX_IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:main" +BUILD_IMAGE_URI="${DOCKER_REGISTRY_URL}/${image_repo}:${IMAGE_TAG}" +MANYLINUX_IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.${WHEEL_TAG}:${IMAGE_TAG}" echo "--- Build with CUDA" @@ -47,13 +48,6 @@ else export BUILD_ONLY_SM75=0 fi -if [[ ${USE_RMM} == 0 ]] -then - # Work around https://github.com/NVIDIA/cccl/issues/1956 - # TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ - git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet -fi - set -x python3 ops/docker_run.py \ diff --git a/ops/pipeline/build-gpu-rpkg.sh b/ops/pipeline/build-gpu-rpkg.sh index ced68245d4bf..6ea2424c05b8 100755 --- a/ops/pipeline/build-gpu-rpkg.sh +++ b/ops/pipeline/build-gpu-rpkg.sh @@ -10,8 +10,9 @@ fi source ops/pipeline/classify-git-branch.sh source ops/pipeline/get-docker-registry-details.sh +source ops/pipeline/get-image-tag.sh -IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.gpu_build_r_rockylinux8:main +IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.gpu_build_r_rockylinux8:${IMAGE_TAG} echo "--- Build XGBoost R package with CUDA" set -x diff --git a/ops/pipeline/build-jvm-doc.sh b/ops/pipeline/build-jvm-doc.sh index 3f9fad3bc2d0..fa2a6f410380 100755 --- a/ops/pipeline/build-jvm-doc.sh +++ b/ops/pipeline/build-jvm-doc.sh @@ -18,8 +18,9 @@ then fi source ops/pipeline/get-docker-registry-details.sh +source ops/pipeline/get-image-tag.sh -IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.jvm_gpu_build:main +IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.jvm_gpu_build:${IMAGE_TAG} echo "--- Build JVM packages doc" set -x diff --git a/ops/pipeline/build-jvm-gpu.sh b/ops/pipeline/build-jvm-gpu.sh index 2f71849efb3b..cc766e93c336 100755 --- a/ops/pipeline/build-jvm-gpu.sh +++ b/ops/pipeline/build-jvm-gpu.sh @@ -5,8 +5,9 @@ set -euo pipefail source ops/pipeline/classify-git-branch.sh source ops/pipeline/get-docker-registry-details.sh +source ops/pipeline/get-image-tag.sh -IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.jvm_gpu_build:main +IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.jvm_gpu_build:${IMAGE_TAG} echo "--- Build libxgboost4j.so with CUDA" @@ -20,7 +21,7 @@ fi COMMAND=$( cat <<-EOF cd build-gpu/ && \ -cmake .. -DCMAKE_PREFIX_PATH=/workspace/cccl -GNinja -DUSE_CUDA=ON -DUSE_NCCL=ON \ +cmake .. -GNinja -DUSE_CUDA=ON -DUSE_NCCL=ON \ -DJVM_BINDINGS=ON -DCMAKE_EXPORT_COMPILE_COMMANDS=ON ${arch_flag} && \ ninja EOF @@ -28,9 +29,6 @@ EOF set -x mkdir -p build-gpu/ -# Work around https://github.com/NVIDIA/cccl/issues/1956 -# TODO(hcho3): Remove this once new CUDA version ships with CCCL 2.6.0+ -git clone https://github.com/NVIDIA/cccl.git -b v2.6.1 --quiet --depth 1 python3 ops/docker_run.py \ --image-uri ${IMAGE_URI} \ -- bash -c "${COMMAND}" diff --git a/ops/pipeline/build-jvm-manylinux2014.sh b/ops/pipeline/build-jvm-manylinux2014.sh index 5b3942fb67ef..91333875716f 100755 --- a/ops/pipeline/build-jvm-manylinux2014.sh +++ b/ops/pipeline/build-jvm-manylinux2014.sh @@ -14,8 +14,9 @@ image_repo="xgb-ci.manylinux2014_${arch}" source ops/pipeline/classify-git-branch.sh source ops/pipeline/get-docker-registry-details.sh +source ops/pipeline/get-image-tag.sh -IMAGE_URI="${DOCKER_REGISTRY_URL}/${image_repo}:main" +IMAGE_URI="${DOCKER_REGISTRY_URL}/${image_repo}:${IMAGE_TAG}" # Build XGBoost4J binary echo "--- Build libxgboost4j.so (targeting glibc 2.17)" diff --git a/ops/pipeline/build-manylinux2014.sh b/ops/pipeline/build-manylinux2014.sh index cf8ca7a94da3..809e00310f15 100755 --- a/ops/pipeline/build-manylinux2014.sh +++ b/ops/pipeline/build-manylinux2014.sh @@ -18,10 +18,11 @@ arch="$1" source ops/pipeline/classify-git-branch.sh source ops/pipeline/get-docker-registry-details.sh +source ops/pipeline/get-image-tag.sh WHEEL_TAG="manylinux2014_${arch}" IMAGE_REPO="xgb-ci.${WHEEL_TAG}" -IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:main" +IMAGE_URI="${DOCKER_REGISTRY_URL}/${IMAGE_REPO}:${IMAGE_TAG}" PYTHON_BIN="/opt/python/cp310-cp310/bin/python" echo "--- Build binary wheel for ${WHEEL_TAG}" diff --git a/ops/pipeline/build-test-jvm-packages.sh b/ops/pipeline/build-test-jvm-packages.sh index 5c1f5e8b0d4a..2c855e5d642f 100755 --- a/ops/pipeline/build-test-jvm-packages.sh +++ b/ops/pipeline/build-test-jvm-packages.sh @@ -13,6 +13,7 @@ EOF set -euo pipefail source ops/pipeline/get-docker-registry-details.sh +source ops/pipeline/get-image-tag.sh for arg in "SCALA_VERSION" do @@ -23,7 +24,7 @@ do fi done -IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.jvm:main +IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.jvm:${IMAGE_TAG} set -x diff --git a/ops/pipeline/deploy-jvm-packages.sh b/ops/pipeline/deploy-jvm-packages.sh index d73b289b165d..a0a1fc62d2f7 100755 --- a/ops/pipeline/deploy-jvm-packages.sh +++ b/ops/pipeline/deploy-jvm-packages.sh @@ -5,6 +5,7 @@ set -euo pipefail source ops/pipeline/enforce-ci.sh source ops/pipeline/get-docker-registry-details.sh +source ops/pipeline/get-image-tag.sh if [[ "$#" -lt 3 ]] then @@ -16,7 +17,7 @@ variant="$1" image_repo="$2" scala_version="$3" -IMAGE_URI="${DOCKER_REGISTRY_URL}/${image_repo}:main" +IMAGE_URI="${DOCKER_REGISTRY_URL}/${image_repo}:${IMAGE_TAG}" set -x diff --git a/ops/pipeline/get-image-tag.sh b/ops/pipeline/get-image-tag.sh new file mode 100755 index 000000000000..74458229bbce --- /dev/null +++ b/ops/pipeline/get-image-tag.sh @@ -0,0 +1,4 @@ +## Update the following line to test changes to CI images +## See https://xgboost.readthedocs.io/en/latest/contrib/ci.html#making-changes-to-ci-containers + +IMAGE_TAG=main diff --git a/ops/pipeline/run-clang-tidy.sh b/ops/pipeline/run-clang-tidy.sh index c6166e99cb33..5daade9f2fd8 100755 --- a/ops/pipeline/run-clang-tidy.sh +++ b/ops/pipeline/run-clang-tidy.sh @@ -3,8 +3,9 @@ set -euo pipefail source ops/pipeline/get-docker-registry-details.sh +source ops/pipeline/get-image-tag.sh -IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.clang_tidy:main +IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.clang_tidy:${IMAGE_TAG} echo "--- Run clang-tidy" set -x diff --git a/ops/pipeline/test-cpp-gpu.sh b/ops/pipeline/test-cpp-gpu.sh index 4262ebda7268..3f3992828cef 100755 --- a/ops/pipeline/test-cpp-gpu.sh +++ b/ops/pipeline/test-cpp-gpu.sh @@ -10,26 +10,29 @@ fi suite=$1 source ops/pipeline/get-docker-registry-details.sh +source ops/pipeline/get-image-tag.sh -IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.gpu:main +IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.gpu:${IMAGE_TAG} case "${suite}" in gpu) echo "--- Run Google Tests, using a single GPU" python3 ops/docker_run.py --image-uri ${IMAGE_URI} --use-gpus \ + --run-args='--privileged' \ -- build/testxgboost ;; gpu-rmm) echo "--- Run Google Tests, using a single GPU, RMM enabled" python3 ops/docker_run.py --image-uri ${IMAGE_URI} --use-gpus \ + --run-args='--privileged' \ -- build/testxgboost --use-rmm-pool ;; mgpu) echo "--- Run Google Tests, using multiple GPUs" python3 ops/docker_run.py --image-uri ${IMAGE_URI} --use-gpus \ - --run-args='--shm-size=4g' \ + --run-args='--shm-size=4g --privileged' \ -- build/testxgboost --gtest_filter=*MGPU* ;; diff --git a/ops/pipeline/test-cpp-i386.sh b/ops/pipeline/test-cpp-i386.sh index dd7827ff2de9..82f7fc1f0ec2 100755 --- a/ops/pipeline/test-cpp-i386.sh +++ b/ops/pipeline/test-cpp-i386.sh @@ -4,8 +4,9 @@ set -euo pipefail source ops/pipeline/get-docker-registry-details.sh +source ops/pipeline/get-image-tag.sh -IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.i386:main" +IMAGE_URI="${DOCKER_REGISTRY_URL}/xgb-ci.i386:${IMAGE_TAG}" set -x python3 ops/docker_run.py \ diff --git a/ops/pipeline/test-jvm-gpu.sh b/ops/pipeline/test-jvm-gpu.sh index c7dfdd05b43f..564e46f0b357 100755 --- a/ops/pipeline/test-jvm-gpu.sh +++ b/ops/pipeline/test-jvm-gpu.sh @@ -24,8 +24,9 @@ do done source ops/pipeline/get-docker-registry-details.sh +source ops/pipeline/get-image-tag.sh -IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.jvm_gpu_build:main +IMAGE_URI=${DOCKER_REGISTRY_URL}/xgb-ci.jvm_gpu_build:${IMAGE_TAG} set -x diff --git a/ops/pipeline/test-python-wheel.sh b/ops/pipeline/test-python-wheel.sh index 5551870e0b77..9ccdc42042d5 100755 --- a/ops/pipeline/test-python-wheel.sh +++ b/ops/pipeline/test-python-wheel.sh @@ -20,7 +20,9 @@ else fi source ops/pipeline/get-docker-registry-details.sh -IMAGE_URI="${DOCKER_REGISTRY_URL}/${image_repo}:main" +source ops/pipeline/get-image-tag.sh + +IMAGE_URI="${DOCKER_REGISTRY_URL}/${image_repo}:${IMAGE_TAG}" set -x python3 ops/docker_run.py --image-uri "${IMAGE_URI}" ${gpu_option} \ diff --git a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py index 0aa3958e5868..38732c1853e3 100644 --- a/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py +++ b/tests/test_distributed/test_gpu_with_dask/test_gpu_with_dask.py @@ -10,6 +10,7 @@ import pytest from hypothesis import given, note, settings, strategies from hypothesis._settings import duration +from packaging.version import parse as parse_version import xgboost as xgb from xgboost import testing as tm @@ -44,6 +45,7 @@ try: import cudf import dask.dataframe as dd + from dask import __version__ as dask_version from dask import array as da from dask.distributed import Client from dask_cuda import LocalCUDACluster @@ -51,7 +53,12 @@ from xgboost import dask as dxgb from xgboost.testing.dask import check_init_estimation, check_uneven_nan except ImportError: - pass + dask_version = None + + +dask_version_ge110 = dask_version and parse_version(dask_version) >= parse_version( + "2024.11.0" +) def run_with_dask_dataframe(DMatrixT: Type, client: Client) -> None: @@ -375,6 +382,9 @@ def test_early_stopping(self, local_cuda_client: Client) -> None: dump = booster.get_dump(dump_format="json") assert len(dump) - booster.best_iteration == early_stopping_rounds + 1 + @pytest.mark.xfail( + dask_version_ge110, reason="Test cannot pass with Dask 2024.11.0+" + ) @pytest.mark.skipif(**tm.no_cudf()) @pytest.mark.parametrize("model", ["boosting"]) def test_dask_classifier(self, model: str, local_cuda_client: Client) -> None: