Skip to content

Commit

Permalink
Increase Nanny close timeout in LocalCUDACluster tests (#5636)
Browse files Browse the repository at this point in the history
Tests in CI may fail at times, possibly under high loads only, due to `Nanny` close timeout, whose internal mechanism to establish timeout to kill processes may leave too little time for the process to shutdown properly.

Dask-CUDA introduced a new `IncreasedCloseTimeoutNanny` class intended to be used with `LocalCUDACluster` in tests to reduce chances such timeouts occur. This new class is now used in tests to improve the situation in CI.

Authors:
  - Peter Andreas Entschev (https://github.com/pentschev)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Simon Adorf (https://github.com/csadorf)

URL: #5636
  • Loading branch information
pentschev authored Oct 30, 2023
1 parent 79aa490 commit 04348ed
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 2 deletions.
8 changes: 7 additions & 1 deletion python/cuml/benchmark/automated/dask/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@

from dask_cuda import initialize
from dask_cuda import LocalCUDACluster
from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
from dask.distributed import Client

enable_tcp_over_ucx = True
Expand All @@ -28,7 +29,11 @@
@pytest.fixture(scope="module")
def cluster():

cluster = LocalCUDACluster(protocol="tcp", scheduler_port=0)
cluster = LocalCUDACluster(
protocol="tcp",
scheduler_port=0,
worker_class=IncreasedCloseTimeoutNanny,
)
yield cluster
cluster.close()

Expand All @@ -54,6 +59,7 @@ def ucx_cluster():
enable_tcp_over_ucx=enable_tcp_over_ucx,
enable_nvlink=enable_nvlink,
enable_infiniband=enable_infiniband,
worker_class=IncreasedCloseTimeoutNanny,
)
yield cluster
cluster.close()
Expand Down
8 changes: 7 additions & 1 deletion python/cuml/tests/dask/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

from dask_cuda import initialize
from dask_cuda import LocalCUDACluster
from dask_cuda.utils_test import IncreasedCloseTimeoutNanny
from dask.distributed import Client

enable_tcp_over_ucx = True
Expand All @@ -14,7 +15,11 @@
@pytest.fixture(scope="module")
def cluster():

cluster = LocalCUDACluster(protocol="tcp", scheduler_port=0)
cluster = LocalCUDACluster(
protocol="tcp",
scheduler_port=0,
worker_class=IncreasedCloseTimeoutNanny,
)
yield cluster
cluster.close()

Expand All @@ -40,6 +45,7 @@ def ucx_cluster():
enable_tcp_over_ucx=enable_tcp_over_ucx,
enable_nvlink=enable_nvlink,
enable_infiniband=enable_infiniband,
worker_class=IncreasedCloseTimeoutNanny,
)
yield cluster
cluster.close()
Expand Down

0 comments on commit 04348ed

Please sign in to comment.