From 0984e97a5c799c7db961ffb2d427ee923eccb607 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Wed, 12 Jun 2024 18:44:51 -0500 Subject: [PATCH] [Bugfix][NCCL] Release NCCL thread_local resources in destructor (#17078) Prior to this commit, allocations performed by `ncclCommInitRank` had no corresponding call to `ncclCommDestroy`. While `ncclCommDestroy` does occur in the `CCLThreadLocalContext::Clear` method, there are no calls into this method. On worker processes, the failure to call `ncclCommDestroy` typically had little effect. Any destruction would occur shortly before the process closes, and so resources would be reclaimed by the OS when the process terminates. However, worker0 of a Disco session is a separate thread, rather than a separate process. While this allows it to easily receive data from the controller thread, resources allocated by worker0 are not reclaimed by the OS until the entire process terminates. As a result, the `CCLThreadLocalContext` leaked GPU memory, as the `ncclCommInitRank` call at the start of each `tvm.runtime.disco.ProcessSession` was never de-allocated. The increase in GPU memory usage was about 1 gigabyte for each `ProcessSession`. This commit updates `CCLThreadLocalContext` to have a destructor that calls the `Clear` method. For worker0, this is called when the thread is joined to the main thread. --- src/runtime/disco/nccl/nccl.cc | 12 ++++++++++++ src/runtime/disco/nccl/nccl_context.h | 15 +++++++++++---- 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/src/runtime/disco/nccl/nccl.cc b/src/runtime/disco/nccl/nccl.cc index 7b943cf83f1f..bba42ed3bdfe 100644 --- a/src/runtime/disco/nccl/nccl.cc +++ b/src/runtime/disco/nccl/nccl.cc @@ -67,9 +67,21 @@ void InitCCLPerWorker(IntTuple device_ids, std::string unique_id_bytes) { CCLThreadLocalContext* ctx = CCLThreadLocalContext::Get(); DiscoWorker* worker = DiscoWorker::ThreadLocal(); ICHECK(worker != nullptr); + CHECK_EQ(unique_id_bytes.size(), NCCL_UNIQUE_ID_BYTES) << "ValueError: The length of unique_id must be " << NCCL_UNIQUE_ID_BYTES << ", but got " << unique_id_bytes.size() << "."; + + CHECK(!ctx->comm) << "Cannot initialize CCL, " + << "the previous thread-global comm still exists, " + << "and has not been destructed"; + CHECK(!ctx->default_stream) << "Cannot initialize CCL, " + << "the previous thread-global stream still exists, " + << "and has not been destructed"; + CHECK(!ctx->worker) << "Cannot initialize CCL, " + << "the previous thread-global worker still exists, " + << "and has not been destructed"; + // Step up local context of NCCL int device_id = device_ids[worker->worker_id]; SetDevice(device_id); diff --git a/src/runtime/disco/nccl/nccl_context.h b/src/runtime/disco/nccl/nccl_context.h index 9d1b8b933a83..3fb281f2cb7c 100644 --- a/src/runtime/disco/nccl/nccl_context.h +++ b/src/runtime/disco/nccl/nccl_context.h @@ -118,16 +118,23 @@ inline ncclDataType_t AsNCCLDataType(runtime::DataType dtype) { } struct CCLThreadLocalContext { - DiscoWorker* worker; + DiscoWorker* worker = nullptr; int device_id; deviceStream_t default_stream = nullptr; - ncclComm_t comm; + ncclComm_t comm = nullptr; + + ~CCLThreadLocalContext() { Clear(); } void Clear() { - NCCL_CALL(ncclCommDestroy(comm)); - if (default_stream != nullptr) { + if (comm) { + NCCL_CALL(ncclCommDestroy(comm)); + comm = nullptr; + } + if (default_stream) { StreamDestroy(default_stream); + default_stream = nullptr; } + worker = nullptr; } deviceStream_t GetDefaultStream() {