-
Notifications
You must be signed in to change notification settings - Fork 3.5k
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Loading status checks…
[Runtime] CUDA IPC Memory support and custom allreduce kernels
This PR introduces the CUDA IPC memory support in TVM runtime. IPC memory allows multiple distribtued workers accessing the GPU memory of each other directly. This functionality is helpful for implementing customzied communication primitives across distributed workers. In this PR, we bring the customized all-reduce implementation from TensorRT-LLM into 3rdparty. This all-reduce implementation makes use of the CUDA IPC memory. We expose the all-reduce function in global function under namespace `tvm::runtime::disco::cuda_ipc`. One unit test for the customized all-reduce kernel over two workers is added. --- Co-authored-by: Hongyi Jin <[email protected]>
1 parent
48cedc7
commit 1a37248
Showing
18 changed files
with
1,366 additions
and
134 deletions.
There are no files selected for viewing
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
/* | ||
* Copyright (c) 2022-2024, NVIDIA CORPORATION. All rights reserved. | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
#include <cuda_fp16.h> | ||
#include <stdint.h> | ||
|
||
namespace tensorrt_llm { | ||
|
||
constexpr size_t WARP_SIZE = 32; | ||
constexpr size_t MAX_ALL_REDUCE_BLOCKS = 24; | ||
constexpr size_t MAX_RANKS_PER_NODE = 8; | ||
constexpr size_t DEFAULT_BLOCK_SIZE = 1024; | ||
|
||
enum class AllReduceStrategyType : int8_t { | ||
ONESHOT = 1, | ||
TWOSHOT = 2, | ||
}; | ||
|
||
struct AllReduceParams { | ||
size_t elts_total; | ||
size_t elts_per_rank; | ||
size_t elts_per_block; | ||
size_t rank_offset; | ||
size_t ranks_per_node, rank, local_rank; | ||
uint32_t barrier_flag; | ||
uint32_t* peer_barrier_ptrs_in[MAX_RANKS_PER_NODE]; | ||
uint32_t* peer_barrier_ptrs_out[MAX_RANKS_PER_NODE]; | ||
void* peer_comm_buffer_ptrs[MAX_RANKS_PER_NODE]; | ||
void* local_output_buffer_ptr; | ||
}; | ||
|
||
void customAllReduce(AllReduceParams& params, void* data, size_t elts, DLDataType dataType, | ||
AllReduceStrategyType strat, cudaStream_t stream); | ||
|
||
} // namespace tensorrt_llm |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,102 @@ | ||
/* | ||
* Licensed to the Apache Software Foundation (ASF) under one | ||
* or more contributor license agreements. See the NOTICE file | ||
* distributed with this work for additional information | ||
* regarding copyright ownership. The ASF licenses this file | ||
* to you under the Apache License, Version 2.0 (the | ||
* "License"); you may not use this file except in compliance | ||
* with the License. You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, | ||
* software distributed under the License is distributed on an | ||
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | ||
* KIND, either express or implied. See the License for the | ||
* specific language governing permissions and limitations | ||
* under the License. | ||
*/ | ||
|
||
#ifndef TVM_RUNTIME_DISCO_CUDA_IPC_MEMORY_H_ | ||
#define TVM_RUNTIME_DISCO_CUDA_IPC_MEMORY_H_ | ||
|
||
#include <tvm/runtime/c_runtime_api.h> | ||
#include <tvm/runtime/memory/memory_manager.h> | ||
#include <tvm/runtime/object.h> | ||
|
||
#include <vector> | ||
|
||
namespace tvm { | ||
namespace runtime { | ||
namespace cuda_ipc { | ||
|
||
/*! | ||
* \brief The CUDA IPC (interprocess communication) memory object, | ||
* which internally contains data pointers to CUDA IPC memory. | ||
* It is be useful for efficient all-reduce implementation. | ||
* \note Right now the class members are closely tied with customized | ||
* all-reduce kernel. They may also be extended for other uses in | ||
* the future. | ||
*/ | ||
class CUDAIPCMemoryObj : public Object { | ||
public: | ||
/*! \brief The number of GPU workers. */ | ||
int num_workers; | ||
/*! \brief The worker id corresponding to this IPC memory object. */ | ||
int worker_id; | ||
/*! | ||
* \brief The data pointers of all all-reduce inputs. | ||
* It has "num_workers" pointers. The i-th pointer is the data pointer on worker i. | ||
* If "i != worker_id", the pointer is an IPC data pointer. | ||
* Otherwise, the pointer is a local CUDA data pointer. | ||
*/ | ||
std::vector<void*> remote_data; | ||
|
||
// We introduce the barrier helper data below per CUDAIPCMemory object | ||
// so that they can be used by custom collective operations and allow | ||
// fine-grained synchronization on each buffer. These barriers have | ||
// low overhead, and can potentially enable concurrent execution of | ||
// kernels in future. | ||
/*! | ||
* \brief The pointers to input barrier signals of all workers for all-reduce. | ||
* It has "num_workers" pointers, and the pointer arrangement is the same as "remote_data". | ||
*/ | ||
std::vector<void*> barrier_in; | ||
/*! | ||
* \brief The pointers to output barrier signals of all workers for all-reduce. | ||
* It has "num_workers" pointers, and the pointer arrangement is the same as "remote_data". | ||
*/ | ||
std::vector<void*> barrier_out; | ||
/*! \brief The integer buffer flag for all-reduce. */ | ||
int barrier_flag; | ||
|
||
static constexpr const char* _type_key = "tvm.runtime.disco.cuda_ipc_memory"; | ||
static constexpr const bool _type_has_method_sequal_reduce = false; | ||
static constexpr const bool _type_has_method_shash_reduce = false; | ||
TVM_DECLARE_BASE_OBJECT_INFO(CUDAIPCMemoryObj, Object); | ||
}; | ||
|
||
/*! | ||
* \brief Managed reference to CUDAIPCMemoryObj. | ||
* \sa CUDAIPCMemory | ||
*/ | ||
class CUDAIPCMemory : public ObjectRef { | ||
public: | ||
/*! \brief Get the global singleton CUDAIPCMemory allocator. */ | ||
TVM_DLL static memory::Allocator* GlobalAllocator(); | ||
/*! | ||
* \brief Given a local CUDA data pointer, return the CUDAIPCMemory object of the pointer. | ||
* \note The pointer's CUDAIPCMemory is expected to have been allocated | ||
* through global function "cuda_ipc.alloc_storage". Or otherwise this | ||
* function will raise exception. | ||
*/ | ||
TVM_DLL static CUDAIPCMemory GetIPCMemoryFromDevicePtr(void* ptr); | ||
|
||
TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(CUDAIPCMemory, ObjectRef, CUDAIPCMemoryObj); | ||
}; | ||
|
||
} // namespace cuda_ipc | ||
} // namespace runtime | ||
} // namespace tvm | ||
|
||
#endif // TVM_RUNTIME_DISCO_CUDA_IPC_MEMORY_H_ |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.