From 07fa60bd79acb881af7fa45417ec3ced10414f5e Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Mon, 20 Jan 2025 20:19:48 +0800 Subject: [PATCH 1/7] optimize moe_align_block_size Signed-off-by: Jinzhen Lin --- csrc/moe/moe_align_sum_kernels.cu | 88 ++++++++++++++++++------------- 1 file changed, 52 insertions(+), 36 deletions(-) diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index 24341d63fb1f8..ef69d21db1aa4 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -21,7 +21,7 @@ __device__ __forceinline__ int32_t index(int32_t total_col, int32_t row, } } // namespace -template +template __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, int32_t* sorted_token_ids, int32_t* expert_ids, @@ -32,12 +32,8 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, const size_t start_idx = threadIdx.x * tokens_per_thread; extern __shared__ int32_t shared_mem[]; - - int32_t* tokens_cnts = - shared_mem; // 2d tensor with shape (blockDim.x + 1, num_experts) - int32_t* cumsum = - shared_mem + - (blockDim.x + 1) * num_experts; // 1d tensor with shape (num_experts + 1) + int32_t* cumsum = shared_mem; // 1d tensor with shape (num_experts + 1) + token_cnts_t* tokens_cnts = (token_cnts_t *) (shared_mem + blockDim.x + 1); for (int i = 0; i < num_experts; ++i) { tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0; @@ -74,7 +70,7 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, block_size) * block_size; } - *total_tokens_post_pad = cumsum[num_experts]; + *total_tokens_post_pad = (int32_t) cumsum[num_experts]; } __syncthreads(); @@ -224,26 +220,41 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, torch::Tensor num_tokens_post_pad) { const cudaStream_t stream = at::cuda::getCurrentCUDAStream(); - // If we have very large number of experts, we can no longer use shared - // memory. - // TODO(simon): the right solution should be calculating the exact right - // amount of shared memory and use that. The num_experts >= 256 is just a - // temporary solution to unblock Deepseek V3. - if (num_experts >= 256) { + int device_max_shared_mem; + auto dev = topk_ids.get_device(); + cudaDeviceGetAttribute(&device_max_shared_mem, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); + + const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); + const int32_t shared_mem_i32 = + ((num_thread + 1) * num_experts + (num_experts + 1)) * + sizeof(int32_t); + const int32_t shared_mem_i16 = + ((num_thread + 1) * num_experts) * sizeof(uint16_t) + (num_experts + 1) * + sizeof(int32_t); + + bool use_global_memory = false, use_i16 = false; + if (shared_mem_i16 > device_max_shared_mem) { + use_global_memory = true; + } else if (shared_mem_i32 > device_max_shared_mem && topk_ids::numel() <= 65535) { + // when nelements of topk_ids is smaller than 65535 (max value of uint16), + // element value of token_cnts would also smaller than 65535, + // so we can use uint16 as dtype of token_cnts + use_i16 = true; + } + + if (use_global_memory) { VLLM_DISPATCH_INTEGRAL_TYPES( topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] { // calc needed amount of shared mem for `tokens_cnts` and `cumsum` // tensors const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); - const int32_t mem_tokens_cnts = - ((num_experts + 1) * num_experts) * sizeof(int32_t); - const int32_t mem_cumsum = (num_experts + 1) * sizeof(int32_t); - // allocate global memory - int32_t* tokens_cnts; - int32_t* cumsum; - cudaMalloc(&tokens_cnts, mem_tokens_cnts); - cudaMalloc(&cumsum, mem_cumsum); + auto options_int = + torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device()); + torch::Tensor token_cnts_buffer = + torch::empty({(num_experts + 1) * num_experts}, options_int); + torch::Tensor cumsum_buffer = + torch::empty({num_experts + 1}, options_int); auto kernel = vllm::moe::moe_align_block_size_global_mem_kernel; @@ -252,25 +263,30 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, sorted_token_ids.data_ptr(), experts_ids.data_ptr(), num_tokens_post_pad.data_ptr(), num_experts, block_size, - topk_ids.numel(), tokens_cnts, cumsum); - cudaFree(tokens_cnts); - cudaFree(cumsum); + topk_ids.numel(), token_cnts_buffer.data_ptr(), + cumsum_buffer.data_ptr()); }); - } else { + } else if (use_i16) { VLLM_DISPATCH_INTEGRAL_TYPES( topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { - // calc needed amount of shared mem for `tokens_cnts` and `cumsum` - // tensors - const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); - const int32_t shared_mem = - ((num_thread + 1) * num_experts + (num_experts + 1)) * - sizeof(int32_t); - // set dynamic shared mem - auto kernel = vllm::moe::moe_align_block_size_kernel; + auto kernel = vllm::moe::moe_align_block_size_kernel; + AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( + (void*)kernel, shared_mem_i16)); + kernel<<<1, num_thread, shared_mem_i16, stream>>>( + topk_ids.data_ptr(), + sorted_token_ids.data_ptr(), + experts_ids.data_ptr(), + num_tokens_post_pad.data_ptr(), num_experts, block_size, + topk_ids.numel()); + }); + } else { + VLLM_DISPATCH_INTEGRAL_TYPES( + topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { + auto kernel = vllm::moe::moe_align_block_size_kernel; AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( - (void*)kernel, shared_mem)); - kernel<<<1, num_thread, shared_mem, stream>>>( + (void*)kernel, shared_mem_i32)); + kernel<<<1, num_thread, shared_mem_i32, stream>>>( topk_ids.data_ptr(), sorted_token_ids.data_ptr(), experts_ids.data_ptr(), From 3201616f5449b47c0b40cd19335eb0f97183c245 Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Mon, 20 Jan 2025 20:35:22 +0800 Subject: [PATCH 2/7] update config Signed-off-by: Jinzhen Lin --- vllm/config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/vllm/config.py b/vllm/config.py index 4698a05020332..b0a92b2e21343 100644 --- a/vllm/config.py +++ b/vllm/config.py @@ -607,7 +607,7 @@ def _verify_cuda_graph(self) -> None: self.max_seq_len_to_capture = min(self.max_seq_len_to_capture, self.max_model_len) - MODEL_NOT_SUPPORT_CUDA_GRAPH = ['deepseek_v3', 'mllama'] + MODEL_NOT_SUPPORT_CUDA_GRAPH = ['mllama'] if (self.hf_config.model_type in MODEL_NOT_SUPPORT_CUDA_GRAPH and not self.enforce_eager): logger.warning( From 57abfd448589ed6eeb4e289feed76d7e22a3e9f1 Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Mon, 20 Jan 2025 20:54:52 +0800 Subject: [PATCH 3/7] fix error Signed-off-by: Jinzhen Lin --- csrc/moe/moe_align_sum_kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index ef69d21db1aa4..d47ec22230237 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -235,7 +235,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, bool use_global_memory = false, use_i16 = false; if (shared_mem_i16 > device_max_shared_mem) { use_global_memory = true; - } else if (shared_mem_i32 > device_max_shared_mem && topk_ids::numel() <= 65535) { + } else if (shared_mem_i32 > device_max_shared_mem && topk_ids.numel() <= 65535) { // when nelements of topk_ids is smaller than 65535 (max value of uint16), // element value of token_cnts would also smaller than 65535, // so we can use uint16 as dtype of token_cnts From 4d263c087fde7055284d60a8d6a710e19f8a9e22 Mon Sep 17 00:00:00 2001 From: Jinzhen Lin Date: Mon, 20 Jan 2025 20:55:41 +0800 Subject: [PATCH 4/7] fix format error Signed-off-by: Jinzhen Lin --- csrc/moe/moe_align_sum_kernels.cu | 32 +++++++++++++++++-------------- 1 file changed, 18 insertions(+), 14 deletions(-) diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index d47ec22230237..84a8fa2e54641 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -8,7 +8,7 @@ #include "../cuda_compat.h" #include "../dispatch_utils.h" -#define CEILDIV(x, y) (((x) + (y) - 1) / (y)) +#define CEILDIV(x, y) (((x) + (y)-1) / (y)) namespace vllm { namespace moe { @@ -33,7 +33,7 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, extern __shared__ int32_t shared_mem[]; int32_t* cumsum = shared_mem; // 1d tensor with shape (num_experts + 1) - token_cnts_t* tokens_cnts = (token_cnts_t *) (shared_mem + blockDim.x + 1); + token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + blockDim.x + 1); for (int i = 0; i < num_experts; ++i) { tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0; @@ -70,7 +70,7 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, block_size) * block_size; } - *total_tokens_post_pad = (int32_t) cumsum[num_experts]; + *total_tokens_post_pad = (int32_t)cumsum[num_experts]; } __syncthreads(); @@ -222,20 +222,21 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, int device_max_shared_mem; auto dev = topk_ids.get_device(); - cudaDeviceGetAttribute(&device_max_shared_mem, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); + cudaDeviceGetAttribute(&device_max_shared_mem, + cudaDevAttrMaxSharedMemoryPerBlockOptin, dev); const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); const int32_t shared_mem_i32 = - ((num_thread + 1) * num_experts + (num_experts + 1)) * - sizeof(int32_t); + ((num_thread + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t); const int32_t shared_mem_i16 = - ((num_thread + 1) * num_experts) * sizeof(uint16_t) + (num_experts + 1) * - sizeof(int32_t); + ((num_thread + 1) * num_experts) * sizeof(uint16_t) + + (num_experts + 1) * sizeof(int32_t); bool use_global_memory = false, use_i16 = false; if (shared_mem_i16 > device_max_shared_mem) { use_global_memory = true; - } else if (shared_mem_i32 > device_max_shared_mem && topk_ids.numel() <= 65535) { + } else if (shared_mem_i32 > device_max_shared_mem && + topk_ids.numel() <= 65535) { // when nelements of topk_ids is smaller than 65535 (max value of uint16), // element value of token_cnts would also smaller than 65535, // so we can use uint16 as dtype of token_cnts @@ -249,8 +250,9 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, // tensors const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE); - auto options_int = - torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device()); + auto options_int = torch::TensorOptions() + .dtype(torch::kInt) + .device(topk_ids.device()); torch::Tensor token_cnts_buffer = torch::empty({(num_experts + 1) * num_experts}, options_int); torch::Tensor cumsum_buffer = @@ -270,7 +272,8 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, VLLM_DISPATCH_INTEGRAL_TYPES( topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { // set dynamic shared mem - auto kernel = vllm::moe::moe_align_block_size_kernel; + auto kernel = + vllm::moe::moe_align_block_size_kernel; AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( (void*)kernel, shared_mem_i16)); kernel<<<1, num_thread, shared_mem_i16, stream>>>( @@ -282,8 +285,9 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, }); } else { VLLM_DISPATCH_INTEGRAL_TYPES( - topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { - auto kernel = vllm::moe::moe_align_block_size_kernel; + topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] { + auto kernel = + vllm::moe::moe_align_block_size_kernel; AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize( (void*)kernel, shared_mem_i32)); kernel<<<1, num_thread, shared_mem_i32, stream>>>( From 0ea4debbc10028f1241236835dc69725fcb3dc64 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 20 Jan 2025 11:07:21 -0500 Subject: [PATCH 5/7] Format --- csrc/moe/moe_align_sum_kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index 84a8fa2e54641..c13eabf260b1e 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -8,7 +8,7 @@ #include "../cuda_compat.h" #include "../dispatch_utils.h" -#define CEILDIV(x, y) (((x) + (y)-1) / (y)) +#define CEILDIV(x, y) (((x) + (y) - 1) / (y)) namespace vllm { namespace moe { From e72c81f1486508f8ad16459abe6657c405efa737 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 20 Jan 2025 12:11:50 -0500 Subject: [PATCH 6/7] Update csrc/moe/moe_align_sum_kernels.cu Co-authored-by: Tyler Michael Smith --- csrc/moe/moe_align_sum_kernels.cu | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index c13eabf260b1e..4276e1fdefbe8 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -232,7 +232,8 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts, ((num_thread + 1) * num_experts) * sizeof(uint16_t) + (num_experts + 1) * sizeof(int32_t); - bool use_global_memory = false, use_i16 = false; + bool use_global_memory = false; + bool use_i16 = false; // Use uint16_t for shared memory token counts if (shared_mem_i16 > device_max_shared_mem) { use_global_memory = true; } else if (shared_mem_i32 > device_max_shared_mem && From 49fb02355433e2167f1ab88ec27b84f8bf287204 Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Mon, 20 Jan 2025 12:11:59 -0500 Subject: [PATCH 7/7] Update csrc/moe/moe_align_sum_kernels.cu Co-authored-by: Tyler Michael Smith --- csrc/moe/moe_align_sum_kernels.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu index 4276e1fdefbe8..715a1b42841f2 100644 --- a/csrc/moe/moe_align_sum_kernels.cu +++ b/csrc/moe/moe_align_sum_kernels.cu @@ -70,7 +70,7 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids, block_size) * block_size; } - *total_tokens_post_pad = (int32_t)cumsum[num_experts]; + *total_tokens_post_pad = static_cast(cumsum[num_experts]); } __syncthreads();