From 07fa60bd79acb881af7fa45417ec3ced10414f5e Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Mon, 20 Jan 2025 20:19:48 +0800
Subject: [PATCH 1/7] optimize moe_align_block_size

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
---
 csrc/moe/moe_align_sum_kernels.cu | 88 ++++++++++++++++++-------------
 1 file changed, 52 insertions(+), 36 deletions(-)
diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index 24341d63fb1f8..ef69d21db1aa4 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -21,7 +21,7 @@ __device__ __forceinline__ int32_t index(int32_t total_col, int32_t row,
 }
 }  // namespace
 
-template <typename scalar_t>
+template <typename scalar_t, typename token_cnts_t>
 __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
                                             int32_t* sorted_token_ids,
                                             int32_t* expert_ids,
@@ -32,12 +32,8 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
   const size_t start_idx = threadIdx.x * tokens_per_thread;
 
   extern __shared__ int32_t shared_mem[];
-
-  int32_t* tokens_cnts =
-      shared_mem;  // 2d tensor with shape (blockDim.x + 1, num_experts)
-  int32_t* cumsum =
-      shared_mem +
-      (blockDim.x + 1) * num_experts;  // 1d tensor with shape (num_experts + 1)
+  int32_t* cumsum = shared_mem;  // 1d tensor with shape (num_experts + 1)
+  token_cnts_t* tokens_cnts = (token_cnts_t *) (shared_mem + blockDim.x + 1);
 
   for (int i = 0; i < num_experts; ++i) {
     tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
@@ -74,7 +70,7 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
                           block_size) *
                       block_size;
     }
-    *total_tokens_post_pad = cumsum[num_experts];
+    *total_tokens_post_pad = (int32_t) cumsum[num_experts];
   }
 
   __syncthreads();
@@ -224,26 +220,41 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
                           torch::Tensor num_tokens_post_pad) {
   const cudaStream_t stream = at::cuda::getCurrentCUDAStream();
 
-  // If we have very large number of experts, we can no longer use shared
-  // memory.
-  // TODO(simon): the right solution should be calculating the exact right
-  // amount of shared memory and use that. The num_experts >= 256 is just a
-  // temporary solution to unblock Deepseek V3.
-  if (num_experts >= 256) {
+  int device_max_shared_mem;
+  auto dev = topk_ids.get_device();
+  cudaDeviceGetAttribute(&device_max_shared_mem, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+
+  const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
+  const int32_t shared_mem_i32 =
+      ((num_thread + 1) * num_experts + (num_experts + 1)) *
+      sizeof(int32_t);
+  const int32_t shared_mem_i16 =
+      ((num_thread + 1) * num_experts) * sizeof(uint16_t) + (num_experts + 1) *
+      sizeof(int32_t);
+
+  bool use_global_memory = false, use_i16 = false;
+  if (shared_mem_i16 > device_max_shared_mem) {
+    use_global_memory = true;
+  } else if (shared_mem_i32 > device_max_shared_mem && topk_ids::numel() <= 65535) {
+    // when nelements of topk_ids is smaller than 65535 (max value of uint16),
+    // element value of token_cnts would also smaller than 65535,
+    // so we can use uint16 as dtype of token_cnts
+    use_i16 = true;
+  }
+
+  if (use_global_memory) {
     VLLM_DISPATCH_INTEGRAL_TYPES(
         topk_ids.scalar_type(), "moe_align_block_size_global_mem_kernel", [&] {
           // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
           // tensors
           const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
 
-          const int32_t mem_tokens_cnts =
-              ((num_experts + 1) * num_experts) * sizeof(int32_t);
-          const int32_t mem_cumsum = (num_experts + 1) * sizeof(int32_t);
-          // allocate global memory
-          int32_t* tokens_cnts;
-          int32_t* cumsum;
-          cudaMalloc(&tokens_cnts, mem_tokens_cnts);
-          cudaMalloc(&cumsum, mem_cumsum);
+          auto options_int =
+              torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
+          torch::Tensor token_cnts_buffer =
+              torch::empty({(num_experts + 1) * num_experts}, options_int);
+          torch::Tensor cumsum_buffer =
+              torch::empty({num_experts + 1}, options_int);
 
           auto kernel =
               vllm::moe::moe_align_block_size_global_mem_kernel<scalar_t>;
@@ -252,25 +263,30 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
               sorted_token_ids.data_ptr<int32_t>(),
               experts_ids.data_ptr<int32_t>(),
               num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
-              topk_ids.numel(), tokens_cnts, cumsum);
-          cudaFree(tokens_cnts);
-          cudaFree(cumsum);
+              topk_ids.numel(), token_cnts_buffer.data_ptr<int32_t>(),
+              cumsum_buffer.data_ptr<int32_t>());
         });
-  } else {
+  } else if (use_i16) {
     VLLM_DISPATCH_INTEGRAL_TYPES(
         topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
-          // calc needed amount of shared mem for `tokens_cnts` and `cumsum`
-          // tensors
-          const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
-          const int32_t shared_mem =
-              ((num_thread + 1) * num_experts + (num_experts + 1)) *
-              sizeof(int32_t);
-
           // set dynamic shared mem
-          auto kernel = vllm::moe::moe_align_block_size_kernel<scalar_t>;
+          auto kernel = vllm::moe::moe_align_block_size_kernel<scalar_t, uint16_t>;
+          AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
+              (void*)kernel, shared_mem_i16));
+          kernel<<<1, num_thread, shared_mem_i16, stream>>>(
+              topk_ids.data_ptr<scalar_t>(),
+              sorted_token_ids.data_ptr<int32_t>(),
+              experts_ids.data_ptr<int32_t>(),
+              num_tokens_post_pad.data_ptr<int32_t>(), num_experts, block_size,
+              topk_ids.numel());
+        });
+  } else {
+    VLLM_DISPATCH_INTEGRAL_TYPES(
+      topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
+          auto kernel = vllm::moe::moe_align_block_size_kernel<scalar_t, int32_t>;
           AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
-              (void*)kernel, shared_mem));
-          kernel<<<1, num_thread, shared_mem, stream>>>(
+              (void*)kernel, shared_mem_i32));
+          kernel<<<1, num_thread, shared_mem_i32, stream>>>(
               topk_ids.data_ptr<scalar_t>(),
               sorted_token_ids.data_ptr<int32_t>(),
               experts_ids.data_ptr<int32_t>(),

From 3201616f5449b47c0b40cd19335eb0f97183c245 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Mon, 20 Jan 2025 20:35:22 +0800
Subject: [PATCH 2/7] update config

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
---
 vllm/config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/config.py b/vllm/config.py
index 4698a05020332..b0a92b2e21343 100644
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -607,7 +607,7 @@ def _verify_cuda_graph(self) -> None:
         self.max_seq_len_to_capture = min(self.max_seq_len_to_capture,
                                           self.max_model_len)
 
-        MODEL_NOT_SUPPORT_CUDA_GRAPH = ['deepseek_v3', 'mllama']
+        MODEL_NOT_SUPPORT_CUDA_GRAPH = ['mllama']
         if (self.hf_config.model_type in MODEL_NOT_SUPPORT_CUDA_GRAPH
                 and not self.enforce_eager):
             logger.warning(

From 57abfd448589ed6eeb4e289feed76d7e22a3e9f1 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Mon, 20 Jan 2025 20:54:52 +0800
Subject: [PATCH 3/7] fix error

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
---
 csrc/moe/moe_align_sum_kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index ef69d21db1aa4..d47ec22230237 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -235,7 +235,7 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
   bool use_global_memory = false, use_i16 = false;
   if (shared_mem_i16 > device_max_shared_mem) {
     use_global_memory = true;
-  } else if (shared_mem_i32 > device_max_shared_mem && topk_ids::numel() <= 65535) {
+  } else if (shared_mem_i32 > device_max_shared_mem && topk_ids.numel() <= 65535) {
     // when nelements of topk_ids is smaller than 65535 (max value of uint16),
     // element value of token_cnts would also smaller than 65535,
     // so we can use uint16 as dtype of token_cnts

From 4d263c087fde7055284d60a8d6a710e19f8a9e22 Mon Sep 17 00:00:00 2001
From: Jinzhen Lin <linjinzhen@hotmail.com>
Date: Mon, 20 Jan 2025 20:55:41 +0800
Subject: [PATCH 4/7] fix format error

Signed-off-by: Jinzhen Lin <linjinzhen@hotmail.com>
---
 csrc/moe/moe_align_sum_kernels.cu | 32 +++++++++++++++++--------------
 1 file changed, 18 insertions(+), 14 deletions(-)

diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index d47ec22230237..84a8fa2e54641 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -8,7 +8,7 @@
 #include "../cuda_compat.h"
 #include "../dispatch_utils.h"
 
-#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
+#define CEILDIV(x, y) (((x) + (y)-1) / (y))
 
 namespace vllm {
 namespace moe {
@@ -33,7 +33,7 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
 
   extern __shared__ int32_t shared_mem[];
   int32_t* cumsum = shared_mem;  // 1d tensor with shape (num_experts + 1)
-  token_cnts_t* tokens_cnts = (token_cnts_t *) (shared_mem + blockDim.x + 1);
+  token_cnts_t* tokens_cnts = (token_cnts_t*)(shared_mem + blockDim.x + 1);
 
   for (int i = 0; i < num_experts; ++i) {
     tokens_cnts[index(num_experts, threadIdx.x + 1, i)] = 0;
@@ -70,7 +70,7 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
                           block_size) *
                       block_size;
     }
-    *total_tokens_post_pad = (int32_t) cumsum[num_experts];
+    *total_tokens_post_pad = (int32_t)cumsum[num_experts];
   }
 
   __syncthreads();
@@ -222,20 +222,21 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
 
   int device_max_shared_mem;
   auto dev = topk_ids.get_device();
-  cudaDeviceGetAttribute(&device_max_shared_mem, cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
+  cudaDeviceGetAttribute(&device_max_shared_mem,
+                         cudaDevAttrMaxSharedMemoryPerBlockOptin, dev);
 
   const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
   const int32_t shared_mem_i32 =
-      ((num_thread + 1) * num_experts + (num_experts + 1)) *
-      sizeof(int32_t);
+      ((num_thread + 1) * num_experts + (num_experts + 1)) * sizeof(int32_t);
   const int32_t shared_mem_i16 =
-      ((num_thread + 1) * num_experts) * sizeof(uint16_t) + (num_experts + 1) *
-      sizeof(int32_t);
+      ((num_thread + 1) * num_experts) * sizeof(uint16_t) +
+      (num_experts + 1) * sizeof(int32_t);
 
   bool use_global_memory = false, use_i16 = false;
   if (shared_mem_i16 > device_max_shared_mem) {
     use_global_memory = true;
-  } else if (shared_mem_i32 > device_max_shared_mem && topk_ids.numel() <= 65535) {
+  } else if (shared_mem_i32 > device_max_shared_mem &&
+             topk_ids.numel() <= 65535) {
     // when nelements of topk_ids is smaller than 65535 (max value of uint16),
     // element value of token_cnts would also smaller than 65535,
     // so we can use uint16 as dtype of token_cnts
@@ -249,8 +250,9 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
           // tensors
           const int32_t num_thread = max((int32_t)num_experts, WARP_SIZE);
 
-          auto options_int =
-              torch::TensorOptions().dtype(torch::kInt).device(topk_ids.device());
+          auto options_int = torch::TensorOptions()
+                                 .dtype(torch::kInt)
+                                 .device(topk_ids.device());
           torch::Tensor token_cnts_buffer =
               torch::empty({(num_experts + 1) * num_experts}, options_int);
           torch::Tensor cumsum_buffer =
@@ -270,7 +272,8 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
     VLLM_DISPATCH_INTEGRAL_TYPES(
         topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
           // set dynamic shared mem
-          auto kernel = vllm::moe::moe_align_block_size_kernel<scalar_t, uint16_t>;
+          auto kernel =
+              vllm::moe::moe_align_block_size_kernel<scalar_t, uint16_t>;
           AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
               (void*)kernel, shared_mem_i16));
           kernel<<<1, num_thread, shared_mem_i16, stream>>>(
@@ -282,8 +285,9 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
         });
   } else {
     VLLM_DISPATCH_INTEGRAL_TYPES(
-      topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
-          auto kernel = vllm::moe::moe_align_block_size_kernel<scalar_t, int32_t>;
+        topk_ids.scalar_type(), "moe_align_block_size_kernel", [&] {
+          auto kernel =
+              vllm::moe::moe_align_block_size_kernel<scalar_t, int32_t>;
           AT_CUDA_CHECK(VLLM_DevFuncAttribute_SET_MaxDynamicSharedMemorySize(
               (void*)kernel, shared_mem_i32));
           kernel<<<1, num_thread, shared_mem_i32, stream>>>(

From 0ea4debbc10028f1241236835dc69725fcb3dc64 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin@redhat.com>
Date: Mon, 20 Jan 2025 11:07:21 -0500
Subject: [PATCH 5/7] Format

---
 csrc/moe/moe_align_sum_kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index 84a8fa2e54641..c13eabf260b1e 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -8,7 +8,7 @@
 #include "../cuda_compat.h"
 #include "../dispatch_utils.h"
 
-#define CEILDIV(x, y) (((x) + (y)-1) / (y))
+#define CEILDIV(x, y) (((x) + (y) - 1) / (y))
 
 namespace vllm {
 namespace moe {

From e72c81f1486508f8ad16459abe6657c405efa737 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin@redhat.com>
Date: Mon, 20 Jan 2025 12:11:50 -0500
Subject: [PATCH 6/7] Update csrc/moe/moe_align_sum_kernels.cu

Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 csrc/moe/moe_align_sum_kernels.cu | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index c13eabf260b1e..4276e1fdefbe8 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -232,7 +232,8 @@ void moe_align_block_size(torch::Tensor topk_ids, int64_t num_experts,
       ((num_thread + 1) * num_experts) * sizeof(uint16_t) +
       (num_experts + 1) * sizeof(int32_t);
 
-  bool use_global_memory = false, use_i16 = false;
+  bool use_global_memory = false;
+  bool use_i16 = false; // Use uint16_t for shared memory token counts
   if (shared_mem_i16 > device_max_shared_mem) {
     use_global_memory = true;
   } else if (shared_mem_i32 > device_max_shared_mem &&

From 49fb02355433e2167f1ab88ec27b84f8bf287204 Mon Sep 17 00:00:00 2001
From: Michael Goin <mgoin@redhat.com>
Date: Mon, 20 Jan 2025 12:11:59 -0500
Subject: [PATCH 7/7] Update csrc/moe/moe_align_sum_kernels.cu

Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
---
 csrc/moe/moe_align_sum_kernels.cu | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/csrc/moe/moe_align_sum_kernels.cu b/csrc/moe/moe_align_sum_kernels.cu
index 4276e1fdefbe8..715a1b42841f2 100644
--- a/csrc/moe/moe_align_sum_kernels.cu
+++ b/csrc/moe/moe_align_sum_kernels.cu
@@ -70,7 +70,7 @@ __global__ void moe_align_block_size_kernel(scalar_t* __restrict__ topk_ids,
                           block_size) *
                       block_size;
     }
-    *total_tokens_post_pad = (int32_t)cumsum[num_experts];
+    *total_tokens_post_pad = static_cast<int32_t>(cumsum[num_experts]);
   }
 
   __syncthreads();