From 44ed280f3cd3cf9fd490e0b7fd81c53987c36e7f Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Tue, 13 Aug 2024 11:15:06 +0000
Subject: [PATCH 1/5] Use a single workspace for flashinfer

---
 python/sglang/global_config.py                |  2 +-
 .../srt/model_executor/cuda_graph_runner.py   | 10 ++++----
 .../sglang/srt/model_executor/model_runner.py | 24 +++++++------------
 python/sglang/srt/server.py                   |  2 +-
 4 files changed, 16 insertions(+), 22 deletions(-)

diff --git a/python/sglang/global_config.py b/python/sglang/global_config.py
index b02ce9f81ea..9b0f0afd4e1 100644
--- a/python/sglang/global_config.py
+++ b/python/sglang/global_config.py
@@ -27,7 +27,7 @@ def __init__(self):
         # Runtime constants: others
         self.num_continue_decode_steps = 10
         self.retract_decode_steps = 20
-        self.flashinfer_workspace_size = 192 * 1024 * 1024
+        self.flashinfer_workspace_size = 256 * 1024 * 1024
 
         # Output tokenization configs
         self.skip_special_tokens_in_output = True
diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py
index ed26322c346..e636a6442e5 100644
--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -120,12 +120,12 @@ def __init__(self, model_runner, max_batch_size_to_capture, use_torch_compile):
         )
         if model_runner.sliding_window_size is None:
             self.flashinfer_workspace_buffer = (
-                self.model_runner.flashinfer_workspace_buffers[0]
+                self.model_runner.flashinfer_workspace_buffer
             )
         else:
-            self.flashinfer_workspace_buffers = [
-                self.model_runner.flashinfer_workspace_buffers[0],
-                self.model_runner.flashinfer_workspace_buffers[2],
+            self.flashinfer_workspace_buffer = [
+                self.model_runner.flashinfer_workspace_buffer
+                self.model_runner.flashinfer_workspace_buffer
             ]
             self.flashinfer_kv_indptr = [
                 self.flashinfer_kv_indptr,
@@ -200,7 +200,7 @@ def capture_one_batch_size(self, bs, forward):
             for i in range(2):
                 flashinfer_decode_wrapper.append(
                     BatchDecodeWithPagedKVCacheWrapper(
-                        self.flashinfer_workspace_buffers[i],
+                        self.flashinfer_workspace_buffer[i],
                         "NHD",
                         use_cuda_graph=True,
                         use_tensor_cores=use_tensor_cores,
diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 0a748342370..0a0a2a8d421 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -318,28 +318,22 @@ def init_flashinfer(self):
             use_tensor_cores = False
 
         if self.sliding_window_size is None:
-            self.flashinfer_workspace_buffers = torch.empty(
-                2,
-                global_config.flashinfer_workspace_size,
-                dtype=torch.uint8,
-                device="cuda",
+            self.flashinfer_workspace_buffer = torch.empty(
+                global_config.flashinfer_workspace_size, dtype=torch.uint8, device="cuda"
             )
-            self.flashinfer_prefill_wrapper_ragged = (
-                BatchPrefillWithRaggedKVCacheWrapper(
-                    self.flashinfer_workspace_buffers[0], "NHD"
-                )
+            self.flashinfer_prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper(
+                self.flashinfer_workspace_buffer, "NHD"
             )
             self.flashinfer_prefill_wrapper_paged = BatchPrefillWithPagedKVCacheWrapper(
-                self.flashinfer_workspace_buffers[1], "NHD"
+                self.flashinfer_workspace_buffer, "NHD"
             )
             self.flashinfer_decode_wrapper = BatchDecodeWithPagedKVCacheWrapper(
-                self.flashinfer_workspace_buffers[0],
+                self.flashinfer_workspace_buffer,
                 "NHD",
                 use_tensor_cores=use_tensor_cores,
             )
         else:
             self.flashinfer_workspace_buffers = torch.empty(
-                4,
                 global_config.flashinfer_workspace_size,
                 dtype=torch.uint8,
                 device="cuda",
@@ -350,17 +344,17 @@ def init_flashinfer(self):
             for i in range(2):
                 self.flashinfer_prefill_wrapper_ragged.append(
                     BatchPrefillWithRaggedKVCacheWrapper(
-                        self.flashinfer_workspace_buffers[2 * i + 0], "NHD"
+                        self.flashinfer_workspace_buffer, "NHD"
                     )
                 )
                 self.flashinfer_prefill_wrapper_paged.append(
                     BatchPrefillWithPagedKVCacheWrapper(
-                        self.flashinfer_workspace_buffers[2 * i + 1], "NHD"
+                        self.flashinfer_workspace_buffer, "NHD"
                     )
                 )
                 self.flashinfer_decode_wrapper.append(
                     BatchDecodeWithPagedKVCacheWrapper(
-                        self.flashinfer_workspace_buffers[2 * i + 0],
+                        self.flashinfer_workspace_buffer,
                         "NHD",
                         use_tensor_cores=use_tensor_cores,
                     )
diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index 973f9c8e120..ae886796c59 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -381,7 +381,7 @@ def _set_envs_and_config(server_args: ServerArgs):
     if not server_args.disable_flashinfer:
         assert_pkg_version(
             "flashinfer",
-            "0.1.4",
+            "0.1.5",
             "Please uninstall the old version and "
             "reinstall the latest version by following the instructions "
             "at https://docs.flashinfer.ai/installation.html.",

From d49cfaac1fe65b655f616e113355f63c4b7b5445 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 14 Aug 2024 18:06:06 -0700
Subject: [PATCH 2/5] update

---
 python/sglang/global_config.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/global_config.py b/python/sglang/global_config.py
index 9b0f0afd4e1..d5f16e2ae54 100644
--- a/python/sglang/global_config.py
+++ b/python/sglang/global_config.py
@@ -27,7 +27,7 @@ def __init__(self):
         # Runtime constants: others
         self.num_continue_decode_steps = 10
         self.retract_decode_steps = 20
-        self.flashinfer_workspace_size = 256 * 1024 * 1024
+        self.flashinfer_workspace_size = 384 * 1024 * 1024
 
         # Output tokenization configs
         self.skip_special_tokens_in_output = True

From 055971173b0ad573a1d49e0da59ed57b4330f5ec Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 14 Aug 2024 18:08:15 -0700
Subject: [PATCH 3/5] fix

---
 python/sglang/srt/model_executor/cuda_graph_runner.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py
index e636a6442e5..7699b073e2e 100644
--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -124,8 +124,8 @@ def __init__(self, model_runner, max_batch_size_to_capture, use_torch_compile):
             )
         else:
             self.flashinfer_workspace_buffer = [
-                self.model_runner.flashinfer_workspace_buffer
-                self.model_runner.flashinfer_workspace_buffer
+                self.model_runner.flashinfer_workspace_buffer,
+                self.model_runner.flashinfer_workspace_buffer,
             ]
             self.flashinfer_kv_indptr = [
                 self.flashinfer_kv_indptr,

From 2609e0169b84d4d3a17e111265f726facf8557d0 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 14 Aug 2024 18:10:21 -0700
Subject: [PATCH 4/5] fix lint

---
 python/sglang/srt/model_executor/model_runner.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py
index 0a0a2a8d421..6826bf1a4e9 100644
--- a/python/sglang/srt/model_executor/model_runner.py
+++ b/python/sglang/srt/model_executor/model_runner.py
@@ -319,10 +319,14 @@ def init_flashinfer(self):
 
         if self.sliding_window_size is None:
             self.flashinfer_workspace_buffer = torch.empty(
-                global_config.flashinfer_workspace_size, dtype=torch.uint8, device="cuda"
+                global_config.flashinfer_workspace_size,
+                dtype=torch.uint8,
+                device="cuda",
             )
-            self.flashinfer_prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper(
-                self.flashinfer_workspace_buffer, "NHD"
+            self.flashinfer_prefill_wrapper_ragged = (
+                BatchPrefillWithRaggedKVCacheWrapper(
+                    self.flashinfer_workspace_buffer, "NHD"
+                )
             )
             self.flashinfer_prefill_wrapper_paged = BatchPrefillWithPagedKVCacheWrapper(
                 self.flashinfer_workspace_buffer, "NHD"

From 62fb110f3db2e90365ed31dd9a5805e6ef86b5a0 Mon Sep 17 00:00:00 2001
From: Lianmin Zheng <lianminzheng@gmail.com>
Date: Wed, 14 Aug 2024 18:42:45 -0700
Subject: [PATCH 5/5] update

---
 benchmark/gsm8k/bench_sglang.py                       |  2 +-
 python/sglang/srt/model_executor/cuda_graph_runner.py | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/benchmark/gsm8k/bench_sglang.py b/benchmark/gsm8k/bench_sglang.py
index 652086f913b..d9d4b0ab20f 100644
--- a/benchmark/gsm8k/bench_sglang.py
+++ b/benchmark/gsm8k/bench_sglang.py
@@ -64,7 +64,7 @@ def main(args):
     @sgl.function
     def few_shot_gsm8k(s, question):
         s += few_shot_examples + question
-        s += sgl.gen("answer", max_tokens=512, stop="Question")
+        s += sgl.gen("answer", max_tokens=512, stop=["Question", "Assistant:"])
 
     #####################################
     ########## SGL Program End ##########
diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py
index 7699b073e2e..3d4e5d4c6a9 100644
--- a/python/sglang/srt/model_executor/cuda_graph_runner.py
+++ b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -123,10 +123,10 @@ def __init__(self, model_runner, max_batch_size_to_capture, use_torch_compile):
                 self.model_runner.flashinfer_workspace_buffer
             )
         else:
-            self.flashinfer_workspace_buffer = [
-                self.model_runner.flashinfer_workspace_buffer,
-                self.model_runner.flashinfer_workspace_buffer,
-            ]
+            self.flashinfer_workspace_buffer = (
+                self.model_runner.flashinfer_workspace_buffer
+            )
+
             self.flashinfer_kv_indptr = [
                 self.flashinfer_kv_indptr,
                 self.flashinfer_kv_indptr.clone(),
@@ -200,7 +200,7 @@ def capture_one_batch_size(self, bs, forward):
             for i in range(2):
                 flashinfer_decode_wrapper.append(
                     BatchDecodeWithPagedKVCacheWrapper(
-                        self.flashinfer_workspace_buffer[i],
+                        self.flashinfer_workspace_buffer,
                         "NHD",
                         use_cuda_graph=True,
                         use_tensor_cores=use_tensor_cores,