From 44ed280f3cd3cf9fd490e0b7fd81c53987c36e7f Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Tue, 13 Aug 2024 11:15:06 +0000 Subject: [PATCH 1/5] Use a single workspace for flashinfer --- python/sglang/global_config.py | 2 +- .../srt/model_executor/cuda_graph_runner.py | 10 ++++---- .../sglang/srt/model_executor/model_runner.py | 24 +++++++------------ python/sglang/srt/server.py | 2 +- 4 files changed, 16 insertions(+), 22 deletions(-) diff --git a/python/sglang/global_config.py b/python/sglang/global_config.py index b02ce9f81ea..9b0f0afd4e1 100644 --- a/python/sglang/global_config.py +++ b/python/sglang/global_config.py @@ -27,7 +27,7 @@ def __init__(self): # Runtime constants: others self.num_continue_decode_steps = 10 self.retract_decode_steps = 20 - self.flashinfer_workspace_size = 192 * 1024 * 1024 + self.flashinfer_workspace_size = 256 * 1024 * 1024 # Output tokenization configs self.skip_special_tokens_in_output = True diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index ed26322c346..e636a6442e5 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -120,12 +120,12 @@ def __init__(self, model_runner, max_batch_size_to_capture, use_torch_compile): ) if model_runner.sliding_window_size is None: self.flashinfer_workspace_buffer = ( - self.model_runner.flashinfer_workspace_buffers[0] + self.model_runner.flashinfer_workspace_buffer ) else: - self.flashinfer_workspace_buffers = [ - self.model_runner.flashinfer_workspace_buffers[0], - self.model_runner.flashinfer_workspace_buffers[2], + self.flashinfer_workspace_buffer = [ + self.model_runner.flashinfer_workspace_buffer + self.model_runner.flashinfer_workspace_buffer ] self.flashinfer_kv_indptr = [ self.flashinfer_kv_indptr, @@ -200,7 +200,7 @@ def capture_one_batch_size(self, bs, forward): for i in range(2): flashinfer_decode_wrapper.append( BatchDecodeWithPagedKVCacheWrapper( - self.flashinfer_workspace_buffers[i], + self.flashinfer_workspace_buffer[i], "NHD", use_cuda_graph=True, use_tensor_cores=use_tensor_cores, diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 0a748342370..0a0a2a8d421 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -318,28 +318,22 @@ def init_flashinfer(self): use_tensor_cores = False if self.sliding_window_size is None: - self.flashinfer_workspace_buffers = torch.empty( - 2, - global_config.flashinfer_workspace_size, - dtype=torch.uint8, - device="cuda", + self.flashinfer_workspace_buffer = torch.empty( + global_config.flashinfer_workspace_size, dtype=torch.uint8, device="cuda" ) - self.flashinfer_prefill_wrapper_ragged = ( - BatchPrefillWithRaggedKVCacheWrapper( - self.flashinfer_workspace_buffers[0], "NHD" - ) + self.flashinfer_prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper( + self.flashinfer_workspace_buffer, "NHD" ) self.flashinfer_prefill_wrapper_paged = BatchPrefillWithPagedKVCacheWrapper( - self.flashinfer_workspace_buffers[1], "NHD" + self.flashinfer_workspace_buffer, "NHD" ) self.flashinfer_decode_wrapper = BatchDecodeWithPagedKVCacheWrapper( - self.flashinfer_workspace_buffers[0], + self.flashinfer_workspace_buffer, "NHD", use_tensor_cores=use_tensor_cores, ) else: self.flashinfer_workspace_buffers = torch.empty( - 4, global_config.flashinfer_workspace_size, dtype=torch.uint8, device="cuda", @@ -350,17 +344,17 @@ def init_flashinfer(self): for i in range(2): self.flashinfer_prefill_wrapper_ragged.append( BatchPrefillWithRaggedKVCacheWrapper( - self.flashinfer_workspace_buffers[2 * i + 0], "NHD" + self.flashinfer_workspace_buffer, "NHD" ) ) self.flashinfer_prefill_wrapper_paged.append( BatchPrefillWithPagedKVCacheWrapper( - self.flashinfer_workspace_buffers[2 * i + 1], "NHD" + self.flashinfer_workspace_buffer, "NHD" ) ) self.flashinfer_decode_wrapper.append( BatchDecodeWithPagedKVCacheWrapper( - self.flashinfer_workspace_buffers[2 * i + 0], + self.flashinfer_workspace_buffer, "NHD", use_tensor_cores=use_tensor_cores, ) diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index 973f9c8e120..ae886796c59 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -381,7 +381,7 @@ def _set_envs_and_config(server_args: ServerArgs): if not server_args.disable_flashinfer: assert_pkg_version( "flashinfer", - "0.1.4", + "0.1.5", "Please uninstall the old version and " "reinstall the latest version by following the instructions " "at https://docs.flashinfer.ai/installation.html.", From d49cfaac1fe65b655f616e113355f63c4b7b5445 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 14 Aug 2024 18:06:06 -0700 Subject: [PATCH 2/5] update --- python/sglang/global_config.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/global_config.py b/python/sglang/global_config.py index 9b0f0afd4e1..d5f16e2ae54 100644 --- a/python/sglang/global_config.py +++ b/python/sglang/global_config.py @@ -27,7 +27,7 @@ def __init__(self): # Runtime constants: others self.num_continue_decode_steps = 10 self.retract_decode_steps = 20 - self.flashinfer_workspace_size = 256 * 1024 * 1024 + self.flashinfer_workspace_size = 384 * 1024 * 1024 # Output tokenization configs self.skip_special_tokens_in_output = True From 055971173b0ad573a1d49e0da59ed57b4330f5ec Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 14 Aug 2024 18:08:15 -0700 Subject: [PATCH 3/5] fix --- python/sglang/srt/model_executor/cuda_graph_runner.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index e636a6442e5..7699b073e2e 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -124,8 +124,8 @@ def __init__(self, model_runner, max_batch_size_to_capture, use_torch_compile): ) else: self.flashinfer_workspace_buffer = [ - self.model_runner.flashinfer_workspace_buffer - self.model_runner.flashinfer_workspace_buffer + self.model_runner.flashinfer_workspace_buffer, + self.model_runner.flashinfer_workspace_buffer, ] self.flashinfer_kv_indptr = [ self.flashinfer_kv_indptr, From 2609e0169b84d4d3a17e111265f726facf8557d0 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 14 Aug 2024 18:10:21 -0700 Subject: [PATCH 4/5] fix lint --- python/sglang/srt/model_executor/model_runner.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/python/sglang/srt/model_executor/model_runner.py b/python/sglang/srt/model_executor/model_runner.py index 0a0a2a8d421..6826bf1a4e9 100644 --- a/python/sglang/srt/model_executor/model_runner.py +++ b/python/sglang/srt/model_executor/model_runner.py @@ -319,10 +319,14 @@ def init_flashinfer(self): if self.sliding_window_size is None: self.flashinfer_workspace_buffer = torch.empty( - global_config.flashinfer_workspace_size, dtype=torch.uint8, device="cuda" + global_config.flashinfer_workspace_size, + dtype=torch.uint8, + device="cuda", ) - self.flashinfer_prefill_wrapper_ragged = BatchPrefillWithRaggedKVCacheWrapper( - self.flashinfer_workspace_buffer, "NHD" + self.flashinfer_prefill_wrapper_ragged = ( + BatchPrefillWithRaggedKVCacheWrapper( + self.flashinfer_workspace_buffer, "NHD" + ) ) self.flashinfer_prefill_wrapper_paged = BatchPrefillWithPagedKVCacheWrapper( self.flashinfer_workspace_buffer, "NHD" From 62fb110f3db2e90365ed31dd9a5805e6ef86b5a0 Mon Sep 17 00:00:00 2001 From: Lianmin Zheng Date: Wed, 14 Aug 2024 18:42:45 -0700 Subject: [PATCH 5/5] update --- benchmark/gsm8k/bench_sglang.py | 2 +- python/sglang/srt/model_executor/cuda_graph_runner.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/benchmark/gsm8k/bench_sglang.py b/benchmark/gsm8k/bench_sglang.py index 652086f913b..d9d4b0ab20f 100644 --- a/benchmark/gsm8k/bench_sglang.py +++ b/benchmark/gsm8k/bench_sglang.py @@ -64,7 +64,7 @@ def main(args): @sgl.function def few_shot_gsm8k(s, question): s += few_shot_examples + question - s += sgl.gen("answer", max_tokens=512, stop="Question") + s += sgl.gen("answer", max_tokens=512, stop=["Question", "Assistant:"]) ##################################### ########## SGL Program End ########## diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py index 7699b073e2e..3d4e5d4c6a9 100644 --- a/python/sglang/srt/model_executor/cuda_graph_runner.py +++ b/python/sglang/srt/model_executor/cuda_graph_runner.py @@ -123,10 +123,10 @@ def __init__(self, model_runner, max_batch_size_to_capture, use_torch_compile): self.model_runner.flashinfer_workspace_buffer ) else: - self.flashinfer_workspace_buffer = [ - self.model_runner.flashinfer_workspace_buffer, - self.model_runner.flashinfer_workspace_buffer, - ] + self.flashinfer_workspace_buffer = ( + self.model_runner.flashinfer_workspace_buffer + ) + self.flashinfer_kv_indptr = [ self.flashinfer_kv_indptr, self.flashinfer_kv_indptr.clone(), @@ -200,7 +200,7 @@ def capture_one_batch_size(self, bs, forward): for i in range(2): flashinfer_decode_wrapper.append( BatchDecodeWithPagedKVCacheWrapper( - self.flashinfer_workspace_buffer[i], + self.flashinfer_workspace_buffer, "NHD", use_cuda_graph=True, use_tensor_cores=use_tensor_cores,