update

sgl-project · Aug 15, 2024 · 62fb110 · 62fb110
1 parent 2609e01
commit 62fb110
Show file tree

Hide file tree

Showing 2 changed files with 6 additions and 6 deletions.
diff --git a/benchmark/gsm8k/bench_sglang.py b/benchmark/gsm8k/bench_sglang.py
@@ -64,7 +64,7 @@ def main(args):
     @sgl.function
     def few_shot_gsm8k(s, question):
         s += few_shot_examples + question
-        s += sgl.gen("answer", max_tokens=512, stop="Question")
+        s += sgl.gen("answer", max_tokens=512, stop=["Question", "Assistant:"])
 
     #####################################
     ########## SGL Program End ##########

diff --git a/python/sglang/srt/model_executor/cuda_graph_runner.py b/python/sglang/srt/model_executor/cuda_graph_runner.py
@@ -123,10 +123,10 @@ def __init__(self, model_runner, max_batch_size_to_capture, use_torch_compile):
                 self.model_runner.flashinfer_workspace_buffer
             )
         else:
-            self.flashinfer_workspace_buffer = [
-                self.model_runner.flashinfer_workspace_buffer,
-                self.model_runner.flashinfer_workspace_buffer,
-            ]
+            self.flashinfer_workspace_buffer = (
+                self.model_runner.flashinfer_workspace_buffer
+            )
+
             self.flashinfer_kv_indptr = [
                 self.flashinfer_kv_indptr,
                 self.flashinfer_kv_indptr.clone(),
@@ -200,7 +200,7 @@ def capture_one_batch_size(self, bs, forward):
             for i in range(2):
                 flashinfer_decode_wrapper.append(
                     BatchDecodeWithPagedKVCacheWrapper(
-                        self.flashinfer_workspace_buffer[i],
+                        self.flashinfer_workspace_buffer,
                         "NHD",
                         use_cuda_graph=True,
                         use_tensor_cores=use_tensor_cores,