From 807a3f0af0b7404f917225f71b5815ab5e1e74e0 Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Sat, 9 Nov 2024 01:32:55 +0000
Subject: [PATCH 01/30] add offline engine bench

---
 python/sglang/bench_serving.py   | 79 +++++++++++++++++++++++++++++++-
 python/sglang/test/test_utils.py | 16 ++++---
 test/srt/test_bench_serving.py   | 11 ++++-
 3 files changed, 96 insertions(+), 10 deletions(-)

diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
index 8bb452cd065..a63cd9fadd6 100644
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -37,6 +37,9 @@
     PreTrainedTokenizerFast,
 )
 
+from sglang.api import Engine as getEngine
+from sglang.srt.server import Engine
+
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
 
 global args
@@ -45,7 +48,9 @@
 @dataclass
 class RequestFuncInput:
     prompt: str
-    api_url: str
+    # one or the other must be defined but not both
+    api_url: Optional[str]
+    engine: Optional[Engine]
     prompt_len: int
     output_len: int
     model: str
@@ -222,6 +227,68 @@ async def async_request_openai_completions(
     return output
 
 
+async def async_request_sglang_offline_engine(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    engine = request_func_input.llm_engine
+    if not engine:
+        raise ValueError("Please pass in an Engine")
+
+    prompt = request_func_input.prompt
+
+    payload = {
+        "temperature": 0.0,
+        "best_of": 1,
+        "max_tokens": request_func_input.output_len,
+        "stream": not args.disable_stream,
+        "ignore_eos": not args.disable_ignore_eos,
+        **request_func_input.extra_request_body,
+    }
+
+    output = RequestFuncOutput()
+    output.prompt_len = request_func_input.prompt_len
+
+    generated_text = ""
+    ttft = 0.0
+    st = time.perf_counter()
+    most_recent_timestamp = st
+    try:
+        gen_out = await engine.async_generate(prompt, **payload)
+        if payload["stream"]:
+            async for chunk in gen_out:
+                latency = time.perf_counter() - st
+                if chunk["text"]:
+                    timestamp = time.perf_counter()
+                    if ttft == 0.0:
+                        ttft = time.perf_counter() - st
+                        output.ttft = ttft
+                    else:
+                        output.itl.append(timestamp - most_recent_timestamp)
+
+                    most_recent_timestamp = timestamp
+                    generated_text += chunk["text"]
+        else:
+            if gen_out[0]["text"]:
+                # not sure why you'd ever want this
+                latency = time.perf_counter() - st
+                ttft = latency
+                output.ttft = ttft
+                generated_text = gen_out[0]["text"]
+        output.generated_text = generated_text
+        output.success = True
+        output.latency = latency
+        output.output_len = request_func_input.output_len
+    except Exception:
+        output.success = False
+        exc_info = sys.exc_info()
+        output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
 async def async_request_truss(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
@@ -425,6 +492,7 @@ def get_tokenizer(
     "sglang": async_request_sglang_generate,
     "sglang-native": async_request_sglang_generate,
     "sglang-oai": async_request_openai_completions,
+    "sglang-offline-engine": async_request_sglang_offline_engine,
     "vllm": async_request_openai_completions,
     "lmdeploy": async_request_openai_completions,
     "trt": async_request_trt_llm,
@@ -718,7 +786,7 @@ def calculate_metrics(
 
 async def benchmark(
     backend: str,
-    api_url: str,
+    api_url: Optional[str],
     model_id: str,
     tokenizer: PreTrainedTokenizerBase,
     input_requests: List[Tuple[str, int, int]],
@@ -730,6 +798,9 @@ async def benchmark(
         request_func = ASYNC_REQUEST_FUNCS[backend]
     else:
         raise ValueError(f"Unknown backend: {backend}")
+    engine = None
+    if backend == "sglang-offline-engine":
+        engine = getEngine(model_path=model_id)
 
     print("Starting initial single prompt test run...")
     test_prompt, test_prompt_len, test_output_len = input_requests[0]
@@ -737,6 +808,7 @@ async def benchmark(
         model=model_id,
         prompt=test_prompt,
         api_url=api_url,
+        engine=engine,
         prompt_len=test_prompt_len,
         output_len=test_output_len,
         extra_request_body=extra_request_body,
@@ -762,6 +834,7 @@ async def benchmark(
             model=model_id,
             prompt=prompt,
             api_url=api_url,
+            engine=engine,
             prompt_len=prompt_len,
             output_len=output_len,
             extra_request_body=extra_request_body,
@@ -974,6 +1047,8 @@ def run_benchmark(args_: argparse.Namespace):
             if args.base_url
             else f"http://{args.host}:{args.port}/v1/completions"
         )
+    elif args.backend in ["sglang-offline-engine"]:
+        api_url = None
     elif args.backend == "trt":
         api_url = (
             f"{args.base_url}/v2/models/ensemble/generate_stream"
diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index 2c68a22b4df..8ab28911db1 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -522,6 +522,7 @@ def run_bench_serving(
     num_prompts,
     request_rate,
     other_server_args,
+    backend="sglang",
     dataset_name="random",
     random_input_len=4096,
     random_output_len=2048,
@@ -529,16 +530,17 @@ def run_bench_serving(
 ):
     # Launch the server
     base_url = DEFAULT_URL_FOR_TEST
-    process = popen_launch_server(
-        model,
-        base_url,
-        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-        other_args=other_server_args,
-    )
+    if backend == "sglang":
+        process = popen_launch_server(
+            model,
+            base_url,
+            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+            other_args=other_server_args,
+        )
 
     # Run benchmark
     args = SimpleNamespace(
-        backend="sglang",
+        backend=backend,
         base_url=base_url,
         host=None,
         port=None,
diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py
index 6955d4917b2..5ab3de1e189 100644
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -86,6 +86,14 @@ def test_offline_throughput_default_fp8(self):
         if is_in_ci():
             assert res["output_throughput"] > 3100
 
+    def test_offline_throughput_default_engine(self):
+        res = run_bench_serving(
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=[],
+        )
+
     def test_online_latency_default(self):
         res = run_bench_serving(
             model=DEFAULT_MODEL_NAME_FOR_TEST,
@@ -112,6 +120,7 @@ def test_moe_offline_throughput_default(self):
 
     def test_moe_offline_throughput_without_radix_cache(self):
         res = run_bench_serving(
+            backend="sglang-offline-engine",
             model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
             num_prompts=300,
             request_rate=float("inf"),
@@ -119,7 +128,7 @@ def test_moe_offline_throughput_without_radix_cache(self):
         )
 
         if is_in_ci():
-            assert res["output_throughput"] > 1950
+            assert res["output_throughput"] > 2830
 
 
 if __name__ == "__main__":

From e3ec6231f47c755b22abf17a7c6bfe304bbd5915 Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Sat, 9 Nov 2024 01:49:44 +0000
Subject: [PATCH 02/30] llm_engine -> engine

---
 python/sglang/bench_serving.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
index a63cd9fadd6..53bbf8b62e0 100644
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -231,7 +231,7 @@ async def async_request_sglang_offline_engine(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
 ) -> RequestFuncOutput:
-    engine = request_func_input.llm_engine
+    engine = request_func_input.engine
     if not engine:
         raise ValueError("Please pass in an Engine")
 

From 8b1232bb48fe7bf4bc6077aff281c0cc7809d56c Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Sat, 9 Nov 2024 02:13:48 +0000
Subject: [PATCH 03/30] add to unit test bench

---
 python/sglang/bench_serving.py | 10 +++++-----
 test/srt/test_bench_serving.py | 15 +++++++++++++--
 2 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
index 53bbf8b62e0..952fca9a585 100644
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -239,12 +239,12 @@ async def async_request_sglang_offline_engine(
 
     payload = {
         "temperature": 0.0,
-        "best_of": 1,
-        "max_tokens": request_func_input.output_len,
-        "stream": not args.disable_stream,
+        "n": 1,
+        "max_new_tokens": request_func_input.output_len,
         "ignore_eos": not args.disable_ignore_eos,
         **request_func_input.extra_request_body,
     }
+    stream = not args.disable_stream
 
     output = RequestFuncOutput()
     output.prompt_len = request_func_input.prompt_len
@@ -254,8 +254,8 @@ async def async_request_sglang_offline_engine(
     st = time.perf_counter()
     most_recent_timestamp = st
     try:
-        gen_out = await engine.async_generate(prompt, **payload)
-        if payload["stream"]:
+        gen_out = await engine.async_generate(prompt, payload, stream=stream)
+        if stream:
             async for chunk in gen_out:
                 latency = time.perf_counter() - st
                 if chunk["text"]:
diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py
index 5ab3de1e189..7de4fd736d0 100644
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -94,6 +94,18 @@ def test_offline_throughput_default_engine(self):
             other_server_args=[],
         )
 
+    def test_offline_throughput_llm_engine(self):
+        res = run_bench_serving(
+            backend="sgl-offline-engine",
+            model=DEFAULT_MODEL_NAME_FOR_TEST,
+            num_prompts=500,
+            request_rate=float("inf"),
+            other_server_args=[],
+        )
+
+        if is_in_ci():
+            assert res["output_throughput"] > 2830
+
     def test_online_latency_default(self):
         res = run_bench_serving(
             model=DEFAULT_MODEL_NAME_FOR_TEST,
@@ -120,7 +132,6 @@ def test_moe_offline_throughput_default(self):
 
     def test_moe_offline_throughput_without_radix_cache(self):
         res = run_bench_serving(
-            backend="sglang-offline-engine",
             model=DEFAULT_MOE_MODEL_NAME_FOR_TEST,
             num_prompts=300,
             request_rate=float("inf"),
@@ -128,7 +139,7 @@ def test_moe_offline_throughput_without_radix_cache(self):
         )
 
         if is_in_ci():
-            assert res["output_throughput"] > 2830
+            assert res["output_throughput"] > 1950
 
 
 if __name__ == "__main__":

From e6293a890437b330b4ed1bb7128e39af6eb5ca61 Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Mon, 11 Nov 2024 22:33:06 +0000
Subject: [PATCH 04/30] first draft bench offline throughput

---
 python/sglang/bench_offline_throughput.py | 153 ++++++++++++++++++++++
 python/sglang/bench_serving.py            |  75 +----------
 2 files changed, 155 insertions(+), 73 deletions(-)
 create mode 100644 python/sglang/bench_offline_throughput.py

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
new file mode 100644
index 00000000000..10c1d4de0e1
--- /dev/null
+++ b/python/sglang/bench_offline_throughput.py
@@ -0,0 +1,153 @@
+"""
+Benchmark the throughput of using the offline LLM engine.
+This script does not launch a server.
+It accepts the same arguments as bench_latency.py
+"""
+
+import argparse
+import dataclasses
+import itertools
+import logging
+import time
+from typing import Dict, List, Tuple
+
+import numpy as np
+
+from python.sglang.srt.sampling.sampling_params import SamplingParams
+from sglang.api import Engine as getEngine
+from sglang.srt.server import Engine
+from sglang.srt.server_args import ServerArgs
+
+
+@dataclasses.dataclass
+class BenchArgs:
+    run_name: str = "before"
+    batch_size: Tuple[int] = (1,)
+    input_len: Tuple[int] = (1024,)
+    output_len: Tuple[int] = (16,)
+    result_filename: str = ""
+    # Plotting args
+    graph_sql: str = (
+        "select run_name, batch_size, prefill_throughput from results where run_name='before'"
+    )
+    graph_filename: str = "out.png"
+
+    @staticmethod
+    def add_cli_args(parser: argparse.ArgumentParser):
+        parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
+        parser.add_argument(
+            "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
+        )
+        parser.add_argument(
+            "--input-len", type=int, nargs="+", default=BenchArgs.input_len
+        )
+        parser.add_argument(
+            "--output-len", type=int, nargs="+", default=BenchArgs.output_len
+        )
+        parser.add_argument(
+            "--result-filename", type=str, default=BenchArgs.result_filename
+        )
+        # graphing
+        parser.add_argument("--graph-sql", type=str, default=BenchArgs.graph_sql)
+        parser.add_argument(
+            "--graph-filename", type=str, default=BenchArgs.graph_filename
+        )
+
+    @classmethod
+    def from_cli_args(cls, args: argparse.Namespace):
+        # use the default value's type to case the args into correct types.
+        attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
+        return cls(
+            **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
+        )
+
+
+def prepare_synthetic_inputs_for_throughput_test(
+    batch_size: int, input_len: int, output_len: int
+):
+    input_ids = [[1] * input_len for _ in range(batch_size)]
+    sampling_params = {
+        "temperature": 0,
+        "min_new_tokens": output_len,
+        "max_new_tokens": output_len,
+    }
+    return input_ids, sampling_params
+
+
+def throughput_test_once(
+    run_name: str,
+    engine: Engine,
+    reqs: Tuple[List[List[int]], Dict],
+    output_len: int,
+):
+    measurement_results = {
+        "run_name": run_name,
+        "batch_size": len(reqs[0]),
+        "input_len": len(reqs[0][0]),
+        "output_len": output_len,
+    }
+
+    st = time.perf_counter()
+    gen_out = engine.generate(input_ids=reqs[0], sampling_params=reqs[1])
+    latency = time.perf_counter() - st
+
+    measurement_results["total_latency"] = latency
+    measurement_results["throughput"] = (
+        (measurement_results["input_len"] + output_len)
+        * measurement_results["batch_size"]
+    ) / latency
+
+    print(
+        f"Throughput: BSZ {measurement_results['batch_size']} tokens, "
+        f"Num sequences {len(reqs[0])}, throughput: "
+        f"{measurement_results['throughput']} tokens/s"
+    )
+    return measurement_results
+
+
+def throughput_test(
+    server_args,
+    bench_args: BenchArgs,
+):
+    engine = getEngine(**server_args)
+    if not engine:
+        raise ValueError("Please provide valid engine arguments")
+
+    warmup_reqs = prepare_synthetic_inputs_for_throughput_test(
+        bench_args.batch_size[0], bench_args.input_len[0], bench_args.output_len[0]
+    )
+
+    # Warm up
+    throughput_test_once("warmup", engine, warmup_reqs, bench_args.output_len[0])
+
+    result_list = []
+    for bs, il, ol in itertools.product(
+        bench_args.batch_size, bench_args.input_len, bench_args.output_len
+    ):
+        reqs = prepare_synthetic_inputs_for_throughput_test(bs, il, ol)
+        ret = throughput_test_once(
+            bench_args.run_name, engine, reqs, bench_args.output_len[0]
+        )
+        if ret is not None:
+            result_list.append(ret)
+
+    print(result_list)
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    BenchArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    server_args = ServerArgs.from_cli_args(args)
+    bench_args = BenchArgs.from_cli_args(args)
+
+    logging.basicConfig(
+        level=getattr(logging, server_args.log_level.upper()),
+        format="%(message)s",
+    )
+
+    try:
+        throughput_test(server_args, bench_args)
+    except Exception as e:
+        raise e
diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
index 952fca9a585..3d08f37863e 100644
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -49,8 +49,7 @@
 class RequestFuncInput:
     prompt: str
     # one or the other must be defined but not both
-    api_url: Optional[str]
-    engine: Optional[Engine]
+    api_url: str
     prompt_len: int
     output_len: int
     model: str
@@ -227,68 +226,6 @@ async def async_request_openai_completions(
     return output
 
 
-async def async_request_sglang_offline_engine(
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    engine = request_func_input.engine
-    if not engine:
-        raise ValueError("Please pass in an Engine")
-
-    prompt = request_func_input.prompt
-
-    payload = {
-        "temperature": 0.0,
-        "n": 1,
-        "max_new_tokens": request_func_input.output_len,
-        "ignore_eos": not args.disable_ignore_eos,
-        **request_func_input.extra_request_body,
-    }
-    stream = not args.disable_stream
-
-    output = RequestFuncOutput()
-    output.prompt_len = request_func_input.prompt_len
-
-    generated_text = ""
-    ttft = 0.0
-    st = time.perf_counter()
-    most_recent_timestamp = st
-    try:
-        gen_out = await engine.async_generate(prompt, payload, stream=stream)
-        if stream:
-            async for chunk in gen_out:
-                latency = time.perf_counter() - st
-                if chunk["text"]:
-                    timestamp = time.perf_counter()
-                    if ttft == 0.0:
-                        ttft = time.perf_counter() - st
-                        output.ttft = ttft
-                    else:
-                        output.itl.append(timestamp - most_recent_timestamp)
-
-                    most_recent_timestamp = timestamp
-                    generated_text += chunk["text"]
-        else:
-            if gen_out[0]["text"]:
-                # not sure why you'd ever want this
-                latency = time.perf_counter() - st
-                ttft = latency
-                output.ttft = ttft
-                generated_text = gen_out[0]["text"]
-        output.generated_text = generated_text
-        output.success = True
-        output.latency = latency
-        output.output_len = request_func_input.output_len
-    except Exception:
-        output.success = False
-        exc_info = sys.exc_info()
-        output.error = "".join(traceback.format_exception(*exc_info))
-
-    if pbar:
-        pbar.update(1)
-    return output
-
-
 async def async_request_truss(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
@@ -492,7 +429,6 @@ def get_tokenizer(
     "sglang": async_request_sglang_generate,
     "sglang-native": async_request_sglang_generate,
     "sglang-oai": async_request_openai_completions,
-    "sglang-offline-engine": async_request_sglang_offline_engine,
     "vllm": async_request_openai_completions,
     "lmdeploy": async_request_openai_completions,
     "trt": async_request_trt_llm,
@@ -786,7 +722,7 @@ def calculate_metrics(
 
 async def benchmark(
     backend: str,
-    api_url: Optional[str],
+    api_url: str,
     model_id: str,
     tokenizer: PreTrainedTokenizerBase,
     input_requests: List[Tuple[str, int, int]],
@@ -798,9 +734,6 @@ async def benchmark(
         request_func = ASYNC_REQUEST_FUNCS[backend]
     else:
         raise ValueError(f"Unknown backend: {backend}")
-    engine = None
-    if backend == "sglang-offline-engine":
-        engine = getEngine(model_path=model_id)
 
     print("Starting initial single prompt test run...")
     test_prompt, test_prompt_len, test_output_len = input_requests[0]
@@ -808,7 +741,6 @@ async def benchmark(
         model=model_id,
         prompt=test_prompt,
         api_url=api_url,
-        engine=engine,
         prompt_len=test_prompt_len,
         output_len=test_output_len,
         extra_request_body=extra_request_body,
@@ -834,7 +766,6 @@ async def benchmark(
             model=model_id,
             prompt=prompt,
             api_url=api_url,
-            engine=engine,
             prompt_len=prompt_len,
             output_len=output_len,
             extra_request_body=extra_request_body,
@@ -1047,8 +978,6 @@ def run_benchmark(args_: argparse.Namespace):
             if args.base_url
             else f"http://{args.host}:{args.port}/v1/completions"
         )
-    elif args.backend in ["sglang-offline-engine"]:
-        api_url = None
     elif args.backend == "trt":
         api_url = (
             f"{args.base_url}/v2/models/ensemble/generate_stream"

From 5564a96d009be60193a6b4442deafaebcb8d7920 Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Mon, 11 Nov 2024 22:59:21 +0000
Subject: [PATCH 05/30] script works

---
 python/sglang/bench_offline_throughput.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index 10c1d4de0e1..aada2ca8b42 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -2,6 +2,10 @@
 Benchmark the throughput of using the offline LLM engine.
 This script does not launch a server.
 It accepts the same arguments as bench_latency.py
+
+# Usage
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl
+
 """
 
 import argparse
@@ -9,11 +13,9 @@
 import itertools
 import logging
 import time
+import jsonlines
 from typing import Dict, List, Tuple
 
-import numpy as np
-
-from python.sglang.srt.sampling.sampling_params import SamplingParams
 from sglang.api import Engine as getEngine
 from sglang.srt.server import Engine
 from sglang.srt.server_args import ServerArgs
@@ -106,10 +108,10 @@ def throughput_test_once(
 
 
 def throughput_test(
-    server_args,
+    server_args: ServerArgs,
     bench_args: BenchArgs,
 ):
-    engine = getEngine(**server_args)
+    engine = getEngine(**dataclasses.asdict(server_args))
     if not engine:
         raise ValueError("Please provide valid engine arguments")
 
@@ -131,7 +133,11 @@ def throughput_test(
         if ret is not None:
             result_list.append(ret)
 
-    print(result_list)
+    if bench_args.result_filename:
+        with jsonlines.open(bench_args.result_filename, "a") as f:
+            f.write_all(result_list)
+    else:
+        print(result_list)
 
 
 if __name__ == "__main__":

From 0078bc3a0abdf53072b3a91a884bcfb362e92703 Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Mon, 11 Nov 2024 23:07:10 +0000
Subject: [PATCH 06/30] reset bench serving stuff

---
 python/sglang/bench_serving.py   |  96 ----------------
 python/sglang/test/test_utils.py | 185 +++----------------------------
 test/srt/test_bench_serving.py   |  20 ----
 3 files changed, 13 insertions(+), 288 deletions(-)

diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
index 3d08f37863e..2ca35aca95a 100644
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -37,9 +37,6 @@
     PreTrainedTokenizerFast,
 )
 
-from sglang.api import Engine as getEngine
-from sglang.srt.server import Engine
-
 AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60)
 
 global args
@@ -48,7 +45,6 @@
 @dataclass
 class RequestFuncInput:
     prompt: str
-    # one or the other must be defined but not both
     api_url: str
     prompt_len: int
     output_len: int
@@ -226,85 +222,6 @@ async def async_request_openai_completions(
     return output
 
 
-async def async_request_truss(
-    request_func_input: RequestFuncInput,
-    pbar: Optional[tqdm] = None,
-) -> RequestFuncOutput:
-    api_url = request_func_input.api_url
-
-    prompt = request_func_input.prompt
-
-    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
-        payload = {
-            "model": request_func_input.model,
-            "prompt": prompt,
-            "temperature": 0.0,
-            "best_of": 1,
-            "max_tokens": request_func_input.output_len,
-            "stream": not args.disable_stream,
-            "ignore_eos": not args.disable_ignore_eos,
-            **request_func_input.extra_request_body,
-        }
-        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
-
-        output = RequestFuncOutput()
-        output.prompt_len = request_func_input.prompt_len
-
-        generated_text = ""
-        ttft = 0.0
-        st = time.perf_counter()
-        most_recent_timestamp = st
-        try:
-            async with session.post(
-                url=api_url, json=payload, headers=headers
-            ) as response:
-                if response.status == 200:
-                    async for chunk_bytes in response.content:
-                        chunk_bytes = chunk_bytes.strip()
-                        if not chunk_bytes:
-                            continue
-
-                        chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
-                        latency = time.perf_counter() - st
-                        if chunk == "[DONE]":
-                            pass
-                        else:
-                            data = json.loads(chunk)
-
-                            # NOTE: Some completion API might have a last
-                            # usage summary response without a token so we
-                            # want to check a token was generated
-                            if data["choices"][0]["delta"]["content"]:
-                                timestamp = time.perf_counter()
-                                # First token
-                                if ttft == 0.0:
-                                    ttft = time.perf_counter() - st
-                                    output.ttft = ttft
-
-                                # Decoding phase
-                                else:
-                                    output.itl.append(timestamp - most_recent_timestamp)
-
-                                most_recent_timestamp = timestamp
-                                generated_text += data["choices"][0]["delta"]["content"]
-
-                    output.generated_text = generated_text
-                    output.success = True
-                    output.latency = latency
-                    output.output_len = request_func_input.output_len
-                else:
-                    output.error = response.reason or ""
-                    output.success = False
-        except Exception:
-            output.success = False
-            exc_info = sys.exc_info()
-            output.error = "".join(traceback.format_exception(*exc_info))
-
-    if pbar:
-        pbar.update(1)
-    return output
-
-
 async def async_request_sglang_generate(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
@@ -433,7 +350,6 @@ def get_tokenizer(
     "lmdeploy": async_request_openai_completions,
     "trt": async_request_trt_llm,
     "gserver": async_request_gserver,
-    "truss": async_request_truss,
 }
 
 
@@ -957,7 +873,6 @@ def run_benchmark(args_: argparse.Namespace):
             "vllm": 8000,
             "trt": 8000,
             "gserver": 9988,
-            "truss": 8080,
         }.get(args.backend, 30000)
 
     model_url = (
@@ -990,20 +905,9 @@ def run_benchmark(args_: argparse.Namespace):
     elif args.backend == "gserver":
         api_url = args.base_url if args.base_url else f"{args.host}:{args.port}"
         args.model = args.model or "default"
-    elif args.backend == "truss":
-        api_url = (
-            f"{args.base_url}/v1/models/model:predict"
-            if args.base_url
-            else f"http://{args.host}:{args.port}/v1/models/model:predict"
-        )
 
     # Get model name
     if args.model is None:
-        if args.backend == "truss":
-            print(
-                "Please provide a model with `--model` when using truss backend. e.g. --model meta-llama/Llama-3.1-8B-Instruct"
-            )
-            sys.exit(1)
         try:
             response = requests.get(model_url)
             model_list = response.json().get("data", [])
diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index 8ab28911db1..8fb20c6eb04 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -3,11 +3,9 @@
 import argparse
 import asyncio
 import os
-import random
 import subprocess
 import threading
 import time
-from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from types import SimpleNamespace
 from typing import Callable, List, Optional
@@ -22,7 +20,6 @@
 from sglang.lang.backend.openai import OpenAI
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.srt.utils import kill_child_process
-from sglang.test.run_eval import run_eval
 from sglang.utils import get_exception_traceback
 
 DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
@@ -403,7 +400,7 @@ def popen_launch_server(
     api_key: Optional[str] = None,
     other_args: tuple = (),
     env: Optional[dict] = None,
-    return_stdout_stderr: Optional[tuple] = None,
+    return_stdout_stderr: bool = False,
 ):
     _, host, port = base_url.split(":")
     host = host[2:]
@@ -426,8 +423,8 @@ def popen_launch_server(
     if return_stdout_stderr:
         process = subprocess.Popen(
             command,
-            stdout=return_stdout_stderr[0],
-            stderr=return_stdout_stderr[1],
+            stdout=subprocess.PIPE,
+            stderr=subprocess.PIPE,
             env=env,
             text=True,
         )
@@ -496,7 +493,7 @@ def run_one_file(filename):
             )
             assert ret_code == 0
         except TimeoutError:
-            kill_child_process(process.pid, include_self=True)
+            kill_child_process(process.pid)
             time.sleep(5)
             print(
                 f"\nTimeout after {timeout_per_file} seconds when running {filename}\n",
@@ -522,7 +519,6 @@ def run_bench_serving(
     num_prompts,
     request_rate,
     other_server_args,
-    backend="sglang",
     dataset_name="random",
     random_input_len=4096,
     random_output_len=2048,
@@ -530,17 +526,16 @@ def run_bench_serving(
 ):
     # Launch the server
     base_url = DEFAULT_URL_FOR_TEST
-    if backend == "sglang":
-        process = popen_launch_server(
-            model,
-            base_url,
-            timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-            other_args=other_server_args,
-        )
+    process = popen_launch_server(
+        model,
+        base_url,
+        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
+        other_args=other_server_args,
+    )
 
     # Run benchmark
     args = SimpleNamespace(
-        backend=backend,
+        backend="sglang",
         base_url=base_url,
         host=None,
         port=None,
@@ -566,7 +561,7 @@ def run_bench_serving(
     try:
         res = run_benchmark(args)
     finally:
-        kill_child_process(process.pid, include_self=True)
+        kill_child_process(process.pid)
 
     assert res["completed"] == num_prompts
     return res
@@ -599,7 +594,7 @@ def run_bench_latency(model, other_args):
         lastline = output.split("\n")[-3]
         output_throughput = float(lastline.split(" ")[-2])
     finally:
-        kill_child_process(process.pid, include_self=True)
+        kill_child_process(process.pid)
 
     return output_throughput
 
@@ -636,157 +631,3 @@ def calculate_rouge_l(output_strs_list1, output_strs_list2):
         rouge_l_scores.append(fmeasure)
 
     return rouge_l_scores
-
-
-STDOUT_FILENAME = "stdout.txt"
-STDERR_FILENAME = "stderr.txt"
-
-
-def read_output(output_lines):
-    """Print the output in real time with another thread."""
-    while not os.path.exists(STDERR_FILENAME):
-        time.sleep(1)
-
-    pt = 0
-    while pt >= 0:
-        if pt > 0 and not os.path.exists(STDERR_FILENAME):
-            break
-        lines = open(STDERR_FILENAME).readlines()
-        for line in lines[pt:]:
-            print(line, end="", flush=True)
-            output_lines.append(line)
-            pt += 1
-        time.sleep(0.1)
-
-
-def run_and_check_memory_leak(
-    workload_func,
-    disable_radix_cache,
-    enable_mixed_chunk,
-    enable_overlap,
-    chunked_prefill_size,
-):
-    other_args = ["--chunked-prefill-size", str(chunked_prefill_size)]
-    if disable_radix_cache:
-        other_args += ["--disable-radix-cache"]
-    if enable_mixed_chunk:
-        other_args += ["--enable-mixed-chunk"]
-    if enable_overlap:
-        other_args += ["--enable-overlap-scheduler"]
-
-    model = DEFAULT_MODEL_NAME_FOR_TEST
-    port = random.randint(4000, 5000)
-    base_url = f"http://127.0.0.1:{port}"
-
-    # Create files and launch the server
-    stdout = open(STDOUT_FILENAME, "w")
-    stderr = open(STDERR_FILENAME, "w")
-    process = popen_launch_server(
-        model,
-        base_url,
-        timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH,
-        other_args=other_args,
-        return_stdout_stderr=(stdout, stderr),
-    )
-
-    # Launch a thread to stream the output
-    output_lines = []
-    t = threading.Thread(target=read_output, args=(output_lines,))
-    t.start()
-
-    # Run the workload
-    workload_func(base_url, model)
-
-    # Clean up everything
-    kill_child_process(process.pid, include_self=True)
-    kill_child_process(process.pid, include_self=True)
-    stdout.close()
-    stderr.close()
-    if os.path.exists(STDOUT_FILENAME):
-        os.remove(STDOUT_FILENAME)
-    if os.path.exists(STDERR_FILENAME):
-        os.remove(STDERR_FILENAME)
-    t.join()
-
-    # Assert success
-    has_new_server = False
-    has_leak = False
-    for line in output_lines:
-        if "The server is fired" in line:
-            has_new_server = True
-        if "leak" in line:
-            has_leak = True
-
-    assert has_new_server
-    assert not has_leak
-
-
-def run_mmlu_test(
-    disable_radix_cache=False,
-    enable_mixed_chunk=False,
-    enable_overlap=False,
-    chunked_prefill_size=32,
-):
-    def workload_func(base_url, model):
-        # Run the eval
-        args = SimpleNamespace(
-            base_url=base_url,
-            model=model,
-            eval_name="mmlu",
-            num_examples=128,
-            num_threads=128,
-        )
-
-        try:
-            metrics = run_eval(args)
-            print(f"{metrics=}")
-            assert metrics["score"] >= 0.65
-        finally:
-            pass
-
-    run_and_check_memory_leak(
-        workload_func,
-        disable_radix_cache,
-        enable_mixed_chunk,
-        enable_overlap,
-        chunked_prefill_size,
-    )
-
-
-def run_mulit_request_test(
-    disable_radix_cache=False,
-    enable_mixed_chunk=False,
-    enable_overlap=False,
-    chunked_prefill_size=32,
-):
-
-    def workload_func(base_url, model):
-        def run_one(_):
-            prompt = """
-            System: You are a helpful assistant.
-            User: What is the capital of France?
-            Assistant: The capital of France is
-            """
-
-            response = requests.post(
-                f"{base_url}/generate",
-                json={
-                    "text": prompt,
-                    "sampling_params": {
-                        "temperature": 0,
-                        "max_new_tokens": 8,
-                    },
-                },
-            )
-            ret = response.json()
-
-        with ThreadPoolExecutor(2) as executor:
-            list(executor.map(run_one, list(range(4))))
-
-    run_and_check_memory_leak(
-        workload_func,
-        disable_radix_cache,
-        enable_mixed_chunk,
-        enable_overlap,
-        chunked_prefill_size,
-    )
diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py
index 7de4fd736d0..6955d4917b2 100644
--- a/test/srt/test_bench_serving.py
+++ b/test/srt/test_bench_serving.py
@@ -86,26 +86,6 @@ def test_offline_throughput_default_fp8(self):
         if is_in_ci():
             assert res["output_throughput"] > 3100
 
-    def test_offline_throughput_default_engine(self):
-        res = run_bench_serving(
-            model=DEFAULT_MODEL_NAME_FOR_TEST,
-            num_prompts=500,
-            request_rate=float("inf"),
-            other_server_args=[],
-        )
-
-    def test_offline_throughput_llm_engine(self):
-        res = run_bench_serving(
-            backend="sgl-offline-engine",
-            model=DEFAULT_MODEL_NAME_FOR_TEST,
-            num_prompts=500,
-            request_rate=float("inf"),
-            other_server_args=[],
-        )
-
-        if is_in_ci():
-            assert res["output_throughput"] > 2830
-
     def test_online_latency_default(self):
         res = run_bench_serving(
             model=DEFAULT_MODEL_NAME_FOR_TEST,

From 31584145e53f3aa482c912887b8b4d9d0b3b91f0 Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Mon, 11 Nov 2024 23:10:36 +0000
Subject: [PATCH 07/30] most recent commit?

---
 python/sglang/bench_serving.py | 92 ++++++++++++++++++++++++++++++++++
 1 file changed, 92 insertions(+)

diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
index d64901cf1fa..74f77565ef2 100644
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -222,6 +222,85 @@ async def async_request_openai_completions(
     return output
 
 
+async def async_request_truss(
+    request_func_input: RequestFuncInput,
+    pbar: Optional[tqdm] = None,
+) -> RequestFuncOutput:
+    api_url = request_func_input.api_url
+
+    prompt = request_func_input.prompt
+
+    async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session:
+        payload = {
+            "model": request_func_input.model,
+            "prompt": prompt,
+            "temperature": 0.0,
+            "best_of": 1,
+            "max_tokens": request_func_input.output_len,
+            "stream": not args.disable_stream,
+            "ignore_eos": not args.disable_ignore_eos,
+            **request_func_input.extra_request_body,
+        }
+        headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"}
+
+        output = RequestFuncOutput()
+        output.prompt_len = request_func_input.prompt_len
+
+        generated_text = ""
+        ttft = 0.0
+        st = time.perf_counter()
+        most_recent_timestamp = st
+        try:
+            async with session.post(
+                url=api_url, json=payload, headers=headers
+            ) as response:
+                if response.status == 200:
+                    async for chunk_bytes in response.content:
+                        chunk_bytes = chunk_bytes.strip()
+                        if not chunk_bytes:
+                            continue
+
+                        chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ")
+                        latency = time.perf_counter() - st
+                        if chunk == "[DONE]":
+                            pass
+                        else:
+                            data = json.loads(chunk)
+
+                            # NOTE: Some completion API might have a last
+                            # usage summary response without a token so we
+                            # want to check a token was generated
+                            if data["choices"][0]["delta"]["content"]:
+                                timestamp = time.perf_counter()
+                                # First token
+                                if ttft == 0.0:
+                                    ttft = time.perf_counter() - st
+                                    output.ttft = ttft
+
+                                # Decoding phase
+                                else:
+                                    output.itl.append(timestamp - most_recent_timestamp)
+
+                                most_recent_timestamp = timestamp
+                                generated_text += data["choices"][0]["delta"]["content"]
+
+                    output.generated_text = generated_text
+                    output.success = True
+                    output.latency = latency
+                    output.output_len = request_func_input.output_len
+                else:
+                    output.error = response.reason or ""
+                    output.success = False
+        except Exception:
+            output.success = False
+            exc_info = sys.exc_info()
+            output.error = "".join(traceback.format_exception(*exc_info))
+
+    if pbar:
+        pbar.update(1)
+    return output
+
+
 async def async_request_sglang_generate(
     request_func_input: RequestFuncInput,
     pbar: Optional[tqdm] = None,
@@ -350,6 +429,7 @@ def get_tokenizer(
     "lmdeploy": async_request_openai_completions,
     "trt": async_request_trt_llm,
     "gserver": async_request_gserver,
+    "truss": async_request_truss,
 }
 
 
@@ -933,6 +1013,7 @@ def run_benchmark(args_: argparse.Namespace):
             "vllm": 8000,
             "trt": 8000,
             "gserver": 9988,
+            "truss": 8080,
         }.get(args.backend, 30000)
 
     model_url = (
@@ -965,9 +1046,20 @@ def run_benchmark(args_: argparse.Namespace):
     elif args.backend == "gserver":
         api_url = args.base_url if args.base_url else f"{args.host}:{args.port}"
         args.model = args.model or "default"
+    elif args.backend == "truss":
+        api_url = (
+            f"{args.base_url}/v1/models/model:predict"
+            if args.base_url
+            else f"http://{args.host}:{args.port}/v1/models/model:predict"
+        )
 
     # Get model name
     if args.model is None:
+        if args.backend == "truss":
+            print(
+                "Please provide a model with `--model` when using truss backend. e.g. --model meta-llama/Llama-3.1-8B-Instruct"
+            )
+            sys.exit(1)
         try:
             response = requests.get(model_url)
             model_list = response.json().get("data", [])

From 550ec14c2e9cdf886d6f13834eecef5167fdae8e Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Mon, 11 Nov 2024 23:11:30 +0000
Subject: [PATCH 08/30] restore test utils

---
 python/sglang/test/test_utils.py | 15 +++++++++------
 1 file changed, 9 insertions(+), 6 deletions(-)

diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index 41e247fcd10..f136a4d1b64 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -3,9 +3,11 @@
 import argparse
 import asyncio
 import os
+import random
 import subprocess
 import threading
 import time
+from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from types import SimpleNamespace
 from typing import Callable, List, Optional
@@ -20,6 +22,7 @@
 from sglang.lang.backend.openai import OpenAI
 from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint
 from sglang.srt.utils import kill_child_process
+from sglang.test.run_eval import run_eval
 from sglang.utils import get_exception_traceback
 
 DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8"
@@ -402,7 +405,7 @@ def popen_launch_server(
     api_key: Optional[str] = None,
     other_args: tuple = (),
     env: Optional[dict] = None,
-    return_stdout_stderr: bool = False,
+    return_stdout_stderr: Optional[tuple] = None,
 ):
     _, host, port = base_url.split(":")
     host = host[2:]
@@ -425,8 +428,8 @@ def popen_launch_server(
     if return_stdout_stderr:
         process = subprocess.Popen(
             command,
-            stdout=subprocess.PIPE,
-            stderr=subprocess.PIPE,
+            stdout=return_stdout_stderr[0],
+            stderr=return_stdout_stderr[1],
             env=env,
             text=True,
         )
@@ -495,7 +498,7 @@ def run_one_file(filename):
             )
             assert ret_code == 0
         except TimeoutError:
-            kill_child_process(process.pid)
+            kill_child_process(process.pid, include_self=True)
             time.sleep(5)
             print(
                 f"\nTimeout after {timeout_per_file} seconds when running {filename}\n",
@@ -563,7 +566,7 @@ def run_bench_serving(
     try:
         res = run_benchmark(args)
     finally:
-        kill_child_process(process.pid)
+        kill_child_process(process.pid, include_self=True)
 
     assert res["completed"] == num_prompts
     return res
@@ -596,7 +599,7 @@ def run_bench_latency(model, other_args):
         lastline = output.split("\n")[-3]
         output_throughput = float(lastline.split(" ")[-2])
     finally:
-        kill_child_process(process.pid)
+        kill_child_process(process.pid, include_self=True)
 
     return output_throughput
 

From c1c62268fcb1779933d532100c2836288821c0d3 Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Mon, 11 Nov 2024 23:15:46 +0000
Subject: [PATCH 09/30] lint

---
 python/sglang/bench_offline_throughput.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index aada2ca8b42..9588cedea2e 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -13,9 +13,10 @@
 import itertools
 import logging
 import time
-import jsonlines
 from typing import Dict, List, Tuple
 
+import jsonlines
+
 from sglang.api import Engine as getEngine
 from sglang.srt.server import Engine
 from sglang.srt.server_args import ServerArgs

From 1895c79d3b4e6c85f5c6aa9656a55e036c6ae4f8 Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Tue, 12 Nov 2024 02:27:22 +0000
Subject: [PATCH 10/30] use sharegpt from bench_serving

---
 python/sglang/bench_offline_throughput.py | 153 ++++++++++++++--------
 1 file changed, 96 insertions(+), 57 deletions(-)

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index 9588cedea2e..88192eeef11 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -12,11 +12,13 @@
 import dataclasses
 import itertools
 import logging
+import random
 import time
 from typing import Dict, List, Tuple
+import json
 
-import jsonlines
-
+import numpy as np
+from sglang.bench_serving import set_ulimit, sample_sharegpt_requests, sample_random_requests, get_tokenizer
 from sglang.api import Engine as getEngine
 from sglang.srt.server import Engine
 from sglang.srt.server_args import ServerArgs
@@ -25,36 +27,56 @@
 @dataclasses.dataclass
 class BenchArgs:
     run_name: str = "before"
-    batch_size: Tuple[int] = (1,)
-    input_len: Tuple[int] = (1024,)
-    output_len: Tuple[int] = (16,)
     result_filename: str = ""
-    # Plotting args
-    graph_sql: str = (
-        "select run_name, batch_size, prefill_throughput from results where run_name='before'"
-    )
-    graph_filename: str = "out.png"
+    seed: int = 1
 
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
         parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
         parser.add_argument(
-            "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size
+            "--result-filename", type=str, default=BenchArgs.result_filename
+        )
+        parser.add_argument(
+            "--dataset-name",
+            type=str,
+            default="sharegpt",
+            choices=["sharegpt", "random", "generated-shared-prefix"],
+            help="Name of the dataset to benchmark on.",
         )
         parser.add_argument(
-            "--input-len", type=int, nargs="+", default=BenchArgs.input_len
+            "--dataset-path", type=str, default="", help="Path to the dataset."
         )
         parser.add_argument(
-            "--output-len", type=int, nargs="+", default=BenchArgs.output_len
+            "--num-prompts",
+            type=int,
+            default=1000,
+            help="Number of prompts to process. Default is 1000.",
         )
         parser.add_argument(
-            "--result-filename", type=str, default=BenchArgs.result_filename
+            "--sharegpt-output-len",
+            type=int,
+            default=None,
+            help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
         )
-        # graphing
-        parser.add_argument("--graph-sql", type=str, default=BenchArgs.graph_sql)
         parser.add_argument(
-            "--graph-filename", type=str, default=BenchArgs.graph_filename
+            "--random-input-len",
+            type=int,
+            help="Number of input tokens per request, used only for random dataset.",
         )
+        parser.add_argument(
+            "--random-output-len",
+            type=int,
+            help="Number of output tokens per request, used only for random dataset.",
+        )
+        parser.add_argument(
+            "--random-range-ratio",
+            type=float,
+            default=0.0,
+            help="Range of sampled ratio of input/output length, "
+            "used only for random dataset.",
+        )
+        parser.add_argument("--seed", type=int, default=1, help="The random seed.")
+
 
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
@@ -65,45 +87,29 @@ def from_cli_args(cls, args: argparse.Namespace):
         )
 
 
-def prepare_synthetic_inputs_for_throughput_test(
-    batch_size: int, input_len: int, output_len: int
-):
-    input_ids = [[1] * input_len for _ in range(batch_size)]
-    sampling_params = {
-        "temperature": 0,
-        "min_new_tokens": output_len,
-        "max_new_tokens": output_len,
-    }
-    return input_ids, sampling_params
-
-
 def throughput_test_once(
     run_name: str,
     engine: Engine,
-    reqs: Tuple[List[List[int]], Dict],
-    output_len: int,
+    reqs: List[Tuple[str, int, int]],
 ):
     measurement_results = {
         "run_name": run_name,
-        "batch_size": len(reqs[0]),
-        "input_len": len(reqs[0][0]),
-        "output_len": output_len,
+        "total_input_tokens": sum(r[1] for r in reqs),
     }
 
     st = time.perf_counter()
-    gen_out = engine.generate(input_ids=reqs[0], sampling_params=reqs[1])
+    gen_out = engine.generate(prompt=[r[0] for r in reqs], sampling_params={ "temperature": 0 })
     latency = time.perf_counter() - st
 
     measurement_results["total_latency"] = latency
+    measurement_results["total_output_tokens"] = sum(o["meta_info"]["completion_tokens"] for o in gen_out)
     measurement_results["throughput"] = (
-        (measurement_results["input_len"] + output_len)
-        * measurement_results["batch_size"]
+        measurement_results["total_input_tokens"] +
+        measurement_results["total_output_tokens"]
     ) / latency
 
     print(
-        f"Throughput: BSZ {measurement_results['batch_size']} tokens, "
-        f"Num sequences {len(reqs[0])}, throughput: "
-        f"{measurement_results['throughput']} tokens/s"
+        f"Throughput: {measurement_results['throughput']} tokens/s"
     )
     return measurement_results
 
@@ -116,29 +122,62 @@ def throughput_test(
     if not engine:
         raise ValueError("Please provide valid engine arguments")
 
-    warmup_reqs = prepare_synthetic_inputs_for_throughput_test(
-        bench_args.batch_size[0], bench_args.input_len[0], bench_args.output_len[0]
+    tokenizer_id = args.model_path
+    tokenizer = get_tokenizer(tokenizer_id)
+
+    # Set global environmnets
+    set_ulimit()
+    random.seed(bench_args.seed)
+    np.random.seed(bench_args.seed)
+
+    if args.dataset_name == "sharegpt":
+        assert args.random_input_len is None and args.random_output_len is None
+        input_requests = sample_sharegpt_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            fixed_output_len=args.sharegpt_output_len,
+        )
+    elif args.dataset_name == "random":
+        assert args.random_input_len is not None and args.random_output_len is not None
+        input_requests = sample_random_requests(
+            input_len=args.random_input_len,
+            output_len=args.random_output_len,
+            num_prompts=args.num_prompts,
+            range_ratio=args.random_range_ratio,
+            tokenizer=tokenizer,
+            dataset_path=args.dataset_path,
+        )
+    else:
+        raise ValueError(f"Unknown dataset: {args.dataset_name}")
+
+    warmup_requests = sample_random_requests(
+        input_len=20,
+        output_len=4,
+        num_prompts=2,
+        range_ratio=0.8,
+        tokenizer=tokenizer,
+        dataset_path=args.dataset_path,
     )
 
     # Warm up
-    throughput_test_once("warmup", engine, warmup_reqs, bench_args.output_len[0])
-
-    result_list = []
-    for bs, il, ol in itertools.product(
-        bench_args.batch_size, bench_args.input_len, bench_args.output_len
-    ):
-        reqs = prepare_synthetic_inputs_for_throughput_test(bs, il, ol)
-        ret = throughput_test_once(
-            bench_args.run_name, engine, reqs, bench_args.output_len[0]
-        )
-        if ret is not None:
-            result_list.append(ret)
+    throughput_test_once(
+        run_name="warmup",
+        engine=engine,
+        reqs=warmup_requests,
+    )
+
+    result = throughput_test_once(
+        run_name=bench_args.run_name,
+        engine=engine,
+        reqs=input_requests,
+    )
 
     if bench_args.result_filename:
-        with jsonlines.open(bench_args.result_filename, "a") as f:
-            f.write_all(result_list)
+        with open(bench_args.result_filename, "a") as fout:
+            fout.write(json.dumps(result) + "\n")
     else:
-        print(result_list)
+        print(result)
 
 
 if __name__ == "__main__":

From 3c8faf9b8c57a4d48ae6d01012b1b3fdc3d435b7 Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Tue, 12 Nov 2024 02:31:16 +0000
Subject: [PATCH 11/30] add unit test

---
 test/srt/test_srt_engine.py | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/test/srt/test_srt_engine.py b/test/srt/test_srt_engine.py
index a375c2900d5..fb6f3dbf5b7 100644
--- a/test/srt/test_srt_engine.py
+++ b/test/srt/test_srt_engine.py
@@ -18,6 +18,7 @@
     DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST,
     DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
 )
+from sglang.bench_offline_throughput import throughput_test
 
 
 class TestSRTEngine(unittest.TestCase):
@@ -152,6 +153,19 @@ def test_6_engine_runtime_encode_consistency(self):
 
         self.assertTrue(torch.allclose(out1, out2, atol=1e-5, rtol=1e-3))
 
+    def test_7_engine_offline_throughput(self):
+        server_args = SimpleNamespace(
+            model_path=DEFAULT_MODEL_NAME_FOR_TEST,
+        )
+        bench_args = SimpleNamespace(
+            num_prompts=10
+        )
+        result = throughput_test(
+            server_args=server_args,
+            bench_args=bench_args
+        )
+        self.assertTrue(result["throughput"] > 3800)
+
 
 if __name__ == "__main__":
     unittest.main()

From 170c83f94b70391279b6b16254861cf46a7b3dd1 Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Tue, 12 Nov 2024 02:32:36 +0000
Subject: [PATCH 12/30] lint

---
 python/sglang/bench_offline_throughput.py | 27 ++++++++++++++---------
 test/srt/test_srt_engine.py               | 11 +++------
 2 files changed, 20 insertions(+), 18 deletions(-)

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index 88192eeef11..bac75e86846 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -11,15 +11,21 @@
 import argparse
 import dataclasses
 import itertools
+import json
 import logging
 import random
 import time
 from typing import Dict, List, Tuple
-import json
 
 import numpy as np
-from sglang.bench_serving import set_ulimit, sample_sharegpt_requests, sample_random_requests, get_tokenizer
+
 from sglang.api import Engine as getEngine
+from sglang.bench_serving import (
+    get_tokenizer,
+    sample_random_requests,
+    sample_sharegpt_requests,
+    set_ulimit,
+)
 from sglang.srt.server import Engine
 from sglang.srt.server_args import ServerArgs
 
@@ -77,7 +83,6 @@ def add_cli_args(parser: argparse.ArgumentParser):
         )
         parser.add_argument("--seed", type=int, default=1, help="The random seed.")
 
-
     @classmethod
     def from_cli_args(cls, args: argparse.Namespace):
         # use the default value's type to case the args into correct types.
@@ -98,19 +103,21 @@ def throughput_test_once(
     }
 
     st = time.perf_counter()
-    gen_out = engine.generate(prompt=[r[0] for r in reqs], sampling_params={ "temperature": 0 })
+    gen_out = engine.generate(
+        prompt=[r[0] for r in reqs], sampling_params={"temperature": 0}
+    )
     latency = time.perf_counter() - st
 
     measurement_results["total_latency"] = latency
-    measurement_results["total_output_tokens"] = sum(o["meta_info"]["completion_tokens"] for o in gen_out)
+    measurement_results["total_output_tokens"] = sum(
+        o["meta_info"]["completion_tokens"] for o in gen_out
+    )
     measurement_results["throughput"] = (
-        measurement_results["total_input_tokens"] +
-        measurement_results["total_output_tokens"]
+        measurement_results["total_input_tokens"]
+        + measurement_results["total_output_tokens"]
     ) / latency
 
-    print(
-        f"Throughput: {measurement_results['throughput']} tokens/s"
-    )
+    print(f"Throughput: {measurement_results['throughput']} tokens/s")
     return measurement_results
 
 
diff --git a/test/srt/test_srt_engine.py b/test/srt/test_srt_engine.py
index fb6f3dbf5b7..5e778f1655d 100644
--- a/test/srt/test_srt_engine.py
+++ b/test/srt/test_srt_engine.py
@@ -11,6 +11,7 @@
 import torch
 
 import sglang as sgl
+from sglang.bench_offline_throughput import throughput_test
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.test.few_shot_gsm8k_engine import run_eval
 from sglang.test.test_utils import (
@@ -18,7 +19,6 @@
     DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST,
     DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
 )
-from sglang.bench_offline_throughput import throughput_test
 
 
 class TestSRTEngine(unittest.TestCase):
@@ -157,13 +157,8 @@ def test_7_engine_offline_throughput(self):
         server_args = SimpleNamespace(
             model_path=DEFAULT_MODEL_NAME_FOR_TEST,
         )
-        bench_args = SimpleNamespace(
-            num_prompts=10
-        )
-        result = throughput_test(
-            server_args=server_args,
-            bench_args=bench_args
-        )
+        bench_args = SimpleNamespace(num_prompts=10)
+        result = throughput_test(server_args=server_args, bench_args=bench_args)
         self.assertTrue(result["throughput"] > 3800)
 
 
From 696dd95b65ee99a98755f457b353c04ef7223aba Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Tue, 12 Nov 2024 21:02:05 +0000
Subject: [PATCH 13/30] add support for runtime backend + dataclass generic
 args

---
 python/sglang/bench_offline_throughput.py | 25 +++++++++++++++--------
 python/sglang/test/test_utils.py          | 10 ++++++++-
 test/srt/test_srt_engine.py               |  5 +++--
 3 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index bac75e86846..d33b88b5b0a 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -15,7 +15,7 @@
 import logging
 import random
 import time
-from typing import Dict, List, Tuple
+from typing import Dict, List, Tuple, Union
 
 import numpy as np
 
@@ -26,7 +26,7 @@
     sample_sharegpt_requests,
     set_ulimit,
 )
-from sglang.srt.server import Engine
+from sglang.srt.server import Engine, Runtime
 from sglang.srt.server_args import ServerArgs
 
 
@@ -35,10 +35,12 @@ class BenchArgs:
     run_name: str = "before"
     result_filename: str = ""
     seed: int = 1
+    backend: str = "engine"
 
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
         parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
+        parser.add_argument("--backend", type=str, default=BenchArgs.backend)
         parser.add_argument(
             "--result-filename", type=str, default=BenchArgs.result_filename
         )
@@ -94,7 +96,7 @@ def from_cli_args(cls, args: argparse.Namespace):
 
 def throughput_test_once(
     run_name: str,
-    engine: Engine,
+    backend: Union[Engine, Runtime],
     reqs: List[Tuple[str, int, int]],
 ):
     measurement_results = {
@@ -103,7 +105,7 @@ def throughput_test_once(
     }
 
     st = time.perf_counter()
-    gen_out = engine.generate(
+    gen_out = backend.generate(
         prompt=[r[0] for r in reqs], sampling_params={"temperature": 0}
     )
     latency = time.perf_counter() - st
@@ -125,9 +127,14 @@ def throughput_test(
     server_args: ServerArgs,
     bench_args: BenchArgs,
 ):
-    engine = getEngine(**dataclasses.asdict(server_args))
-    if not engine:
-        raise ValueError("Please provide valid engine arguments")
+    if bench_args.backend == "engine":
+        backend = getEngine(**dataclasses.asdict(server_args))
+        if not backend:
+            raise ValueError("Please provide valid engine arguments")
+    elif bench_args.backend == "runtime":
+        backend = Runtime(**dataclasses.asdict(server_args))
+    else:
+        raise ValueError('Please set backend to either "engine" or "runtime"')
 
     tokenizer_id = args.model_path
     tokenizer = get_tokenizer(tokenizer_id)
@@ -170,13 +177,13 @@ def throughput_test(
     # Warm up
     throughput_test_once(
         run_name="warmup",
-        engine=engine,
+        backend=backend,
         reqs=warmup_requests,
     )
 
     result = throughput_test_once(
         run_name=bench_args.run_name,
-        engine=engine,
+        backend=backend,
         reqs=input_requests,
     )
 
diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index f136a4d1b64..4b59bdff173 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -10,7 +10,8 @@
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from types import SimpleNamespace
-from typing import Callable, List, Optional
+from typing import Callable, List, Optional, Dict, Any
+import dataclasses
 
 import numpy as np
 import requests
@@ -39,6 +40,13 @@
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
 
+@dataclasses.dataclass
+class GenericArgs:
+    __dict__: Dict[str, Any] = dataclasses.field(default_factory=dict)
+
+    def __init__(self, **kwargs):
+        self.__dict__.update(kwargs)
+
 
 def is_in_ci():
     """Return whether it is in CI runner."""
diff --git a/test/srt/test_srt_engine.py b/test/srt/test_srt_engine.py
index 5e778f1655d..834cbb866f6 100644
--- a/test/srt/test_srt_engine.py
+++ b/test/srt/test_srt_engine.py
@@ -18,6 +18,7 @@
     DEFAULT_MODEL_NAME_FOR_TEST,
     DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST,
     DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
+    GenericArgs,
 )
 
 
@@ -154,10 +155,10 @@ def test_6_engine_runtime_encode_consistency(self):
         self.assertTrue(torch.allclose(out1, out2, atol=1e-5, rtol=1e-3))
 
     def test_7_engine_offline_throughput(self):
-        server_args = SimpleNamespace(
+        server_args = GenericArgs(
             model_path=DEFAULT_MODEL_NAME_FOR_TEST,
         )
-        bench_args = SimpleNamespace(num_prompts=10)
+        bench_args = GenericArgs(num_prompts=10)
         result = throughput_test(server_args=server_args, bench_args=bench_args)
         self.assertTrue(result["throughput"] > 3800)
 

From 21b6ed5c4c4dd64f08673d2efc589725ada8eb9f Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Tue, 12 Nov 2024 23:42:42 +0000
Subject: [PATCH 14/30] push not being processed?


From 0589a6bcfad69c236fe65719cfc921b13fc57e0f Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Tue, 12 Nov 2024 23:46:00 +0000
Subject: [PATCH 15/30] lint

---
 python/sglang/test/test_utils.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index 4b59bdff173..af4abe4ceb4 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -2,6 +2,7 @@
 
 import argparse
 import asyncio
+import dataclasses
 import os
 import random
 import subprocess
@@ -10,8 +11,7 @@
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from types import SimpleNamespace
-from typing import Callable, List, Optional, Dict, Any
-import dataclasses
+from typing import Any, Callable, Dict, List, Optional
 
 import numpy as np
 import requests
@@ -40,6 +40,7 @@
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8"
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
 
+
 @dataclasses.dataclass
 class GenericArgs:
     __dict__: Dict[str, Any] = dataclasses.field(default_factory=dict)

From 383b6d1a7091295cee395b398ff1840f44fc8328 Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Wed, 13 Nov 2024 01:59:32 +0000
Subject: [PATCH 16/30] fix benches

---
 python/sglang/bench_offline_throughput.py | 56 +++++++++++++++--------
 python/sglang/test/test_utils.py          | 10 +---
 test/srt/test_srt_engine.py               | 10 ++--
 3 files changed, 43 insertions(+), 33 deletions(-)

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index d33b88b5b0a..557b810de06 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -33,9 +33,16 @@
 @dataclasses.dataclass
 class BenchArgs:
     run_name: str = "before"
+    backend: str = "engine"
     result_filename: str = ""
+    dataset_name: str = "sharegpt"
+    dataset_path: str = ""
+    num_prompts: int = 1000
+    sharegpt_output_len: Union[int, None] = None
+    random_input_len: Union[int, None] = None
+    random_output_len: Union[int, None] = None
+    random_range_ratio: Union[int, None] = None
     seed: int = 1
-    backend: str = "engine"
 
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
@@ -57,7 +64,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
         parser.add_argument(
             "--num-prompts",
             type=int,
-            default=1000,
+            default=BenchArgs.num_prompts,
             help="Number of prompts to process. Default is 1000.",
         )
         parser.add_argument(
@@ -96,6 +103,7 @@ def from_cli_args(cls, args: argparse.Namespace):
 
 def throughput_test_once(
     run_name: str,
+    backend_name: str,
     backend: Union[Engine, Runtime],
     reqs: List[Tuple[str, int, int]],
 ):
@@ -110,6 +118,9 @@ def throughput_test_once(
     )
     latency = time.perf_counter() - st
 
+    if backend_name == "runtime":
+        gen_out = json.loads(gen_out)
+
     measurement_results["total_latency"] = latency
     measurement_results["total_output_tokens"] = sum(
         o["meta_info"]["completion_tokens"] for o in gen_out
@@ -136,7 +147,7 @@ def throughput_test(
     else:
         raise ValueError('Please set backend to either "engine" or "runtime"')
 
-    tokenizer_id = args.model_path
+    tokenizer_id = server_args.model_path
     tokenizer = get_tokenizer(tokenizer_id)
 
     # Set global environmnets
@@ -144,26 +155,31 @@ def throughput_test(
     random.seed(bench_args.seed)
     np.random.seed(bench_args.seed)
 
-    if args.dataset_name == "sharegpt":
-        assert args.random_input_len is None and args.random_output_len is None
+    if bench_args.dataset_name == "sharegpt":
+        assert (
+            bench_args.random_input_len is None and bench_args.random_output_len is None
+        )
         input_requests = sample_sharegpt_requests(
-            dataset_path=args.dataset_path,
-            num_requests=args.num_prompts,
+            dataset_path=bench_args.dataset_path,
+            num_requests=bench_args.num_prompts,
             tokenizer=tokenizer,
-            fixed_output_len=args.sharegpt_output_len,
+            fixed_output_len=bench_args.sharegpt_output_len,
+        )
+    elif bench_args.dataset_name == "random":
+        assert (
+            bench_args.random_input_len is not None
+            and bench_args.random_output_len is not None
         )
-    elif args.dataset_name == "random":
-        assert args.random_input_len is not None and args.random_output_len is not None
         input_requests = sample_random_requests(
-            input_len=args.random_input_len,
-            output_len=args.random_output_len,
-            num_prompts=args.num_prompts,
-            range_ratio=args.random_range_ratio,
+            input_len=bench_args.random_input_len,
+            output_len=bench_args.random_output_len,
+            num_prompts=bench_args.num_prompts,
+            range_ratio=bench_args.random_range_ratio,
             tokenizer=tokenizer,
-            dataset_path=args.dataset_path,
+            dataset_path=bench_args.dataset_path,
         )
     else:
-        raise ValueError(f"Unknown dataset: {args.dataset_name}")
+        raise ValueError(f"Unknown dataset: {bench_args.dataset_name}")
 
     warmup_requests = sample_random_requests(
         input_len=20,
@@ -171,18 +187,20 @@ def throughput_test(
         num_prompts=2,
         range_ratio=0.8,
         tokenizer=tokenizer,
-        dataset_path=args.dataset_path,
+        dataset_path=bench_args.dataset_path,
     )
 
     # Warm up
     throughput_test_once(
         run_name="warmup",
+        backend_name=bench_args.backend,
         backend=backend,
         reqs=warmup_requests,
     )
 
     result = throughput_test_once(
         run_name=bench_args.run_name,
+        backend_name=bench_args.backend,
         backend=backend,
         reqs=input_requests,
     )
@@ -191,7 +209,7 @@ def throughput_test(
         with open(bench_args.result_filename, "a") as fout:
             fout.write(json.dumps(result) + "\n")
     else:
-        print(result)
+        return result
 
 
 if __name__ == "__main__":
@@ -208,6 +226,6 @@ def throughput_test(
     )
 
     try:
-        throughput_test(server_args, bench_args)
+        print(throughput_test(server_args, bench_args))
     except Exception as e:
         raise e
diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index af4abe4ceb4..8b979ead6d1 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -11,7 +11,7 @@
 from concurrent.futures import ThreadPoolExecutor
 from functools import partial
 from types import SimpleNamespace
-from typing import Any, Callable, Dict, List, Optional
+from typing import Callable, List, Optional
 
 import numpy as np
 import requests
@@ -41,14 +41,6 @@
 DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4"
 
 
-@dataclasses.dataclass
-class GenericArgs:
-    __dict__: Dict[str, Any] = dataclasses.field(default_factory=dict)
-
-    def __init__(self, **kwargs):
-        self.__dict__.update(kwargs)
-
-
 def is_in_ci():
     """Return whether it is in CI runner."""
     return os.getenv("SGLANG_IS_IN_CI", "false") == "true"
diff --git a/test/srt/test_srt_engine.py b/test/srt/test_srt_engine.py
index 834cbb866f6..16e23a92bad 100644
--- a/test/srt/test_srt_engine.py
+++ b/test/srt/test_srt_engine.py
@@ -11,14 +11,14 @@
 import torch
 
 import sglang as sgl
-from sglang.bench_offline_throughput import throughput_test
+from sglang.bench_offline_throughput import throughput_test, BenchArgs
+from sglang.srt.server_args import ServerArgs
 from sglang.srt.hf_transformers_utils import get_tokenizer
 from sglang.test.few_shot_gsm8k_engine import run_eval
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
     DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST,
     DEFAULT_SMALL_MODEL_NAME_FOR_TEST,
-    GenericArgs,
 )
 
 
@@ -155,12 +155,12 @@ def test_6_engine_runtime_encode_consistency(self):
         self.assertTrue(torch.allclose(out1, out2, atol=1e-5, rtol=1e-3))
 
     def test_7_engine_offline_throughput(self):
-        server_args = GenericArgs(
+        server_args = ServerArgs(
             model_path=DEFAULT_MODEL_NAME_FOR_TEST,
         )
-        bench_args = GenericArgs(num_prompts=10)
+        bench_args = BenchArgs(num_prompts=10)
         result = throughput_test(server_args=server_args, bench_args=bench_args)
-        self.assertTrue(result["throughput"] > 3800)
+        self.assertTrue(result["throughput"] > 3000)
 
 
 if __name__ == "__main__":

From 8db0340a41649afd253b38495e3ba11c0e864883 Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Wed, 13 Nov 2024 02:01:31 +0000
Subject: [PATCH 17/30] lint

---
 python/sglang/test/test_utils.py | 1 -
 test/srt/test_srt_engine.py      | 4 ++--
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py
index 8b979ead6d1..f136a4d1b64 100644
--- a/python/sglang/test/test_utils.py
+++ b/python/sglang/test/test_utils.py
@@ -2,7 +2,6 @@
 
 import argparse
 import asyncio
-import dataclasses
 import os
 import random
 import subprocess
diff --git a/test/srt/test_srt_engine.py b/test/srt/test_srt_engine.py
index 16e23a92bad..6170118950c 100644
--- a/test/srt/test_srt_engine.py
+++ b/test/srt/test_srt_engine.py
@@ -11,9 +11,9 @@
 import torch
 
 import sglang as sgl
-from sglang.bench_offline_throughput import throughput_test, BenchArgs
-from sglang.srt.server_args import ServerArgs
+from sglang.bench_offline_throughput import BenchArgs, throughput_test
 from sglang.srt.hf_transformers_utils import get_tokenizer
+from sglang.srt.server_args import ServerArgs
 from sglang.test.few_shot_gsm8k_engine import run_eval
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,

From c6a6827003add1e584f4ffada0dea92aa57e83b8 Mon Sep 17 00:00:00 2001
From: ByronHsu <byronhsu1230@gmail.com>
Date: Wed, 13 Nov 2024 05:19:15 +0000
Subject: [PATCH 18/30] add review

---
 python/sglang/bench_offline_throughput.py | 31 +++++++++++++----------
 1 file changed, 18 insertions(+), 13 deletions(-)

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index 557b810de06..ac80a1d2997 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -4,8 +4,10 @@
 It accepts the same arguments as bench_latency.py
 
 # Usage
+# TODO: is this runnable?
 python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl
 
+# TODO: add running command for shared gpt, random, and gen-shared-prefix dataset
 """
 
 import argparse
@@ -32,16 +34,22 @@
 
 @dataclasses.dataclass
 class BenchArgs:
+    # TODO: what does "before" mean
     run_name: str = "before"
     backend: str = "engine"
     result_filename: str = ""
     dataset_name: str = "sharegpt"
     dataset_path: str = ""
     num_prompts: int = 1000
-    sharegpt_output_len: Union[int, None] = None
-    random_input_len: Union[int, None] = None
-    random_output_len: Union[int, None] = None
-    random_range_ratio: Union[int, None] = None
+    # TODO: with None, the program crashes with
+    # bench_offline_throughput.py", line 101, in <dictcomp>
+    # **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
+    # TypeError: NoneType takes no arguments
+    # Ideally we want to make it easier to run with specified default values, so users dont have to keep trial and errors
+    sharegpt_output_len: int = 256
+    random_input_len: int = 256
+    random_output_len: int = 256
+    random_range_ratio: float = 0.0
     seed: int = 1
 
     @staticmethod
@@ -70,23 +78,25 @@ def add_cli_args(parser: argparse.ArgumentParser):
         parser.add_argument(
             "--sharegpt-output-len",
             type=int,
-            default=None,
+            default=BenchArgs.sharegpt_output_len,
             help="Output length for each request. Overrides the output length from the ShareGPT dataset.",
         )
         parser.add_argument(
             "--random-input-len",
             type=int,
+            default=BenchArgs.random_input_len,
             help="Number of input tokens per request, used only for random dataset.",
         )
         parser.add_argument(
             "--random-output-len",
             type=int,
+            default=BenchArgs.random_output_len,
             help="Number of output tokens per request, used only for random dataset.",
         )
         parser.add_argument(
             "--random-range-ratio",
             type=float,
-            default=0.0,
+            default=BenchArgs.random_range_ratio,
             help="Range of sampled ratio of input/output length, "
             "used only for random dataset.",
         )
@@ -96,6 +106,7 @@ def add_cli_args(parser: argparse.ArgumentParser):
     def from_cli_args(cls, args: argparse.Namespace):
         # use the default value's type to case the args into correct types.
         attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)]
+        print(attrs)
         return cls(
             **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
         )
@@ -156,9 +167,6 @@ def throughput_test(
     np.random.seed(bench_args.seed)
 
     if bench_args.dataset_name == "sharegpt":
-        assert (
-            bench_args.random_input_len is None and bench_args.random_output_len is None
-        )
         input_requests = sample_sharegpt_requests(
             dataset_path=bench_args.dataset_path,
             num_requests=bench_args.num_prompts,
@@ -166,10 +174,6 @@ def throughput_test(
             fixed_output_len=bench_args.sharegpt_output_len,
         )
     elif bench_args.dataset_name == "random":
-        assert (
-            bench_args.random_input_len is not None
-            and bench_args.random_output_len is not None
-        )
         input_requests = sample_random_requests(
             input_len=bench_args.random_input_len,
             output_len=bench_args.random_output_len,
@@ -178,6 +182,7 @@ def throughput_test(
             tokenizer=tokenizer,
             dataset_path=bench_args.dataset_path,
         )
+    # TODO: gen-shared-prefix dataset
     else:
         raise ValueError(f"Unknown dataset: {bench_args.dataset_name}")
 

From ed1a1333e607296213571ae35c8a57b694eacca9 Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Wed, 13 Nov 2024 07:07:02 +0000
Subject: [PATCH 19/30] address todos

---
 python/sglang/bench_offline_throughput.py | 63 +++++++++++++++++------
 1 file changed, 48 insertions(+), 15 deletions(-)

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index ac80a1d2997..8fc892ac2ad 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -4,8 +4,14 @@
 It accepts the same arguments as bench_latency.py
 
 # Usage
-# TODO: is this runnable?
-python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl
+## Sharegpt dataset with default args
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct
+
+## Random dataset with default args
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --backend random
+
+## Shared prefix dataset with default args
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --backend generated-shared-prefix
 
 # TODO: add running command for shared gpt, random, and gen-shared-prefix dataset
 """
@@ -26,6 +32,7 @@
     get_tokenizer,
     sample_random_requests,
     sample_sharegpt_requests,
+    sample_generated_shared_prefix_requests,
     set_ulimit,
 )
 from sglang.srt.server import Engine, Runtime
@@ -34,27 +41,23 @@
 
 @dataclasses.dataclass
 class BenchArgs:
-    # TODO: what does "before" mean
-    run_name: str = "before"
     backend: str = "engine"
     result_filename: str = ""
     dataset_name: str = "sharegpt"
     dataset_path: str = ""
     num_prompts: int = 1000
-    # TODO: with None, the program crashes with
-    # bench_offline_throughput.py", line 101, in <dictcomp>
-    # **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs}
-    # TypeError: NoneType takes no arguments
-    # Ideally we want to make it easier to run with specified default values, so users dont have to keep trial and errors
     sharegpt_output_len: int = 256
     random_input_len: int = 256
     random_output_len: int = 256
     random_range_ratio: float = 0.0
+    gen_num_groups: int = (8,)
+    gen_prompts_per_group: int = (16,)
+    gen_system_prompt_len: int = (128,)
+    gen_question_len: int = (256,)
     seed: int = 1
 
     @staticmethod
     def add_cli_args(parser: argparse.ArgumentParser):
-        parser.add_argument("--run-name", type=str, default=BenchArgs.run_name)
         parser.add_argument("--backend", type=str, default=BenchArgs.backend)
         parser.add_argument(
             "--result-filename", type=str, default=BenchArgs.result_filename
@@ -100,6 +103,32 @@ def add_cli_args(parser: argparse.ArgumentParser):
             help="Range of sampled ratio of input/output length, "
             "used only for random dataset.",
         )
+        parser.add_argument(
+            "--gen-num-groups",
+            type=int,
+            default=BenchArgs.gen_num_groups,
+            help="Number of groups with shared prefix, used"
+            "only for generate-shared-prefix",
+        )
+        parser.add_argument(
+            "--gen-prompts-per-group",
+            type=int,
+            default=BenchArgs.gen_prompts_per_group,
+            help="Number of prompts per group of shared prefix, used"
+            "only for generate-shared-prefix",
+        )
+        parser.add_argument(
+            "--gen-system-prompt-len",
+            type=int,
+            default=BenchArgs.gen_system_prompt_len,
+            help="System prompt length, used" "only for generate-shared-prefix",
+        )
+        parser.add_argument(
+            "--gen-question-len",
+            type=int,
+            default=BenchArgs.gen_question_len,
+            help="Question length, used" "only for generate-shared-prefix",
+        )
         parser.add_argument("--seed", type=int, default=1, help="The random seed.")
 
     @classmethod
@@ -113,13 +142,11 @@ def from_cli_args(cls, args: argparse.Namespace):
 
 
 def throughput_test_once(
-    run_name: str,
     backend_name: str,
     backend: Union[Engine, Runtime],
     reqs: List[Tuple[str, int, int]],
 ):
     measurement_results = {
-        "run_name": run_name,
         "total_input_tokens": sum(r[1] for r in reqs),
     }
 
@@ -182,7 +209,15 @@ def throughput_test(
             tokenizer=tokenizer,
             dataset_path=bench_args.dataset_path,
         )
-    # TODO: gen-shared-prefix dataset
+    elif bench_args.dataset_name == "generated-shared-prefix":
+        input_requests = sample_generated_shared_prefix_requests(
+            num_groups=bench_args.gen_num_groups,
+            prompts_per_group=bench_args.gen_prompts_per_group,
+            system_prompt_len=bench_args.gen_system_prompt_len,
+            question_len=bench_args.gen_question_len,
+            output_len=bench_args.gen_output_len,
+            tokenizer=tokenizer,
+        )
     else:
         raise ValueError(f"Unknown dataset: {bench_args.dataset_name}")
 
@@ -197,14 +232,12 @@ def throughput_test(
 
     # Warm up
     throughput_test_once(
-        run_name="warmup",
         backend_name=bench_args.backend,
         backend=backend,
         reqs=warmup_requests,
     )
 
     result = throughput_test_once(
-        run_name=bench_args.run_name,
         backend_name=bench_args.backend,
         backend=backend,
         reqs=input_requests,

From c485dbeeefade7841c468f33d5494e45a1b98f9f Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Wed, 13 Nov 2024 08:54:18 +0000
Subject: [PATCH 20/30] not sure how the tuple stuff got there

---
 python/sglang/bench_offline_throughput.py | 10 +++++-----
 python/sglang/bench_serving.py            |  1 -
 2 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index 8fc892ac2ad..9a86ef2ace1 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -30,9 +30,9 @@
 from sglang.api import Engine as getEngine
 from sglang.bench_serving import (
     get_tokenizer,
+    sample_generated_shared_prefix_requests,
     sample_random_requests,
     sample_sharegpt_requests,
-    sample_generated_shared_prefix_requests,
     set_ulimit,
 )
 from sglang.srt.server import Engine, Runtime
@@ -50,10 +50,10 @@ class BenchArgs:
     random_input_len: int = 256
     random_output_len: int = 256
     random_range_ratio: float = 0.0
-    gen_num_groups: int = (8,)
-    gen_prompts_per_group: int = (16,)
-    gen_system_prompt_len: int = (128,)
-    gen_question_len: int = (256,)
+    gen_num_groups: int = 8
+    gen_prompts_per_group: int = 16
+    gen_system_prompt_len: int = 128
+    gen_question_len: int = 256
     seed: int = 1
 
     @staticmethod
diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
index c0cf946ede9..686be14b910 100644
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -590,7 +590,6 @@ def sample_random_requests(
             (data["conversations"][0]["value"], data["conversations"][1]["value"])
             for data in dataset
         ]
-
         # Shuffle the dataset.
         random.shuffle(dataset)
 

From fd2d04ddef3683ca2d6eb8eaa61d829180dffa1b Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Wed, 13 Nov 2024 09:56:28 +0000
Subject: [PATCH 21/30] fix

---
 python/sglang/bench_offline_throughput.py | 33 ++++++++++++++++++-----
 1 file changed, 26 insertions(+), 7 deletions(-)

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index 9a86ef2ace1..018567cb3c3 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -25,8 +25,6 @@
 import time
 from typing import Dict, List, Tuple, Union
 
-import numpy as np
-
 from sglang.api import Engine as getEngine
 from sglang.bench_serving import (
     get_tokenizer,
@@ -54,6 +52,7 @@ class BenchArgs:
     gen_prompts_per_group: int = 16
     gen_system_prompt_len: int = 128
     gen_question_len: int = 256
+    disable_ignore_eos: bool = False
     seed: int = 1
 
     @staticmethod
@@ -129,6 +128,12 @@ def add_cli_args(parser: argparse.ArgumentParser):
             default=BenchArgs.gen_question_len,
             help="Question length, used" "only for generate-shared-prefix",
         )
+        parser.add_argument(
+            "--disable-ignore-eos",
+            type=bool,
+            default=BenchArgs.disable_ignore_eos,
+            help="Disable ignore EOS token",
+        )
         parser.add_argument("--seed", type=int, default=1, help="The random seed.")
 
     @classmethod
@@ -145,15 +150,22 @@ def throughput_test_once(
     backend_name: str,
     backend: Union[Engine, Runtime],
     reqs: List[Tuple[str, int, int]],
+    output_len: int,
+    ignore_eos: bool,
 ):
     measurement_results = {
         "total_input_tokens": sum(r[1] for r in reqs),
     }
 
+    prompt = [r[0] for r in reqs]
+    sampling_params = {
+        "temperature": 0,
+        "max_new_tokens": output_len,
+        "ignore_eos": ignore_eos,
+    }
+
     st = time.perf_counter()
-    gen_out = backend.generate(
-        prompt=[r[0] for r in reqs], sampling_params={"temperature": 0}
-    )
+    gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
     latency = time.perf_counter() - st
 
     if backend_name == "runtime":
@@ -200,6 +212,7 @@ def throughput_test(
             tokenizer=tokenizer,
             fixed_output_len=bench_args.sharegpt_output_len,
         )
+        output_len = bench_args.sharegpt_output_len
     elif bench_args.dataset_name == "random":
         input_requests = sample_random_requests(
             input_len=bench_args.random_input_len,
@@ -209,6 +222,7 @@ def throughput_test(
             tokenizer=tokenizer,
             dataset_path=bench_args.dataset_path,
         )
+        output_len = bench_args.random_output_len
     elif bench_args.dataset_name == "generated-shared-prefix":
         input_requests = sample_generated_shared_prefix_requests(
             num_groups=bench_args.gen_num_groups,
@@ -218,6 +232,7 @@ def throughput_test(
             output_len=bench_args.gen_output_len,
             tokenizer=tokenizer,
         )
+        output_len = bench_args.gen_output_len
     else:
         raise ValueError(f"Unknown dataset: {bench_args.dataset_name}")
 
@@ -235,19 +250,23 @@ def throughput_test(
         backend_name=bench_args.backend,
         backend=backend,
         reqs=warmup_requests,
+        output_len=output_len,
+        ignore_eos=not bench_args.disable_ignore_eos,
     )
 
     result = throughput_test_once(
         backend_name=bench_args.backend,
         backend=backend,
         reqs=input_requests,
+        output_len=output_len,
+        ignore_eos=not bench_args.disable_ignore_eos,
     )
 
     if bench_args.result_filename:
         with open(bench_args.result_filename, "a") as fout:
             fout.write(json.dumps(result) + "\n")
-    else:
-        return result
+
+    return result
 
 
 if __name__ == "__main__":

From ea3b60a3d989d9321b9c79044c82ca6b8c3aa82f Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Wed, 13 Nov 2024 20:56:25 +0000
Subject: [PATCH 22/30] fix

---
 python/sglang/bench_offline_throughput.py | 41 +++------------
 python/sglang/bench_serving.py            | 64 +++++++++++++----------
 2 files changed, 43 insertions(+), 62 deletions(-)

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index 018567cb3c3..4d88b1c0bd2 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -8,12 +8,13 @@
 python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct
 
 ## Random dataset with default args
-python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --backend random
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random
 
 ## Shared prefix dataset with default args
-python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --backend generated-shared-prefix
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --dataset-name generated-shared-prefix
 
-# TODO: add running command for shared gpt, random, and gen-shared-prefix dataset
+## Sharegpt dataset on runtime backend
+python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --backend runtime
 """
 
 import argparse
@@ -25,9 +26,12 @@
 import time
 from typing import Dict, List, Tuple, Union
 
+import numpy as np
+
 from sglang.api import Engine as getEngine
 from sglang.bench_serving import (
     get_tokenizer,
+    get_dataset,
     sample_generated_shared_prefix_requests,
     sample_random_requests,
     sample_sharegpt_requests,
@@ -205,36 +209,7 @@ def throughput_test(
     random.seed(bench_args.seed)
     np.random.seed(bench_args.seed)
 
-    if bench_args.dataset_name == "sharegpt":
-        input_requests = sample_sharegpt_requests(
-            dataset_path=bench_args.dataset_path,
-            num_requests=bench_args.num_prompts,
-            tokenizer=tokenizer,
-            fixed_output_len=bench_args.sharegpt_output_len,
-        )
-        output_len = bench_args.sharegpt_output_len
-    elif bench_args.dataset_name == "random":
-        input_requests = sample_random_requests(
-            input_len=bench_args.random_input_len,
-            output_len=bench_args.random_output_len,
-            num_prompts=bench_args.num_prompts,
-            range_ratio=bench_args.random_range_ratio,
-            tokenizer=tokenizer,
-            dataset_path=bench_args.dataset_path,
-        )
-        output_len = bench_args.random_output_len
-    elif bench_args.dataset_name == "generated-shared-prefix":
-        input_requests = sample_generated_shared_prefix_requests(
-            num_groups=bench_args.gen_num_groups,
-            prompts_per_group=bench_args.gen_prompts_per_group,
-            system_prompt_len=bench_args.gen_system_prompt_len,
-            question_len=bench_args.gen_question_len,
-            output_len=bench_args.gen_output_len,
-            tokenizer=tokenizer,
-        )
-        output_len = bench_args.gen_output_len
-    else:
-        raise ValueError(f"Unknown dataset: {bench_args.dataset_name}")
+    input_requests = get_dataset(bench_args, tokenizer)
 
     warmup_requests = sample_random_requests(
         input_len=20,
diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
index 686be14b910..344142f1be9 100644
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -421,6 +421,40 @@ def get_tokenizer(
     )
 
 
+def get_dataset(args, tokenizer):
+    if args.dataset_name == "sharegpt":
+        input_requests = sample_sharegpt_requests(
+            dataset_path=args.dataset_path,
+            num_requests=args.num_prompts,
+            tokenizer=tokenizer,
+            fixed_output_len=args.sharegpt_output_len,
+        )
+        output_len = args.sharegpt_output_len
+    elif args.dataset_name == "random":
+        input_requests = sample_random_requests(
+            input_len=args.random_input_len,
+            output_len=args.random_output_len,
+            num_prompts=args.num_prompts,
+            range_ratio=args.random_range_ratio,
+            tokenizer=tokenizer,
+            dataset_path=args.dataset_path,
+        )
+        output_len = args.random_output_len
+    elif args.dataset_name == "generated-shared-prefix":
+        input_requests = sample_generated_shared_prefix_requests(
+            num_groups=args.gen_num_groups,
+            prompts_per_group=args.gen_prompts_per_group,
+            system_prompt_len=args.gen_system_prompt_len,
+            question_len=args.gen_question_len,
+            output_len=args.gen_output_len,
+            tokenizer=tokenizer,
+        )
+        output_len = args.gen_output_len
+    else:
+        raise ValueError(f"Unknown dataset: {args.dataset_name}")
+    return output_len
+
+
 ASYNC_REQUEST_FUNCS = {
     "sglang": async_request_sglang_generate,
     "sglang-native": async_request_sglang_generate,
@@ -1097,35 +1131,7 @@ def run_benchmark(args_: argparse.Namespace):
 
     tokenizer = get_tokenizer(tokenizer_id)
 
-    if args.dataset_name == "sharegpt":
-        assert args.random_input_len is None and args.random_output_len is None
-        input_requests = sample_sharegpt_requests(
-            dataset_path=args.dataset_path,
-            num_requests=args.num_prompts,
-            tokenizer=tokenizer,
-            fixed_output_len=args.sharegpt_output_len,
-        )
-    elif args.dataset_name == "random":
-        assert args.random_input_len is not None and args.random_output_len is not None
-        input_requests = sample_random_requests(
-            input_len=args.random_input_len,
-            output_len=args.random_output_len,
-            num_prompts=args.num_prompts,
-            range_ratio=args.random_range_ratio,
-            tokenizer=tokenizer,
-            dataset_path=args.dataset_path,
-        )
-    elif args.dataset_name == "generated-shared-prefix":
-        input_requests = sample_generated_shared_prefix_requests(
-            num_groups=args.gen_num_groups,
-            prompts_per_group=args.gen_prompts_per_group,
-            system_prompt_len=args.gen_system_prompt_len,
-            question_len=args.gen_question_len,
-            output_len=args.gen_output_len,
-            tokenizer=tokenizer,
-        )
-    else:
-        raise ValueError(f"Unknown dataset: {args.dataset_name}")
+    input_requests = get_dataset(args, tokenizer)
 
     if not args.multi:
         return asyncio.run(

From 732e3babb1ebeaf8380a02075ac368308528c51f Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Wed, 13 Nov 2024 20:58:36 +0000
Subject: [PATCH 23/30] lint

---
 python/sglang/bench_offline_throughput.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index 4d88b1c0bd2..c856db47c69 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -30,8 +30,8 @@
 
 from sglang.api import Engine as getEngine
 from sglang.bench_serving import (
-    get_tokenizer,
     get_dataset,
+    get_tokenizer,
     sample_generated_shared_prefix_requests,
     sample_random_requests,
     sample_sharegpt_requests,

From 41aad44e495de46e8b90fdbd37732c1a77159d2d Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Thu, 14 Nov 2024 03:22:31 +0000
Subject: [PATCH 24/30] format benchmark + add diff metrics

---
 python/sglang/bench_offline_throughput.py | 76 ++++++++++++++++++++---
 python/sglang/bench_serving.py            |  9 +++
 python/sglang/srt/server.py               |  2 +-
 3 files changed, 76 insertions(+), 11 deletions(-)

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index c856db47c69..03ae4032e06 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -1,7 +1,7 @@
 """
 Benchmark the throughput of using the offline LLM engine.
 This script does not launch a server.
-It accepts the same arguments as bench_latency.py
+It accepts the same arguments as launch_server.py and additional benchmark arguments
 
 # Usage
 ## Sharegpt dataset with default args
@@ -154,19 +154,29 @@ def throughput_test_once(
     backend_name: str,
     backend: Union[Engine, Runtime],
     reqs: List[Tuple[str, int, int]],
-    output_len: int,
     ignore_eos: bool,
 ):
     measurement_results = {
+        "backend": backend_name,
+        "successful_requests": len(reqs),
+        "total_latency": -1,
         "total_input_tokens": sum(r[1] for r in reqs),
+        "total_output_tokens": -1,
+        "request_throughput": -1,
+        "input_throughput": -1,
+        "output_throughput": -1,
+        "total_throughput": -1,
     }
 
     prompt = [r[0] for r in reqs]
-    sampling_params = {
-        "temperature": 0,
-        "max_new_tokens": output_len,
-        "ignore_eos": ignore_eos,
-    }
+    sampling_params = [
+        {
+            "temperature": 0,
+            "max_new_tokens": r[2],
+            "ignore_eos": ignore_eos,
+        }
+        for r in reqs
+    ]
 
     st = time.perf_counter()
     gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params)
@@ -179,12 +189,20 @@ def throughput_test_once(
     measurement_results["total_output_tokens"] = sum(
         o["meta_info"]["completion_tokens"] for o in gen_out
     )
-    measurement_results["throughput"] = (
+    measurement_results["request_throughput"] = (
+        measurement_results["successful_requests"] / latency
+    )
+    measurement_results["input_throughput"] = (
+        measurement_results["total_input_tokens"] / latency
+    )
+    measurement_results["output_throughput"] = (
+        measurement_results["total_output_tokens"] / latency
+    )
+    measurement_results["total_throughput"] = (
         measurement_results["total_input_tokens"]
         + measurement_results["total_output_tokens"]
     ) / latency
 
-    print(f"Throughput: {measurement_results['throughput']} tokens/s")
     return measurement_results
 
 
@@ -258,6 +276,44 @@ def throughput_test(
     )
 
     try:
-        print(throughput_test(server_args, bench_args))
+        res = throughput_test(server_args, bench_args)
+        print(
+            "\n{s:{c}^{n}}".format(
+                s=" Offline Throughput Benchmark Result ", n=50, c="="
+            )
+        )
+        print("{:<40} {:<10}".format("Backend:", res["backend"]))
+        print(
+            "{:<40} {:<10}".format("Successful requests:", res["successful_requests"])
+        )
+        print(
+            "{:<40} {:<10.2f}".format("Benchmark duration (s):", res["total_latency"])
+        )
+        print("{:<40} {:<10}".format("Total input tokens:", res["total_input_tokens"]))
+        print(
+            "{:<40} {:<10}".format(
+                "Total generated tokens:", res["total_output_tokens"]
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Request throughput (req/s):", res["request_throughput"]
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Input token throughput (tok/s):", res["input_throughput"]
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Output token throughput (tok/s):", res["output_throughput"]
+            )
+        )
+        print(
+            "{:<40} {:<10.2f}".format(
+                "Total token throughput (tok/s):", res["total_throughput"]
+            )
+        )
     except Exception as e:
         raise e
diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
index 344142f1be9..3f2c680e7ad 100644
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -477,6 +477,8 @@ class BenchmarkMetrics:
     input_throughput: float
     output_throughput: float
     output_throughput_retokenized: float
+    total_throughput: float
+    total_throughput_retokenized: float
     mean_ttft_ms: float
     median_ttft_ms: float
     std_ttft_ms: float
@@ -797,6 +799,8 @@ def calculate_metrics(
         input_throughput=total_input / dur_s,
         output_throughput=sum(output_lens) / dur_s,
         output_throughput_retokenized=sum(retokenized_output_lens) / dur_s,
+        total_throughput=(total_input + sum(output_lens)) / dur_s,
+        total_throughput_retokenized=(total_input + sum(retokenized_output_lens)) / dur_s,
         mean_ttft_ms=np.mean(ttfts or 0)
         * 1000,  # ttfts is empty if streaming is not supported by backend
         median_ttft_ms=np.median(ttfts or 0) * 1000,
@@ -914,6 +918,11 @@ async def benchmark(
             "Output token throughput (tok/s):", metrics.output_throughput
         )
     )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Total token throughput (tok/s):", metrics.total_throughput
+        )
+    )
     print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-"))
     print(
         "{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms)
diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py
index e27bb1bb97b..e4798877a83 100644
--- a/python/sglang/srt/server.py
+++ b/python/sglang/srt/server.py
@@ -768,7 +768,7 @@ def generate(
         self,
         # The input prompt. It can be a single prompt or a batch of prompts.
         prompt: Optional[Union[List[str], str]] = None,
-        sampling_params: Optional[Dict] = None,
+        sampling_params: Optional[Union[List[Dict], Dict]] = None,
         # The token ids for text; one can either specify text or input_ids.
         input_ids: Optional[Union[List[List[int]], List[int]]] = None,
         return_logprob: Optional[Union[List[bool], bool]] = False,

From fa76ac95017801ae72356b079891fbaafbaff893 Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Thu, 14 Nov 2024 03:22:48 +0000
Subject: [PATCH 25/30] lint

---
 python/sglang/bench_serving.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
index 3f2c680e7ad..9dfc6b3bff2 100644
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -800,7 +800,8 @@ def calculate_metrics(
         output_throughput=sum(output_lens) / dur_s,
         output_throughput_retokenized=sum(retokenized_output_lens) / dur_s,
         total_throughput=(total_input + sum(output_lens)) / dur_s,
-        total_throughput_retokenized=(total_input + sum(retokenized_output_lens)) / dur_s,
+        total_throughput_retokenized=(total_input + sum(retokenized_output_lens))
+        / dur_s,
         mean_ttft_ms=np.mean(ttfts or 0)
         * 1000,  # ttfts is empty if streaming is not supported by backend
         median_ttft_ms=np.median(ttfts or 0) * 1000,

From cc2a5c56f68c220625fa258ed9a3f00e220c9446 Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Thu, 14 Nov 2024 04:22:44 +0000
Subject: [PATCH 26/30] fix script

---
 python/sglang/bench_offline_throughput.py | 3 +--
 python/sglang/bench_serving.py            | 5 +----
 2 files changed, 2 insertions(+), 6 deletions(-)

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index 03ae4032e06..e9729cfbf1b 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -243,7 +243,6 @@ def throughput_test(
         backend_name=bench_args.backend,
         backend=backend,
         reqs=warmup_requests,
-        output_len=output_len,
         ignore_eos=not bench_args.disable_ignore_eos,
     )
 
@@ -251,7 +250,6 @@ def throughput_test(
         backend_name=bench_args.backend,
         backend=backend,
         reqs=input_requests,
-        output_len=output_len,
         ignore_eos=not bench_args.disable_ignore_eos,
     )
 
@@ -315,5 +313,6 @@ def throughput_test(
                 "Total token throughput (tok/s):", res["total_throughput"]
             )
         )
+        print("=" * 50)
     except Exception as e:
         raise e
diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py
index 9dfc6b3bff2..68c67241302 100644
--- a/python/sglang/bench_serving.py
+++ b/python/sglang/bench_serving.py
@@ -429,7 +429,6 @@ def get_dataset(args, tokenizer):
             tokenizer=tokenizer,
             fixed_output_len=args.sharegpt_output_len,
         )
-        output_len = args.sharegpt_output_len
     elif args.dataset_name == "random":
         input_requests = sample_random_requests(
             input_len=args.random_input_len,
@@ -439,7 +438,6 @@ def get_dataset(args, tokenizer):
             tokenizer=tokenizer,
             dataset_path=args.dataset_path,
         )
-        output_len = args.random_output_len
     elif args.dataset_name == "generated-shared-prefix":
         input_requests = sample_generated_shared_prefix_requests(
             num_groups=args.gen_num_groups,
@@ -449,10 +447,9 @@ def get_dataset(args, tokenizer):
             output_len=args.gen_output_len,
             tokenizer=tokenizer,
         )
-        output_len = args.gen_output_len
     else:
         raise ValueError(f"Unknown dataset: {args.dataset_name}")
-    return output_len
+    return input_requests
 
 
 ASYNC_REQUEST_FUNCS = {

From e1045e48d011e5981005874c32807af74034263b Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Thu, 14 Nov 2024 04:36:40 +0000
Subject: [PATCH 27/30] fix test

---
 test/srt/test_srt_engine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/srt/test_srt_engine.py b/test/srt/test_srt_engine.py
index 6170118950c..6e630d4848d 100644
--- a/test/srt/test_srt_engine.py
+++ b/test/srt/test_srt_engine.py
@@ -160,7 +160,7 @@ def test_7_engine_offline_throughput(self):
         )
         bench_args = BenchArgs(num_prompts=10)
         result = throughput_test(server_args=server_args, bench_args=bench_args)
-        self.assertTrue(result["throughput"] > 3000)
+        self.assertTrue(result["total_throughput"] > 3000)
 
 
 if __name__ == "__main__":

From 4a322a31f38bb727bf62bc4337a70f93e01d1099 Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Thu, 14 Nov 2024 07:57:23 +0000
Subject: [PATCH 28/30] fix

---
 python/sglang/bench_offline_throughput.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index e9729cfbf1b..4c66e401490 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -19,25 +19,22 @@
 
 import argparse
 import dataclasses
-import itertools
 import json
 import logging
 import random
 import time
-from typing import Dict, List, Tuple, Union
+from typing import List, Tuple
 
 import numpy as np
 
-from sglang.api import Engine as getEngine
+from sglang.api import Engine
 from sglang.bench_serving import (
     get_dataset,
     get_tokenizer,
-    sample_generated_shared_prefix_requests,
     sample_random_requests,
-    sample_sharegpt_requests,
     set_ulimit,
 )
-from sglang.srt.server import Engine, Runtime
+from sglang.srt.server import Runtime
 from sglang.srt.server_args import ServerArgs
 
 
@@ -152,7 +149,7 @@ def from_cli_args(cls, args: argparse.Namespace):
 
 def throughput_test_once(
     backend_name: str,
-    backend: Union[Engine, Runtime],
+    backend,
     reqs: List[Tuple[str, int, int]],
     ignore_eos: bool,
 ):
@@ -211,7 +208,7 @@ def throughput_test(
     bench_args: BenchArgs,
 ):
     if bench_args.backend == "engine":
-        backend = getEngine(**dataclasses.asdict(server_args))
+        backend = Engine(**dataclasses.asdict(server_args))
         if not backend:
             raise ValueError("Please provide valid engine arguments")
     elif bench_args.backend == "runtime":

From ef4f2782d57e704d18b42d824316d34b1d9a52ff Mon Sep 17 00:00:00 2001
From: zolinthecow <colinzhao777@gmail.com>
Date: Thu, 14 Nov 2024 07:58:46 +0000
Subject: [PATCH 29/30] remove useless try except

---
 python/sglang/bench_offline_throughput.py | 65 +++++++++--------------
 1 file changed, 26 insertions(+), 39 deletions(-)

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index 4c66e401490..104d44cbf91 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -270,46 +270,33 @@ def throughput_test(
         format="%(message)s",
     )
 
-    try:
-        res = throughput_test(server_args, bench_args)
-        print(
-            "\n{s:{c}^{n}}".format(
-                s=" Offline Throughput Benchmark Result ", n=50, c="="
-            )
-        )
-        print("{:<40} {:<10}".format("Backend:", res["backend"]))
-        print(
-            "{:<40} {:<10}".format("Successful requests:", res["successful_requests"])
-        )
-        print(
-            "{:<40} {:<10.2f}".format("Benchmark duration (s):", res["total_latency"])
-        )
-        print("{:<40} {:<10}".format("Total input tokens:", res["total_input_tokens"]))
-        print(
-            "{:<40} {:<10}".format(
-                "Total generated tokens:", res["total_output_tokens"]
-            )
-        )
-        print(
-            "{:<40} {:<10.2f}".format(
-                "Request throughput (req/s):", res["request_throughput"]
-            )
+    res = throughput_test(server_args, bench_args)
+    print(
+        "\n{s:{c}^{n}}".format(s=" Offline Throughput Benchmark Result ", n=50, c="=")
+    )
+    print("{:<40} {:<10}".format("Backend:", res["backend"]))
+    print("{:<40} {:<10}".format("Successful requests:", res["successful_requests"]))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", res["total_latency"]))
+    print("{:<40} {:<10}".format("Total input tokens:", res["total_input_tokens"]))
+    print("{:<40} {:<10}".format("Total generated tokens:", res["total_output_tokens"]))
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Request throughput (req/s):", res["request_throughput"]
         )
-        print(
-            "{:<40} {:<10.2f}".format(
-                "Input token throughput (tok/s):", res["input_throughput"]
-            )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Input token throughput (tok/s):", res["input_throughput"]
         )
-        print(
-            "{:<40} {:<10.2f}".format(
-                "Output token throughput (tok/s):", res["output_throughput"]
-            )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Output token throughput (tok/s):", res["output_throughput"]
         )
-        print(
-            "{:<40} {:<10.2f}".format(
-                "Total token throughput (tok/s):", res["total_throughput"]
-            )
+    )
+    print(
+        "{:<40} {:<10.2f}".format(
+            "Total token throughput (tok/s):", res["total_throughput"]
         )
-        print("=" * 50)
-    except Exception as e:
-        raise e
+    )
+    print("=" * 50)

From df9da2e43bb7f1480ab8f50f7aa5c1d0f561b05a Mon Sep 17 00:00:00 2001
From: ByronHsu <byronhsu1230@gmail.com>
Date: Fri, 15 Nov 2024 05:38:26 +0000
Subject: [PATCH 30/30] fix test and move logging

---
 python/sglang/bench_offline_throughput.py | 55 ++++++++++++-----------
 test/srt/test_srt_engine.py               |  2 +-
 2 files changed, 30 insertions(+), 27 deletions(-)

diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py
index 104d44cbf91..3c57e1144c0 100644
--- a/python/sglang/bench_offline_throughput.py
+++ b/python/sglang/bench_offline_throughput.py
@@ -254,49 +254,52 @@ def throughput_test(
         with open(bench_args.result_filename, "a") as fout:
             fout.write(json.dumps(result) + "\n")
 
-    return result
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    ServerArgs.add_cli_args(parser)
-    BenchArgs.add_cli_args(parser)
-    args = parser.parse_args()
-    server_args = ServerArgs.from_cli_args(args)
-    bench_args = BenchArgs.from_cli_args(args)
-
-    logging.basicConfig(
-        level=getattr(logging, server_args.log_level.upper()),
-        format="%(message)s",
-    )
-
-    res = throughput_test(server_args, bench_args)
     print(
         "\n{s:{c}^{n}}".format(s=" Offline Throughput Benchmark Result ", n=50, c="=")
     )
-    print("{:<40} {:<10}".format("Backend:", res["backend"]))
-    print("{:<40} {:<10}".format("Successful requests:", res["successful_requests"]))
-    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", res["total_latency"]))
-    print("{:<40} {:<10}".format("Total input tokens:", res["total_input_tokens"]))
-    print("{:<40} {:<10}".format("Total generated tokens:", res["total_output_tokens"]))
+    print("{:<40} {:<10}".format("Backend:", result["backend"]))
+    print("{:<40} {:<10}".format("Successful requests:", result["successful_requests"]))
+    print("{:<40} {:<10.2f}".format("Benchmark duration (s):", result["total_latency"]))
+    print("{:<40} {:<10}".format("Total input tokens:", result["total_input_tokens"]))
+    print(
+        "{:<40} {:<10}".format("Total generated tokens:", result["total_output_tokens"])
+    )
     print(
         "{:<40} {:<10.2f}".format(
-            "Request throughput (req/s):", res["request_throughput"]
+            "Request throughput (req/s):", result["request_throughput"]
         )
     )
     print(
         "{:<40} {:<10.2f}".format(
-            "Input token throughput (tok/s):", res["input_throughput"]
+            "Input token throughput (tok/s):", result["input_throughput"]
         )
     )
     print(
         "{:<40} {:<10.2f}".format(
-            "Output token throughput (tok/s):", res["output_throughput"]
+            "Output token throughput (tok/s):", result["output_throughput"]
         )
     )
     print(
         "{:<40} {:<10.2f}".format(
-            "Total token throughput (tok/s):", res["total_throughput"]
+            "Total token throughput (tok/s):", result["total_throughput"]
         )
     )
     print("=" * 50)
+
+    return result
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    ServerArgs.add_cli_args(parser)
+    BenchArgs.add_cli_args(parser)
+    args = parser.parse_args()
+    server_args = ServerArgs.from_cli_args(args)
+    bench_args = BenchArgs.from_cli_args(args)
+
+    logging.basicConfig(
+        level=getattr(logging, server_args.log_level.upper()),
+        format="%(message)s",
+    )
+
+    throughput_test(server_args, bench_args)
diff --git a/test/srt/test_srt_engine.py b/test/srt/test_srt_engine.py
index 6e630d4848d..33232f50b41 100644
--- a/test/srt/test_srt_engine.py
+++ b/test/srt/test_srt_engine.py
@@ -158,7 +158,7 @@ def test_7_engine_offline_throughput(self):
         server_args = ServerArgs(
             model_path=DEFAULT_MODEL_NAME_FOR_TEST,
         )
-        bench_args = BenchArgs(num_prompts=10)
+        bench_args = BenchArgs(num_prompts=100)
         result = throughput_test(server_args=server_args, bench_args=bench_args)
         self.assertTrue(result["total_throughput"] > 3000)