From 807a3f0af0b7404f917225f71b5815ab5e1e74e0 Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Sat, 9 Nov 2024 01:32:55 +0000 Subject: [PATCH 01/30] add offline engine bench --- python/sglang/bench_serving.py | 79 +++++++++++++++++++++++++++++++- python/sglang/test/test_utils.py | 16 ++++--- test/srt/test_bench_serving.py | 11 ++++- 3 files changed, 96 insertions(+), 10 deletions(-) diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 8bb452cd065..a63cd9fadd6 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -37,6 +37,9 @@ PreTrainedTokenizerFast, ) +from sglang.api import Engine as getEngine +from sglang.srt.server import Engine + AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) global args @@ -45,7 +48,9 @@ @dataclass class RequestFuncInput: prompt: str - api_url: str + # one or the other must be defined but not both + api_url: Optional[str] + engine: Optional[Engine] prompt_len: int output_len: int model: str @@ -222,6 +227,68 @@ async def async_request_openai_completions( return output +async def async_request_sglang_offline_engine( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + engine = request_func_input.llm_engine + if not engine: + raise ValueError("Please pass in an Engine") + + prompt = request_func_input.prompt + + payload = { + "temperature": 0.0, + "best_of": 1, + "max_tokens": request_func_input.output_len, + "stream": not args.disable_stream, + "ignore_eos": not args.disable_ignore_eos, + **request_func_input.extra_request_body, + } + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + gen_out = await engine.async_generate(prompt, **payload) + if payload["stream"]: + async for chunk in gen_out: + latency = time.perf_counter() - st + if chunk["text"]: + timestamp = time.perf_counter() + if ttft == 0.0: + ttft = time.perf_counter() - st + output.ttft = ttft + else: + output.itl.append(timestamp - most_recent_timestamp) + + most_recent_timestamp = timestamp + generated_text += chunk["text"] + else: + if gen_out[0]["text"]: + # not sure why you'd ever want this + latency = time.perf_counter() - st + ttft = latency + output.ttft = ttft + generated_text = gen_out[0]["text"] + output.generated_text = generated_text + output.success = True + output.latency = latency + output.output_len = request_func_input.output_len + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + async def async_request_truss( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, @@ -425,6 +492,7 @@ def get_tokenizer( "sglang": async_request_sglang_generate, "sglang-native": async_request_sglang_generate, "sglang-oai": async_request_openai_completions, + "sglang-offline-engine": async_request_sglang_offline_engine, "vllm": async_request_openai_completions, "lmdeploy": async_request_openai_completions, "trt": async_request_trt_llm, @@ -718,7 +786,7 @@ def calculate_metrics( async def benchmark( backend: str, - api_url: str, + api_url: Optional[str], model_id: str, tokenizer: PreTrainedTokenizerBase, input_requests: List[Tuple[str, int, int]], @@ -730,6 +798,9 @@ async def benchmark( request_func = ASYNC_REQUEST_FUNCS[backend] else: raise ValueError(f"Unknown backend: {backend}") + engine = None + if backend == "sglang-offline-engine": + engine = getEngine(model_path=model_id) print("Starting initial single prompt test run...") test_prompt, test_prompt_len, test_output_len = input_requests[0] @@ -737,6 +808,7 @@ async def benchmark( model=model_id, prompt=test_prompt, api_url=api_url, + engine=engine, prompt_len=test_prompt_len, output_len=test_output_len, extra_request_body=extra_request_body, @@ -762,6 +834,7 @@ async def benchmark( model=model_id, prompt=prompt, api_url=api_url, + engine=engine, prompt_len=prompt_len, output_len=output_len, extra_request_body=extra_request_body, @@ -974,6 +1047,8 @@ def run_benchmark(args_: argparse.Namespace): if args.base_url else f"http://{args.host}:{args.port}/v1/completions" ) + elif args.backend in ["sglang-offline-engine"]: + api_url = None elif args.backend == "trt": api_url = ( f"{args.base_url}/v2/models/ensemble/generate_stream" diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 2c68a22b4df..8ab28911db1 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -522,6 +522,7 @@ def run_bench_serving( num_prompts, request_rate, other_server_args, + backend="sglang", dataset_name="random", random_input_len=4096, random_output_len=2048, @@ -529,16 +530,17 @@ def run_bench_serving( ): # Launch the server base_url = DEFAULT_URL_FOR_TEST - process = popen_launch_server( - model, - base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=other_server_args, - ) + if backend == "sglang": + process = popen_launch_server( + model, + base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_server_args, + ) # Run benchmark args = SimpleNamespace( - backend="sglang", + backend=backend, base_url=base_url, host=None, port=None, diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py index 6955d4917b2..5ab3de1e189 100644 --- a/test/srt/test_bench_serving.py +++ b/test/srt/test_bench_serving.py @@ -86,6 +86,14 @@ def test_offline_throughput_default_fp8(self): if is_in_ci(): assert res["output_throughput"] > 3100 + def test_offline_throughput_default_engine(self): + res = run_bench_serving( + model=DEFAULT_MODEL_NAME_FOR_TEST, + num_prompts=500, + request_rate=float("inf"), + other_server_args=[], + ) + def test_online_latency_default(self): res = run_bench_serving( model=DEFAULT_MODEL_NAME_FOR_TEST, @@ -112,6 +120,7 @@ def test_moe_offline_throughput_default(self): def test_moe_offline_throughput_without_radix_cache(self): res = run_bench_serving( + backend="sglang-offline-engine", model=DEFAULT_MOE_MODEL_NAME_FOR_TEST, num_prompts=300, request_rate=float("inf"), @@ -119,7 +128,7 @@ def test_moe_offline_throughput_without_radix_cache(self): ) if is_in_ci(): - assert res["output_throughput"] > 1950 + assert res["output_throughput"] > 2830 if __name__ == "__main__": From e3ec6231f47c755b22abf17a7c6bfe304bbd5915 Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Sat, 9 Nov 2024 01:49:44 +0000 Subject: [PATCH 02/30] llm_engine -> engine --- python/sglang/bench_serving.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index a63cd9fadd6..53bbf8b62e0 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -231,7 +231,7 @@ async def async_request_sglang_offline_engine( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, ) -> RequestFuncOutput: - engine = request_func_input.llm_engine + engine = request_func_input.engine if not engine: raise ValueError("Please pass in an Engine") From 8b1232bb48fe7bf4bc6077aff281c0cc7809d56c Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Sat, 9 Nov 2024 02:13:48 +0000 Subject: [PATCH 03/30] add to unit test bench --- python/sglang/bench_serving.py | 10 +++++----- test/srt/test_bench_serving.py | 15 +++++++++++++-- 2 files changed, 18 insertions(+), 7 deletions(-) diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 53bbf8b62e0..952fca9a585 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -239,12 +239,12 @@ async def async_request_sglang_offline_engine( payload = { "temperature": 0.0, - "best_of": 1, - "max_tokens": request_func_input.output_len, - "stream": not args.disable_stream, + "n": 1, + "max_new_tokens": request_func_input.output_len, "ignore_eos": not args.disable_ignore_eos, **request_func_input.extra_request_body, } + stream = not args.disable_stream output = RequestFuncOutput() output.prompt_len = request_func_input.prompt_len @@ -254,8 +254,8 @@ async def async_request_sglang_offline_engine( st = time.perf_counter() most_recent_timestamp = st try: - gen_out = await engine.async_generate(prompt, **payload) - if payload["stream"]: + gen_out = await engine.async_generate(prompt, payload, stream=stream) + if stream: async for chunk in gen_out: latency = time.perf_counter() - st if chunk["text"]: diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py index 5ab3de1e189..7de4fd736d0 100644 --- a/test/srt/test_bench_serving.py +++ b/test/srt/test_bench_serving.py @@ -94,6 +94,18 @@ def test_offline_throughput_default_engine(self): other_server_args=[], ) + def test_offline_throughput_llm_engine(self): + res = run_bench_serving( + backend="sgl-offline-engine", + model=DEFAULT_MODEL_NAME_FOR_TEST, + num_prompts=500, + request_rate=float("inf"), + other_server_args=[], + ) + + if is_in_ci(): + assert res["output_throughput"] > 2830 + def test_online_latency_default(self): res = run_bench_serving( model=DEFAULT_MODEL_NAME_FOR_TEST, @@ -120,7 +132,6 @@ def test_moe_offline_throughput_default(self): def test_moe_offline_throughput_without_radix_cache(self): res = run_bench_serving( - backend="sglang-offline-engine", model=DEFAULT_MOE_MODEL_NAME_FOR_TEST, num_prompts=300, request_rate=float("inf"), @@ -128,7 +139,7 @@ def test_moe_offline_throughput_without_radix_cache(self): ) if is_in_ci(): - assert res["output_throughput"] > 2830 + assert res["output_throughput"] > 1950 if __name__ == "__main__": From e6293a890437b330b4ed1bb7128e39af6eb5ca61 Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Mon, 11 Nov 2024 22:33:06 +0000 Subject: [PATCH 04/30] first draft bench offline throughput --- python/sglang/bench_offline_throughput.py | 153 ++++++++++++++++++++++ python/sglang/bench_serving.py | 75 +---------- 2 files changed, 155 insertions(+), 73 deletions(-) create mode 100644 python/sglang/bench_offline_throughput.py diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py new file mode 100644 index 00000000000..10c1d4de0e1 --- /dev/null +++ b/python/sglang/bench_offline_throughput.py @@ -0,0 +1,153 @@ +""" +Benchmark the throughput of using the offline LLM engine. +This script does not launch a server. +It accepts the same arguments as bench_latency.py +""" + +import argparse +import dataclasses +import itertools +import logging +import time +from typing import Dict, List, Tuple + +import numpy as np + +from python.sglang.srt.sampling.sampling_params import SamplingParams +from sglang.api import Engine as getEngine +from sglang.srt.server import Engine +from sglang.srt.server_args import ServerArgs + + +@dataclasses.dataclass +class BenchArgs: + run_name: str = "before" + batch_size: Tuple[int] = (1,) + input_len: Tuple[int] = (1024,) + output_len: Tuple[int] = (16,) + result_filename: str = "" + # Plotting args + graph_sql: str = ( + "select run_name, batch_size, prefill_throughput from results where run_name='before'" + ) + graph_filename: str = "out.png" + + @staticmethod + def add_cli_args(parser: argparse.ArgumentParser): + parser.add_argument("--run-name", type=str, default=BenchArgs.run_name) + parser.add_argument( + "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size + ) + parser.add_argument( + "--input-len", type=int, nargs="+", default=BenchArgs.input_len + ) + parser.add_argument( + "--output-len", type=int, nargs="+", default=BenchArgs.output_len + ) + parser.add_argument( + "--result-filename", type=str, default=BenchArgs.result_filename + ) + # graphing + parser.add_argument("--graph-sql", type=str, default=BenchArgs.graph_sql) + parser.add_argument( + "--graph-filename", type=str, default=BenchArgs.graph_filename + ) + + @classmethod + def from_cli_args(cls, args: argparse.Namespace): + # use the default value's type to case the args into correct types. + attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)] + return cls( + **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs} + ) + + +def prepare_synthetic_inputs_for_throughput_test( + batch_size: int, input_len: int, output_len: int +): + input_ids = [[1] * input_len for _ in range(batch_size)] + sampling_params = { + "temperature": 0, + "min_new_tokens": output_len, + "max_new_tokens": output_len, + } + return input_ids, sampling_params + + +def throughput_test_once( + run_name: str, + engine: Engine, + reqs: Tuple[List[List[int]], Dict], + output_len: int, +): + measurement_results = { + "run_name": run_name, + "batch_size": len(reqs[0]), + "input_len": len(reqs[0][0]), + "output_len": output_len, + } + + st = time.perf_counter() + gen_out = engine.generate(input_ids=reqs[0], sampling_params=reqs[1]) + latency = time.perf_counter() - st + + measurement_results["total_latency"] = latency + measurement_results["throughput"] = ( + (measurement_results["input_len"] + output_len) + * measurement_results["batch_size"] + ) / latency + + print( + f"Throughput: BSZ {measurement_results['batch_size']} tokens, " + f"Num sequences {len(reqs[0])}, throughput: " + f"{measurement_results['throughput']} tokens/s" + ) + return measurement_results + + +def throughput_test( + server_args, + bench_args: BenchArgs, +): + engine = getEngine(**server_args) + if not engine: + raise ValueError("Please provide valid engine arguments") + + warmup_reqs = prepare_synthetic_inputs_for_throughput_test( + bench_args.batch_size[0], bench_args.input_len[0], bench_args.output_len[0] + ) + + # Warm up + throughput_test_once("warmup", engine, warmup_reqs, bench_args.output_len[0]) + + result_list = [] + for bs, il, ol in itertools.product( + bench_args.batch_size, bench_args.input_len, bench_args.output_len + ): + reqs = prepare_synthetic_inputs_for_throughput_test(bs, il, ol) + ret = throughput_test_once( + bench_args.run_name, engine, reqs, bench_args.output_len[0] + ) + if ret is not None: + result_list.append(ret) + + print(result_list) + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + ServerArgs.add_cli_args(parser) + BenchArgs.add_cli_args(parser) + args = parser.parse_args() + server_args = ServerArgs.from_cli_args(args) + bench_args = BenchArgs.from_cli_args(args) + + logging.basicConfig( + level=getattr(logging, server_args.log_level.upper()), + format="%(message)s", + ) + + try: + throughput_test(server_args, bench_args) + except Exception as e: + raise e diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 952fca9a585..3d08f37863e 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -49,8 +49,7 @@ class RequestFuncInput: prompt: str # one or the other must be defined but not both - api_url: Optional[str] - engine: Optional[Engine] + api_url: str prompt_len: int output_len: int model: str @@ -227,68 +226,6 @@ async def async_request_openai_completions( return output -async def async_request_sglang_offline_engine( - request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, -) -> RequestFuncOutput: - engine = request_func_input.engine - if not engine: - raise ValueError("Please pass in an Engine") - - prompt = request_func_input.prompt - - payload = { - "temperature": 0.0, - "n": 1, - "max_new_tokens": request_func_input.output_len, - "ignore_eos": not args.disable_ignore_eos, - **request_func_input.extra_request_body, - } - stream = not args.disable_stream - - output = RequestFuncOutput() - output.prompt_len = request_func_input.prompt_len - - generated_text = "" - ttft = 0.0 - st = time.perf_counter() - most_recent_timestamp = st - try: - gen_out = await engine.async_generate(prompt, payload, stream=stream) - if stream: - async for chunk in gen_out: - latency = time.perf_counter() - st - if chunk["text"]: - timestamp = time.perf_counter() - if ttft == 0.0: - ttft = time.perf_counter() - st - output.ttft = ttft - else: - output.itl.append(timestamp - most_recent_timestamp) - - most_recent_timestamp = timestamp - generated_text += chunk["text"] - else: - if gen_out[0]["text"]: - # not sure why you'd ever want this - latency = time.perf_counter() - st - ttft = latency - output.ttft = ttft - generated_text = gen_out[0]["text"] - output.generated_text = generated_text - output.success = True - output.latency = latency - output.output_len = request_func_input.output_len - except Exception: - output.success = False - exc_info = sys.exc_info() - output.error = "".join(traceback.format_exception(*exc_info)) - - if pbar: - pbar.update(1) - return output - - async def async_request_truss( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, @@ -492,7 +429,6 @@ def get_tokenizer( "sglang": async_request_sglang_generate, "sglang-native": async_request_sglang_generate, "sglang-oai": async_request_openai_completions, - "sglang-offline-engine": async_request_sglang_offline_engine, "vllm": async_request_openai_completions, "lmdeploy": async_request_openai_completions, "trt": async_request_trt_llm, @@ -786,7 +722,7 @@ def calculate_metrics( async def benchmark( backend: str, - api_url: Optional[str], + api_url: str, model_id: str, tokenizer: PreTrainedTokenizerBase, input_requests: List[Tuple[str, int, int]], @@ -798,9 +734,6 @@ async def benchmark( request_func = ASYNC_REQUEST_FUNCS[backend] else: raise ValueError(f"Unknown backend: {backend}") - engine = None - if backend == "sglang-offline-engine": - engine = getEngine(model_path=model_id) print("Starting initial single prompt test run...") test_prompt, test_prompt_len, test_output_len = input_requests[0] @@ -808,7 +741,6 @@ async def benchmark( model=model_id, prompt=test_prompt, api_url=api_url, - engine=engine, prompt_len=test_prompt_len, output_len=test_output_len, extra_request_body=extra_request_body, @@ -834,7 +766,6 @@ async def benchmark( model=model_id, prompt=prompt, api_url=api_url, - engine=engine, prompt_len=prompt_len, output_len=output_len, extra_request_body=extra_request_body, @@ -1047,8 +978,6 @@ def run_benchmark(args_: argparse.Namespace): if args.base_url else f"http://{args.host}:{args.port}/v1/completions" ) - elif args.backend in ["sglang-offline-engine"]: - api_url = None elif args.backend == "trt": api_url = ( f"{args.base_url}/v2/models/ensemble/generate_stream" From 5564a96d009be60193a6b4442deafaebcb8d7920 Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Mon, 11 Nov 2024 22:59:21 +0000 Subject: [PATCH 05/30] script works --- python/sglang/bench_offline_throughput.py | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py index 10c1d4de0e1..aada2ca8b42 100644 --- a/python/sglang/bench_offline_throughput.py +++ b/python/sglang/bench_offline_throughput.py @@ -2,6 +2,10 @@ Benchmark the throughput of using the offline LLM engine. This script does not launch a server. It accepts the same arguments as bench_latency.py + +# Usage +python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl + """ import argparse @@ -9,11 +13,9 @@ import itertools import logging import time +import jsonlines from typing import Dict, List, Tuple -import numpy as np - -from python.sglang.srt.sampling.sampling_params import SamplingParams from sglang.api import Engine as getEngine from sglang.srt.server import Engine from sglang.srt.server_args import ServerArgs @@ -106,10 +108,10 @@ def throughput_test_once( def throughput_test( - server_args, + server_args: ServerArgs, bench_args: BenchArgs, ): - engine = getEngine(**server_args) + engine = getEngine(**dataclasses.asdict(server_args)) if not engine: raise ValueError("Please provide valid engine arguments") @@ -131,7 +133,11 @@ def throughput_test( if ret is not None: result_list.append(ret) - print(result_list) + if bench_args.result_filename: + with jsonlines.open(bench_args.result_filename, "a") as f: + f.write_all(result_list) + else: + print(result_list) if __name__ == "__main__": From 0078bc3a0abdf53072b3a91a884bcfb362e92703 Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Mon, 11 Nov 2024 23:07:10 +0000 Subject: [PATCH 06/30] reset bench serving stuff --- python/sglang/bench_serving.py | 96 ---------------- python/sglang/test/test_utils.py | 185 +++---------------------------- test/srt/test_bench_serving.py | 20 ---- 3 files changed, 13 insertions(+), 288 deletions(-) diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 3d08f37863e..2ca35aca95a 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -37,9 +37,6 @@ PreTrainedTokenizerFast, ) -from sglang.api import Engine as getEngine -from sglang.srt.server import Engine - AIOHTTP_TIMEOUT = aiohttp.ClientTimeout(total=6 * 60 * 60) global args @@ -48,7 +45,6 @@ @dataclass class RequestFuncInput: prompt: str - # one or the other must be defined but not both api_url: str prompt_len: int output_len: int @@ -226,85 +222,6 @@ async def async_request_openai_completions( return output -async def async_request_truss( - request_func_input: RequestFuncInput, - pbar: Optional[tqdm] = None, -) -> RequestFuncOutput: - api_url = request_func_input.api_url - - prompt = request_func_input.prompt - - async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: - payload = { - "model": request_func_input.model, - "prompt": prompt, - "temperature": 0.0, - "best_of": 1, - "max_tokens": request_func_input.output_len, - "stream": not args.disable_stream, - "ignore_eos": not args.disable_ignore_eos, - **request_func_input.extra_request_body, - } - headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} - - output = RequestFuncOutput() - output.prompt_len = request_func_input.prompt_len - - generated_text = "" - ttft = 0.0 - st = time.perf_counter() - most_recent_timestamp = st - try: - async with session.post( - url=api_url, json=payload, headers=headers - ) as response: - if response.status == 200: - async for chunk_bytes in response.content: - chunk_bytes = chunk_bytes.strip() - if not chunk_bytes: - continue - - chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ") - latency = time.perf_counter() - st - if chunk == "[DONE]": - pass - else: - data = json.loads(chunk) - - # NOTE: Some completion API might have a last - # usage summary response without a token so we - # want to check a token was generated - if data["choices"][0]["delta"]["content"]: - timestamp = time.perf_counter() - # First token - if ttft == 0.0: - ttft = time.perf_counter() - st - output.ttft = ttft - - # Decoding phase - else: - output.itl.append(timestamp - most_recent_timestamp) - - most_recent_timestamp = timestamp - generated_text += data["choices"][0]["delta"]["content"] - - output.generated_text = generated_text - output.success = True - output.latency = latency - output.output_len = request_func_input.output_len - else: - output.error = response.reason or "" - output.success = False - except Exception: - output.success = False - exc_info = sys.exc_info() - output.error = "".join(traceback.format_exception(*exc_info)) - - if pbar: - pbar.update(1) - return output - - async def async_request_sglang_generate( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, @@ -433,7 +350,6 @@ def get_tokenizer( "lmdeploy": async_request_openai_completions, "trt": async_request_trt_llm, "gserver": async_request_gserver, - "truss": async_request_truss, } @@ -957,7 +873,6 @@ def run_benchmark(args_: argparse.Namespace): "vllm": 8000, "trt": 8000, "gserver": 9988, - "truss": 8080, }.get(args.backend, 30000) model_url = ( @@ -990,20 +905,9 @@ def run_benchmark(args_: argparse.Namespace): elif args.backend == "gserver": api_url = args.base_url if args.base_url else f"{args.host}:{args.port}" args.model = args.model or "default" - elif args.backend == "truss": - api_url = ( - f"{args.base_url}/v1/models/model:predict" - if args.base_url - else f"http://{args.host}:{args.port}/v1/models/model:predict" - ) # Get model name if args.model is None: - if args.backend == "truss": - print( - "Please provide a model with `--model` when using truss backend. e.g. --model meta-llama/Llama-3.1-8B-Instruct" - ) - sys.exit(1) try: response = requests.get(model_url) model_list = response.json().get("data", []) diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 8ab28911db1..8fb20c6eb04 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -3,11 +3,9 @@ import argparse import asyncio import os -import random import subprocess import threading import time -from concurrent.futures import ThreadPoolExecutor from functools import partial from types import SimpleNamespace from typing import Callable, List, Optional @@ -22,7 +20,6 @@ from sglang.lang.backend.openai import OpenAI from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint from sglang.srt.utils import kill_child_process -from sglang.test.run_eval import run_eval from sglang.utils import get_exception_traceback DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8" @@ -403,7 +400,7 @@ def popen_launch_server( api_key: Optional[str] = None, other_args: tuple = (), env: Optional[dict] = None, - return_stdout_stderr: Optional[tuple] = None, + return_stdout_stderr: bool = False, ): _, host, port = base_url.split(":") host = host[2:] @@ -426,8 +423,8 @@ def popen_launch_server( if return_stdout_stderr: process = subprocess.Popen( command, - stdout=return_stdout_stderr[0], - stderr=return_stdout_stderr[1], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, env=env, text=True, ) @@ -496,7 +493,7 @@ def run_one_file(filename): ) assert ret_code == 0 except TimeoutError: - kill_child_process(process.pid, include_self=True) + kill_child_process(process.pid) time.sleep(5) print( f"\nTimeout after {timeout_per_file} seconds when running {filename}\n", @@ -522,7 +519,6 @@ def run_bench_serving( num_prompts, request_rate, other_server_args, - backend="sglang", dataset_name="random", random_input_len=4096, random_output_len=2048, @@ -530,17 +526,16 @@ def run_bench_serving( ): # Launch the server base_url = DEFAULT_URL_FOR_TEST - if backend == "sglang": - process = popen_launch_server( - model, - base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=other_server_args, - ) + process = popen_launch_server( + model, + base_url, + timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, + other_args=other_server_args, + ) # Run benchmark args = SimpleNamespace( - backend=backend, + backend="sglang", base_url=base_url, host=None, port=None, @@ -566,7 +561,7 @@ def run_bench_serving( try: res = run_benchmark(args) finally: - kill_child_process(process.pid, include_self=True) + kill_child_process(process.pid) assert res["completed"] == num_prompts return res @@ -599,7 +594,7 @@ def run_bench_latency(model, other_args): lastline = output.split("\n")[-3] output_throughput = float(lastline.split(" ")[-2]) finally: - kill_child_process(process.pid, include_self=True) + kill_child_process(process.pid) return output_throughput @@ -636,157 +631,3 @@ def calculate_rouge_l(output_strs_list1, output_strs_list2): rouge_l_scores.append(fmeasure) return rouge_l_scores - - -STDOUT_FILENAME = "stdout.txt" -STDERR_FILENAME = "stderr.txt" - - -def read_output(output_lines): - """Print the output in real time with another thread.""" - while not os.path.exists(STDERR_FILENAME): - time.sleep(1) - - pt = 0 - while pt >= 0: - if pt > 0 and not os.path.exists(STDERR_FILENAME): - break - lines = open(STDERR_FILENAME).readlines() - for line in lines[pt:]: - print(line, end="", flush=True) - output_lines.append(line) - pt += 1 - time.sleep(0.1) - - -def run_and_check_memory_leak( - workload_func, - disable_radix_cache, - enable_mixed_chunk, - enable_overlap, - chunked_prefill_size, -): - other_args = ["--chunked-prefill-size", str(chunked_prefill_size)] - if disable_radix_cache: - other_args += ["--disable-radix-cache"] - if enable_mixed_chunk: - other_args += ["--enable-mixed-chunk"] - if enable_overlap: - other_args += ["--enable-overlap-scheduler"] - - model = DEFAULT_MODEL_NAME_FOR_TEST - port = random.randint(4000, 5000) - base_url = f"http://127.0.0.1:{port}" - - # Create files and launch the server - stdout = open(STDOUT_FILENAME, "w") - stderr = open(STDERR_FILENAME, "w") - process = popen_launch_server( - model, - base_url, - timeout=DEFAULT_TIMEOUT_FOR_SERVER_LAUNCH, - other_args=other_args, - return_stdout_stderr=(stdout, stderr), - ) - - # Launch a thread to stream the output - output_lines = [] - t = threading.Thread(target=read_output, args=(output_lines,)) - t.start() - - # Run the workload - workload_func(base_url, model) - - # Clean up everything - kill_child_process(process.pid, include_self=True) - kill_child_process(process.pid, include_self=True) - stdout.close() - stderr.close() - if os.path.exists(STDOUT_FILENAME): - os.remove(STDOUT_FILENAME) - if os.path.exists(STDERR_FILENAME): - os.remove(STDERR_FILENAME) - t.join() - - # Assert success - has_new_server = False - has_leak = False - for line in output_lines: - if "The server is fired" in line: - has_new_server = True - if "leak" in line: - has_leak = True - - assert has_new_server - assert not has_leak - - -def run_mmlu_test( - disable_radix_cache=False, - enable_mixed_chunk=False, - enable_overlap=False, - chunked_prefill_size=32, -): - def workload_func(base_url, model): - # Run the eval - args = SimpleNamespace( - base_url=base_url, - model=model, - eval_name="mmlu", - num_examples=128, - num_threads=128, - ) - - try: - metrics = run_eval(args) - print(f"{metrics=}") - assert metrics["score"] >= 0.65 - finally: - pass - - run_and_check_memory_leak( - workload_func, - disable_radix_cache, - enable_mixed_chunk, - enable_overlap, - chunked_prefill_size, - ) - - -def run_mulit_request_test( - disable_radix_cache=False, - enable_mixed_chunk=False, - enable_overlap=False, - chunked_prefill_size=32, -): - - def workload_func(base_url, model): - def run_one(_): - prompt = """ - System: You are a helpful assistant. - User: What is the capital of France? - Assistant: The capital of France is - """ - - response = requests.post( - f"{base_url}/generate", - json={ - "text": prompt, - "sampling_params": { - "temperature": 0, - "max_new_tokens": 8, - }, - }, - ) - ret = response.json() - - with ThreadPoolExecutor(2) as executor: - list(executor.map(run_one, list(range(4)))) - - run_and_check_memory_leak( - workload_func, - disable_radix_cache, - enable_mixed_chunk, - enable_overlap, - chunked_prefill_size, - ) diff --git a/test/srt/test_bench_serving.py b/test/srt/test_bench_serving.py index 7de4fd736d0..6955d4917b2 100644 --- a/test/srt/test_bench_serving.py +++ b/test/srt/test_bench_serving.py @@ -86,26 +86,6 @@ def test_offline_throughput_default_fp8(self): if is_in_ci(): assert res["output_throughput"] > 3100 - def test_offline_throughput_default_engine(self): - res = run_bench_serving( - model=DEFAULT_MODEL_NAME_FOR_TEST, - num_prompts=500, - request_rate=float("inf"), - other_server_args=[], - ) - - def test_offline_throughput_llm_engine(self): - res = run_bench_serving( - backend="sgl-offline-engine", - model=DEFAULT_MODEL_NAME_FOR_TEST, - num_prompts=500, - request_rate=float("inf"), - other_server_args=[], - ) - - if is_in_ci(): - assert res["output_throughput"] > 2830 - def test_online_latency_default(self): res = run_bench_serving( model=DEFAULT_MODEL_NAME_FOR_TEST, From 31584145e53f3aa482c912887b8b4d9d0b3b91f0 Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Mon, 11 Nov 2024 23:10:36 +0000 Subject: [PATCH 07/30] most recent commit? --- python/sglang/bench_serving.py | 92 ++++++++++++++++++++++++++++++++++ 1 file changed, 92 insertions(+) diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index d64901cf1fa..74f77565ef2 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -222,6 +222,85 @@ async def async_request_openai_completions( return output +async def async_request_truss( + request_func_input: RequestFuncInput, + pbar: Optional[tqdm] = None, +) -> RequestFuncOutput: + api_url = request_func_input.api_url + + prompt = request_func_input.prompt + + async with aiohttp.ClientSession(timeout=AIOHTTP_TIMEOUT) as session: + payload = { + "model": request_func_input.model, + "prompt": prompt, + "temperature": 0.0, + "best_of": 1, + "max_tokens": request_func_input.output_len, + "stream": not args.disable_stream, + "ignore_eos": not args.disable_ignore_eos, + **request_func_input.extra_request_body, + } + headers = {"Authorization": f"Bearer {os.environ.get('OPENAI_API_KEY')}"} + + output = RequestFuncOutput() + output.prompt_len = request_func_input.prompt_len + + generated_text = "" + ttft = 0.0 + st = time.perf_counter() + most_recent_timestamp = st + try: + async with session.post( + url=api_url, json=payload, headers=headers + ) as response: + if response.status == 200: + async for chunk_bytes in response.content: + chunk_bytes = chunk_bytes.strip() + if not chunk_bytes: + continue + + chunk = remove_prefix(chunk_bytes.decode("utf-8"), "data: ") + latency = time.perf_counter() - st + if chunk == "[DONE]": + pass + else: + data = json.loads(chunk) + + # NOTE: Some completion API might have a last + # usage summary response without a token so we + # want to check a token was generated + if data["choices"][0]["delta"]["content"]: + timestamp = time.perf_counter() + # First token + if ttft == 0.0: + ttft = time.perf_counter() - st + output.ttft = ttft + + # Decoding phase + else: + output.itl.append(timestamp - most_recent_timestamp) + + most_recent_timestamp = timestamp + generated_text += data["choices"][0]["delta"]["content"] + + output.generated_text = generated_text + output.success = True + output.latency = latency + output.output_len = request_func_input.output_len + else: + output.error = response.reason or "" + output.success = False + except Exception: + output.success = False + exc_info = sys.exc_info() + output.error = "".join(traceback.format_exception(*exc_info)) + + if pbar: + pbar.update(1) + return output + + async def async_request_sglang_generate( request_func_input: RequestFuncInput, pbar: Optional[tqdm] = None, @@ -350,6 +429,7 @@ def get_tokenizer( "lmdeploy": async_request_openai_completions, "trt": async_request_trt_llm, "gserver": async_request_gserver, + "truss": async_request_truss, } @@ -933,6 +1013,7 @@ def run_benchmark(args_: argparse.Namespace): "vllm": 8000, "trt": 8000, "gserver": 9988, + "truss": 8080, }.get(args.backend, 30000) model_url = ( @@ -965,9 +1046,20 @@ def run_benchmark(args_: argparse.Namespace): elif args.backend == "gserver": api_url = args.base_url if args.base_url else f"{args.host}:{args.port}" args.model = args.model or "default" + elif args.backend == "truss": + api_url = ( + f"{args.base_url}/v1/models/model:predict" + if args.base_url + else f"http://{args.host}:{args.port}/v1/models/model:predict" + ) # Get model name if args.model is None: + if args.backend == "truss": + print( + "Please provide a model with `--model` when using truss backend. e.g. --model meta-llama/Llama-3.1-8B-Instruct" + ) + sys.exit(1) try: response = requests.get(model_url) model_list = response.json().get("data", []) From 550ec14c2e9cdf886d6f13834eecef5167fdae8e Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Mon, 11 Nov 2024 23:11:30 +0000 Subject: [PATCH 08/30] restore test utils --- python/sglang/test/test_utils.py | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 41e247fcd10..f136a4d1b64 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -3,9 +3,11 @@ import argparse import asyncio import os +import random import subprocess import threading import time +from concurrent.futures import ThreadPoolExecutor from functools import partial from types import SimpleNamespace from typing import Callable, List, Optional @@ -20,6 +22,7 @@ from sglang.lang.backend.openai import OpenAI from sglang.lang.backend.runtime_endpoint import RuntimeEndpoint from sglang.srt.utils import kill_child_process +from sglang.test.run_eval import run_eval from sglang.utils import get_exception_traceback DEFAULT_FP8_MODEL_NAME_FOR_TEST = "neuralmagic/Meta-Llama-3.1-8B-FP8" @@ -402,7 +405,7 @@ def popen_launch_server( api_key: Optional[str] = None, other_args: tuple = (), env: Optional[dict] = None, - return_stdout_stderr: bool = False, + return_stdout_stderr: Optional[tuple] = None, ): _, host, port = base_url.split(":") host = host[2:] @@ -425,8 +428,8 @@ def popen_launch_server( if return_stdout_stderr: process = subprocess.Popen( command, - stdout=subprocess.PIPE, - stderr=subprocess.PIPE, + stdout=return_stdout_stderr[0], + stderr=return_stdout_stderr[1], env=env, text=True, ) @@ -495,7 +498,7 @@ def run_one_file(filename): ) assert ret_code == 0 except TimeoutError: - kill_child_process(process.pid) + kill_child_process(process.pid, include_self=True) time.sleep(5) print( f"\nTimeout after {timeout_per_file} seconds when running {filename}\n", @@ -563,7 +566,7 @@ def run_bench_serving( try: res = run_benchmark(args) finally: - kill_child_process(process.pid) + kill_child_process(process.pid, include_self=True) assert res["completed"] == num_prompts return res @@ -596,7 +599,7 @@ def run_bench_latency(model, other_args): lastline = output.split("\n")[-3] output_throughput = float(lastline.split(" ")[-2]) finally: - kill_child_process(process.pid) + kill_child_process(process.pid, include_self=True) return output_throughput From c1c62268fcb1779933d532100c2836288821c0d3 Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Mon, 11 Nov 2024 23:15:46 +0000 Subject: [PATCH 09/30] lint --- python/sglang/bench_offline_throughput.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py index aada2ca8b42..9588cedea2e 100644 --- a/python/sglang/bench_offline_throughput.py +++ b/python/sglang/bench_offline_throughput.py @@ -13,9 +13,10 @@ import itertools import logging import time -import jsonlines from typing import Dict, List, Tuple +import jsonlines + from sglang.api import Engine as getEngine from sglang.srt.server import Engine from sglang.srt.server_args import ServerArgs From 1895c79d3b4e6c85f5c6aa9656a55e036c6ae4f8 Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Tue, 12 Nov 2024 02:27:22 +0000 Subject: [PATCH 10/30] use sharegpt from bench_serving --- python/sglang/bench_offline_throughput.py | 153 ++++++++++++++-------- 1 file changed, 96 insertions(+), 57 deletions(-) diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py index 9588cedea2e..88192eeef11 100644 --- a/python/sglang/bench_offline_throughput.py +++ b/python/sglang/bench_offline_throughput.py @@ -12,11 +12,13 @@ import dataclasses import itertools import logging +import random import time from typing import Dict, List, Tuple +import json -import jsonlines - +import numpy as np +from sglang.bench_serving import set_ulimit, sample_sharegpt_requests, sample_random_requests, get_tokenizer from sglang.api import Engine as getEngine from sglang.srt.server import Engine from sglang.srt.server_args import ServerArgs @@ -25,36 +27,56 @@ @dataclasses.dataclass class BenchArgs: run_name: str = "before" - batch_size: Tuple[int] = (1,) - input_len: Tuple[int] = (1024,) - output_len: Tuple[int] = (16,) result_filename: str = "" - # Plotting args - graph_sql: str = ( - "select run_name, batch_size, prefill_throughput from results where run_name='before'" - ) - graph_filename: str = "out.png" + seed: int = 1 @staticmethod def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument("--run-name", type=str, default=BenchArgs.run_name) parser.add_argument( - "--batch-size", type=int, nargs="+", default=BenchArgs.batch_size + "--result-filename", type=str, default=BenchArgs.result_filename + ) + parser.add_argument( + "--dataset-name", + type=str, + default="sharegpt", + choices=["sharegpt", "random", "generated-shared-prefix"], + help="Name of the dataset to benchmark on.", ) parser.add_argument( - "--input-len", type=int, nargs="+", default=BenchArgs.input_len + "--dataset-path", type=str, default="", help="Path to the dataset." ) parser.add_argument( - "--output-len", type=int, nargs="+", default=BenchArgs.output_len + "--num-prompts", + type=int, + default=1000, + help="Number of prompts to process. Default is 1000.", ) parser.add_argument( - "--result-filename", type=str, default=BenchArgs.result_filename + "--sharegpt-output-len", + type=int, + default=None, + help="Output length for each request. Overrides the output length from the ShareGPT dataset.", ) - # graphing - parser.add_argument("--graph-sql", type=str, default=BenchArgs.graph_sql) parser.add_argument( - "--graph-filename", type=str, default=BenchArgs.graph_filename + "--random-input-len", + type=int, + help="Number of input tokens per request, used only for random dataset.", ) + parser.add_argument( + "--random-output-len", + type=int, + help="Number of output tokens per request, used only for random dataset.", + ) + parser.add_argument( + "--random-range-ratio", + type=float, + default=0.0, + help="Range of sampled ratio of input/output length, " + "used only for random dataset.", + ) + parser.add_argument("--seed", type=int, default=1, help="The random seed.") + @classmethod def from_cli_args(cls, args: argparse.Namespace): @@ -65,45 +87,29 @@ def from_cli_args(cls, args: argparse.Namespace): ) -def prepare_synthetic_inputs_for_throughput_test( - batch_size: int, input_len: int, output_len: int -): - input_ids = [[1] * input_len for _ in range(batch_size)] - sampling_params = { - "temperature": 0, - "min_new_tokens": output_len, - "max_new_tokens": output_len, - } - return input_ids, sampling_params - - def throughput_test_once( run_name: str, engine: Engine, - reqs: Tuple[List[List[int]], Dict], - output_len: int, + reqs: List[Tuple[str, int, int]], ): measurement_results = { "run_name": run_name, - "batch_size": len(reqs[0]), - "input_len": len(reqs[0][0]), - "output_len": output_len, + "total_input_tokens": sum(r[1] for r in reqs), } st = time.perf_counter() - gen_out = engine.generate(input_ids=reqs[0], sampling_params=reqs[1]) + gen_out = engine.generate(prompt=[r[0] for r in reqs], sampling_params={ "temperature": 0 }) latency = time.perf_counter() - st measurement_results["total_latency"] = latency + measurement_results["total_output_tokens"] = sum(o["meta_info"]["completion_tokens"] for o in gen_out) measurement_results["throughput"] = ( - (measurement_results["input_len"] + output_len) - * measurement_results["batch_size"] + measurement_results["total_input_tokens"] + + measurement_results["total_output_tokens"] ) / latency print( - f"Throughput: BSZ {measurement_results['batch_size']} tokens, " - f"Num sequences {len(reqs[0])}, throughput: " - f"{measurement_results['throughput']} tokens/s" + f"Throughput: {measurement_results['throughput']} tokens/s" ) return measurement_results @@ -116,29 +122,62 @@ def throughput_test( if not engine: raise ValueError("Please provide valid engine arguments") - warmup_reqs = prepare_synthetic_inputs_for_throughput_test( - bench_args.batch_size[0], bench_args.input_len[0], bench_args.output_len[0] + tokenizer_id = args.model_path + tokenizer = get_tokenizer(tokenizer_id) + + # Set global environmnets + set_ulimit() + random.seed(bench_args.seed) + np.random.seed(bench_args.seed) + + if args.dataset_name == "sharegpt": + assert args.random_input_len is None and args.random_output_len is None + input_requests = sample_sharegpt_requests( + dataset_path=args.dataset_path, + num_requests=args.num_prompts, + tokenizer=tokenizer, + fixed_output_len=args.sharegpt_output_len, + ) + elif args.dataset_name == "random": + assert args.random_input_len is not None and args.random_output_len is not None + input_requests = sample_random_requests( + input_len=args.random_input_len, + output_len=args.random_output_len, + num_prompts=args.num_prompts, + range_ratio=args.random_range_ratio, + tokenizer=tokenizer, + dataset_path=args.dataset_path, + ) + else: + raise ValueError(f"Unknown dataset: {args.dataset_name}") + + warmup_requests = sample_random_requests( + input_len=20, + output_len=4, + num_prompts=2, + range_ratio=0.8, + tokenizer=tokenizer, + dataset_path=args.dataset_path, ) # Warm up - throughput_test_once("warmup", engine, warmup_reqs, bench_args.output_len[0]) - - result_list = [] - for bs, il, ol in itertools.product( - bench_args.batch_size, bench_args.input_len, bench_args.output_len - ): - reqs = prepare_synthetic_inputs_for_throughput_test(bs, il, ol) - ret = throughput_test_once( - bench_args.run_name, engine, reqs, bench_args.output_len[0] - ) - if ret is not None: - result_list.append(ret) + throughput_test_once( + run_name="warmup", + engine=engine, + reqs=warmup_requests, + ) + + result = throughput_test_once( + run_name=bench_args.run_name, + engine=engine, + reqs=input_requests, + ) if bench_args.result_filename: - with jsonlines.open(bench_args.result_filename, "a") as f: - f.write_all(result_list) + with open(bench_args.result_filename, "a") as fout: + fout.write(json.dumps(result) + "\n") else: - print(result_list) + print(result) if __name__ == "__main__": From 3c8faf9b8c57a4d48ae6d01012b1b3fdc3d435b7 Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Tue, 12 Nov 2024 02:31:16 +0000 Subject: [PATCH 11/30] add unit test --- test/srt/test_srt_engine.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/test/srt/test_srt_engine.py b/test/srt/test_srt_engine.py index a375c2900d5..fb6f3dbf5b7 100644 --- a/test/srt/test_srt_engine.py +++ b/test/srt/test_srt_engine.py @@ -18,6 +18,7 @@ DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST, DEFAULT_SMALL_MODEL_NAME_FOR_TEST, ) +from sglang.bench_offline_throughput import throughput_test class TestSRTEngine(unittest.TestCase): @@ -152,6 +153,19 @@ def test_6_engine_runtime_encode_consistency(self): self.assertTrue(torch.allclose(out1, out2, atol=1e-5, rtol=1e-3)) + def test_7_engine_offline_throughput(self): + server_args = SimpleNamespace( + model_path=DEFAULT_MODEL_NAME_FOR_TEST, + ) + bench_args = SimpleNamespace( + num_prompts=10 + ) + result = throughput_test( + server_args=server_args, + bench_args=bench_args + ) + self.assertTrue(result["throughput"] > 3800) + if __name__ == "__main__": unittest.main() From 170c83f94b70391279b6b16254861cf46a7b3dd1 Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Tue, 12 Nov 2024 02:32:36 +0000 Subject: [PATCH 12/30] lint --- python/sglang/bench_offline_throughput.py | 27 ++++++++++++++--------- test/srt/test_srt_engine.py | 11 +++------ 2 files changed, 20 insertions(+), 18 deletions(-) diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py index 88192eeef11..bac75e86846 100644 --- a/python/sglang/bench_offline_throughput.py +++ b/python/sglang/bench_offline_throughput.py @@ -11,15 +11,21 @@ import argparse import dataclasses import itertools +import json import logging import random import time from typing import Dict, List, Tuple -import json import numpy as np -from sglang.bench_serving import set_ulimit, sample_sharegpt_requests, sample_random_requests, get_tokenizer + from sglang.api import Engine as getEngine +from sglang.bench_serving import ( + get_tokenizer, + sample_random_requests, + sample_sharegpt_requests, + set_ulimit, +) from sglang.srt.server import Engine from sglang.srt.server_args import ServerArgs @@ -77,7 +83,6 @@ def add_cli_args(parser: argparse.ArgumentParser): ) parser.add_argument("--seed", type=int, default=1, help="The random seed.") - @classmethod def from_cli_args(cls, args: argparse.Namespace): # use the default value's type to case the args into correct types. @@ -98,19 +103,21 @@ def throughput_test_once( } st = time.perf_counter() - gen_out = engine.generate(prompt=[r[0] for r in reqs], sampling_params={ "temperature": 0 }) + gen_out = engine.generate( + prompt=[r[0] for r in reqs], sampling_params={"temperature": 0} + ) latency = time.perf_counter() - st measurement_results["total_latency"] = latency - measurement_results["total_output_tokens"] = sum(o["meta_info"]["completion_tokens"] for o in gen_out) + measurement_results["total_output_tokens"] = sum( + o["meta_info"]["completion_tokens"] for o in gen_out + ) measurement_results["throughput"] = ( - measurement_results["total_input_tokens"] + - measurement_results["total_output_tokens"] + measurement_results["total_input_tokens"] + + measurement_results["total_output_tokens"] ) / latency - print( - f"Throughput: {measurement_results['throughput']} tokens/s" - ) + print(f"Throughput: {measurement_results['throughput']} tokens/s") return measurement_results diff --git a/test/srt/test_srt_engine.py b/test/srt/test_srt_engine.py index fb6f3dbf5b7..5e778f1655d 100644 --- a/test/srt/test_srt_engine.py +++ b/test/srt/test_srt_engine.py @@ -11,6 +11,7 @@ import torch import sglang as sgl +from sglang.bench_offline_throughput import throughput_test from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.test.few_shot_gsm8k_engine import run_eval from sglang.test.test_utils import ( @@ -18,7 +19,6 @@ DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST, DEFAULT_SMALL_MODEL_NAME_FOR_TEST, ) -from sglang.bench_offline_throughput import throughput_test class TestSRTEngine(unittest.TestCase): @@ -157,13 +157,8 @@ def test_7_engine_offline_throughput(self): server_args = SimpleNamespace( model_path=DEFAULT_MODEL_NAME_FOR_TEST, ) - bench_args = SimpleNamespace( - num_prompts=10 - ) - result = throughput_test( - server_args=server_args, - bench_args=bench_args - ) + bench_args = SimpleNamespace(num_prompts=10) + result = throughput_test(server_args=server_args, bench_args=bench_args) self.assertTrue(result["throughput"] > 3800) From 696dd95b65ee99a98755f457b353c04ef7223aba Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Tue, 12 Nov 2024 21:02:05 +0000 Subject: [PATCH 13/30] add support for runtime backend + dataclass generic args --- python/sglang/bench_offline_throughput.py | 25 +++++++++++++++-------- python/sglang/test/test_utils.py | 10 ++++++++- test/srt/test_srt_engine.py | 5 +++-- 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py index bac75e86846..d33b88b5b0a 100644 --- a/python/sglang/bench_offline_throughput.py +++ b/python/sglang/bench_offline_throughput.py @@ -15,7 +15,7 @@ import logging import random import time -from typing import Dict, List, Tuple +from typing import Dict, List, Tuple, Union import numpy as np @@ -26,7 +26,7 @@ sample_sharegpt_requests, set_ulimit, ) -from sglang.srt.server import Engine +from sglang.srt.server import Engine, Runtime from sglang.srt.server_args import ServerArgs @@ -35,10 +35,12 @@ class BenchArgs: run_name: str = "before" result_filename: str = "" seed: int = 1 + backend: str = "engine" @staticmethod def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument("--run-name", type=str, default=BenchArgs.run_name) + parser.add_argument("--backend", type=str, default=BenchArgs.backend) parser.add_argument( "--result-filename", type=str, default=BenchArgs.result_filename ) @@ -94,7 +96,7 @@ def from_cli_args(cls, args: argparse.Namespace): def throughput_test_once( run_name: str, - engine: Engine, + backend: Union[Engine, Runtime], reqs: List[Tuple[str, int, int]], ): measurement_results = { @@ -103,7 +105,7 @@ def throughput_test_once( } st = time.perf_counter() - gen_out = engine.generate( + gen_out = backend.generate( prompt=[r[0] for r in reqs], sampling_params={"temperature": 0} ) latency = time.perf_counter() - st @@ -125,9 +127,14 @@ def throughput_test( server_args: ServerArgs, bench_args: BenchArgs, ): - engine = getEngine(**dataclasses.asdict(server_args)) - if not engine: - raise ValueError("Please provide valid engine arguments") + if bench_args.backend == "engine": + backend = getEngine(**dataclasses.asdict(server_args)) + if not backend: + raise ValueError("Please provide valid engine arguments") + elif bench_args.backend == "runtime": + backend = Runtime(**dataclasses.asdict(server_args)) + else: + raise ValueError('Please set backend to either "engine" or "runtime"') tokenizer_id = args.model_path tokenizer = get_tokenizer(tokenizer_id) @@ -170,13 +177,13 @@ def throughput_test( # Warm up throughput_test_once( run_name="warmup", - engine=engine, + backend=backend, reqs=warmup_requests, ) result = throughput_test_once( run_name=bench_args.run_name, - engine=engine, + backend=backend, reqs=input_requests, ) diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index f136a4d1b64..4b59bdff173 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -10,7 +10,8 @@ from concurrent.futures import ThreadPoolExecutor from functools import partial from types import SimpleNamespace -from typing import Callable, List, Optional +from typing import Callable, List, Optional, Dict, Any +import dataclasses import numpy as np import requests @@ -39,6 +40,13 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4" +@dataclasses.dataclass +class GenericArgs: + __dict__: Dict[str, Any] = dataclasses.field(default_factory=dict) + + def __init__(self, **kwargs): + self.__dict__.update(kwargs) + def is_in_ci(): """Return whether it is in CI runner.""" diff --git a/test/srt/test_srt_engine.py b/test/srt/test_srt_engine.py index 5e778f1655d..834cbb866f6 100644 --- a/test/srt/test_srt_engine.py +++ b/test/srt/test_srt_engine.py @@ -18,6 +18,7 @@ DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST, DEFAULT_SMALL_MODEL_NAME_FOR_TEST, + GenericArgs, ) @@ -154,10 +155,10 @@ def test_6_engine_runtime_encode_consistency(self): self.assertTrue(torch.allclose(out1, out2, atol=1e-5, rtol=1e-3)) def test_7_engine_offline_throughput(self): - server_args = SimpleNamespace( + server_args = GenericArgs( model_path=DEFAULT_MODEL_NAME_FOR_TEST, ) - bench_args = SimpleNamespace(num_prompts=10) + bench_args = GenericArgs(num_prompts=10) result = throughput_test(server_args=server_args, bench_args=bench_args) self.assertTrue(result["throughput"] > 3800) From 21b6ed5c4c4dd64f08673d2efc589725ada8eb9f Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Tue, 12 Nov 2024 23:42:42 +0000 Subject: [PATCH 14/30] push not being processed? From 0589a6bcfad69c236fe65719cfc921b13fc57e0f Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Tue, 12 Nov 2024 23:46:00 +0000 Subject: [PATCH 15/30] lint --- python/sglang/test/test_utils.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 4b59bdff173..af4abe4ceb4 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -2,6 +2,7 @@ import argparse import asyncio +import dataclasses import os import random import subprocess @@ -10,8 +11,7 @@ from concurrent.futures import ThreadPoolExecutor from functools import partial from types import SimpleNamespace -from typing import Callable, List, Optional, Dict, Any -import dataclasses +from typing import Any, Callable, Dict, List, Optional import numpy as np import requests @@ -40,6 +40,7 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_FP8_TP2 = "neuralmagic/Meta-Llama-3.1-70B-Instruct-FP8,neuralmagic/Mixtral-8x7B-Instruct-v0.1-FP8,neuralmagic/Qwen2-72B-Instruct-FP8,neuralmagic/Qwen2-57B-A14B-Instruct-FP8,neuralmagic/DeepSeek-Coder-V2-Lite-Instruct-FP8" DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4" + @dataclasses.dataclass class GenericArgs: __dict__: Dict[str, Any] = dataclasses.field(default_factory=dict) From 383b6d1a7091295cee395b398ff1840f44fc8328 Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Wed, 13 Nov 2024 01:59:32 +0000 Subject: [PATCH 16/30] fix benches --- python/sglang/bench_offline_throughput.py | 56 +++++++++++++++-------- python/sglang/test/test_utils.py | 10 +--- test/srt/test_srt_engine.py | 10 ++-- 3 files changed, 43 insertions(+), 33 deletions(-) diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py index d33b88b5b0a..557b810de06 100644 --- a/python/sglang/bench_offline_throughput.py +++ b/python/sglang/bench_offline_throughput.py @@ -33,9 +33,16 @@ @dataclasses.dataclass class BenchArgs: run_name: str = "before" + backend: str = "engine" result_filename: str = "" + dataset_name: str = "sharegpt" + dataset_path: str = "" + num_prompts: int = 1000 + sharegpt_output_len: Union[int, None] = None + random_input_len: Union[int, None] = None + random_output_len: Union[int, None] = None + random_range_ratio: Union[int, None] = None seed: int = 1 - backend: str = "engine" @staticmethod def add_cli_args(parser: argparse.ArgumentParser): @@ -57,7 +64,7 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--num-prompts", type=int, - default=1000, + default=BenchArgs.num_prompts, help="Number of prompts to process. Default is 1000.", ) parser.add_argument( @@ -96,6 +103,7 @@ def from_cli_args(cls, args: argparse.Namespace): def throughput_test_once( run_name: str, + backend_name: str, backend: Union[Engine, Runtime], reqs: List[Tuple[str, int, int]], ): @@ -110,6 +118,9 @@ def throughput_test_once( ) latency = time.perf_counter() - st + if backend_name == "runtime": + gen_out = json.loads(gen_out) + measurement_results["total_latency"] = latency measurement_results["total_output_tokens"] = sum( o["meta_info"]["completion_tokens"] for o in gen_out @@ -136,7 +147,7 @@ def throughput_test( else: raise ValueError('Please set backend to either "engine" or "runtime"') - tokenizer_id = args.model_path + tokenizer_id = server_args.model_path tokenizer = get_tokenizer(tokenizer_id) # Set global environmnets @@ -144,26 +155,31 @@ def throughput_test( random.seed(bench_args.seed) np.random.seed(bench_args.seed) - if args.dataset_name == "sharegpt": - assert args.random_input_len is None and args.random_output_len is None + if bench_args.dataset_name == "sharegpt": + assert ( + bench_args.random_input_len is None and bench_args.random_output_len is None + ) input_requests = sample_sharegpt_requests( - dataset_path=args.dataset_path, - num_requests=args.num_prompts, + dataset_path=bench_args.dataset_path, + num_requests=bench_args.num_prompts, tokenizer=tokenizer, - fixed_output_len=args.sharegpt_output_len, + fixed_output_len=bench_args.sharegpt_output_len, + ) + elif bench_args.dataset_name == "random": + assert ( + bench_args.random_input_len is not None + and bench_args.random_output_len is not None ) - elif args.dataset_name == "random": - assert args.random_input_len is not None and args.random_output_len is not None input_requests = sample_random_requests( - input_len=args.random_input_len, - output_len=args.random_output_len, - num_prompts=args.num_prompts, - range_ratio=args.random_range_ratio, + input_len=bench_args.random_input_len, + output_len=bench_args.random_output_len, + num_prompts=bench_args.num_prompts, + range_ratio=bench_args.random_range_ratio, tokenizer=tokenizer, - dataset_path=args.dataset_path, + dataset_path=bench_args.dataset_path, ) else: - raise ValueError(f"Unknown dataset: {args.dataset_name}") + raise ValueError(f"Unknown dataset: {bench_args.dataset_name}") warmup_requests = sample_random_requests( input_len=20, @@ -171,18 +187,20 @@ def throughput_test( num_prompts=2, range_ratio=0.8, tokenizer=tokenizer, - dataset_path=args.dataset_path, + dataset_path=bench_args.dataset_path, ) # Warm up throughput_test_once( run_name="warmup", + backend_name=bench_args.backend, backend=backend, reqs=warmup_requests, ) result = throughput_test_once( run_name=bench_args.run_name, + backend_name=bench_args.backend, backend=backend, reqs=input_requests, ) @@ -191,7 +209,7 @@ def throughput_test( with open(bench_args.result_filename, "a") as fout: fout.write(json.dumps(result) + "\n") else: - print(result) + return result if __name__ == "__main__": @@ -208,6 +226,6 @@ def throughput_test( ) try: - throughput_test(server_args, bench_args) + print(throughput_test(server_args, bench_args)) except Exception as e: raise e diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index af4abe4ceb4..8b979ead6d1 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -11,7 +11,7 @@ from concurrent.futures import ThreadPoolExecutor from functools import partial from types import SimpleNamespace -from typing import Any, Callable, Dict, List, Optional +from typing import Callable, List, Optional import numpy as np import requests @@ -41,14 +41,6 @@ DEFAULT_MODEL_NAME_FOR_NIGHTLY_EVAL_QUANT_TP1 = "hugging-quants/Meta-Llama-3.1-8B-Instruct-AWQ-INT4,hugging-quants/Meta-Llama-3.1-8B-Instruct-GPTQ-INT4" -@dataclasses.dataclass -class GenericArgs: - __dict__: Dict[str, Any] = dataclasses.field(default_factory=dict) - - def __init__(self, **kwargs): - self.__dict__.update(kwargs) - - def is_in_ci(): """Return whether it is in CI runner.""" return os.getenv("SGLANG_IS_IN_CI", "false") == "true" diff --git a/test/srt/test_srt_engine.py b/test/srt/test_srt_engine.py index 834cbb866f6..16e23a92bad 100644 --- a/test/srt/test_srt_engine.py +++ b/test/srt/test_srt_engine.py @@ -11,14 +11,14 @@ import torch import sglang as sgl -from sglang.bench_offline_throughput import throughput_test +from sglang.bench_offline_throughput import throughput_test, BenchArgs +from sglang.srt.server_args import ServerArgs from sglang.srt.hf_transformers_utils import get_tokenizer from sglang.test.few_shot_gsm8k_engine import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, DEFAULT_SMALL_EMBEDDING_MODEL_NAME_FOR_TEST, DEFAULT_SMALL_MODEL_NAME_FOR_TEST, - GenericArgs, ) @@ -155,12 +155,12 @@ def test_6_engine_runtime_encode_consistency(self): self.assertTrue(torch.allclose(out1, out2, atol=1e-5, rtol=1e-3)) def test_7_engine_offline_throughput(self): - server_args = GenericArgs( + server_args = ServerArgs( model_path=DEFAULT_MODEL_NAME_FOR_TEST, ) - bench_args = GenericArgs(num_prompts=10) + bench_args = BenchArgs(num_prompts=10) result = throughput_test(server_args=server_args, bench_args=bench_args) - self.assertTrue(result["throughput"] > 3800) + self.assertTrue(result["throughput"] > 3000) if __name__ == "__main__": From 8db0340a41649afd253b38495e3ba11c0e864883 Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Wed, 13 Nov 2024 02:01:31 +0000 Subject: [PATCH 17/30] lint --- python/sglang/test/test_utils.py | 1 - test/srt/test_srt_engine.py | 4 ++-- 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/python/sglang/test/test_utils.py b/python/sglang/test/test_utils.py index 8b979ead6d1..f136a4d1b64 100644 --- a/python/sglang/test/test_utils.py +++ b/python/sglang/test/test_utils.py @@ -2,7 +2,6 @@ import argparse import asyncio -import dataclasses import os import random import subprocess diff --git a/test/srt/test_srt_engine.py b/test/srt/test_srt_engine.py index 16e23a92bad..6170118950c 100644 --- a/test/srt/test_srt_engine.py +++ b/test/srt/test_srt_engine.py @@ -11,9 +11,9 @@ import torch import sglang as sgl -from sglang.bench_offline_throughput import throughput_test, BenchArgs -from sglang.srt.server_args import ServerArgs +from sglang.bench_offline_throughput import BenchArgs, throughput_test from sglang.srt.hf_transformers_utils import get_tokenizer +from sglang.srt.server_args import ServerArgs from sglang.test.few_shot_gsm8k_engine import run_eval from sglang.test.test_utils import ( DEFAULT_MODEL_NAME_FOR_TEST, From c6a6827003add1e584f4ffada0dea92aa57e83b8 Mon Sep 17 00:00:00 2001 From: ByronHsu Date: Wed, 13 Nov 2024 05:19:15 +0000 Subject: [PATCH 18/30] add review --- python/sglang/bench_offline_throughput.py | 31 +++++++++++++---------- 1 file changed, 18 insertions(+), 13 deletions(-) diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py index 557b810de06..ac80a1d2997 100644 --- a/python/sglang/bench_offline_throughput.py +++ b/python/sglang/bench_offline_throughput.py @@ -4,8 +4,10 @@ It accepts the same arguments as bench_latency.py # Usage +# TODO: is this runnable? python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl +# TODO: add running command for shared gpt, random, and gen-shared-prefix dataset """ import argparse @@ -32,16 +34,22 @@ @dataclasses.dataclass class BenchArgs: + # TODO: what does "before" mean run_name: str = "before" backend: str = "engine" result_filename: str = "" dataset_name: str = "sharegpt" dataset_path: str = "" num_prompts: int = 1000 - sharegpt_output_len: Union[int, None] = None - random_input_len: Union[int, None] = None - random_output_len: Union[int, None] = None - random_range_ratio: Union[int, None] = None + # TODO: with None, the program crashes with + # bench_offline_throughput.py", line 101, in + # **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs} + # TypeError: NoneType takes no arguments + # Ideally we want to make it easier to run with specified default values, so users dont have to keep trial and errors + sharegpt_output_len: int = 256 + random_input_len: int = 256 + random_output_len: int = 256 + random_range_ratio: float = 0.0 seed: int = 1 @staticmethod @@ -70,23 +78,25 @@ def add_cli_args(parser: argparse.ArgumentParser): parser.add_argument( "--sharegpt-output-len", type=int, - default=None, + default=BenchArgs.sharegpt_output_len, help="Output length for each request. Overrides the output length from the ShareGPT dataset.", ) parser.add_argument( "--random-input-len", type=int, + default=BenchArgs.random_input_len, help="Number of input tokens per request, used only for random dataset.", ) parser.add_argument( "--random-output-len", type=int, + default=BenchArgs.random_output_len, help="Number of output tokens per request, used only for random dataset.", ) parser.add_argument( "--random-range-ratio", type=float, - default=0.0, + default=BenchArgs.random_range_ratio, help="Range of sampled ratio of input/output length, " "used only for random dataset.", ) @@ -96,6 +106,7 @@ def add_cli_args(parser: argparse.ArgumentParser): def from_cli_args(cls, args: argparse.Namespace): # use the default value's type to case the args into correct types. attrs = [(attr.name, type(attr.default)) for attr in dataclasses.fields(cls)] + print(attrs) return cls( **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs} ) @@ -156,9 +167,6 @@ def throughput_test( np.random.seed(bench_args.seed) if bench_args.dataset_name == "sharegpt": - assert ( - bench_args.random_input_len is None and bench_args.random_output_len is None - ) input_requests = sample_sharegpt_requests( dataset_path=bench_args.dataset_path, num_requests=bench_args.num_prompts, @@ -166,10 +174,6 @@ def throughput_test( fixed_output_len=bench_args.sharegpt_output_len, ) elif bench_args.dataset_name == "random": - assert ( - bench_args.random_input_len is not None - and bench_args.random_output_len is not None - ) input_requests = sample_random_requests( input_len=bench_args.random_input_len, output_len=bench_args.random_output_len, @@ -178,6 +182,7 @@ def throughput_test( tokenizer=tokenizer, dataset_path=bench_args.dataset_path, ) + # TODO: gen-shared-prefix dataset else: raise ValueError(f"Unknown dataset: {bench_args.dataset_name}") From ed1a1333e607296213571ae35c8a57b694eacca9 Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Wed, 13 Nov 2024 07:07:02 +0000 Subject: [PATCH 19/30] address todos --- python/sglang/bench_offline_throughput.py | 63 +++++++++++++++++------ 1 file changed, 48 insertions(+), 15 deletions(-) diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py index ac80a1d2997..8fc892ac2ad 100644 --- a/python/sglang/bench_offline_throughput.py +++ b/python/sglang/bench_offline_throughput.py @@ -4,8 +4,14 @@ It accepts the same arguments as bench_latency.py # Usage -# TODO: is this runnable? -python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --batch 1 12 14 --input-len 256 512 --output-len 32 256 --result-filename out.jsonl +## Sharegpt dataset with default args +python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct + +## Random dataset with default args +python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --backend random + +## Shared prefix dataset with default args +python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --backend generated-shared-prefix # TODO: add running command for shared gpt, random, and gen-shared-prefix dataset """ @@ -26,6 +32,7 @@ get_tokenizer, sample_random_requests, sample_sharegpt_requests, + sample_generated_shared_prefix_requests, set_ulimit, ) from sglang.srt.server import Engine, Runtime @@ -34,27 +41,23 @@ @dataclasses.dataclass class BenchArgs: - # TODO: what does "before" mean - run_name: str = "before" backend: str = "engine" result_filename: str = "" dataset_name: str = "sharegpt" dataset_path: str = "" num_prompts: int = 1000 - # TODO: with None, the program crashes with - # bench_offline_throughput.py", line 101, in - # **{attr: attr_type(getattr(args, attr)) for attr, attr_type in attrs} - # TypeError: NoneType takes no arguments - # Ideally we want to make it easier to run with specified default values, so users dont have to keep trial and errors sharegpt_output_len: int = 256 random_input_len: int = 256 random_output_len: int = 256 random_range_ratio: float = 0.0 + gen_num_groups: int = (8,) + gen_prompts_per_group: int = (16,) + gen_system_prompt_len: int = (128,) + gen_question_len: int = (256,) seed: int = 1 @staticmethod def add_cli_args(parser: argparse.ArgumentParser): - parser.add_argument("--run-name", type=str, default=BenchArgs.run_name) parser.add_argument("--backend", type=str, default=BenchArgs.backend) parser.add_argument( "--result-filename", type=str, default=BenchArgs.result_filename @@ -100,6 +103,32 @@ def add_cli_args(parser: argparse.ArgumentParser): help="Range of sampled ratio of input/output length, " "used only for random dataset.", ) + parser.add_argument( + "--gen-num-groups", + type=int, + default=BenchArgs.gen_num_groups, + help="Number of groups with shared prefix, used" + "only for generate-shared-prefix", + ) + parser.add_argument( + "--gen-prompts-per-group", + type=int, + default=BenchArgs.gen_prompts_per_group, + help="Number of prompts per group of shared prefix, used" + "only for generate-shared-prefix", + ) + parser.add_argument( + "--gen-system-prompt-len", + type=int, + default=BenchArgs.gen_system_prompt_len, + help="System prompt length, used" "only for generate-shared-prefix", + ) + parser.add_argument( + "--gen-question-len", + type=int, + default=BenchArgs.gen_question_len, + help="Question length, used" "only for generate-shared-prefix", + ) parser.add_argument("--seed", type=int, default=1, help="The random seed.") @classmethod @@ -113,13 +142,11 @@ def from_cli_args(cls, args: argparse.Namespace): def throughput_test_once( - run_name: str, backend_name: str, backend: Union[Engine, Runtime], reqs: List[Tuple[str, int, int]], ): measurement_results = { - "run_name": run_name, "total_input_tokens": sum(r[1] for r in reqs), } @@ -182,7 +209,15 @@ def throughput_test( tokenizer=tokenizer, dataset_path=bench_args.dataset_path, ) - # TODO: gen-shared-prefix dataset + elif bench_args.dataset_name == "generated-shared-prefix": + input_requests = sample_generated_shared_prefix_requests( + num_groups=bench_args.gen_num_groups, + prompts_per_group=bench_args.gen_prompts_per_group, + system_prompt_len=bench_args.gen_system_prompt_len, + question_len=bench_args.gen_question_len, + output_len=bench_args.gen_output_len, + tokenizer=tokenizer, + ) else: raise ValueError(f"Unknown dataset: {bench_args.dataset_name}") @@ -197,14 +232,12 @@ def throughput_test( # Warm up throughput_test_once( - run_name="warmup", backend_name=bench_args.backend, backend=backend, reqs=warmup_requests, ) result = throughput_test_once( - run_name=bench_args.run_name, backend_name=bench_args.backend, backend=backend, reqs=input_requests, From c485dbeeefade7841c468f33d5494e45a1b98f9f Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Wed, 13 Nov 2024 08:54:18 +0000 Subject: [PATCH 20/30] not sure how the tuple stuff got there --- python/sglang/bench_offline_throughput.py | 10 +++++----- python/sglang/bench_serving.py | 1 - 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py index 8fc892ac2ad..9a86ef2ace1 100644 --- a/python/sglang/bench_offline_throughput.py +++ b/python/sglang/bench_offline_throughput.py @@ -30,9 +30,9 @@ from sglang.api import Engine as getEngine from sglang.bench_serving import ( get_tokenizer, + sample_generated_shared_prefix_requests, sample_random_requests, sample_sharegpt_requests, - sample_generated_shared_prefix_requests, set_ulimit, ) from sglang.srt.server import Engine, Runtime @@ -50,10 +50,10 @@ class BenchArgs: random_input_len: int = 256 random_output_len: int = 256 random_range_ratio: float = 0.0 - gen_num_groups: int = (8,) - gen_prompts_per_group: int = (16,) - gen_system_prompt_len: int = (128,) - gen_question_len: int = (256,) + gen_num_groups: int = 8 + gen_prompts_per_group: int = 16 + gen_system_prompt_len: int = 128 + gen_question_len: int = 256 seed: int = 1 @staticmethod diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index c0cf946ede9..686be14b910 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -590,7 +590,6 @@ def sample_random_requests( (data["conversations"][0]["value"], data["conversations"][1]["value"]) for data in dataset ] - # Shuffle the dataset. random.shuffle(dataset) From fd2d04ddef3683ca2d6eb8eaa61d829180dffa1b Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Wed, 13 Nov 2024 09:56:28 +0000 Subject: [PATCH 21/30] fix --- python/sglang/bench_offline_throughput.py | 33 ++++++++++++++++++----- 1 file changed, 26 insertions(+), 7 deletions(-) diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py index 9a86ef2ace1..018567cb3c3 100644 --- a/python/sglang/bench_offline_throughput.py +++ b/python/sglang/bench_offline_throughput.py @@ -25,8 +25,6 @@ import time from typing import Dict, List, Tuple, Union -import numpy as np - from sglang.api import Engine as getEngine from sglang.bench_serving import ( get_tokenizer, @@ -54,6 +52,7 @@ class BenchArgs: gen_prompts_per_group: int = 16 gen_system_prompt_len: int = 128 gen_question_len: int = 256 + disable_ignore_eos: bool = False seed: int = 1 @staticmethod @@ -129,6 +128,12 @@ def add_cli_args(parser: argparse.ArgumentParser): default=BenchArgs.gen_question_len, help="Question length, used" "only for generate-shared-prefix", ) + parser.add_argument( + "--disable-ignore-eos", + type=bool, + default=BenchArgs.disable_ignore_eos, + help="Disable ignore EOS token", + ) parser.add_argument("--seed", type=int, default=1, help="The random seed.") @classmethod @@ -145,15 +150,22 @@ def throughput_test_once( backend_name: str, backend: Union[Engine, Runtime], reqs: List[Tuple[str, int, int]], + output_len: int, + ignore_eos: bool, ): measurement_results = { "total_input_tokens": sum(r[1] for r in reqs), } + prompt = [r[0] for r in reqs] + sampling_params = { + "temperature": 0, + "max_new_tokens": output_len, + "ignore_eos": ignore_eos, + } + st = time.perf_counter() - gen_out = backend.generate( - prompt=[r[0] for r in reqs], sampling_params={"temperature": 0} - ) + gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params) latency = time.perf_counter() - st if backend_name == "runtime": @@ -200,6 +212,7 @@ def throughput_test( tokenizer=tokenizer, fixed_output_len=bench_args.sharegpt_output_len, ) + output_len = bench_args.sharegpt_output_len elif bench_args.dataset_name == "random": input_requests = sample_random_requests( input_len=bench_args.random_input_len, @@ -209,6 +222,7 @@ def throughput_test( tokenizer=tokenizer, dataset_path=bench_args.dataset_path, ) + output_len = bench_args.random_output_len elif bench_args.dataset_name == "generated-shared-prefix": input_requests = sample_generated_shared_prefix_requests( num_groups=bench_args.gen_num_groups, @@ -218,6 +232,7 @@ def throughput_test( output_len=bench_args.gen_output_len, tokenizer=tokenizer, ) + output_len = bench_args.gen_output_len else: raise ValueError(f"Unknown dataset: {bench_args.dataset_name}") @@ -235,19 +250,23 @@ def throughput_test( backend_name=bench_args.backend, backend=backend, reqs=warmup_requests, + output_len=output_len, + ignore_eos=not bench_args.disable_ignore_eos, ) result = throughput_test_once( backend_name=bench_args.backend, backend=backend, reqs=input_requests, + output_len=output_len, + ignore_eos=not bench_args.disable_ignore_eos, ) if bench_args.result_filename: with open(bench_args.result_filename, "a") as fout: fout.write(json.dumps(result) + "\n") - else: - return result + + return result if __name__ == "__main__": From ea3b60a3d989d9321b9c79044c82ca6b8c3aa82f Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Wed, 13 Nov 2024 20:56:25 +0000 Subject: [PATCH 22/30] fix --- python/sglang/bench_offline_throughput.py | 41 +++------------ python/sglang/bench_serving.py | 64 +++++++++++++---------- 2 files changed, 43 insertions(+), 62 deletions(-) diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py index 018567cb3c3..4d88b1c0bd2 100644 --- a/python/sglang/bench_offline_throughput.py +++ b/python/sglang/bench_offline_throughput.py @@ -8,12 +8,13 @@ python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct ## Random dataset with default args -python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --backend random +python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --dataset-name random ## Shared prefix dataset with default args -python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --backend generated-shared-prefix +python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --dataset-name generated-shared-prefix -# TODO: add running command for shared gpt, random, and gen-shared-prefix dataset +## Sharegpt dataset on runtime backend +python -m sglang.bench_offline_throughput --model-path meta-llama/Meta-Llama-3-8B-Instruct --backend runtime """ import argparse @@ -25,9 +26,12 @@ import time from typing import Dict, List, Tuple, Union +import numpy as np + from sglang.api import Engine as getEngine from sglang.bench_serving import ( get_tokenizer, + get_dataset, sample_generated_shared_prefix_requests, sample_random_requests, sample_sharegpt_requests, @@ -205,36 +209,7 @@ def throughput_test( random.seed(bench_args.seed) np.random.seed(bench_args.seed) - if bench_args.dataset_name == "sharegpt": - input_requests = sample_sharegpt_requests( - dataset_path=bench_args.dataset_path, - num_requests=bench_args.num_prompts, - tokenizer=tokenizer, - fixed_output_len=bench_args.sharegpt_output_len, - ) - output_len = bench_args.sharegpt_output_len - elif bench_args.dataset_name == "random": - input_requests = sample_random_requests( - input_len=bench_args.random_input_len, - output_len=bench_args.random_output_len, - num_prompts=bench_args.num_prompts, - range_ratio=bench_args.random_range_ratio, - tokenizer=tokenizer, - dataset_path=bench_args.dataset_path, - ) - output_len = bench_args.random_output_len - elif bench_args.dataset_name == "generated-shared-prefix": - input_requests = sample_generated_shared_prefix_requests( - num_groups=bench_args.gen_num_groups, - prompts_per_group=bench_args.gen_prompts_per_group, - system_prompt_len=bench_args.gen_system_prompt_len, - question_len=bench_args.gen_question_len, - output_len=bench_args.gen_output_len, - tokenizer=tokenizer, - ) - output_len = bench_args.gen_output_len - else: - raise ValueError(f"Unknown dataset: {bench_args.dataset_name}") + input_requests = get_dataset(bench_args, tokenizer) warmup_requests = sample_random_requests( input_len=20, diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 686be14b910..344142f1be9 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -421,6 +421,40 @@ def get_tokenizer( ) +def get_dataset(args, tokenizer): + if args.dataset_name == "sharegpt": + input_requests = sample_sharegpt_requests( + dataset_path=args.dataset_path, + num_requests=args.num_prompts, + tokenizer=tokenizer, + fixed_output_len=args.sharegpt_output_len, + ) + output_len = args.sharegpt_output_len + elif args.dataset_name == "random": + input_requests = sample_random_requests( + input_len=args.random_input_len, + output_len=args.random_output_len, + num_prompts=args.num_prompts, + range_ratio=args.random_range_ratio, + tokenizer=tokenizer, + dataset_path=args.dataset_path, + ) + output_len = args.random_output_len + elif args.dataset_name == "generated-shared-prefix": + input_requests = sample_generated_shared_prefix_requests( + num_groups=args.gen_num_groups, + prompts_per_group=args.gen_prompts_per_group, + system_prompt_len=args.gen_system_prompt_len, + question_len=args.gen_question_len, + output_len=args.gen_output_len, + tokenizer=tokenizer, + ) + output_len = args.gen_output_len + else: + raise ValueError(f"Unknown dataset: {args.dataset_name}") + return output_len + + ASYNC_REQUEST_FUNCS = { "sglang": async_request_sglang_generate, "sglang-native": async_request_sglang_generate, @@ -1097,35 +1131,7 @@ def run_benchmark(args_: argparse.Namespace): tokenizer = get_tokenizer(tokenizer_id) - if args.dataset_name == "sharegpt": - assert args.random_input_len is None and args.random_output_len is None - input_requests = sample_sharegpt_requests( - dataset_path=args.dataset_path, - num_requests=args.num_prompts, - tokenizer=tokenizer, - fixed_output_len=args.sharegpt_output_len, - ) - elif args.dataset_name == "random": - assert args.random_input_len is not None and args.random_output_len is not None - input_requests = sample_random_requests( - input_len=args.random_input_len, - output_len=args.random_output_len, - num_prompts=args.num_prompts, - range_ratio=args.random_range_ratio, - tokenizer=tokenizer, - dataset_path=args.dataset_path, - ) - elif args.dataset_name == "generated-shared-prefix": - input_requests = sample_generated_shared_prefix_requests( - num_groups=args.gen_num_groups, - prompts_per_group=args.gen_prompts_per_group, - system_prompt_len=args.gen_system_prompt_len, - question_len=args.gen_question_len, - output_len=args.gen_output_len, - tokenizer=tokenizer, - ) - else: - raise ValueError(f"Unknown dataset: {args.dataset_name}") + input_requests = get_dataset(args, tokenizer) if not args.multi: return asyncio.run( From 732e3babb1ebeaf8380a02075ac368308528c51f Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Wed, 13 Nov 2024 20:58:36 +0000 Subject: [PATCH 23/30] lint --- python/sglang/bench_offline_throughput.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py index 4d88b1c0bd2..c856db47c69 100644 --- a/python/sglang/bench_offline_throughput.py +++ b/python/sglang/bench_offline_throughput.py @@ -30,8 +30,8 @@ from sglang.api import Engine as getEngine from sglang.bench_serving import ( - get_tokenizer, get_dataset, + get_tokenizer, sample_generated_shared_prefix_requests, sample_random_requests, sample_sharegpt_requests, From 41aad44e495de46e8b90fdbd37732c1a77159d2d Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Thu, 14 Nov 2024 03:22:31 +0000 Subject: [PATCH 24/30] format benchmark + add diff metrics --- python/sglang/bench_offline_throughput.py | 76 ++++++++++++++++++++--- python/sglang/bench_serving.py | 9 +++ python/sglang/srt/server.py | 2 +- 3 files changed, 76 insertions(+), 11 deletions(-) diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py index c856db47c69..03ae4032e06 100644 --- a/python/sglang/bench_offline_throughput.py +++ b/python/sglang/bench_offline_throughput.py @@ -1,7 +1,7 @@ """ Benchmark the throughput of using the offline LLM engine. This script does not launch a server. -It accepts the same arguments as bench_latency.py +It accepts the same arguments as launch_server.py and additional benchmark arguments # Usage ## Sharegpt dataset with default args @@ -154,19 +154,29 @@ def throughput_test_once( backend_name: str, backend: Union[Engine, Runtime], reqs: List[Tuple[str, int, int]], - output_len: int, ignore_eos: bool, ): measurement_results = { + "backend": backend_name, + "successful_requests": len(reqs), + "total_latency": -1, "total_input_tokens": sum(r[1] for r in reqs), + "total_output_tokens": -1, + "request_throughput": -1, + "input_throughput": -1, + "output_throughput": -1, + "total_throughput": -1, } prompt = [r[0] for r in reqs] - sampling_params = { - "temperature": 0, - "max_new_tokens": output_len, - "ignore_eos": ignore_eos, - } + sampling_params = [ + { + "temperature": 0, + "max_new_tokens": r[2], + "ignore_eos": ignore_eos, + } + for r in reqs + ] st = time.perf_counter() gen_out = backend.generate(prompt=prompt, sampling_params=sampling_params) @@ -179,12 +189,20 @@ def throughput_test_once( measurement_results["total_output_tokens"] = sum( o["meta_info"]["completion_tokens"] for o in gen_out ) - measurement_results["throughput"] = ( + measurement_results["request_throughput"] = ( + measurement_results["successful_requests"] / latency + ) + measurement_results["input_throughput"] = ( + measurement_results["total_input_tokens"] / latency + ) + measurement_results["output_throughput"] = ( + measurement_results["total_output_tokens"] / latency + ) + measurement_results["total_throughput"] = ( measurement_results["total_input_tokens"] + measurement_results["total_output_tokens"] ) / latency - print(f"Throughput: {measurement_results['throughput']} tokens/s") return measurement_results @@ -258,6 +276,44 @@ def throughput_test( ) try: - print(throughput_test(server_args, bench_args)) + res = throughput_test(server_args, bench_args) + print( + "\n{s:{c}^{n}}".format( + s=" Offline Throughput Benchmark Result ", n=50, c="=" + ) + ) + print("{:<40} {:<10}".format("Backend:", res["backend"])) + print( + "{:<40} {:<10}".format("Successful requests:", res["successful_requests"]) + ) + print( + "{:<40} {:<10.2f}".format("Benchmark duration (s):", res["total_latency"]) + ) + print("{:<40} {:<10}".format("Total input tokens:", res["total_input_tokens"])) + print( + "{:<40} {:<10}".format( + "Total generated tokens:", res["total_output_tokens"] + ) + ) + print( + "{:<40} {:<10.2f}".format( + "Request throughput (req/s):", res["request_throughput"] + ) + ) + print( + "{:<40} {:<10.2f}".format( + "Input token throughput (tok/s):", res["input_throughput"] + ) + ) + print( + "{:<40} {:<10.2f}".format( + "Output token throughput (tok/s):", res["output_throughput"] + ) + ) + print( + "{:<40} {:<10.2f}".format( + "Total token throughput (tok/s):", res["total_throughput"] + ) + ) except Exception as e: raise e diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 344142f1be9..3f2c680e7ad 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -477,6 +477,8 @@ class BenchmarkMetrics: input_throughput: float output_throughput: float output_throughput_retokenized: float + total_throughput: float + total_throughput_retokenized: float mean_ttft_ms: float median_ttft_ms: float std_ttft_ms: float @@ -797,6 +799,8 @@ def calculate_metrics( input_throughput=total_input / dur_s, output_throughput=sum(output_lens) / dur_s, output_throughput_retokenized=sum(retokenized_output_lens) / dur_s, + total_throughput=(total_input + sum(output_lens)) / dur_s, + total_throughput_retokenized=(total_input + sum(retokenized_output_lens)) / dur_s, mean_ttft_ms=np.mean(ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend median_ttft_ms=np.median(ttfts or 0) * 1000, @@ -914,6 +918,11 @@ async def benchmark( "Output token throughput (tok/s):", metrics.output_throughput ) ) + print( + "{:<40} {:<10.2f}".format( + "Total token throughput (tok/s):", metrics.total_throughput + ) + ) print("{s:{c}^{n}}".format(s="End-to-End Latency", n=50, c="-")) print( "{:<40} {:<10.2f}".format("Mean E2E Latency (ms):", metrics.mean_e2e_latency_ms) diff --git a/python/sglang/srt/server.py b/python/sglang/srt/server.py index e27bb1bb97b..e4798877a83 100644 --- a/python/sglang/srt/server.py +++ b/python/sglang/srt/server.py @@ -768,7 +768,7 @@ def generate( self, # The input prompt. It can be a single prompt or a batch of prompts. prompt: Optional[Union[List[str], str]] = None, - sampling_params: Optional[Dict] = None, + sampling_params: Optional[Union[List[Dict], Dict]] = None, # The token ids for text; one can either specify text or input_ids. input_ids: Optional[Union[List[List[int]], List[int]]] = None, return_logprob: Optional[Union[List[bool], bool]] = False, From fa76ac95017801ae72356b079891fbaafbaff893 Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Thu, 14 Nov 2024 03:22:48 +0000 Subject: [PATCH 25/30] lint --- python/sglang/bench_serving.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 3f2c680e7ad..9dfc6b3bff2 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -800,7 +800,8 @@ def calculate_metrics( output_throughput=sum(output_lens) / dur_s, output_throughput_retokenized=sum(retokenized_output_lens) / dur_s, total_throughput=(total_input + sum(output_lens)) / dur_s, - total_throughput_retokenized=(total_input + sum(retokenized_output_lens)) / dur_s, + total_throughput_retokenized=(total_input + sum(retokenized_output_lens)) + / dur_s, mean_ttft_ms=np.mean(ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend median_ttft_ms=np.median(ttfts or 0) * 1000, From cc2a5c56f68c220625fa258ed9a3f00e220c9446 Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Thu, 14 Nov 2024 04:22:44 +0000 Subject: [PATCH 26/30] fix script --- python/sglang/bench_offline_throughput.py | 3 +-- python/sglang/bench_serving.py | 5 +---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py index 03ae4032e06..e9729cfbf1b 100644 --- a/python/sglang/bench_offline_throughput.py +++ b/python/sglang/bench_offline_throughput.py @@ -243,7 +243,6 @@ def throughput_test( backend_name=bench_args.backend, backend=backend, reqs=warmup_requests, - output_len=output_len, ignore_eos=not bench_args.disable_ignore_eos, ) @@ -251,7 +250,6 @@ def throughput_test( backend_name=bench_args.backend, backend=backend, reqs=input_requests, - output_len=output_len, ignore_eos=not bench_args.disable_ignore_eos, ) @@ -315,5 +313,6 @@ def throughput_test( "Total token throughput (tok/s):", res["total_throughput"] ) ) + print("=" * 50) except Exception as e: raise e diff --git a/python/sglang/bench_serving.py b/python/sglang/bench_serving.py index 9dfc6b3bff2..68c67241302 100644 --- a/python/sglang/bench_serving.py +++ b/python/sglang/bench_serving.py @@ -429,7 +429,6 @@ def get_dataset(args, tokenizer): tokenizer=tokenizer, fixed_output_len=args.sharegpt_output_len, ) - output_len = args.sharegpt_output_len elif args.dataset_name == "random": input_requests = sample_random_requests( input_len=args.random_input_len, @@ -439,7 +438,6 @@ def get_dataset(args, tokenizer): tokenizer=tokenizer, dataset_path=args.dataset_path, ) - output_len = args.random_output_len elif args.dataset_name == "generated-shared-prefix": input_requests = sample_generated_shared_prefix_requests( num_groups=args.gen_num_groups, @@ -449,10 +447,9 @@ def get_dataset(args, tokenizer): output_len=args.gen_output_len, tokenizer=tokenizer, ) - output_len = args.gen_output_len else: raise ValueError(f"Unknown dataset: {args.dataset_name}") - return output_len + return input_requests ASYNC_REQUEST_FUNCS = { From e1045e48d011e5981005874c32807af74034263b Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Thu, 14 Nov 2024 04:36:40 +0000 Subject: [PATCH 27/30] fix test --- test/srt/test_srt_engine.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/srt/test_srt_engine.py b/test/srt/test_srt_engine.py index 6170118950c..6e630d4848d 100644 --- a/test/srt/test_srt_engine.py +++ b/test/srt/test_srt_engine.py @@ -160,7 +160,7 @@ def test_7_engine_offline_throughput(self): ) bench_args = BenchArgs(num_prompts=10) result = throughput_test(server_args=server_args, bench_args=bench_args) - self.assertTrue(result["throughput"] > 3000) + self.assertTrue(result["total_throughput"] > 3000) if __name__ == "__main__": From 4a322a31f38bb727bf62bc4337a70f93e01d1099 Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Thu, 14 Nov 2024 07:57:23 +0000 Subject: [PATCH 28/30] fix --- python/sglang/bench_offline_throughput.py | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py index e9729cfbf1b..4c66e401490 100644 --- a/python/sglang/bench_offline_throughput.py +++ b/python/sglang/bench_offline_throughput.py @@ -19,25 +19,22 @@ import argparse import dataclasses -import itertools import json import logging import random import time -from typing import Dict, List, Tuple, Union +from typing import List, Tuple import numpy as np -from sglang.api import Engine as getEngine +from sglang.api import Engine from sglang.bench_serving import ( get_dataset, get_tokenizer, - sample_generated_shared_prefix_requests, sample_random_requests, - sample_sharegpt_requests, set_ulimit, ) -from sglang.srt.server import Engine, Runtime +from sglang.srt.server import Runtime from sglang.srt.server_args import ServerArgs @@ -152,7 +149,7 @@ def from_cli_args(cls, args: argparse.Namespace): def throughput_test_once( backend_name: str, - backend: Union[Engine, Runtime], + backend, reqs: List[Tuple[str, int, int]], ignore_eos: bool, ): @@ -211,7 +208,7 @@ def throughput_test( bench_args: BenchArgs, ): if bench_args.backend == "engine": - backend = getEngine(**dataclasses.asdict(server_args)) + backend = Engine(**dataclasses.asdict(server_args)) if not backend: raise ValueError("Please provide valid engine arguments") elif bench_args.backend == "runtime": From ef4f2782d57e704d18b42d824316d34b1d9a52ff Mon Sep 17 00:00:00 2001 From: zolinthecow Date: Thu, 14 Nov 2024 07:58:46 +0000 Subject: [PATCH 29/30] remove useless try except --- python/sglang/bench_offline_throughput.py | 65 +++++++++-------------- 1 file changed, 26 insertions(+), 39 deletions(-) diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py index 4c66e401490..104d44cbf91 100644 --- a/python/sglang/bench_offline_throughput.py +++ b/python/sglang/bench_offline_throughput.py @@ -270,46 +270,33 @@ def throughput_test( format="%(message)s", ) - try: - res = throughput_test(server_args, bench_args) - print( - "\n{s:{c}^{n}}".format( - s=" Offline Throughput Benchmark Result ", n=50, c="=" - ) - ) - print("{:<40} {:<10}".format("Backend:", res["backend"])) - print( - "{:<40} {:<10}".format("Successful requests:", res["successful_requests"]) - ) - print( - "{:<40} {:<10.2f}".format("Benchmark duration (s):", res["total_latency"]) - ) - print("{:<40} {:<10}".format("Total input tokens:", res["total_input_tokens"])) - print( - "{:<40} {:<10}".format( - "Total generated tokens:", res["total_output_tokens"] - ) - ) - print( - "{:<40} {:<10.2f}".format( - "Request throughput (req/s):", res["request_throughput"] - ) + res = throughput_test(server_args, bench_args) + print( + "\n{s:{c}^{n}}".format(s=" Offline Throughput Benchmark Result ", n=50, c="=") + ) + print("{:<40} {:<10}".format("Backend:", res["backend"])) + print("{:<40} {:<10}".format("Successful requests:", res["successful_requests"])) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", res["total_latency"])) + print("{:<40} {:<10}".format("Total input tokens:", res["total_input_tokens"])) + print("{:<40} {:<10}".format("Total generated tokens:", res["total_output_tokens"])) + print( + "{:<40} {:<10.2f}".format( + "Request throughput (req/s):", res["request_throughput"] ) - print( - "{:<40} {:<10.2f}".format( - "Input token throughput (tok/s):", res["input_throughput"] - ) + ) + print( + "{:<40} {:<10.2f}".format( + "Input token throughput (tok/s):", res["input_throughput"] ) - print( - "{:<40} {:<10.2f}".format( - "Output token throughput (tok/s):", res["output_throughput"] - ) + ) + print( + "{:<40} {:<10.2f}".format( + "Output token throughput (tok/s):", res["output_throughput"] ) - print( - "{:<40} {:<10.2f}".format( - "Total token throughput (tok/s):", res["total_throughput"] - ) + ) + print( + "{:<40} {:<10.2f}".format( + "Total token throughput (tok/s):", res["total_throughput"] ) - print("=" * 50) - except Exception as e: - raise e + ) + print("=" * 50) From df9da2e43bb7f1480ab8f50f7aa5c1d0f561b05a Mon Sep 17 00:00:00 2001 From: ByronHsu Date: Fri, 15 Nov 2024 05:38:26 +0000 Subject: [PATCH 30/30] fix test and move logging --- python/sglang/bench_offline_throughput.py | 55 ++++++++++++----------- test/srt/test_srt_engine.py | 2 +- 2 files changed, 30 insertions(+), 27 deletions(-) diff --git a/python/sglang/bench_offline_throughput.py b/python/sglang/bench_offline_throughput.py index 104d44cbf91..3c57e1144c0 100644 --- a/python/sglang/bench_offline_throughput.py +++ b/python/sglang/bench_offline_throughput.py @@ -254,49 +254,52 @@ def throughput_test( with open(bench_args.result_filename, "a") as fout: fout.write(json.dumps(result) + "\n") - return result - - -if __name__ == "__main__": - parser = argparse.ArgumentParser() - ServerArgs.add_cli_args(parser) - BenchArgs.add_cli_args(parser) - args = parser.parse_args() - server_args = ServerArgs.from_cli_args(args) - bench_args = BenchArgs.from_cli_args(args) - - logging.basicConfig( - level=getattr(logging, server_args.log_level.upper()), - format="%(message)s", - ) - - res = throughput_test(server_args, bench_args) print( "\n{s:{c}^{n}}".format(s=" Offline Throughput Benchmark Result ", n=50, c="=") ) - print("{:<40} {:<10}".format("Backend:", res["backend"])) - print("{:<40} {:<10}".format("Successful requests:", res["successful_requests"])) - print("{:<40} {:<10.2f}".format("Benchmark duration (s):", res["total_latency"])) - print("{:<40} {:<10}".format("Total input tokens:", res["total_input_tokens"])) - print("{:<40} {:<10}".format("Total generated tokens:", res["total_output_tokens"])) + print("{:<40} {:<10}".format("Backend:", result["backend"])) + print("{:<40} {:<10}".format("Successful requests:", result["successful_requests"])) + print("{:<40} {:<10.2f}".format("Benchmark duration (s):", result["total_latency"])) + print("{:<40} {:<10}".format("Total input tokens:", result["total_input_tokens"])) + print( + "{:<40} {:<10}".format("Total generated tokens:", result["total_output_tokens"]) + ) print( "{:<40} {:<10.2f}".format( - "Request throughput (req/s):", res["request_throughput"] + "Request throughput (req/s):", result["request_throughput"] ) ) print( "{:<40} {:<10.2f}".format( - "Input token throughput (tok/s):", res["input_throughput"] + "Input token throughput (tok/s):", result["input_throughput"] ) ) print( "{:<40} {:<10.2f}".format( - "Output token throughput (tok/s):", res["output_throughput"] + "Output token throughput (tok/s):", result["output_throughput"] ) ) print( "{:<40} {:<10.2f}".format( - "Total token throughput (tok/s):", res["total_throughput"] + "Total token throughput (tok/s):", result["total_throughput"] ) ) print("=" * 50) + + return result + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + ServerArgs.add_cli_args(parser) + BenchArgs.add_cli_args(parser) + args = parser.parse_args() + server_args = ServerArgs.from_cli_args(args) + bench_args = BenchArgs.from_cli_args(args) + + logging.basicConfig( + level=getattr(logging, server_args.log_level.upper()), + format="%(message)s", + ) + + throughput_test(server_args, bench_args) diff --git a/test/srt/test_srt_engine.py b/test/srt/test_srt_engine.py index 6e630d4848d..33232f50b41 100644 --- a/test/srt/test_srt_engine.py +++ b/test/srt/test_srt_engine.py @@ -158,7 +158,7 @@ def test_7_engine_offline_throughput(self): server_args = ServerArgs( model_path=DEFAULT_MODEL_NAME_FOR_TEST, ) - bench_args = BenchArgs(num_prompts=10) + bench_args = BenchArgs(num_prompts=100) result = throughput_test(server_args=server_args, bench_args=bench_args) self.assertTrue(result["total_throughput"] > 3000)