diff --git a/benchmarks/inference/mii/A6000_benchmarks_example.PNG b/benchmarks/inference/mii/A6000_benchmarks_example.PNG new file mode 100644 index 000000000..853e92378 Binary files /dev/null and b/benchmarks/inference/mii/A6000_benchmarks_example.PNG differ diff --git a/benchmarks/inference/mii/README.md b/benchmarks/inference/mii/README.md index f9a825daa..d9e475cdb 100644 --- a/benchmarks/inference/mii/README.md +++ b/benchmarks/inference/mii/README.md @@ -11,7 +11,7 @@ python server.py [options] start Use the -h option to view all available options. To stop the server, use this command: -```bash +```bash python server.py stop ``` @@ -30,3 +30,17 @@ The scripts mentioned below were used for generating the plots featured in our b - `plot_th_lat.py`: This script generates charts for throughput and latency across different model sizes and client counts. - `plot_effective_throughput.py`: Use this to chart effective throughput. - `plot_latency_percentile.py`: This script will plot the 50th, 90th, and 95th percentile latencies. + +## Running an End-to-End Example + +To quickly experience the end-to-end process of running our benchmark and getting results, you can use the `run_example.sh`. This script is designed to execute the benchmark with a specific configuration. The plots below will be generated in the charts directory. These plots show the performance as depicted in figure 8 of our blog [post.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen#f-other-hardware-platforms) + +```bash +bash run_example.sh +``` + +
+
+ + *Figure 1: Throughput-latency curve and effective throughput of Llama 2 7b using A6000. Runs the client with 60 generation steps and input prompt length of 2600.*
+
\ No newline at end of file diff --git a/benchmarks/inference/mii/plot_effective_throughput.py b/benchmarks/inference/mii/plot_effective_throughput.py index 357fc7f9e..350c269c3 100644 --- a/benchmarks/inference/mii/plot_effective_throughput.py +++ b/benchmarks/inference/mii/plot_effective_throughput.py @@ -12,21 +12,30 @@ SLA_GEN_TOKENS_PER_SEC = [1, 2, 3, 4, 6, 8] EMA_SPAN = 16 -tp_sizes = { +tp_sizes_all = { "7b": [1], - "70b": [4, 8], + "70b": [4, 8] } -prompt_gen_pairs = [ +tp_sizes_test = { + "7b": [1] +} + +prompt_gen_pairs_all = [ (1200, 60), (1200, 128), (2600, 60), (2600, 128), ] +prompt_gen_pairs_test = [ + (2600, 60) +] def get_args(): parser = argparse.ArgumentParser() + parser.add_argument("--test", action="store_true") + parser.add_argument("--no_vllm", action="store_true") parser.add_argument("--log_dir", type=Path, default=".") parser.add_argument("--out_dir", type=Path, default="charts/goodtput") args = parser.parse_args() @@ -96,7 +105,8 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out print(f"model: {model_size} Prompt: {prompt}, Generation: {gen}, TP: {tp} sla_token_gen: {sla_token_gen}") mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json" - vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json" + if not args.no_vllm: + vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json" validate_funcs = [ (validate_token_cum_latency_SLA, (), "cum"), @@ -109,8 +119,9 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out client_num_list = sorted(list(mii_goodputs.keys())) mii_goodputs_list = [mii_goodputs[client_num] for client_num in client_num_list] - vllm_goodputs, vllm_good_ratios = extract_values(vllm_file_pattern, sla_token_gen, f) - vllm_goodputs_list = [vllm_goodputs[client_num] for client_num in client_num_list] + if not args.no_vllm: + vllm_goodputs, vllm_good_ratios = extract_values(vllm_file_pattern, sla_token_gen, f) + vllm_goodputs_list = [vllm_goodputs[client_num] for client_num in client_num_list] # print(f"MII {mii_goodputs_list} ratio={mii_good_ratios}") # print(f"vLLM {vllm_goodputs_list} ratio={vllm_good_ratios}") @@ -118,16 +129,18 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out # Plotting the scatter plot plt.figure(figsize=(7, 4)) plt.scatter(client_num_list, mii_goodputs_list, label=f"DeepSpeed-FastGen", marker="o", color="blue") - plt.scatter(client_num_list, vllm_goodputs_list, label=f"vLLM", marker="x", color="orange") + if not args.no_vllm: + plt.scatter(client_num_list, vllm_goodputs_list, label=f"vLLM", marker="x", color="orange") fit_x_list = np.arange(min(client_num_list), max(client_num_list), 0.1) mii_fit_model = np.polyfit(client_num_list, mii_goodputs_list, 4) mii_model_fn = np.poly1d(mii_fit_model) plt.plot(fit_x_list, mii_model_fn(fit_x_list), color="blue", alpha=0.5, linestyle="--") - vllm_fit_model = np.polyfit(client_num_list, vllm_goodputs_list, 4) - vllm_model_fn = np.poly1d(vllm_fit_model) - plt.plot(fit_x_list, vllm_model_fn(fit_x_list), color="orange", alpha=0.5, linestyle="--") + if not args.no_vllm: + vllm_fit_model = np.polyfit(client_num_list, vllm_goodputs_list, 4) + vllm_model_fn = np.poly1d(vllm_fit_model) + plt.plot(fit_x_list, vllm_model_fn(fit_x_list), color="orange", alpha=0.5, linestyle="--") title = f"Effective throughput (SLA prompt: {SLA_PROMPT_TOKENS_PER_SEC} tokens/s, generation: {sla_token_gen} tokens/s)\n" \ + f'Llama 2 {model_size.upper()} Prompt: {prompt}, Generation: {gen}, TP: {tp}' @@ -148,6 +161,13 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out if __name__ == "__main__": args = get_args() + if args.test: + tp_sizes = tp_sizes_test + prompt_gen_pairs = prompt_gen_pairs_test + else: + tp_sizes = tp_sizes_all + prompt_gen_pairs = prompt_gen_pairs_all + for model_size, tps in tp_sizes.items(): for tp in tps: for prompt, gen in prompt_gen_pairs: diff --git a/benchmarks/inference/mii/plot_th_lat.py b/benchmarks/inference/mii/plot_th_lat.py index 8ede6e818..e99dc5a3e 100644 --- a/benchmarks/inference/mii/plot_th_lat.py +++ b/benchmarks/inference/mii/plot_th_lat.py @@ -3,17 +3,25 @@ import argparse from pathlib import Path import numpy as np - +import pdb from postprocess_results import read_json, get_summary bs = 768 -tp_sizes = { +tp_sizes_test = { + "7b": [1] +} + +tp_sizes_all = { "7b": [1], "70b": [4, 8], } -prompt_gen_pairs = [ +prompt_gen_pairs_test = [ + (2600, 60) +] + +prompt_gen_pairs_all = [ (1200, 60), (1200, 128), (2600, 60), @@ -22,7 +30,9 @@ def get_args(): parser = argparse.ArgumentParser() - parser.add_argument("--log_dir", type=Path, default="logs.release") + parser.add_argument("--test", action="store_true") + parser.add_argument("--no_vllm", action="store_true") + parser.add_argument("--log_dir", type=Path, default=".") parser.add_argument("--out_dir", type=Path, default="charts/throughput_latency") args = parser.parse_args() return args @@ -56,19 +66,22 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): out_dir.mkdir(parents=True, exist_ok=True) mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json" - vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json" + if not args.no_vllm: + vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json" _, mii_throughputs, mii_latencies = extract_values(mii_file_pattern) - _, vllm_throughputs, vllm_latencies = extract_values(vllm_file_pattern) + if not args.no_vllm: + _, vllm_throughputs, vllm_latencies = extract_values(vllm_file_pattern) # Plotting the scatter plot plt.figure(figsize=(6, 4)) - - plt.scatter(vllm_throughputs, vllm_latencies, label=f"vLLM", marker="x", color="orange") - fit_vllm_x_list = np.arange(min(vllm_throughputs), max(vllm_throughputs), 0.01) - vllm_vllm_model = np.polyfit(vllm_throughputs, vllm_latencies, 3) - vllm_model_fn = np.poly1d(vllm_vllm_model) - plt.plot(fit_vllm_x_list, vllm_model_fn(fit_vllm_x_list), color="orange", alpha=0.5, linestyle="--") + + if not args.no_vllm: + plt.scatter(vllm_throughputs, vllm_latencies, label=f"vLLM", marker="x", color="orange") + fit_vllm_x_list = np.arange(min(vllm_throughputs), max(vllm_throughputs), 0.01) + vllm_vllm_model = np.polyfit(vllm_throughputs, vllm_latencies, 3) + vllm_model_fn = np.poly1d(vllm_vllm_model) + plt.plot(fit_vllm_x_list, vllm_model_fn(fit_vllm_x_list), color="orange", alpha=0.5, linestyle="--") plt.scatter(mii_throughputs, mii_latencies, label=f"DeepSpeed FastGen", marker="o", color="blue") fit_mii_x_list = np.arange(min(mii_throughputs), max(mii_throughputs), 0.01) @@ -82,7 +95,6 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): plt.legend() plt.grid(True) plt.tight_layout() - # plt.show() out_file = out_dir / f"th_lat_curve_llama{model_size}_tp{tp}_p{prompt}g{gen}.png" print(f"Saving {out_file}") plt.savefig(out_file) @@ -90,7 +102,13 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir): if __name__ == "__main__": args = get_args() - + if args.test: + tp_sizes = tp_sizes_test + prompt_gen_pairs = prompt_gen_pairs_test + else: + tp_sizes = tp_sizes_all + prompt_gen_pairs = prompt_gen_pairs_test_all + for model_size, tps in tp_sizes.items(): for tp in tps: for prompt, gen in prompt_gen_pairs: diff --git a/benchmarks/inference/mii/run_benchmark_client.py b/benchmarks/inference/mii/run_benchmark_client.py index 77377a93a..52fc0da50 100644 --- a/benchmarks/inference/mii/run_benchmark_client.py +++ b/benchmarks/inference/mii/run_benchmark_client.py @@ -80,35 +80,16 @@ def callback(response): token_gen_time.append(time_now - time_last_token) time_last_token = time_now - postprocess_config = { - "logit_processor": { - # "name": "TopP", - # "args": { - # "top_p": 0.9 - # } - "name": "Temperature", - "args": { - "temperature": 0.9 - } - }, - "sampler": { - "name": "Logits" - }, - "stop_criterion": { - "name": "EosGeneration" - } - } - time_last_token = start_time = time.time() token_gen_time = [] if stream: output_tokens = [] client.generate( - input_tokens, max_new_tokens=max_new_tokens, postprocess_config=postprocess_config, + input_tokens, max_new_tokens=max_new_tokens, streaming_fn=callback) else: result = client.generate( - input_tokens, max_new_tokens=max_new_tokens, postprocess_config=postprocess_config) + input_tokens, max_new_tokens=max_new_tokens) output_tokens = result.response[0] return ResponseDetails( diff --git a/benchmarks/inference/mii/run_example.sh b/benchmarks/inference/mii/run_example.sh new file mode 100644 index 000000000..ece8393ed --- /dev/null +++ b/benchmarks/inference/mii/run_example.sh @@ -0,0 +1,19 @@ +### Run the server +RAGGED_BATCH_SIZE=768 +PARAM_SIZES=(7b) +DEPLOYMENT_NAME=llama2-7b-tp1-b768 +python server.py --model_name meta-llama/Llama-2-7b-hf -d llama2-7b-tp1-b768 -m 1 -b 768 start + +### This command will run the client with 60 generation steps and input prompt length of 2600 +DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 bash ./run_benchmark_client.sh + +### Stop the server +echo "Stopping server" +python server.py -d ${DEPLOYMENT_NAME} stop +sleep 120 + +### Gernerate the plots +python plot_th_lat.py --log_dir . --test --no_vllm +python plot_effective_throughput.py --log_dir . --test --no_vllm + +echo "Find the plots in the charts directory and the logs inside logs.llama2-7b-tp1-b768"