diff --git a/benchmarks/inference/mii/A6000_benchmarks_example.PNG b/benchmarks/inference/mii/A6000_benchmarks_example.PNG
new file mode 100644
index 000000000..853e92378
Binary files /dev/null and b/benchmarks/inference/mii/A6000_benchmarks_example.PNG differ
diff --git a/benchmarks/inference/mii/README.md b/benchmarks/inference/mii/README.md
index f9a825daa..d9e475cdb 100644
--- a/benchmarks/inference/mii/README.md
+++ b/benchmarks/inference/mii/README.md
@@ -11,7 +11,7 @@ python server.py [options] start
Use the -h option to view all available options. To stop the server, use this command:
-```bash
+```bash
python server.py stop
```
@@ -30,3 +30,17 @@ The scripts mentioned below were used for generating the plots featured in our b
- `plot_th_lat.py`: This script generates charts for throughput and latency across different model sizes and client counts.
- `plot_effective_throughput.py`: Use this to chart effective throughput.
- `plot_latency_percentile.py`: This script will plot the 50th, 90th, and 95th percentile latencies.
+
+## Running an End-to-End Example
+
+To quickly experience the end-to-end process of running our benchmark and getting results, you can use the `run_example.sh`. This script is designed to execute the benchmark with a specific configuration. The plots below will be generated in the charts directory. These plots show the performance as depicted in figure 8 of our blog [post.](https://github.com/microsoft/DeepSpeed/tree/master/blogs/deepspeed-fastgen#f-other-hardware-platforms)
+
+```bash
+bash run_example.sh
+```
+
+
+
+
+ *Figure 1: Throughput-latency curve and effective throughput of Llama 2 7b using A6000. Runs the client with 60 generation steps and input prompt length of 2600.*
+
\ No newline at end of file
diff --git a/benchmarks/inference/mii/plot_effective_throughput.py b/benchmarks/inference/mii/plot_effective_throughput.py
index 357fc7f9e..350c269c3 100644
--- a/benchmarks/inference/mii/plot_effective_throughput.py
+++ b/benchmarks/inference/mii/plot_effective_throughput.py
@@ -12,21 +12,30 @@
SLA_GEN_TOKENS_PER_SEC = [1, 2, 3, 4, 6, 8]
EMA_SPAN = 16
-tp_sizes = {
+tp_sizes_all = {
"7b": [1],
- "70b": [4, 8],
+ "70b": [4, 8]
}
-prompt_gen_pairs = [
+tp_sizes_test = {
+ "7b": [1]
+}
+
+prompt_gen_pairs_all = [
(1200, 60),
(1200, 128),
(2600, 60),
(2600, 128),
]
+prompt_gen_pairs_test = [
+ (2600, 60)
+]
def get_args():
parser = argparse.ArgumentParser()
+ parser.add_argument("--test", action="store_true")
+ parser.add_argument("--no_vllm", action="store_true")
parser.add_argument("--log_dir", type=Path, default=".")
parser.add_argument("--out_dir", type=Path, default="charts/goodtput")
args = parser.parse_args()
@@ -96,7 +105,8 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out
print(f"model: {model_size} Prompt: {prompt}, Generation: {gen}, TP: {tp} sla_token_gen: {sla_token_gen}")
mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json"
- vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json"
+ if not args.no_vllm:
+ vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json"
validate_funcs = [
(validate_token_cum_latency_SLA, (), "cum"),
@@ -109,8 +119,9 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out
client_num_list = sorted(list(mii_goodputs.keys()))
mii_goodputs_list = [mii_goodputs[client_num] for client_num in client_num_list]
- vllm_goodputs, vllm_good_ratios = extract_values(vllm_file_pattern, sla_token_gen, f)
- vllm_goodputs_list = [vllm_goodputs[client_num] for client_num in client_num_list]
+ if not args.no_vllm:
+ vllm_goodputs, vllm_good_ratios = extract_values(vllm_file_pattern, sla_token_gen, f)
+ vllm_goodputs_list = [vllm_goodputs[client_num] for client_num in client_num_list]
# print(f"MII {mii_goodputs_list} ratio={mii_good_ratios}")
# print(f"vLLM {vllm_goodputs_list} ratio={vllm_good_ratios}")
@@ -118,16 +129,18 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out
# Plotting the scatter plot
plt.figure(figsize=(7, 4))
plt.scatter(client_num_list, mii_goodputs_list, label=f"DeepSpeed-FastGen", marker="o", color="blue")
- plt.scatter(client_num_list, vllm_goodputs_list, label=f"vLLM", marker="x", color="orange")
+ if not args.no_vllm:
+ plt.scatter(client_num_list, vllm_goodputs_list, label=f"vLLM", marker="x", color="orange")
fit_x_list = np.arange(min(client_num_list), max(client_num_list), 0.1)
mii_fit_model = np.polyfit(client_num_list, mii_goodputs_list, 4)
mii_model_fn = np.poly1d(mii_fit_model)
plt.plot(fit_x_list, mii_model_fn(fit_x_list), color="blue", alpha=0.5, linestyle="--")
- vllm_fit_model = np.polyfit(client_num_list, vllm_goodputs_list, 4)
- vllm_model_fn = np.poly1d(vllm_fit_model)
- plt.plot(fit_x_list, vllm_model_fn(fit_x_list), color="orange", alpha=0.5, linestyle="--")
+ if not args.no_vllm:
+ vllm_fit_model = np.polyfit(client_num_list, vllm_goodputs_list, 4)
+ vllm_model_fn = np.poly1d(vllm_fit_model)
+ plt.plot(fit_x_list, vllm_model_fn(fit_x_list), color="orange", alpha=0.5, linestyle="--")
title = f"Effective throughput (SLA prompt: {SLA_PROMPT_TOKENS_PER_SEC} tokens/s, generation: {sla_token_gen} tokens/s)\n" \
+ f'Llama 2 {model_size.upper()} Prompt: {prompt}, Generation: {gen}, TP: {tp}'
@@ -148,6 +161,13 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out
if __name__ == "__main__":
args = get_args()
+ if args.test:
+ tp_sizes = tp_sizes_test
+ prompt_gen_pairs = prompt_gen_pairs_test
+ else:
+ tp_sizes = tp_sizes_all
+ prompt_gen_pairs = prompt_gen_pairs_all
+
for model_size, tps in tp_sizes.items():
for tp in tps:
for prompt, gen in prompt_gen_pairs:
diff --git a/benchmarks/inference/mii/plot_th_lat.py b/benchmarks/inference/mii/plot_th_lat.py
index 8ede6e818..e99dc5a3e 100644
--- a/benchmarks/inference/mii/plot_th_lat.py
+++ b/benchmarks/inference/mii/plot_th_lat.py
@@ -3,17 +3,25 @@
import argparse
from pathlib import Path
import numpy as np
-
+import pdb
from postprocess_results import read_json, get_summary
bs = 768
-tp_sizes = {
+tp_sizes_test = {
+ "7b": [1]
+}
+
+tp_sizes_all = {
"7b": [1],
"70b": [4, 8],
}
-prompt_gen_pairs = [
+prompt_gen_pairs_test = [
+ (2600, 60)
+]
+
+prompt_gen_pairs_all = [
(1200, 60),
(1200, 128),
(2600, 60),
@@ -22,7 +30,9 @@
def get_args():
parser = argparse.ArgumentParser()
- parser.add_argument("--log_dir", type=Path, default="logs.release")
+ parser.add_argument("--test", action="store_true")
+ parser.add_argument("--no_vllm", action="store_true")
+ parser.add_argument("--log_dir", type=Path, default=".")
parser.add_argument("--out_dir", type=Path, default="charts/throughput_latency")
args = parser.parse_args()
return args
@@ -56,19 +66,22 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir):
out_dir.mkdir(parents=True, exist_ok=True)
mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json"
- vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json"
+ if not args.no_vllm:
+ vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json"
_, mii_throughputs, mii_latencies = extract_values(mii_file_pattern)
- _, vllm_throughputs, vllm_latencies = extract_values(vllm_file_pattern)
+ if not args.no_vllm:
+ _, vllm_throughputs, vllm_latencies = extract_values(vllm_file_pattern)
# Plotting the scatter plot
plt.figure(figsize=(6, 4))
-
- plt.scatter(vllm_throughputs, vllm_latencies, label=f"vLLM", marker="x", color="orange")
- fit_vllm_x_list = np.arange(min(vllm_throughputs), max(vllm_throughputs), 0.01)
- vllm_vllm_model = np.polyfit(vllm_throughputs, vllm_latencies, 3)
- vllm_model_fn = np.poly1d(vllm_vllm_model)
- plt.plot(fit_vllm_x_list, vllm_model_fn(fit_vllm_x_list), color="orange", alpha=0.5, linestyle="--")
+
+ if not args.no_vllm:
+ plt.scatter(vllm_throughputs, vllm_latencies, label=f"vLLM", marker="x", color="orange")
+ fit_vllm_x_list = np.arange(min(vllm_throughputs), max(vllm_throughputs), 0.01)
+ vllm_vllm_model = np.polyfit(vllm_throughputs, vllm_latencies, 3)
+ vllm_model_fn = np.poly1d(vllm_vllm_model)
+ plt.plot(fit_vllm_x_list, vllm_model_fn(fit_vllm_x_list), color="orange", alpha=0.5, linestyle="--")
plt.scatter(mii_throughputs, mii_latencies, label=f"DeepSpeed FastGen", marker="o", color="blue")
fit_mii_x_list = np.arange(min(mii_throughputs), max(mii_throughputs), 0.01)
@@ -82,7 +95,6 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir):
plt.legend()
plt.grid(True)
plt.tight_layout()
- # plt.show()
out_file = out_dir / f"th_lat_curve_llama{model_size}_tp{tp}_p{prompt}g{gen}.png"
print(f"Saving {out_file}")
plt.savefig(out_file)
@@ -90,7 +102,13 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir):
if __name__ == "__main__":
args = get_args()
-
+ if args.test:
+ tp_sizes = tp_sizes_test
+ prompt_gen_pairs = prompt_gen_pairs_test
+ else:
+ tp_sizes = tp_sizes_all
+ prompt_gen_pairs = prompt_gen_pairs_test_all
+
for model_size, tps in tp_sizes.items():
for tp in tps:
for prompt, gen in prompt_gen_pairs:
diff --git a/benchmarks/inference/mii/run_benchmark_client.py b/benchmarks/inference/mii/run_benchmark_client.py
index 77377a93a..52fc0da50 100644
--- a/benchmarks/inference/mii/run_benchmark_client.py
+++ b/benchmarks/inference/mii/run_benchmark_client.py
@@ -80,35 +80,16 @@ def callback(response):
token_gen_time.append(time_now - time_last_token)
time_last_token = time_now
- postprocess_config = {
- "logit_processor": {
- # "name": "TopP",
- # "args": {
- # "top_p": 0.9
- # }
- "name": "Temperature",
- "args": {
- "temperature": 0.9
- }
- },
- "sampler": {
- "name": "Logits"
- },
- "stop_criterion": {
- "name": "EosGeneration"
- }
- }
-
time_last_token = start_time = time.time()
token_gen_time = []
if stream:
output_tokens = []
client.generate(
- input_tokens, max_new_tokens=max_new_tokens, postprocess_config=postprocess_config,
+ input_tokens, max_new_tokens=max_new_tokens,
streaming_fn=callback)
else:
result = client.generate(
- input_tokens, max_new_tokens=max_new_tokens, postprocess_config=postprocess_config)
+ input_tokens, max_new_tokens=max_new_tokens)
output_tokens = result.response[0]
return ResponseDetails(
diff --git a/benchmarks/inference/mii/run_example.sh b/benchmarks/inference/mii/run_example.sh
new file mode 100644
index 000000000..ece8393ed
--- /dev/null
+++ b/benchmarks/inference/mii/run_example.sh
@@ -0,0 +1,19 @@
+### Run the server
+RAGGED_BATCH_SIZE=768
+PARAM_SIZES=(7b)
+DEPLOYMENT_NAME=llama2-7b-tp1-b768
+python server.py --model_name meta-llama/Llama-2-7b-hf -d llama2-7b-tp1-b768 -m 1 -b 768 start
+
+### This command will run the client with 60 generation steps and input prompt length of 2600
+DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 bash ./run_benchmark_client.sh
+
+### Stop the server
+echo "Stopping server"
+python server.py -d ${DEPLOYMENT_NAME} stop
+sleep 120
+
+### Gernerate the plots
+python plot_th_lat.py --log_dir . --test --no_vllm
+python plot_effective_throughput.py --log_dir . --test --no_vllm
+
+echo "Find the plots in the charts directory and the logs inside logs.llama2-7b-tp1-b768"