Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds script as an example of a run of DS-FastGen #810

Merged
merged 13 commits into from
Nov 17, 2023
17 changes: 16 additions & 1 deletion benchmarks/inference/mii/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ python server.py [options] start

Use the -h option to view all available options. To stop the server, use this command:

```bash
```bash
python server.py stop
```

Expand All @@ -30,3 +30,18 @@ The scripts mentioned below were used for generating the plots featured in our b
- `plot_th_lat.py`: This script generates charts for throughput and latency across different model sizes and client counts.
- `plot_effective_throughput.py`: Use this to chart effective throughput.
- `plot_latency_percentile.py`: This script will plot the 50th, 90th, and 95th percentile latencies.

## Running an End-to-End Example

To quickly experience the end-to-end process of running our benchmark and generating plots, you can use the `run_example.sh` Bash file. This script is designed to execute the benchmark with a specific configuration, providing a hassle-free way to observe the performance under defined parameters.

### Configuration Details:

- **Parameter Size:** 7 billion
- **Tensor Parallelism:** 1
- **Max New Tokens:** 60, 128
- **Prompt Length:** 1200, 2600

```bash
bash run_example.sh
```
33 changes: 24 additions & 9 deletions benchmarks/inference/mii/plot_effective_throughput.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,13 @@
SLA_GEN_TOKENS_PER_SEC = [1, 2, 3, 4, 6, 8]
EMA_SPAN = 16

tp_sizes = {
tp_sizes_all = {
"7b": [1],
"70b": [4, 8],
"70b": [4, 8]
}

tp_sizes_test = {
"7b": [1]
}

prompt_gen_pairs = [
Expand All @@ -27,6 +31,8 @@

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--test", action="store_true")
parser.add_argument("--no_vllm", action="store_true")
parser.add_argument("--log_dir", type=Path, default=".")
parser.add_argument("--out_dir", type=Path, default="charts/goodtput")
args = parser.parse_args()
Expand Down Expand Up @@ -96,7 +102,8 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out
print(f"model: {model_size} Prompt: {prompt}, Generation: {gen}, TP: {tp} sla_token_gen: {sla_token_gen}")

mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json"
vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json"
if not args.no_vllm:
vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json"

validate_funcs = [
(validate_token_cum_latency_SLA, (), "cum"),
Expand All @@ -109,25 +116,28 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out
client_num_list = sorted(list(mii_goodputs.keys()))
mii_goodputs_list = [mii_goodputs[client_num] for client_num in client_num_list]

vllm_goodputs, vllm_good_ratios = extract_values(vllm_file_pattern, sla_token_gen, f)
vllm_goodputs_list = [vllm_goodputs[client_num] for client_num in client_num_list]
if not args.no_vllm:
vllm_goodputs, vllm_good_ratios = extract_values(vllm_file_pattern, sla_token_gen, f)
vllm_goodputs_list = [vllm_goodputs[client_num] for client_num in client_num_list]

# print(f"MII {mii_goodputs_list} ratio={mii_good_ratios}")
# print(f"vLLM {vllm_goodputs_list} ratio={vllm_good_ratios}")

# Plotting the scatter plot
plt.figure(figsize=(7, 4))
plt.scatter(client_num_list, mii_goodputs_list, label=f"DeepSpeed-FastGen", marker="o", color="blue")
plt.scatter(client_num_list, vllm_goodputs_list, label=f"vLLM", marker="x", color="orange")
if not args.no_vllm:
plt.scatter(client_num_list, vllm_goodputs_list, label=f"vLLM", marker="x", color="orange")

fit_x_list = np.arange(min(client_num_list), max(client_num_list), 0.1)
mii_fit_model = np.polyfit(client_num_list, mii_goodputs_list, 4)
mii_model_fn = np.poly1d(mii_fit_model)
plt.plot(fit_x_list, mii_model_fn(fit_x_list), color="blue", alpha=0.5, linestyle="--")

vllm_fit_model = np.polyfit(client_num_list, vllm_goodputs_list, 4)
vllm_model_fn = np.poly1d(vllm_fit_model)
plt.plot(fit_x_list, vllm_model_fn(fit_x_list), color="orange", alpha=0.5, linestyle="--")
if not args.no_vllm:
vllm_fit_model = np.polyfit(client_num_list, vllm_goodputs_list, 4)
vllm_model_fn = np.poly1d(vllm_fit_model)
plt.plot(fit_x_list, vllm_model_fn(fit_x_list), color="orange", alpha=0.5, linestyle="--")

title = f"Effective throughput (SLA prompt: {SLA_PROMPT_TOKENS_PER_SEC} tokens/s, generation: {sla_token_gen} tokens/s)\n" \
+ f'Llama 2 {model_size.upper()} Prompt: {prompt}, Generation: {gen}, TP: {tp}'
Expand All @@ -148,6 +158,11 @@ def display_results(model_size, tp, bs, sla_token_gen, prompt, gen, log_dir, out
if __name__ == "__main__":
args = get_args()

if args.test:
tp_sizes = tp_sizes_test
else:
tp_sizes = tp_sizes_all

for model_size, tps in tp_sizes.items():
for tp in tps:
for prompt, gen in prompt_gen_pairs:
Expand Down
38 changes: 26 additions & 12 deletions benchmarks/inference/mii/plot_th_lat.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,16 @@
import argparse
from pathlib import Path
import numpy as np

import pdb
from postprocess_results import read_json, get_summary

bs = 768

tp_sizes = {
tp_sizes_test = {
"7b": [1]
}

tp_sizes_all = {
"7b": [1],
"70b": [4, 8],
}
Expand All @@ -22,7 +26,9 @@

def get_args():
parser = argparse.ArgumentParser()
parser.add_argument("--log_dir", type=Path, default="logs.release")
parser.add_argument("--test", action="store_true")
parser.add_argument("--no_vllm", action="store_true")
parser.add_argument("--log_dir", type=Path, default=".")
parser.add_argument("--out_dir", type=Path, default="charts/throughput_latency")
args = parser.parse_args()
return args
Expand Down Expand Up @@ -56,19 +62,22 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir):
out_dir.mkdir(parents=True, exist_ok=True)

mii_file_pattern = f"{log_dir}/logs.llama2-{model_size}-tp{tp}-b{bs}/llama2-{model_size}-tp{tp}-b{bs}_c*_p{prompt}_g{gen}.json"
vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json"
if not args.no_vllm:
vllm_file_pattern = f"{log_dir}/logs.vllm-llama2-{model_size}-tp{tp}/vllm-llama2-{model_size}-tp{tp}_c*_p{prompt}_g{gen}.json"

_, mii_throughputs, mii_latencies = extract_values(mii_file_pattern)
_, vllm_throughputs, vllm_latencies = extract_values(vllm_file_pattern)
if not args.no_vllm:
_, vllm_throughputs, vllm_latencies = extract_values(vllm_file_pattern)

# Plotting the scatter plot
plt.figure(figsize=(6, 4))

plt.scatter(vllm_throughputs, vllm_latencies, label=f"vLLM", marker="x", color="orange")
fit_vllm_x_list = np.arange(min(vllm_throughputs), max(vllm_throughputs), 0.01)
vllm_vllm_model = np.polyfit(vllm_throughputs, vllm_latencies, 3)
vllm_model_fn = np.poly1d(vllm_vllm_model)
plt.plot(fit_vllm_x_list, vllm_model_fn(fit_vllm_x_list), color="orange", alpha=0.5, linestyle="--")

if not args.no_vllm:
plt.scatter(vllm_throughputs, vllm_latencies, label=f"vLLM", marker="x", color="orange")
fit_vllm_x_list = np.arange(min(vllm_throughputs), max(vllm_throughputs), 0.01)
vllm_vllm_model = np.polyfit(vllm_throughputs, vllm_latencies, 3)
vllm_model_fn = np.poly1d(vllm_vllm_model)
plt.plot(fit_vllm_x_list, vllm_model_fn(fit_vllm_x_list), color="orange", alpha=0.5, linestyle="--")

plt.scatter(mii_throughputs, mii_latencies, label=f"DeepSpeed FastGen", marker="o", color="blue")
fit_mii_x_list = np.arange(min(mii_throughputs), max(mii_throughputs), 0.01)
Expand All @@ -90,7 +99,12 @@ def output_charts(model_size, tp, bs, prompt, gen, log_dir, out_dir):

if __name__ == "__main__":
args = get_args()

if args.test:
tp_sizes = tp_sizes_test
else:
print("No test")
tp_sizes = tp_sizes_all

for model_size, tps in tp_sizes.items():
for tp in tps:
for prompt, gen in prompt_gen_pairs:
Expand Down
23 changes: 2 additions & 21 deletions benchmarks/inference/mii/run_benchmark_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,35 +80,16 @@ def callback(response):
token_gen_time.append(time_now - time_last_token)
time_last_token = time_now

postprocess_config = {
"logit_processor": {
# "name": "TopP",
# "args": {
# "top_p": 0.9
# }
"name": "Temperature",
"args": {
"temperature": 0.9
}
},
"sampler": {
"name": "Logits"
},
"stop_criterion": {
"name": "EosGeneration"
}
}

time_last_token = start_time = time.time()
token_gen_time = []
if stream:
output_tokens = []
client.generate(
input_tokens, max_new_tokens=max_new_tokens, postprocess_config=postprocess_config,
input_tokens, max_new_tokens=max_new_tokens,
streaming_fn=callback)
else:
result = client.generate(
input_tokens, max_new_tokens=max_new_tokens, postprocess_config=postprocess_config)
input_tokens, max_new_tokens=max_new_tokens)
output_tokens = result.response[0]

return ResponseDetails(
Expand Down
21 changes: 21 additions & 0 deletions benchmarks/inference/mii/run_example.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
Run the code

RAGGED_BATCH_SIZE=768
PARAM_SIZES=(7b)
DEPLOYMENT_NAME=llama2-7b-tp1-b768
python server.py --model_name meta-llama/Llama-2-7b-hf -d llama2-7b-tp1-b768 -m 1 -b 768 start

DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=60 bash ./run_benchmark_client.sh
DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=2600 MAX_NEW_TOKENS=128 bash ./run_benchmark_client.sh
DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=60 bash ./run_benchmark_client.sh
DEPLOYMENT_NAME=${DEPLOYMENT_NAME} PROMPT_LENGTH=1200 MAX_NEW_TOKENS=128 bash ./run_benchmark_client.sh

echo "Stopping server"
python server.py -d ${DEPLOYMENT_NAME} stop
sleep 120

### Plots
python plot_th_lat.py --log_dir . --test --no_vllm
python plot_effective_throughput.py --log_dir . --test --no_vllm

echo "Find the plots in the charts directory and the logs inside logs.llama2-7b-tp1-b768"
Loading