From f813e2eaefb02ba9517f6d02435e50a83b7e51fd Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sun, 9 Jun 2024 00:58:19 -0700 Subject: [PATCH 001/150] Kuntai: add tgi and trt benchmarking script (initial version) --- .../nightly-benchmarks/run-tgi-benchmarks.sh | 72 +++++++++++++ .../nightly-benchmarks/run-trt-benchmarks.sh | 102 ++++++++++++++++++ 2 files changed, 174 insertions(+) create mode 100644 .buildkite/nightly-benchmarks/run-tgi-benchmarks.sh create mode 100644 .buildkite/nightly-benchmarks/run-trt-benchmarks.sh diff --git a/.buildkite/nightly-benchmarks/run-tgi-benchmarks.sh b/.buildkite/nightly-benchmarks/run-tgi-benchmarks.sh new file mode 100644 index 0000000000000..27f0fe57f3716 --- /dev/null +++ b/.buildkite/nightly-benchmarks/run-tgi-benchmarks.sh @@ -0,0 +1,72 @@ +#!/bin/bash + +# This script should be run inside the tgi container. Enter the latest tgi container by +# docker run -it --gpus all -e "HF_TOKEN=" --shm-size 1g --entrypoint /bin/bash ghcr.io/huggingface/text-generation-inference:2.0 +# (please modify `` to your own huggingface token in the above command +# Then, copy-paste this file into any directory you prefer in the docker and execute it using bash. +# Benchmarking results will be inside /vllm/benchmarks/*.txt +# NOTE: this script gradually reduces the request rate from 20, to ensure all requests are successful. + +set -ex +set -o pipefail + +# install conda +(which wget && which curl) || (apt-get update && apt-get install -y wget curl) +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh +bash Miniconda3-latest-Linux-x86_64.sh -b -u -p ~/miniconda3 +~/miniconda3/bin/conda init bash +eval "$(cat ~/.bashrc | tail -n +15)" + +# create conda environment for vllm +conda create -n vllm python=3.9 -y +eval "$(conda shell.bash hook)" +conda activate vllm +pip install vllm + +# clone vllm repo +cd / +git clone https://github.com/vllm-project/vllm.git +wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + +# launch TGI server +/tgi-entrypoint.sh --port 8000 --model-id meta-llama/Llama-2-7b-chat-hf & +tgi_pid=$! +timeout 600 bash -c 'until curl localhost:8000/generate_stream; do sleep 1; done' || exit 1 + +# gradually reduce the request rate from 20, untill all request successed +request_rate=20 +get_successful_requests() { + grep "Successful requests:" benchmark_serving.txt | awk '{print $3}' +} +while true; do + echo "Running benchmark with request rate $request_rate..." + python3 vllm/benchmarks/benchmark_serving.py --backend tgi --model meta-llama/Llama-2-7b-chat-hf --dataset-name sharegpt --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1000 --endpoint /generate_stream --request-rate $request_rate --port 8000 --save-result 2>&1 | tee benchmark_serving.txt + bench_serving_exit_code=$? + successful_requests=$(get_successful_requests) + echo "Successful requests: $successful_requests" + if [ "$successful_requests" -eq 1000 ]; then + echo "Reached 1000 successful requests with request rate $request_rate" + break + fi + request_rate=$((request_rate - 1)) + if [ "$request_rate" -lt 1 ]; then + echo "Request rate went below 1. Exiting." + break + fi +done +kill $tgi_pid + +echo "### TGI Serving Benchmarks" >>benchmark_results.md +sed -n '1p' benchmark_serving.txt >>benchmark_results.md +echo "" >>benchmark_results.md +echo '```' >>benchmark_results.md +tail -n 17 benchmark_serving.txt >>benchmark_results.md +echo '```' >>benchmark_results.md + +# if the agent binary is not found, skip uploading the results, exit 0 +if [ ! -f /workspace/buildkite-agent ]; then + exit 0 +fi + +# upload the results to buildkite +/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 --runtime=nvidia --gpus all --entrypoint /bin/bash nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 +# (please modify `` to your own huggingface token in the above command +# Then, copy-paste this file into the docker and execute it using bash. + +set -xe +TRT_LLM_VERSION=r24.04 +model_path=meta-llama/llama-2-7b-chat-hf +model_name=llama-2-7b-chat-hf +model_type=llama +model_dtype=float16 +model_tp_size=1 +max_batch_size=233 +max_input_len=15000 +max_output_len=15000 +cd ~ +mkdir models +cd models +models_dir=`pwd` +trt_model_path=${models_dir}/${model_name}-trt-ckpt +trt_engine_path=${models_dir}/${model_name}-trt-engine + + + +cd ~ +git clone https://github.com/neuralmagic/tensorrt-demo.git +cd tensorrt-demo +tensorrt_demo_dir=`pwd` + +# make sure the parameter inside tensorrt_demo is consistent to envvar +sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt +sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt +sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt +sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt +sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt +sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt + + +cd / +git clone https://github.com/triton-inference-server/tensorrtllm_backend.git +git lfs install +cd tensorrtllm_backend +git checkout $TRT_LLM_VERSION +tensorrtllm_backend_dir=`pwd` + +git submodule update --init --recursive +cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/ + +cd /tensorrtllm_backend +cd ./tensorrt_llm/examples/${model_type} + +python3 convert_checkpoint.py \ + --model_dir ${model_path} \ + --dtype ${model_dtype} \ + --tp_size ${model_tp_size} \ + --output_dir ${trt_model_path} + +trtllm-build \ + --checkpoint_dir=${trt_model_path} \ + --gpt_attention_plugin=${model_dtype} \ + --gemm_plugin=${model_dtype} \ + --remove_input_padding=enable \ + --paged_kv_cache=enable \ + --tp_size=${model_tp_size} \ + --max_batch_size=${max_batch_size} \ + --max_input_len=${max_input_len} \ + --max_output_len=${max_output_len} \ + --max_num_tokens=${max_output_len} \ + --opt_num_tokens=${max_output_len} \ + --output_dir=${trt_engine_path} + +cd /tensorrtllm_backend/triton_model_repo +cp -r ${trt_engine_path}/* ./tensorrt_llm/1 +cd /tensorrtllm_backend +python3 scripts/launch_triton_server.py --world_size=${model_tp_size} --model_repo=/tensorrtllm_backend/triton_model_repo & + + +# sleep for 20 seconds, to make sure the server is launched +sleep 30 + + +# install vllm inside conda, for benchmarking. +(which wget && which curl) || (apt-get update && apt-get install -y wget curl) +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh +bash Miniconda3-latest-Linux-x86_64.sh -b -u -p ~/miniconda3 +~/miniconda3/bin/conda init bash +eval "$(cat ~/.bashrc | tail -n +15)" +conda create -n vllm python=3.9 -y +eval "$(conda shell.bash hook)" +conda activate vllm +pip install vllm + +# clone vllm's benchmark_serving script +cd ~ +git clone https://github.com/vllm-project/vllm.git +cd vllm/benchmarks/ + +export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') +wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json +python benchmark_serving.py --backend tensorrt-llm --endpoint /v2/models/ensemble/generate_stream --port 8000 --model $model_path --save-result --dataset-name sharegpt --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1000 2>&1 | tee benchmark_serving.txt From d6cba4653b414192779861c717ddd565e51338e0 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 13 Jun 2024 23:13:16 -0700 Subject: [PATCH 002/150] update initial benchmarking script for lmdeploy --- .../run-lmdeploy-benchmarks.sh | 42 +++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 .buildkite/nightly-benchmarks/run-lmdeploy-benchmarks.sh diff --git a/.buildkite/nightly-benchmarks/run-lmdeploy-benchmarks.sh b/.buildkite/nightly-benchmarks/run-lmdeploy-benchmarks.sh new file mode 100644 index 0000000000000..c1a579eaeefc4 --- /dev/null +++ b/.buildkite/nightly-benchmarks/run-lmdeploy-benchmarks.sh @@ -0,0 +1,42 @@ +#!/bin/bash + +# This script should be run inside the tgi container. Enter the latest tgi container by +# docker run --gpus all -e "HF_TOKEN=" -v ~/.cache/huggingface:/root/.cache/huggingface --entrypoint /bin/bash openmmlab/lmdeploy:latest +# lmdeploy serve api_server internlm/internlm2-chat-7b +# docker run -it --gpus all -e "HF_TOKEN=" --shm-size 1g --entrypoint /bin/bash ghcr.io/huggingface/text-generation-inference:2.0 +# (please modify `` to your own huggingface token in the above command +# Then, copy-paste this file into any directory you prefer in the docker and execute it using bash. + + + +set -ex +set -o pipefail + +# install conda +(which wget && which curl) || (apt-get update && apt-get install -y wget curl) +wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh +bash Miniconda3-latest-Linux-x86_64.sh -b -u -p ~/miniconda3 +~/miniconda3/bin/conda init bash +eval "$(cat ~/.bashrc | tail -n +15)" + +# create conda environment for vllm +conda create -n vllm python=3.9 -y +eval "$(conda shell.bash hook)" +conda activate vllm +pip install vllm + +# clone vllm repo +cd / +git clone https://github.com/vllm-project/vllm.git +wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + +# launch TGI server +lmdeploy serve api_server meta-llama/Llama-2-7b-hf --server-port 8000 & +tgi_pid=$! +timeout 600 bash -c 'until curl localhost:8000/v1/completion; do sleep 1; done' || exit 1 + +# gradually reduce the request rate from 20, untill all request successed +request_rate=20 +echo "Running benchmark with request rate $request_rate..." +python3 vllm/benchmarks/benchmark_serving.py --backend lmdeploy --model meta-llama/Llama-2-7b-chat-hf --dataset-name sharegpt --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 200 --request-rate $request_rate --port 8000 --save-result 2>&1 | tee benchmark_serving.txt +kill $tgi_pid From 5d8292bf5a12ff86ac73cea8422b5ce4b292ce31 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 18 Jun 2024 21:09:21 -0700 Subject: [PATCH 003/150] Add download tokenizer script for lmdeploy --- .../scripts/download-tokenizer.py | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) create mode 100644 .buildkite/nightly-benchmarks/scripts/download-tokenizer.py diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py new file mode 100644 index 0000000000000..140233e5dad91 --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py @@ -0,0 +1,18 @@ + +import argparse +from transformers import AutoTokenizer + +def main(model, cachedir): + # Load the tokenizer and save it to the specified directory + tokenizer = AutoTokenizer.from_pretrained(model) + tokenizer.save_pretrained(cachedir) + print(f"Tokenizer saved to {cachedir}") + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="Download and save Hugging Face tokenizer") + parser.add_argument("--model", type=str, required=True, help="Name of the model") + parser.add_argument("--cachedir", type=str, required=True, help="Directory to save the tokenizer") + + args = parser.parse_args() + main(args.model, args.cachedir) + \ No newline at end of file From a2dd7c9f3f86c82a0eb392bfb498fff80feb84fa Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 18 Jun 2024 23:45:22 -0700 Subject: [PATCH 004/150] add one-click runnable script for lmdeploy, parse tests from json file --- .../nightly-benchmarks/run-nightly-suite.sh | 53 +++++ .../scripts/download-tokenizer.py | 1 + .../scripts/run-lmdeploy-nightly.sh | 192 ++++++++++++++++++ 3 files changed, 246 insertions(+) create mode 100644 .buildkite/nightly-benchmarks/run-nightly-suite.sh create mode 100644 .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh new file mode 100644 index 0000000000000..04cdd0d8322b7 --- /dev/null +++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +set -ex +set -o pipefail + +check_gpus() { + # check the number of GPUs and GPU type. + declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) + if [[ $gpu_count -gt 0 ]]; then + echo "GPU found." + else + echo "Need at least 1 GPU to run benchmarking." + exit 1 + fi + declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') + echo "GPU type is $gpu_type" +} + +check_hf_token() { + # check if HF_TOKEN is available and valid + if [[ -z "$HF_TOKEN" ]]; then + echo "Error: HF_TOKEN is not set." + exit 1 + elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then + echo "Error: HF_TOKEN does not start with 'hf_'." + exit 1 + else + echo "HF_TOKEN is set and valid." + fi +} + +main() { + + check_gpus + check_hf_token + + (which wget && which curl) || (apt-get update && apt-get install -y wget curl) + cd / + git clone https://github.com/KuntaiDu/vllm.git + cd vllm + git checkout kuntai-benchmark-dev + cd benchmarks + wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + + # run lmdeploy + if which lmdeploy >/dev/null; then + echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh" + bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh + fi + +} + +main "$@" \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py index 140233e5dad91..add331bfbd9f3 100644 --- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py +++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py @@ -1,5 +1,6 @@ import argparse +from pathlib import Path from transformers import AutoTokenizer def main(model, cachedir): diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh new file mode 100644 index 0000000000000..495d3adf2ae14 --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -0,0 +1,192 @@ +#!/bin/bash + +set -ex +set -o pipefail + + +check_gpus() { + # check the number of GPUs and GPU type. + declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) + if [[ $gpu_count -gt 0 ]]; then + echo "GPU found." + else + echo "Need at least 1 GPU to run benchmarking." + exit 1 + fi + declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') + echo "GPU type is $gpu_type" +} + + + +kill_gpu_processes() { + # kill all processes on GPU. + pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader) + if [ -z "$pids" ]; then + echo "No GPU processes found." + else + for pid in $pids; do + kill -9 "$pid" + echo "Killed process with PID: $pid" + done + + echo "All GPU processes have been killed." + fi + + # waiting for GPU processes to be fully killed + sleep 10 + + # remove vllm config file + rm -rf ~/.config/vllm + + # Print the GPU memory usage + # so that we know if all GPU processes are killed. + gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) + # The memory usage should be 0 MB. + echo "GPU 0 Memory Usage: $gpu_memory_usage MB" +} + + + +json2args() { + # transforms the JSON string to command line args, and '_' is replaced to '-' + # example: + # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } + # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 + local json_string=$1 + local args=$( + echo "$json_string" | jq -r ' + to_entries | + map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | + join(" ") + ' + ) + echo "$args" +} + + + +run_serving_tests() { + # run serving tests using `benchmark_serving.py` + # $1: a json file specifying serving test cases + + local serving_test_file + serving_test_file=$1 + + # Iterate over serving tests + jq -c '.[]' "$serving_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + # append lmdeploy to the test name + test_name=lmdeploy_$test_name + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + + # get client and server arguments + server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters') + client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters') + model=$(echo "$server_params" | jq -r '.model') + server_args=$(json2args "$server_params") + client_args=$(json2args "$client_params") + qps_list=$(echo "$params" | jq -r '.qps_list') + qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') + echo "Running over qps list $qps_list" + + # check if there is enough GPU to run the test + tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') + if [[ $gpu_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." + continue + fi + + # prepare tokenizer + server_model=$(echo "$server_params" | jq -r '.model') + rm /tokenizer_cache/* + python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ + --model "$server_model" \ + --cachedir /tokenizer_cache + + server_command="lmdeploy server api_server $model $server_args" + + # run the server + echo "Running test case $test_name" + echo "Server command: $server_command" + eval "$server_command" & + + # wait until the server is alive + wait_for_server + if [ $? -eq 0 ]; then + echo "" + echo "vllm server is up and running." + else + echo "" + echo "vllm failed to start within the timeout period." + fi + + # iterate over different QPS + for qps in $qps_list; do + # remove the surrounding single quote from qps + if [[ "$qps" == *"inf"* ]]; then + echo "qps was $qps" + qps="inf" + echo "now qps is $qps" + fi + + new_test_name=$test_name"_qps_"$qps + + client_command="python3 benchmark_serving.py \ + --backend lmdeploy \ + --tokenizer /tokenizer_cache \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + $client_args" + + echo "Running test case $test_name with qps $qps" + echo "Client command: $client_command" + + eval "$client_command" + + # record the benchmarking commands + jq_output=$(jq -n \ + --arg server "$server_command" \ + --arg client "$client_command" \ + --arg gpu "$gpu_type" \ + --arg engine "lmdeploy" \ + '{ + server_command: $server, + client_command: $client, + gpu_type: $gpu, + engine: $engine + }') + echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands" + + done + + # clean up + kill_gpu_processes + done +} + +main () { + + # create tokenizer directory + mkdir /tokenizer_cache + # enter vllm directory + cd /vllm/benchmarks + + declare -g RESULTS_FOLDER=results/ + mkdir -p $RESULTS_FOLDER + BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + + run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json + +} + +main "$@" \ No newline at end of file From 8416ce6a15773a1bce5ecd85feea4540d2d9d5eb Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 18 Jun 2024 23:50:24 -0700 Subject: [PATCH 005/150] add nightly test json file --- .../tests/nightly-tests.json | 25 +++++++++++++++++++ 1 file changed, 25 insertions(+) create mode 100644 .buildkite/nightly-benchmarks/tests/nightly-tests.json diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json new file mode 100644 index 0000000000000..b4290dc42dce3 --- /dev/null +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -0,0 +1,25 @@ +[ + { + "test_name": "llama8B_tp1_sharegpt", + "qps_list": [1, 16], + "model": "meta-llama/Llama-8B-hf", + "lmdeploy_server_parameters": { + "tp": 1, + "server_port": 8000 + }, + "client_parameters": { + "model": "meta-llama/Llama-2-7b-hf", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "port": 8000 + }, + "lmdeploy_client_parameters": { + "model": "llama2", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "port": 8000 + } + } +] \ No newline at end of file From df4ba8f8d3476c9136d6b64fd2299692c8fe1158 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 18 Jun 2024 23:53:27 -0700 Subject: [PATCH 006/150] bug fix on tokenizer directory --- .../nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index 495d3adf2ae14..b957f506431f8 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -106,7 +106,8 @@ run_serving_tests() { # prepare tokenizer server_model=$(echo "$server_params" | jq -r '.model') - rm /tokenizer_cache/* + rm -rf /tokenizer_cache + mkdir /tokenizer_cache python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ --model "$server_model" \ --cachedir /tokenizer_cache @@ -176,8 +177,6 @@ run_serving_tests() { main () { - # create tokenizer directory - mkdir /tokenizer_cache # enter vllm directory cd /vllm/benchmarks From b974495a0839189502ec6b1c0245e711ed800e50 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 18 Jun 2024 23:55:27 -0700 Subject: [PATCH 007/150] bug fix on getting model --- .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index b957f506431f8..7580ecc68e9a9 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -90,7 +90,7 @@ run_serving_tests() { # get client and server arguments server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters') client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters') - model=$(echo "$server_params" | jq -r '.model') + model=$(echo "$params" | jq -r '.model') server_args=$(json2args "$server_params") client_args=$(json2args "$client_params") qps_list=$(echo "$params" | jq -r '.qps_list') From 80d1c77fd110ab1e958707c59d9de0cb3b9f6e11 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 18 Jun 2024 23:57:12 -0700 Subject: [PATCH 008/150] update test cases --- .buildkite/nightly-benchmarks/tests/nightly-tests.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index b4290dc42dce3..d854e59342119 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -2,7 +2,7 @@ { "test_name": "llama8B_tp1_sharegpt", "qps_list": [1, 16], - "model": "meta-llama/Llama-8B-hf", + "model": "meta-llama/Llama-2-7b-hf", "lmdeploy_server_parameters": { "tp": 1, "server_port": 8000 From b3f3b0e3ec5112bf00730f895ba423391bdd0ed0 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 18 Jun 2024 23:58:59 -0700 Subject: [PATCH 009/150] update parameter name --- .../nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 5 ++--- .buildkite/nightly-benchmarks/tests/nightly-tests.json | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index 7580ecc68e9a9..898c78ec86a20 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -90,7 +90,7 @@ run_serving_tests() { # get client and server arguments server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters') client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters') - model=$(echo "$params" | jq -r '.model') + model=$(echo "$params" | jq -r '.lmdeploy_server_model') server_args=$(json2args "$server_params") client_args=$(json2args "$client_params") qps_list=$(echo "$params" | jq -r '.qps_list') @@ -105,11 +105,10 @@ run_serving_tests() { fi # prepare tokenizer - server_model=$(echo "$server_params" | jq -r '.model') rm -rf /tokenizer_cache mkdir /tokenizer_cache python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ - --model "$server_model" \ + --model "$model" \ --cachedir /tokenizer_cache server_command="lmdeploy server api_server $model $server_args" diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index d854e59342119..a730c172089f8 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -2,7 +2,7 @@ { "test_name": "llama8B_tp1_sharegpt", "qps_list": [1, 16], - "model": "meta-llama/Llama-2-7b-hf", + "lmdeploy_server_model": "meta-llama/Llama-2-7b-hf", "lmdeploy_server_parameters": { "tp": 1, "server_port": 8000 From 9483acf7f2c5d606b768f096143d1af26c4da264 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 19 Jun 2024 00:00:58 -0700 Subject: [PATCH 010/150] typo fix --- .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index 898c78ec86a20..626b23f1e1689 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -111,7 +111,7 @@ run_serving_tests() { --model "$model" \ --cachedir /tokenizer_cache - server_command="lmdeploy server api_server $model $server_args" + server_command="lmdeploy serve api_server $model $server_args" # run the server echo "Running test case $test_name" From d72ae51d7e0a0357bc131d12339d8931f20d1899 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 19 Jun 2024 00:02:48 -0700 Subject: [PATCH 011/150] add wait_for_server --- .../nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index 626b23f1e1689..306830d2651ee 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -64,6 +64,15 @@ json2args() { echo "$args" } +wait_for_server() { + # wait for vllm server to start + # return 1 if vllm server crashes + timeout 1200 bash -c ' + until curl localhost:8000/v1/completions; do + sleep 1 + done' && return 0 || return 1 +} + run_serving_tests() { From 0e819f034109e79984efec0a8c30e4bbd27369c4 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 19 Jun 2024 00:22:57 -0700 Subject: [PATCH 012/150] update summarization script --- .../run-lmdeploy-benchmarks.sh | 42 ----------- .../scripts/run-lmdeploy-nightly.sh | 40 +++++------ .../scripts/summary-nightly-results.py | 70 +++++++++++++++++++ 3 files changed, 87 insertions(+), 65 deletions(-) delete mode 100644 .buildkite/nightly-benchmarks/run-lmdeploy-benchmarks.sh create mode 100644 .buildkite/nightly-benchmarks/scripts/summary-nightly-results.py diff --git a/.buildkite/nightly-benchmarks/run-lmdeploy-benchmarks.sh b/.buildkite/nightly-benchmarks/run-lmdeploy-benchmarks.sh deleted file mode 100644 index c1a579eaeefc4..0000000000000 --- a/.buildkite/nightly-benchmarks/run-lmdeploy-benchmarks.sh +++ /dev/null @@ -1,42 +0,0 @@ -#!/bin/bash - -# This script should be run inside the tgi container. Enter the latest tgi container by -# docker run --gpus all -e "HF_TOKEN=" -v ~/.cache/huggingface:/root/.cache/huggingface --entrypoint /bin/bash openmmlab/lmdeploy:latest -# lmdeploy serve api_server internlm/internlm2-chat-7b -# docker run -it --gpus all -e "HF_TOKEN=" --shm-size 1g --entrypoint /bin/bash ghcr.io/huggingface/text-generation-inference:2.0 -# (please modify `` to your own huggingface token in the above command -# Then, copy-paste this file into any directory you prefer in the docker and execute it using bash. - - - -set -ex -set -o pipefail - -# install conda -(which wget && which curl) || (apt-get update && apt-get install -y wget curl) -wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -bash Miniconda3-latest-Linux-x86_64.sh -b -u -p ~/miniconda3 -~/miniconda3/bin/conda init bash -eval "$(cat ~/.bashrc | tail -n +15)" - -# create conda environment for vllm -conda create -n vllm python=3.9 -y -eval "$(conda shell.bash hook)" -conda activate vllm -pip install vllm - -# clone vllm repo -cd / -git clone https://github.com/vllm-project/vllm.git -wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - -# launch TGI server -lmdeploy serve api_server meta-llama/Llama-2-7b-hf --server-port 8000 & -tgi_pid=$! -timeout 600 bash -c 'until curl localhost:8000/v1/completion; do sleep 1; done' || exit 1 - -# gradually reduce the request rate from 20, untill all request successed -request_rate=20 -echo "Running benchmark with request rate $request_rate..." -python3 vllm/benchmarks/benchmark_serving.py --backend lmdeploy --model meta-llama/Llama-2-7b-chat-hf --dataset-name sharegpt --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 200 --request-rate $request_rate --port 8000 --save-result 2>&1 | tee benchmark_serving.txt -kill $tgi_pid diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index 306830d2651ee..82bca8d8ed344 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -3,7 +3,6 @@ set -ex set -o pipefail - check_gpus() { # check the number of GPUs and GPU type. declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) @@ -17,20 +16,18 @@ check_gpus() { echo "GPU type is $gpu_type" } - - kill_gpu_processes() { # kill all processes on GPU. pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader) if [ -z "$pids" ]; then - echo "No GPU processes found." + echo "No GPU processes found." else - for pid in $pids; do - kill -9 "$pid" - echo "Killed process with PID: $pid" - done + for pid in $pids; do + kill -9 "$pid" + echo "Killed process with PID: $pid" + done - echo "All GPU processes have been killed." + echo "All GPU processes have been killed." fi # waiting for GPU processes to be fully killed @@ -46,8 +43,6 @@ kill_gpu_processes() { echo "GPU 0 Memory Usage: $gpu_memory_usage MB" } - - json2args() { # transforms the JSON string to command line args, and '_' is replaced to '-' # example: @@ -73,8 +68,6 @@ wait_for_server() { done' && return 0 || return 1 } - - run_serving_tests() { # run serving tests using `benchmark_serving.py` # $1: a json file specifying serving test cases @@ -95,7 +88,6 @@ run_serving_tests() { continue fi - # get client and server arguments server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters') client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters') @@ -174,7 +166,7 @@ run_serving_tests() { gpu_type: $gpu, engine: $engine }') - echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands" + echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" done @@ -183,17 +175,19 @@ run_serving_tests() { done } -main () { +main() { - # enter vllm directory - cd /vllm/benchmarks + check_gpus + # enter vllm directory + cd /vllm/benchmarks - declare -g RESULTS_FOLDER=results/ - mkdir -p $RESULTS_FOLDER - BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + declare -g RESULTS_FOLDER=results/ + mkdir -p $RESULTS_FOLDER + BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ - run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json + run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json + CURRENT_LLM_SERVING_ENGINE=lmdeploy python $BENCHMARK_ROOT/scripts/summary-nightly-results.py } -main "$@" \ No newline at end of file +main "$@" diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py new file mode 100644 index 0000000000000..6c2668ed2b3ec --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -0,0 +1,70 @@ +import json +import os +from pathlib import Path + +import pandas as pd + +results_folder = Path("results/") + +# serving results and the keys that will be printed into markdown +serving_results = [] +serving_column_mapping = { + "test_name": "Test name", + "gpu_type": "GPU", + "completed": "Successful req.", + "request_throughput": "Tput (req/s)", + # "input_throughput": "Input Tput (tok/s)", + # "output_throughput": "Output Tput (tok/s)", + "mean_ttft_ms": "Mean TTFT (ms)", + "median_ttft_ms": "Median TTFT (ms)", + "p99_ttft_ms": "P99 TTFT (ms)", + # "mean_tpot_ms": "Mean TPOT (ms)", + # "median_tpot_ms": "Median", + # "p99_tpot_ms": "P99", + "mean_itl_ms": "Mean ITL (ms)", + "median_itl_ms": "Median ITL (ms)", + "p99_itl_ms": "P99 ITL (ms)", + "engine": "Engine", +} + + + + +if __name__ == "__main__": + + # collect results + for test_file in results_folder.glob("*.json"): + + with open(test_file, "r") as f: + raw_result = json.loads(f.read()) + + + # attach the benchmarking command to raw_result + with open(test_file.with_suffix(".commands"), "r") as f: + command = json.loads(f.read()) + raw_result.update(command) + + # update the test name of this result + raw_result.update({"test_name": test_file.stem}) + + # add the result to raw_result + serving_results.append(raw_result) + continue + + + serving_results = pd.DataFrame.from_dict(serving_results) + + + if not serving_results.empty: + serving_results = serving_results[list( + serving_column_mapping.keys())].rename( + columns=serving_column_mapping) + + + prefix = os.environ.get("CURRENT_LLM_SERVING_ENGINE") + + # document benchmarking results in json + with open(results_folder / f"{prefix}_nightly_results.json", "w") as f: + + results = serving_results.to_dict(orient='records') + f.write(json.dumps(results)) From 9181a1d27a56bb7054d3706343b657ca3e9b7283 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 19 Jun 2024 00:37:33 -0700 Subject: [PATCH 013/150] use pkill tp kill lmdeploy --- .../scripts/run-lmdeploy-nightly.sh | 18 +----------------- 1 file changed, 1 insertion(+), 17 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index 82bca8d8ed344..ff58d93b06d7c 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -17,25 +17,9 @@ check_gpus() { } kill_gpu_processes() { - # kill all processes on GPU. - pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader) - if [ -z "$pids" ]; then - echo "No GPU processes found." - else - for pid in $pids; do - kill -9 "$pid" - echo "Killed process with PID: $pid" - done - - echo "All GPU processes have been killed." - fi - + pkill lmdeploy || true # waiting for GPU processes to be fully killed sleep 10 - - # remove vllm config file - rm -rf ~/.config/vllm - # Print the GPU memory usage # so that we know if all GPU processes are killed. gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) From 6e1936c6625c57c707ea6c73914ac5d80f2cd330 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 19 Jun 2024 01:04:59 -0700 Subject: [PATCH 014/150] update script for tgi --- .../nightly-benchmarks/run-nightly-suite.sh | 6 + .../scripts/run-tgi-nightly.sh | 167 ++++++++++++++++++ .../tests/nightly-tests.json | 16 +- 3 files changed, 184 insertions(+), 5 deletions(-) create mode 100644 .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh index 04cdd0d8322b7..058b84df80a86 100644 --- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh +++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh @@ -48,6 +48,12 @@ main() { bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh fi + # run tgi + if [ -e /tgi-entrypoint.sh ]; then + echo "tgi is available, redirect to run-tgi-nightly.sh" + bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh + fi + } main "$@" \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh new file mode 100644 index 0000000000000..b0b8f26cfee33 --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh @@ -0,0 +1,167 @@ +#!/bin/bash + +set -ex +set -o pipefail + +check_gpus() { + # check the number of GPUs and GPU type. + declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) + if [[ $gpu_count -gt 0 ]]; then + echo "GPU found." + else + echo "Need at least 1 GPU to run benchmarking." + exit 1 + fi + declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') + echo "GPU type is $gpu_type" +} + +kill_gpu_processes() { + pkill text-generation || true + # waiting for GPU processes to be fully killed + sleep 10 + # Print the GPU memory usage + # so that we know if all GPU processes are killed. + gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) + # The memory usage should be 0 MB. + echo "GPU 0 Memory Usage: $gpu_memory_usage MB" +} + +json2args() { + # transforms the JSON string to command line args, and '_' is replaced to '-' + # example: + # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } + # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 + local json_string=$1 + local args=$( + echo "$json_string" | jq -r ' + to_entries | + map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | + join(" ") + ' + ) + echo "$args" +} + +wait_for_server() { + timeout 1200 bash -c ' + until curl localhost:8000/generate_stream; do + sleep 1 + done' && return 0 || return 1 +} + +run_serving_tests() { + # run serving tests using `benchmark_serving.py` + # $1: a json file specifying serving test cases + + local serving_test_file + serving_test_file=$1 + + # Iterate over serving tests + jq -c '.[]' "$serving_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + # append tgi to the test name + test_name=tgi_$test_name + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get client and server arguments + server_params=$(echo "$params" | jq -r '.tgi_server_parameters') + client_params=$(echo "$params" | jq -r '.tgi_client_parameters') + server_args=$(json2args "$server_params") + client_args=$(json2args "$client_params") + qps_list=$(echo "$params" | jq -r '.qps_list') + qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') + echo "Running over qps list $qps_list" + + # check if there is enough GPU to run the test + tp=$(echo "$server_params" | jq -r '.num_shard') + if [[ $gpu_count -lt $tp ]]; then + echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $testname." + continue + fi + + + server_command="./tgi-entrypoint.sh $server_args" + + # run the server + echo "Running test case $test_name" + echo "Server command: $server_command" + eval "$server_command" & + + # wait until the server is alive + wait_for_server + if [ $? -eq 0 ]; then + echo "" + echo "tgi server is up and running." + else + echo "" + echo "tgi failed to start within the timeout period." + fi + + # iterate over different QPS + for qps in $qps_list; do + # remove the surrounding single quote from qps + if [[ "$qps" == *"inf"* ]]; then + echo "qps was $qps" + qps="inf" + echo "now qps is $qps" + fi + + new_test_name=$test_name"_qps_"$qps + + client_command="python3 benchmark_serving.py \ + --backend tgi \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + $client_args" + + echo "Running test case $test_name with qps $qps" + echo "Client command: $client_command" + + eval "$client_command" + + # record the benchmarking commands + jq_output=$(jq -n \ + --arg server "$server_command" \ + --arg client "$client_command" \ + --arg gpu "$gpu_type" \ + --arg engine "tgi" \ + '{ + server_command: $server, + client_command: $client, + gpu_type: $gpu, + engine: $engine + }') + echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" + + done + + # clean up + kill_gpu_processes + done +} + +main() { + + check_gpus + # enter vllm directory + cd /vllm/benchmarks + + declare -g RESULTS_FOLDER=results/ + mkdir -p $RESULTS_FOLDER + BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + + run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json + CURRENT_LLM_SERVING_ENGINE=tgi python $BENCHMARK_ROOT/scripts/summary-nightly-results.py + +} + +main "$@" diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index a730c172089f8..29d800a215bda 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -7,19 +7,25 @@ "tp": 1, "server_port": 8000 }, - "client_parameters": { - "model": "meta-llama/Llama-2-7b-hf", + "lmdeploy_client_parameters": { + "model": "llama2", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "port": 8000 }, - "lmdeploy_client_parameters": { - "model": "llama2", + "tgi_server_parameters": { + "model_id": "meta-llama/Llama-2-7b-hf", + "num_shard": 2, + "port": 8000 + }, + "tgi_client_parameters": { + "model": "meta-llama/Llama-2-7b-hf", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, - "port": 8000 + "port": 8000, + "endpoint": "/generate_stream" } } ] \ No newline at end of file From c6aded948727b1ec43ce8b21916331735eb71989 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 19 Jun 2024 01:07:20 -0700 Subject: [PATCH 015/150] add install jq --- .buildkite/nightly-benchmarks/run-nightly-suite.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh index 058b84df80a86..2e0e974873c83 100644 --- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh +++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh @@ -35,6 +35,7 @@ main() { check_hf_token (which wget && which curl) || (apt-get update && apt-get install -y wget curl) + (which jq) || (apt-get update && apt-get -y install jq) cd / git clone https://github.com/KuntaiDu/vllm.git cd vllm From ccbcd18a272c2b30551bfe8b93c0d3750ee79064 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 19 Jun 2024 01:10:06 -0700 Subject: [PATCH 016/150] reduce 7b llama tp to 1 --- .buildkite/nightly-benchmarks/tests/nightly-tests.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 29d800a215bda..f393dee4ca7a2 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -16,7 +16,7 @@ }, "tgi_server_parameters": { "model_id": "meta-llama/Llama-2-7b-hf", - "num_shard": 2, + "num_shard": 1, "port": 8000 }, "tgi_client_parameters": { From 38cc38a70c5934bd66b20bc9f3fdc8b09aab8e2e Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 19 Jun 2024 01:11:07 -0700 Subject: [PATCH 017/150] update lmdeploy tp --- .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index ff58d93b06d7c..43d0b26e19f88 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -83,7 +83,7 @@ run_serving_tests() { echo "Running over qps list $qps_list" # check if there is enough GPU to run the test - tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') + tp=$(echo "$server_params" | jq -r '.tp') if [[ $gpu_count -lt $tp ]]; then echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." continue From 832891e993b6aee0d40eb82ea6c5b2551e5e2126 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 19 Jun 2024 01:13:11 -0700 Subject: [PATCH 018/150] bug fix --- .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh index b0b8f26cfee33..a6edb652619ca 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh @@ -87,7 +87,7 @@ run_serving_tests() { fi - server_command="./tgi-entrypoint.sh $server_args" + server_command="/tgi-entrypoint.sh $server_args" # run the server echo "Running test case $test_name" From 587780694b8bd17eede9033e930c82cf6b4d341d Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 19 Jun 2024 21:21:34 -0700 Subject: [PATCH 019/150] update tensorrt script --- .../scripts/run-lmdeploy-nightly.sh | 2 +- .../scripts/run-tgi-nightly.sh | 2 +- .../scripts/run-trt-nightly.sh | 244 ++++++++++++++++++ .../tests/nightly-tests.json | 18 ++ 4 files changed, 264 insertions(+), 2 deletions(-) create mode 100644 .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index 43d0b26e19f88..0010fe8403974 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -85,7 +85,7 @@ run_serving_tests() { # check if there is enough GPU to run the test tp=$(echo "$server_params" | jq -r '.tp') if [[ $gpu_count -lt $tp ]]; then - echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname." + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." continue fi diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh index a6edb652619ca..f6de7728aea2f 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh @@ -82,7 +82,7 @@ run_serving_tests() { # check if there is enough GPU to run the test tp=$(echo "$server_params" | jq -r '.num_shard') if [[ $gpu_count -lt $tp ]]; then - echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $testname." + echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." continue fi diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh new file mode 100644 index 0000000000000..416839b7d32fb --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -0,0 +1,244 @@ +#!/bin/bash + +set -ex +set -o pipefail + +check_gpus() { + # check the number of GPUs and GPU type. + declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) + if [[ $gpu_count -gt 0 ]]; then + echo "GPU found." + else + echo "Need at least 1 GPU to run benchmarking." + exit 1 + fi + declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') + echo "GPU type is $gpu_type" +} + +kill_gpu_processes() { + pkill text-generation || true + # waiting for GPU processes to be fully killed + sleep 10 + # Print the GPU memory usage + # so that we know if all GPU processes are killed. + gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) + # The memory usage should be 0 MB. + echo "GPU 0 Memory Usage: $gpu_memory_usage MB" +} + +json2args() { + # transforms the JSON string to command line args, and '_' is replaced to '-' + # example: + # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } + # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 + local json_string=$1 + local args=$( + echo "$json_string" | jq -r ' + to_entries | + map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | + join(" ") + ' + ) + echo "$args" +} + +wait_for_server() { + timeout 1200 bash -c ' + until curl localhost:8000/generate_stream; do + sleep 1 + done' && return 0 || return 1 +} + + +run_trt_server() { + + params=$1 + + model_name=$(echo "$params" | jq -r '.model_name') + model_path=$(echo "$params" | jq -r '.model_path') + model_type=$(echo "$params" | jq -r '.model_type') + model_dtype=$(echo "$params" | jq -r '.model_dtype') + model_tp_size=$(echo "$params" | jq -r '.model_tp_size') + max_batch_size=$(echo "$params" | jq -r '.max_batch_size') + max_input_len=$(echo "$params" | jq -r '.max_input_len') + max_output_len=$(echo "$params" | jq -r '.max_output_len') + trt_llm_version=$(echo "$params" | jq -r '.trt_llm_version') + + cd ~ + rm -rf models + mkdir -p models + cd models + models_dir=$(pwd) + trt_model_path=${models_dir}/${model_name}-trt-ckpt + trt_engine_path=${models_dir}/${model_name}-trt-engine + + cd ~ + rm -rf tensorrt-demo + git clone https://github.com/neuralmagic/tensorrt-demo.git + cd tensorrt-demo + tensorrt_demo_dir=$(pwd) + + # make sure the parameter inside tensorrt_demo is consistent to envvar + sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt + sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt + sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt + sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt + sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt + sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt + + + cd / + rm -rf tensorrtllm_backend + git clone https://github.com/triton-inference-server/tensorrtllm_backend.git + git lfs install + cd tensorrtllm_backend + git checkout $trt_llm_version + tensorrtllm_backend_dir=$(pwd) + git submodule update --init --recursive + cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/ + + cd /tensorrtllm_backend + cd ./tensorrt_llm/examples/${model_type} + + python3 convert_checkpoint.py \ + --model_dir ${model_path} \ + --dtype ${model_dtype} \ + --tp_size ${model_tp_size} \ + --output_dir ${trt_model_path} + + trtllm-build \ + --checkpoint_dir=${trt_model_path} \ + --gpt_attention_plugin=${model_dtype} \ + --gemm_plugin=${model_dtype} \ + --remove_input_padding=enable \ + --paged_kv_cache=enable \ + --tp_size=${model_tp_size} \ + --max_batch_size=${max_batch_size} \ + --max_input_len=${max_input_len} \ + --max_output_len=${max_output_len} \ + --max_num_tokens=${max_output_len} \ + --opt_num_tokens=${max_output_len} \ + --output_dir=${trt_engine_path} + + cd /tensorrtllm_backend/triton_model_repo + cp -r ${trt_engine_path}/* ./tensorrt_llm/1 + cd /tensorrtllm_backend + python3 scripts/launch_triton_server.py \ + --world_size=${model_tp_size} \ + --model_repo=/tensorrtllm_backend/triton_model_repo & +} + +run_serving_tests() { + # run serving tests using `benchmark_serving.py` + # $1: a json file specifying serving test cases + + local serving_test_file + serving_test_file=$1 + + # Iterate over serving tests + jq -c '.[]' "$serving_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + # append tgi to the test name + test_name=trt_$test_name + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # get client and server arguments + server_params=$(echo "$params" | jq -r '.trt_server_parameters') + client_params=$(echo "$params" | jq -r '.trt_client_parameters') + client_args=$(json2args "$client_params") + qps_list=$(echo "$params" | jq -r '.qps_list') + qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') + echo "Running over qps list $qps_list" + + # check if there is enough GPU to run the test + tp=$(echo "$server_params" | jq -r '.model_tp_size') + if [[ $gpu_count -lt $tp ]]; then + echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name." + continue + fi + + + # run the server + echo "Running test case $test_name" + run_trt_server $server_params + + # wait until the server is alive + wait_for_server + if [ $? -eq 0 ]; then + echo "" + echo "trt server is up and running." + else + echo "" + echo "trt failed to start within the timeout period." + fi + + # iterate over different QPS + for qps in $qps_list; do + # remove the surrounding single quote from qps + if [[ "$qps" == *"inf"* ]]; then + echo "qps was $qps" + qps="inf" + echo "now qps is $qps" + fi + + new_test_name=$test_name"_qps_"$qps + + client_command="python3 benchmark_serving.py \ + --backend trt \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + $client_args" + + echo "Running test case $test_name with qps $qps" + echo "Client command: $client_command" + + eval "$client_command" + + # record the benchmarking commands + jq_output=$(jq -n \ + --arg server "$server_command" \ + --arg client "$client_command" \ + --arg gpu "$gpu_type" \ + --arg engine "tgi" \ + '{ + server_command: $server, + client_command: $client, + gpu_type: $gpu, + engine: $engine + }') + echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" + + done + + # clean up + kill_gpu_processes + done +} + +main() { + + check_gpus + + + # enter vllm directory + cd /vllm/benchmarks + + declare -g RESULTS_FOLDER=results/ + mkdir -p $RESULTS_FOLDER + BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + + run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json + CURRENT_LLM_SERVING_ENGINE=trt python $BENCHMARK_ROOT/scripts/summary-nightly-results.py + +} + +main "$@" diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index f393dee4ca7a2..88fa7a9fd5ea8 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -26,6 +26,24 @@ "num_prompts": 200, "port": 8000, "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_path": "meta-llama/llama-2-7b-chat-hf", + "model_name": "llama-2-7b-chat-hf", + "model_type": "llama", + "model_dtype": "float16", + "model_tp_size": 1, + "max_batch_size": 256, + "max_input_len": 10000, + "max_output_len": 10000 + }, + "trt_client_parameters": { + "model": "meta-llama/Llama-2-7b-hf", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200, + "port": 8000, + "endpoint": "/v2/models/ensemble/generate_stream" } } ] \ No newline at end of file From 9972abac594dc7f57754f20a27860e2a366493b9 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 19 Jun 2024 22:41:39 -0700 Subject: [PATCH 020/150] update nightly suite --- .buildkite/nightly-benchmarks/run-nightly-suite.sh | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh index 2e0e974873c83..2a3e0b81c981d 100644 --- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh +++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh @@ -55,6 +55,12 @@ main() { bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh fi + # run trt + if which trtllm-build >/dev/null; then + echo "trtllm is available, redirect to run-trt-nightly.sh" + bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh + fi + } main "$@" \ No newline at end of file From 6493679313dd3e882c80ae95f6257b6fff824790 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 19 Jun 2024 22:47:40 -0700 Subject: [PATCH 021/150] add double quote --- .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index 416839b7d32fb..f4ebab70e18b1 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -167,7 +167,7 @@ run_serving_tests() { # run the server echo "Running test case $test_name" - run_trt_server $server_params + run_trt_server "$server_params" # wait until the server is alive wait_for_server From e62cae68a29a8037f2dd60d7be72715c45b30742 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 19 Jun 2024 22:48:53 -0700 Subject: [PATCH 022/150] add trt llm version --- .buildkite/nightly-benchmarks/tests/nightly-tests.json | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 88fa7a9fd5ea8..adb62761d36a8 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -35,7 +35,8 @@ "model_tp_size": 1, "max_batch_size": 256, "max_input_len": 10000, - "max_output_len": 10000 + "max_output_len": 10000, + "trt_llm_version": "r24.04" }, "trt_client_parameters": { "model": "meta-llama/Llama-2-7b-hf", From 9ce358963a3491f700435b0bcdb17c9591eb43ee Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 19 Jun 2024 22:58:15 -0700 Subject: [PATCH 023/150] update trt --- .../scripts/run-trt-nightly.sh | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index f4ebab70e18b1..4e46064c69e2b 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -152,6 +152,7 @@ run_serving_tests() { # get client and server arguments server_params=$(echo "$params" | jq -r '.trt_server_parameters') client_params=$(echo "$params" | jq -r '.trt_client_parameters') + model=$(echo "$client_params" | jq -r '.model') client_args=$(json2args "$client_params") qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') @@ -165,6 +166,15 @@ run_serving_tests() { fi + # prepare tokenizer + cd /vllm/benchmarks + rm -rf /tokenizer_cache + mkdir /tokenizer_cache + python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ + --model "$model" \ + --cachedir /tokenizer_cache + + # run the server echo "Running test case $test_name" run_trt_server "$server_params" @@ -179,6 +189,9 @@ run_serving_tests() { echo "trt failed to start within the timeout period." fi + # go back to vllm benchmarking directory + cd /vllm/benchmarks + # iterate over different QPS for qps in $qps_list; do # remove the surrounding single quote from qps @@ -191,7 +204,8 @@ run_serving_tests() { new_test_name=$test_name"_qps_"$qps client_command="python3 benchmark_serving.py \ - --backend trt \ + --backend tensorrt-llm \ + --tokenizer /tokenizer_cache \ --save-result \ --result-dir $RESULTS_FOLDER \ --result-filename ${new_test_name}.json \ From f634dee89da1abd8ba9283e650ea7809eaa90f20 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 19 Jun 2024 23:13:23 -0700 Subject: [PATCH 024/150] update on how to kill the server --- .../nightly-benchmarks/run-tgi-benchmarks.sh | 72 ------------- .../nightly-benchmarks/run-trt-benchmarks.sh | 102 ------------------ .../scripts/run-trt-nightly.sh | 4 +- 3 files changed, 2 insertions(+), 176 deletions(-) delete mode 100644 .buildkite/nightly-benchmarks/run-tgi-benchmarks.sh delete mode 100644 .buildkite/nightly-benchmarks/run-trt-benchmarks.sh diff --git a/.buildkite/nightly-benchmarks/run-tgi-benchmarks.sh b/.buildkite/nightly-benchmarks/run-tgi-benchmarks.sh deleted file mode 100644 index 27f0fe57f3716..0000000000000 --- a/.buildkite/nightly-benchmarks/run-tgi-benchmarks.sh +++ /dev/null @@ -1,72 +0,0 @@ -#!/bin/bash - -# This script should be run inside the tgi container. Enter the latest tgi container by -# docker run -it --gpus all -e "HF_TOKEN=" --shm-size 1g --entrypoint /bin/bash ghcr.io/huggingface/text-generation-inference:2.0 -# (please modify `` to your own huggingface token in the above command -# Then, copy-paste this file into any directory you prefer in the docker and execute it using bash. -# Benchmarking results will be inside /vllm/benchmarks/*.txt -# NOTE: this script gradually reduces the request rate from 20, to ensure all requests are successful. - -set -ex -set -o pipefail - -# install conda -(which wget && which curl) || (apt-get update && apt-get install -y wget curl) -wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -bash Miniconda3-latest-Linux-x86_64.sh -b -u -p ~/miniconda3 -~/miniconda3/bin/conda init bash -eval "$(cat ~/.bashrc | tail -n +15)" - -# create conda environment for vllm -conda create -n vllm python=3.9 -y -eval "$(conda shell.bash hook)" -conda activate vllm -pip install vllm - -# clone vllm repo -cd / -git clone https://github.com/vllm-project/vllm.git -wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - -# launch TGI server -/tgi-entrypoint.sh --port 8000 --model-id meta-llama/Llama-2-7b-chat-hf & -tgi_pid=$! -timeout 600 bash -c 'until curl localhost:8000/generate_stream; do sleep 1; done' || exit 1 - -# gradually reduce the request rate from 20, untill all request successed -request_rate=20 -get_successful_requests() { - grep "Successful requests:" benchmark_serving.txt | awk '{print $3}' -} -while true; do - echo "Running benchmark with request rate $request_rate..." - python3 vllm/benchmarks/benchmark_serving.py --backend tgi --model meta-llama/Llama-2-7b-chat-hf --dataset-name sharegpt --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1000 --endpoint /generate_stream --request-rate $request_rate --port 8000 --save-result 2>&1 | tee benchmark_serving.txt - bench_serving_exit_code=$? - successful_requests=$(get_successful_requests) - echo "Successful requests: $successful_requests" - if [ "$successful_requests" -eq 1000 ]; then - echo "Reached 1000 successful requests with request rate $request_rate" - break - fi - request_rate=$((request_rate - 1)) - if [ "$request_rate" -lt 1 ]; then - echo "Request rate went below 1. Exiting." - break - fi -done -kill $tgi_pid - -echo "### TGI Serving Benchmarks" >>benchmark_results.md -sed -n '1p' benchmark_serving.txt >>benchmark_results.md -echo "" >>benchmark_results.md -echo '```' >>benchmark_results.md -tail -n 17 benchmark_serving.txt >>benchmark_results.md -echo '```' >>benchmark_results.md - -# if the agent binary is not found, skip uploading the results, exit 0 -if [ ! -f /workspace/buildkite-agent ]; then - exit 0 -fi - -# upload the results to buildkite -/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 --runtime=nvidia --gpus all --entrypoint /bin/bash nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 -# (please modify `` to your own huggingface token in the above command -# Then, copy-paste this file into the docker and execute it using bash. - -set -xe -TRT_LLM_VERSION=r24.04 -model_path=meta-llama/llama-2-7b-chat-hf -model_name=llama-2-7b-chat-hf -model_type=llama -model_dtype=float16 -model_tp_size=1 -max_batch_size=233 -max_input_len=15000 -max_output_len=15000 -cd ~ -mkdir models -cd models -models_dir=`pwd` -trt_model_path=${models_dir}/${model_name}-trt-ckpt -trt_engine_path=${models_dir}/${model_name}-trt-engine - - - -cd ~ -git clone https://github.com/neuralmagic/tensorrt-demo.git -cd tensorrt-demo -tensorrt_demo_dir=`pwd` - -# make sure the parameter inside tensorrt_demo is consistent to envvar -sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt -sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt -sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt -sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt -sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt -sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt - - -cd / -git clone https://github.com/triton-inference-server/tensorrtllm_backend.git -git lfs install -cd tensorrtllm_backend -git checkout $TRT_LLM_VERSION -tensorrtllm_backend_dir=`pwd` - -git submodule update --init --recursive -cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/ - -cd /tensorrtllm_backend -cd ./tensorrt_llm/examples/${model_type} - -python3 convert_checkpoint.py \ - --model_dir ${model_path} \ - --dtype ${model_dtype} \ - --tp_size ${model_tp_size} \ - --output_dir ${trt_model_path} - -trtllm-build \ - --checkpoint_dir=${trt_model_path} \ - --gpt_attention_plugin=${model_dtype} \ - --gemm_plugin=${model_dtype} \ - --remove_input_padding=enable \ - --paged_kv_cache=enable \ - --tp_size=${model_tp_size} \ - --max_batch_size=${max_batch_size} \ - --max_input_len=${max_input_len} \ - --max_output_len=${max_output_len} \ - --max_num_tokens=${max_output_len} \ - --opt_num_tokens=${max_output_len} \ - --output_dir=${trt_engine_path} - -cd /tensorrtllm_backend/triton_model_repo -cp -r ${trt_engine_path}/* ./tensorrt_llm/1 -cd /tensorrtllm_backend -python3 scripts/launch_triton_server.py --world_size=${model_tp_size} --model_repo=/tensorrtllm_backend/triton_model_repo & - - -# sleep for 20 seconds, to make sure the server is launched -sleep 30 - - -# install vllm inside conda, for benchmarking. -(which wget && which curl) || (apt-get update && apt-get install -y wget curl) -wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh -bash Miniconda3-latest-Linux-x86_64.sh -b -u -p ~/miniconda3 -~/miniconda3/bin/conda init bash -eval "$(cat ~/.bashrc | tail -n +15)" -conda create -n vllm python=3.9 -y -eval "$(conda shell.bash hook)" -conda activate vllm -pip install vllm - -# clone vllm's benchmark_serving script -cd ~ -git clone https://github.com/vllm-project/vllm.git -cd vllm/benchmarks/ - -export VLLM_HOST_IP=$(hostname -I | awk '{print $1}') -wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json -python benchmark_serving.py --backend tensorrt-llm --endpoint /v2/models/ensemble/generate_stream --port 8000 --model $model_path --save-result --dataset-name sharegpt --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1000 2>&1 | tee benchmark_serving.txt diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index 4e46064c69e2b..ea49df1d22f59 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -17,9 +17,9 @@ check_gpus() { } kill_gpu_processes() { - pkill text-generation || true + pkill tritonserver || true # waiting for GPU processes to be fully killed - sleep 10 + sleep 20 # Print the GPU memory usage # so that we know if all GPU processes are killed. gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) From b47e30be938f2ec86c8c8227ff8b478dd5fc23d0 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 20 Jun 2024 00:19:02 -0700 Subject: [PATCH 025/150] update vllm nightly test --- .../nightly-benchmarks/run-nightly-suite.sh | 6 + .../scripts/run-lmdeploy-nightly.sh | 7 +- .../scripts/run-trt-nightly.sh | 4 +- .../scripts/run-vllm-nightly.sh | 182 ++++++++++++++++++ .../tests/nightly-tests.json | 11 ++ 5 files changed, 206 insertions(+), 4 deletions(-) create mode 100644 .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh index 2a3e0b81c981d..3e938a87a1bfb 100644 --- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh +++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh @@ -61,6 +61,12 @@ main() { bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh fi + # run vllm + if python3 -c "import vllm" &> /dev/null; then + echo "vllm is available, redirect to run-vllm-nightly.sh" + bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh + fi + } main "$@" \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index 0010fe8403974..42918e41946bb 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -63,14 +63,17 @@ run_serving_tests() { jq -c '.[]' "$serving_test_file" | while read -r params; do # get the test name, and append the GPU type back to it. test_name=$(echo "$params" | jq -r '.test_name') - # append lmdeploy to the test name - test_name=lmdeploy_$test_name # if TEST_SELECTOR is set, only run the test cases that match the selector if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then echo "Skip test case $test_name." continue fi + + # append lmdeploy to the test name + test_name=lmdeploy_$test_name + + # get client and server arguments server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters') diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index ea49df1d22f59..3956db112023c 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -140,7 +140,7 @@ run_serving_tests() { jq -c '.[]' "$serving_test_file" | while read -r params; do # get the test name, and append the GPU type back to it. test_name=$(echo "$params" | jq -r '.test_name') - # append tgi to the test name + # append trt to the test name test_name=trt_$test_name # if TEST_SELECTOR is set, only run the test cases that match the selector @@ -222,7 +222,7 @@ run_serving_tests() { --arg server "$server_command" \ --arg client "$client_command" \ --arg gpu "$gpu_type" \ - --arg engine "tgi" \ + --arg engine "trt" \ '{ server_command: $server, client_command: $client, diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh new file mode 100644 index 0000000000000..a06851e474638 --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh @@ -0,0 +1,182 @@ +#!/bin/bash + +set -ex +set -o pipefail + +check_gpus() { + # check the number of GPUs and GPU type. + declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l) + if [[ $gpu_count -gt 0 ]]; then + echo "GPU found." + else + echo "Need at least 1 GPU to run benchmarking." + exit 1 + fi + declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}') + echo "GPU type is $gpu_type" +} + +kill_gpu_processes() { + pkill lmdeploy || true + # waiting for GPU processes to be fully killed + sleep 10 + # Print the GPU memory usage + # so that we know if all GPU processes are killed. + gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) + # The memory usage should be 0 MB. + echo "GPU 0 Memory Usage: $gpu_memory_usage MB" +} + +json2args() { + # transforms the JSON string to command line args, and '_' is replaced to '-' + # example: + # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 } + # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1 + local json_string=$1 + local args=$( + echo "$json_string" | jq -r ' + to_entries | + map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) | + join(" ") + ' + ) + echo "$args" +} + +wait_for_server() { + # wait for vllm server to start + # return 1 if vllm server crashes + timeout 1200 bash -c ' + until curl localhost:8000/v1/completions; do + sleep 1 + done' && return 0 || return 1 +} + +run_serving_tests() { + # run serving tests using `benchmark_serving.py` + # $1: a json file specifying serving test cases + + local serving_test_file + serving_test_file=$1 + + # Iterate over serving tests + jq -c '.[]' "$serving_test_file" | while read -r params; do + # get the test name, and append the GPU type back to it. + test_name=$(echo "$params" | jq -r '.test_name') + + # if TEST_SELECTOR is set, only run the test cases that match the selector + if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then + echo "Skip test case $test_name." + continue + fi + + # append vllm to the test name + test_name=vllm_$test_name + + + + # get client and server arguments + server_params=$(echo "$params" | jq -r '.vllm_server_parameters') + client_params=$(echo "$params" | jq -r '.vllm_client_parameters') + server_args=$(json2args "$server_params") + client_args=$(json2args "$client_params") + qps_list=$(echo "$params" | jq -r '.qps_list') + qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') + echo "Running over qps list $qps_list" + + # check if there is enough GPU to run the test + tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') + if [[ $gpu_count -lt $tp ]]; then + echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." + continue + fi + + # check if server model and client model is aligned + server_model=$(echo "$server_params" | jq -r '.model') + client_model=$(echo "$client_params" | jq -r '.model') + if [[ $server_model != "$client_model" ]]; then + echo "Server model and client model must be the same. Skip testcase $test_name." + continue + fi + + + server_command="python3 \ + -m vllm.entrypoints.openai.api_server \ + $server_args" + + # run the server + echo "Running test case $test_name" + echo "Server command: $server_command" + eval "$server_command" & + + # wait until the server is alive + wait_for_server + if [ $? -eq 0 ]; then + echo "" + echo "vllm server is up and running." + else + echo "" + echo "vllm failed to start within the timeout period." + fi + + # iterate over different QPS + for qps in $qps_list; do + # remove the surrounding single quote from qps + if [[ "$qps" == *"inf"* ]]; then + echo "qps was $qps" + qps="inf" + echo "now qps is $qps" + fi + + new_test_name=$test_name"_qps_"$qps + + client_command="python3 benchmark_serving.py \ + --backend vllm \ + --save-result \ + --result-dir $RESULTS_FOLDER \ + --result-filename ${new_test_name}.json \ + --request-rate $qps \ + $client_args" + + echo "Running test case $test_name with qps $qps" + echo "Client command: $client_command" + + eval "$client_command" + + # record the benchmarking commands + jq_output=$(jq -n \ + --arg server "$server_command" \ + --arg client "$client_command" \ + --arg gpu "$gpu_type" \ + --arg engine "vllm" \ + '{ + server_command: $server, + client_command: $client, + gpu_type: $gpu, + engine: $engine + }') + echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands" + + done + + # clean up + kill_gpu_processes + done +} + +main() { + + check_gpus + # enter vllm directory + cd /vllm/benchmarks + + declare -g RESULTS_FOLDER=results/ + mkdir -p $RESULTS_FOLDER + BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + + run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json + CURRENT_LLM_SERVING_ENGINE=vllm python $BENCHMARK_ROOT/scripts/summary-nightly-results.py + +} + +main "$@" diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index adb62761d36a8..1328f281536bf 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -45,6 +45,17 @@ "num_prompts": 200, "port": 8000, "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "model": "meta-llama/Llama-2-7b-hf", + "tensor_parallel_size": 1 + }, + "vllm_client_parameters": { + "model": "meta-llama/Llama-2-7b-hf", + "backend": "vllm", + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 200 } } ] \ No newline at end of file From ec8b29597e70bd1a53f8d55457d5a1add8058d6a Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 20 Jun 2024 00:22:21 -0700 Subject: [PATCH 026/150] disalbe vllm server log --- .buildkite/nightly-benchmarks/tests/nightly-tests.json | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 1328f281536bf..c37ef39470a98 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -55,7 +55,9 @@ "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 + "num_prompts": 200, + "disable_log_stats": "", + "disable_log_requests": "" } } ] \ No newline at end of file From 792ef7f21ccedbd1a2dd27c1f94e8f3eb06671ca Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 20 Jun 2024 00:28:33 -0700 Subject: [PATCH 027/150] adjust how to kill processes in vllm --- .../scripts/run-vllm-nightly.sh | 20 +++++++++++++++++-- 1 file changed, 18 insertions(+), 2 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh index a06851e474638..d0a145d8186f2 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh @@ -17,9 +17,25 @@ check_gpus() { } kill_gpu_processes() { - pkill lmdeploy || true + # kill all processes on GPU. + pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader) + if [ -z "$pids" ]; then + echo "No GPU processes found." + else + for pid in $pids; do + kill -9 "$pid" + echo "Killed process with PID: $pid" + done + + echo "All GPU processes have been killed." + fi + # waiting for GPU processes to be fully killed sleep 10 + + # remove vllm config file + rm -rf ~/.config/vllm + # Print the GPU memory usage # so that we know if all GPU processes are killed. gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0) @@ -175,7 +191,7 @@ main() { BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json - CURRENT_LLM_SERVING_ENGINE=vllm python $BENCHMARK_ROOT/scripts/summary-nightly-results.py + CURRENT_LLM_SERVING_ENGINE=vllm python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py } From 4f67c960ccc3d30ae7e4a73b1a6ed835e0d6d61e Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 20 Jun 2024 15:07:10 -0700 Subject: [PATCH 028/150] update nightly tests --- .buildkite/nightly-benchmarks/tests/nightly-tests.json | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index c37ef39470a98..bfff63ee1cb47 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -48,16 +48,16 @@ }, "vllm_server_parameters": { "model": "meta-llama/Llama-2-7b-hf", - "tensor_parallel_size": 1 + "tensor_parallel_size": 1, + "disable_log_stats": "", + "disable_log_requests": "" }, "vllm_client_parameters": { "model": "meta-llama/Llama-2-7b-hf", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "disable_log_stats": "", - "disable_log_requests": "" + "num_prompts": 200 } } ] \ No newline at end of file From f0fe30cc6464a4c816b0346e7de247f77631ff0d Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 20 Jun 2024 15:25:20 -0700 Subject: [PATCH 029/150] update summary results --- .../scripts/summary-nightly-results.py | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index 6c2668ed2b3ec..3814f4f515114 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -3,6 +3,7 @@ from pathlib import Path import pandas as pd +from tabulate import tabulate results_folder = Path("results/") @@ -53,6 +54,10 @@ serving_results = pd.DataFrame.from_dict(serving_results) + serving_md_table = tabulate(serving_results, + headers='keys', + tablefmt='pipe', + showindex=False) if not serving_results.empty: @@ -63,6 +68,12 @@ prefix = os.environ.get("CURRENT_LLM_SERVING_ENGINE") + # document benchmarking results in markdown + with open(results_folder / f"{prefix}_nightly_results.md", "w") as f: + f.write(serving_md_table) + f.write('\n') + + # document benchmarking results in json with open(results_folder / f"{prefix}_nightly_results.json", "w") as f: From d0978436d7f43a1e7ac65d9cdd7210ce94c270ea Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 20 Jun 2024 15:26:51 -0700 Subject: [PATCH 030/150] update summary results --- .../scripts/summary-nightly-results.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index 3814f4f515114..ef8239ed94472 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -54,10 +54,6 @@ serving_results = pd.DataFrame.from_dict(serving_results) - serving_md_table = tabulate(serving_results, - headers='keys', - tablefmt='pipe', - showindex=False) if not serving_results.empty: @@ -65,6 +61,11 @@ serving_column_mapping.keys())].rename( columns=serving_column_mapping) + serving_md_table = tabulate(serving_results, + headers='keys', + tablefmt='pipe', + showindex=False) + prefix = os.environ.get("CURRENT_LLM_SERVING_ENGINE") From 2c30f38b1520e77776af9796d6d80f861233e235 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 20 Jun 2024 15:34:55 -0700 Subject: [PATCH 031/150] add upload_to_buildkite utility --- .../scripts/run-lmdeploy-nightly.sh | 19 ++++++++++- .../scripts/run-tgi-nightly.sh | 19 ++++++++++- .../scripts/run-trt-nightly.sh | 18 +++++++++- .../scripts/run-vllm-nightly.sh | 33 +++++++++++-------- 4 files changed, 72 insertions(+), 17 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index 42918e41946bb..79cf30f924746 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -162,6 +162,20 @@ run_serving_tests() { done } + +upload_to_buildkite() { + # upload the benchmarking results to buildkite + + # if the agent binary is not found, skip uploading the results, exit 0 + if [ ! -f /workspace/buildkite-agent ]; then + echo "buildkite-agent binary not found. Skip uploading the results." + return 0 + fi + /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" +} + + main() { check_gpus @@ -172,8 +186,11 @@ main() { mkdir -p $RESULTS_FOLDER BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + export CURRENT_LLM_SERVING_ENGINE=lmdeploy run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json - CURRENT_LLM_SERVING_ENGINE=lmdeploy python $BENCHMARK_ROOT/scripts/summary-nightly-results.py + python -m pip install tabulate pandas + python $BENCHMARK_ROOT/scripts/summary-nightly-results.py + upload_to_buildkite } diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh index f6de7728aea2f..398e2017f9b56 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh @@ -149,6 +149,20 @@ run_serving_tests() { done } + + +upload_to_buildkite() { + # upload the benchmarking results to buildkite + + # if the agent binary is not found, skip uploading the results, exit 0 + if [ ! -f /workspace/buildkite-agent ]; then + echo "buildkite-agent binary not found. Skip uploading the results." + return 0 + fi + /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" +} + main() { check_gpus @@ -159,8 +173,11 @@ main() { mkdir -p $RESULTS_FOLDER BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + export CURRENT_LLM_SERVING_ENGINE=tgi run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json - CURRENT_LLM_SERVING_ENGINE=tgi python $BENCHMARK_ROOT/scripts/summary-nightly-results.py + python -m pip install tabulate pandas + python $BENCHMARK_ROOT/scripts/summary-nightly-results.py + upload_to_buildkite } diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index 3956db112023c..71badb8ebf647 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -238,6 +238,19 @@ run_serving_tests() { done } +upload_to_buildkite() { + # upload the benchmarking results to buildkite + + # if the agent binary is not found, skip uploading the results, exit 0 + if [ ! -f /workspace/buildkite-agent ]; then + echo "buildkite-agent binary not found. Skip uploading the results." + return 0 + fi + /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" +} + + main() { check_gpus @@ -250,8 +263,11 @@ main() { mkdir -p $RESULTS_FOLDER BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + export CURRENT_LLM_SERVING_ENGINE=trt run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json - CURRENT_LLM_SERVING_ENGINE=trt python $BENCHMARK_ROOT/scripts/summary-nightly-results.py + python -m pip install tabulate pandas + python $BENCHMARK_ROOT/scripts/summary-nightly-results.py + upload_to_buildkite } diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh index d0a145d8186f2..3ce8f7e3bb4d8 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh @@ -18,19 +18,7 @@ check_gpus() { kill_gpu_processes() { # kill all processes on GPU. - pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader) - if [ -z "$pids" ]; then - echo "No GPU processes found." - else - for pid in $pids; do - kill -9 "$pid" - echo "Killed process with PID: $pid" - done - - echo "All GPU processes have been killed." - fi - - # waiting for GPU processes to be fully killed + pkill pt_main_thread sleep 10 # remove vllm config file @@ -180,6 +168,19 @@ run_serving_tests() { done } + +upload_to_buildkite() { + # upload the benchmarking results to buildkite + + # if the agent binary is not found, skip uploading the results, exit 0 + if [ ! -f /workspace/buildkite-agent ]; then + echo "buildkite-agent binary not found. Skip uploading the results." + return 0 + fi + /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" +} + main() { check_gpus @@ -190,8 +191,12 @@ main() { mkdir -p $RESULTS_FOLDER BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + export CURRENT_LLM_SERVING_ENGINE=vllm run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json - CURRENT_LLM_SERVING_ENGINE=vllm python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py + + python3 -m pip install tabulate pandas + python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py + upload_to_buildkite } From 7304668667d035710d2eb2686c381b87aeaaa063 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 20 Jun 2024 15:38:56 -0700 Subject: [PATCH 032/150] update kickoff pipeline to initiate nightly benchmark --- .../nightly-benchmarks/kickoff-pipeline.sh | 18 ++- .../nightly-benchmarks/nightly-pipeline.yaml | 126 ++++++++++++++++++ 2 files changed, 138 insertions(+), 6 deletions(-) create mode 100644 .buildkite/nightly-benchmarks/nightly-pipeline.yaml diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh index 15d411febcee1..12e63e9c9278e 100755 --- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh +++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh @@ -10,18 +10,24 @@ apt install -y curl jq # Install minijinja for templating curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh source $HOME/.cargo/env +local target_yaml_file="" # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name') if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then - echo "This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks." - else - echo "This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks." - exit 0 + echo "This PR has the 'perf-benchmarks' label. Proceeding with the performance benchmarks." + target_yaml_file=".buildkite/nightly-benchmarks/benchmark-pipeline.yaml" + fi + + if [[ $PR_LABELS == *"comp-benchmarks"* ]]; then + echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks." + target_yaml_file=".buildkite/nightly-benchmarks/nightly-pipeline.yaml" fi fi -# Upload sample.yaml -buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml +if [ -n "$target_yaml_file" ]; then + # Upload sample.yaml + buildkite-agent pipeline upload $target_yaml_file +fi diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml new file mode 100644 index 0000000000000..79092eeed4bca --- /dev/null +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -0,0 +1,126 @@ +steps: + - label: "A100 trt benchmark" + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + containers: + - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 + command: + - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh) + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - wait + - label: "A100 vllm benchmark" + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + containers: + - image: vllm/vllm-openai:latest + command: + - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh) + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - wait + - label: "A100 tgi benchmark" + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + containers: + - image: ghcr.io/huggingface/text-generation-inference:2.0 + command: + - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh) + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - wait + - label: "A100 lmdeploy benchmark" + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + containers: + - image: openmmlab/lmdeploy:latest + command: + - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh) + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - wait + \ No newline at end of file From f811ef0832de5469ce04ca58392c52975fe8c918 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 20 Jun 2024 15:39:47 -0700 Subject: [PATCH 033/150] update kickoff pipeline --- .buildkite/nightly-benchmarks/kickoff-pipeline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh index 12e63e9c9278e..bf25aef70f5ac 100755 --- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh +++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh @@ -10,7 +10,7 @@ apt install -y curl jq # Install minijinja for templating curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh source $HOME/.cargo/env -local target_yaml_file="" +target_yaml_file="" # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then From 1876048c51fb9d5a9ea515ba248b017d5471ee46 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 20 Jun 2024 15:44:24 -0700 Subject: [PATCH 034/150] update the label name --- .buildkite/nightly-benchmarks/kickoff-pipeline.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh index bf25aef70f5ac..8a4f852477713 100755 --- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh +++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh @@ -21,7 +21,7 @@ if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then target_yaml_file=".buildkite/nightly-benchmarks/benchmark-pipeline.yaml" fi - if [[ $PR_LABELS == *"comp-benchmarks"* ]]; then + if [[ $PR_LABELS == *"nightly-benchmarks"* ]]; then echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks." target_yaml_file=".buildkite/nightly-benchmarks/nightly-pipeline.yaml" fi From 0a4518dda8823e16a8889eca259ec386aa5ad860 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 20 Jun 2024 22:21:18 -0700 Subject: [PATCH 035/150] bug fix: exit benchmarking script after finish benchmarking one application --- .buildkite/nightly-benchmarks/run-nightly-suite.sh | 6 +++++- .../nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 2 +- .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh | 2 +- .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh | 2 +- .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh | 2 +- 5 files changed, 9 insertions(+), 5 deletions(-) diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh index 3e938a87a1bfb..4f09c9e8423f8 100644 --- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh +++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh @@ -47,24 +47,28 @@ main() { if which lmdeploy >/dev/null; then echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh" bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh + exit 0 fi # run tgi if [ -e /tgi-entrypoint.sh ]; then echo "tgi is available, redirect to run-tgi-nightly.sh" bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh + exit 0 fi # run trt if which trtllm-build >/dev/null; then echo "trtllm is available, redirect to run-trt-nightly.sh" bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh + exit 0 fi # run vllm - if python3 -c "import vllm" &> /dev/null; then + if [ -e /vllm-workspace ]; then echo "vllm is available, redirect to run-vllm-nightly.sh" bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh + exit 0 fi } diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index 79cf30f924746..c95b8845be1c6 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -171,7 +171,7 @@ upload_to_buildkite() { echo "buildkite-agent binary not found. Skip uploading the results." return 0 fi - /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + /workspace/buildkite-agent annotate --style "info" --context "lmdeploy-benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" } diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh index 398e2017f9b56..23cbafa5c2edc 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh @@ -159,7 +159,7 @@ upload_to_buildkite() { echo "buildkite-agent binary not found. Skip uploading the results." return 0 fi - /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + /workspace/buildkite-agent annotate --style "info" --context "tgi-benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" } diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index 71badb8ebf647..ec0d99570f08f 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -246,7 +246,7 @@ upload_to_buildkite() { echo "buildkite-agent binary not found. Skip uploading the results." return 0 fi - /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + /workspace/buildkite-agent annotate --style "info" --context "trt-benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" } diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh index 3ce8f7e3bb4d8..021a8dc170d1c 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh @@ -177,7 +177,7 @@ upload_to_buildkite() { echo "buildkite-agent binary not found. Skip uploading the results." return 0 fi - /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + /workspace/buildkite-agent annotate --style "info" --context "vllm-benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" } From f47db88c31a2315080c918e2f8bdee343924c7a6 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 20 Jun 2024 22:22:55 -0700 Subject: [PATCH 036/150] make yapf, ruff and isort happy --- .../scripts/download-tokenizer.py | 19 +++++++++++++------ .../scripts/summary-nightly-results.py | 12 ++---------- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py index add331bfbd9f3..68ac5909e5951 100644 --- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py +++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py @@ -1,19 +1,26 @@ - import argparse -from pathlib import Path + from transformers import AutoTokenizer + def main(model, cachedir): # Load the tokenizer and save it to the specified directory tokenizer = AutoTokenizer.from_pretrained(model) tokenizer.save_pretrained(cachedir) print(f"Tokenizer saved to {cachedir}") + if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Download and save Hugging Face tokenizer") - parser.add_argument("--model", type=str, required=True, help="Name of the model") - parser.add_argument("--cachedir", type=str, required=True, help="Directory to save the tokenizer") + parser = argparse.ArgumentParser( + description="Download and save Hugging Face tokenizer") + parser.add_argument("--model", + type=str, + required=True, + help="Name of the model") + parser.add_argument("--cachedir", + type=str, + required=True, + help="Directory to save the tokenizer") args = parser.parse_args() main(args.model, args.cachedir) - \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index ef8239ed94472..ced57295f735e 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -28,9 +28,6 @@ "engine": "Engine", } - - - if __name__ == "__main__": # collect results @@ -39,7 +36,6 @@ with open(test_file, "r") as f: raw_result = json.loads(f.read()) - # attach the benchmarking command to raw_result with open(test_file.with_suffix(".commands"), "r") as f: command = json.loads(f.read()) @@ -52,28 +48,24 @@ serving_results.append(raw_result) continue - serving_results = pd.DataFrame.from_dict(serving_results) - if not serving_results.empty: serving_results = serving_results[list( serving_column_mapping.keys())].rename( columns=serving_column_mapping) - + serving_md_table = tabulate(serving_results, headers='keys', tablefmt='pipe', showindex=False) - - + prefix = os.environ.get("CURRENT_LLM_SERVING_ENGINE") # document benchmarking results in markdown with open(results_folder / f"{prefix}_nightly_results.md", "w") as f: f.write(serving_md_table) f.write('\n') - # document benchmarking results in json with open(results_folder / f"{prefix}_nightly_results.json", "w") as f: From 8f4da1b2a33217d9f43d48f65b09a988d0524087 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Fri, 21 Jun 2024 16:20:43 -0700 Subject: [PATCH 037/150] give nightly pipeline higher priority --- .buildkite/nightly-benchmarks/nightly-pipeline.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 79092eeed4bca..536ad4b0b5235 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -5,6 +5,7 @@ steps: plugins: - kubernetes: podSpec: + priorityClassName: perf-benchmark containers: - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 command: @@ -36,6 +37,7 @@ steps: plugins: - kubernetes: podSpec: + priorityClassName: perf-benchmark containers: - image: vllm/vllm-openai:latest command: @@ -67,6 +69,7 @@ steps: plugins: - kubernetes: podSpec: + priorityClassName: perf-benchmark containers: - image: ghcr.io/huggingface/text-generation-inference:2.0 command: @@ -98,6 +101,7 @@ steps: plugins: - kubernetes: podSpec: + priorityClassName: perf-benchmark containers: - image: openmmlab/lmdeploy:latest command: From 2cbdac3af8603b4538b371d5a3826292e6f9aa60 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sat, 22 Jun 2024 11:59:46 -0700 Subject: [PATCH 038/150] fix new bugs in latest lmdeploy docker --- .../nightly-benchmarks/scripts/get-lmdeploy-modelname.py | 6 ++++++ .../nightly-benchmarks/scripts/get_lmdeploy_modelname.py | 6 ++++++ .../nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 4 ++++ .buildkite/nightly-benchmarks/tests/nightly-tests.json | 1 - 4 files changed, 16 insertions(+), 1 deletion(-) create mode 100644 .buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py create mode 100644 .buildkite/nightly-benchmarks/scripts/get_lmdeploy_modelname.py diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py new file mode 100644 index 0000000000000..1f7ecb306c575 --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py @@ -0,0 +1,6 @@ + +from lmdeploy.serve.openai.api_client import APIClient +api_client = APIClient("http://localhost:8000") +model_name = api_client.available_models[0] + +print(model_name) \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/get_lmdeploy_modelname.py b/.buildkite/nightly-benchmarks/scripts/get_lmdeploy_modelname.py new file mode 100644 index 0000000000000..1f7ecb306c575 --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/get_lmdeploy_modelname.py @@ -0,0 +1,6 @@ + +from lmdeploy.serve.openai.api_client import APIClient +api_client = APIClient("http://localhost:8000") +model_name = api_client.available_models[0] + +print(model_name) \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index c95b8845be1c6..559c92d4eebf4 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -116,6 +116,9 @@ run_serving_tests() { echo "vllm failed to start within the timeout period." fi + # get model name + model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py) + # iterate over different QPS for qps in $qps_list; do # remove the surrounding single quote from qps @@ -134,6 +137,7 @@ run_serving_tests() { --result-dir $RESULTS_FOLDER \ --result-filename ${new_test_name}.json \ --request-rate $qps \ + --model \"$model_name\" \ $client_args" echo "Running test case $test_name with qps $qps" diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index bfff63ee1cb47..b12ac211a6c14 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -8,7 +8,6 @@ "server_port": 8000 }, "lmdeploy_client_parameters": { - "model": "llama2", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, From c24b963b8d1147817a6fc2b72ad22bec5c792edb Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sun, 23 Jun 2024 19:59:50 -0700 Subject: [PATCH 039/150] try llama 70B with tp 4 --- .../scripts/get-lmdeploy-modelname.py | 4 ++-- .../scripts/get_lmdeploy_modelname.py | 6 ----- .../tests/nightly-tests.json | 24 +++++++++---------- 3 files changed, 14 insertions(+), 20 deletions(-) delete mode 100644 .buildkite/nightly-benchmarks/scripts/get_lmdeploy_modelname.py diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py index 1f7ecb306c575..18bcc3a8714c4 100644 --- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py +++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py @@ -1,6 +1,6 @@ - from lmdeploy.serve.openai.api_client import APIClient + api_client = APIClient("http://localhost:8000") model_name = api_client.available_models[0] -print(model_name) \ No newline at end of file +print(model_name) diff --git a/.buildkite/nightly-benchmarks/scripts/get_lmdeploy_modelname.py b/.buildkite/nightly-benchmarks/scripts/get_lmdeploy_modelname.py deleted file mode 100644 index 1f7ecb306c575..0000000000000 --- a/.buildkite/nightly-benchmarks/scripts/get_lmdeploy_modelname.py +++ /dev/null @@ -1,6 +0,0 @@ - -from lmdeploy.serve.openai.api_client import APIClient -api_client = APIClient("http://localhost:8000") -model_name = api_client.available_models[0] - -print(model_name) \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index b12ac211a6c14..64d29ffdc0d5e 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -1,10 +1,10 @@ [ { - "test_name": "llama8B_tp1_sharegpt", + "test_name": "llama70B_tp4_sharegpt", "qps_list": [1, 16], - "lmdeploy_server_model": "meta-llama/Llama-2-7b-hf", + "lmdeploy_server_model": "meta-llama/Meta-Llama-3-70B-Instruct", "lmdeploy_server_parameters": { - "tp": 1, + "tp": 4, "server_port": 8000 }, "lmdeploy_client_parameters": { @@ -14,12 +14,12 @@ "port": 8000 }, "tgi_server_parameters": { - "model_id": "meta-llama/Llama-2-7b-hf", - "num_shard": 1, + "model_id": "meta-llama/Meta-Llama-3-70B-Instruct", + "num_shard": 4, "port": 8000 }, "tgi_client_parameters": { - "model": "meta-llama/Llama-2-7b-hf", + "model": "meta-llama/Meta-Llama-3-70B-Instruct", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, @@ -27,18 +27,18 @@ "endpoint": "/generate_stream" }, "trt_server_parameters": { - "model_path": "meta-llama/llama-2-7b-chat-hf", + "model_path": "meta-llama/Meta-Llama-3-70B-Instruct", "model_name": "llama-2-7b-chat-hf", "model_type": "llama", "model_dtype": "float16", - "model_tp_size": 1, + "model_tp_size": 4, "max_batch_size": 256, "max_input_len": 10000, "max_output_len": 10000, "trt_llm_version": "r24.04" }, "trt_client_parameters": { - "model": "meta-llama/Llama-2-7b-hf", + "model": "meta-llama/Meta-Llama-3-70B-Instruct", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, @@ -46,13 +46,13 @@ "endpoint": "/v2/models/ensemble/generate_stream" }, "vllm_server_parameters": { - "model": "meta-llama/Llama-2-7b-hf", - "tensor_parallel_size": 1, + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tensor_parallel_size": 4, "disable_log_stats": "", "disable_log_requests": "" }, "vllm_client_parameters": { - "model": "meta-llama/Llama-2-7b-hf", + "model": "meta-llama/Meta-Llama-3-70B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", From f79b6c425a8526fdb7c3452d81adef6be42a0cb3 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sun, 23 Jun 2024 21:59:17 -0700 Subject: [PATCH 040/150] rebuild --- .buildkite/nightly-benchmarks/README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md index 4036b32a46bf7..c84e150934306 100644 --- a/.buildkite/nightly-benchmarks/README.md +++ b/.buildkite/nightly-benchmarks/README.md @@ -1,5 +1,6 @@ # vLLM benchmark suite + ## Introduction This directory contains the performance benchmarking CI for vllm. From dfb77f436756b93bcb8952d8ed285dc694c644a2 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sun, 23 Jun 2024 22:32:16 -0700 Subject: [PATCH 041/150] use mixtral model to prevent disk quota exceeded --- .../tests/nightly-tests.json | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 64d29ffdc0d5e..98ea73fd87371 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -1,10 +1,10 @@ [ { - "test_name": "llama70B_tp4_sharegpt", + "test_name": "mixtral8x7B_tp2_sharegpt", "qps_list": [1, 16], - "lmdeploy_server_model": "meta-llama/Meta-Llama-3-70B-Instruct", + "lmdeploy_server_model": "mistralai/Mixtral-8x7B-Instruct-v0.1", "lmdeploy_server_parameters": { - "tp": 4, + "tp": 2, "server_port": 8000 }, "lmdeploy_client_parameters": { @@ -14,12 +14,12 @@ "port": 8000 }, "tgi_server_parameters": { - "model_id": "meta-llama/Meta-Llama-3-70B-Instruct", - "num_shard": 4, + "model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "num_shard": 2, "port": 8000 }, "tgi_client_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, @@ -27,18 +27,18 @@ "endpoint": "/generate_stream" }, "trt_server_parameters": { - "model_path": "meta-llama/Meta-Llama-3-70B-Instruct", + "model_path": "mistralai/Mixtral-8x7B-Instruct-v0.1", "model_name": "llama-2-7b-chat-hf", "model_type": "llama", "model_dtype": "float16", - "model_tp_size": 4, + "model_tp_size": 2, "max_batch_size": 256, "max_input_len": 10000, "max_output_len": 10000, "trt_llm_version": "r24.04" }, "trt_client_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, @@ -46,13 +46,13 @@ "endpoint": "/v2/models/ensemble/generate_stream" }, "vllm_server_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", - "tensor_parallel_size": 4, + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tensor_parallel_size": 2, "disable_log_stats": "", "disable_log_requests": "" }, "vllm_client_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", From d2e4171f2a343794a469b3cc0fd122b44378d92c Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sun, 23 Jun 2024 23:25:46 -0700 Subject: [PATCH 042/150] remove wait --- .buildkite/nightly-benchmarks/nightly-pipeline.yaml | 4 ---- 1 file changed, 4 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 536ad4b0b5235..47505ff64e0fb 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -30,7 +30,6 @@ steps: - name: devshm emptyDir: medium: Memory - - wait - label: "A100 vllm benchmark" agents: queue: A100 @@ -62,7 +61,6 @@ steps: - name: devshm emptyDir: medium: Memory - - wait - label: "A100 tgi benchmark" agents: queue: A100 @@ -94,7 +92,6 @@ steps: - name: devshm emptyDir: medium: Memory - - wait - label: "A100 lmdeploy benchmark" agents: queue: A100 @@ -126,5 +123,4 @@ steps: - name: devshm emptyDir: medium: Memory - - wait \ No newline at end of file From dc5219567946942834ed3db15b55357fc30f91e0 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 24 Jun 2024 11:04:09 -0700 Subject: [PATCH 043/150] temporarily remove trt pipeline --- disk quota exceeded --- .../nightly-benchmarks/nightly-pipeline.yaml | 62 +++++++++---------- 1 file changed, 31 insertions(+), 31 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 47505ff64e0fb..8dace4b605889 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -1,35 +1,35 @@ steps: - - label: "A100 trt benchmark" - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 - command: - - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh) - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory + # - label: "A100 trt benchmark" + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # priorityClassName: perf-benchmark + # containers: + # - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 + # command: + # - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh) + # resources: + # limits: + # nvidia.com/gpu: 8 + # volumeMounts: + # - name: devshm + # mountPath: /dev/shm + # env: + # - name: VLLM_USAGE_SOURCE + # value: ci-test + # - name: HF_TOKEN + # valueFrom: + # secretKeyRef: + # name: hf-token-secret + # key: token + # nodeSelector: + # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + # volumes: + # - name: devshm + # emptyDir: + # medium: Memory - label: "A100 vllm benchmark" agents: queue: A100 From 8ffd8b15144e4a6b462c143d3436f1e96d07eac0 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 24 Jun 2024 12:13:51 -0700 Subject: [PATCH 044/150] fall back to 70B and test the storage required --- .../tests/nightly-tests.json | 24 +++++++++---------- 1 file changed, 12 insertions(+), 12 deletions(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 98ea73fd87371..aeeb0a609c73d 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -1,10 +1,10 @@ [ { - "test_name": "mixtral8x7B_tp2_sharegpt", + "test_name": "llama70B_tp4", "qps_list": [1, 16], - "lmdeploy_server_model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "lmdeploy_server_model": "meta-llama/Meta-Llama-3-70B-Instruct", "lmdeploy_server_parameters": { - "tp": 2, + "tp": 4, "server_port": 8000 }, "lmdeploy_client_parameters": { @@ -14,12 +14,12 @@ "port": 8000 }, "tgi_server_parameters": { - "model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "num_shard": 2, + "model_id": "meta-llama/Meta-Llama-3-70B-Instruct", + "num_shard": 4, "port": 8000 }, "tgi_client_parameters": { - "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model": "meta-llama/Meta-Llama-3-70B-Instruct", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, @@ -27,18 +27,18 @@ "endpoint": "/generate_stream" }, "trt_server_parameters": { - "model_path": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model_path": "meta-llama/Meta-Llama-3-70B-Instruct", "model_name": "llama-2-7b-chat-hf", "model_type": "llama", "model_dtype": "float16", - "model_tp_size": 2, + "model_tp_size": 4, "max_batch_size": 256, "max_input_len": 10000, "max_output_len": 10000, "trt_llm_version": "r24.04" }, "trt_client_parameters": { - "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model": "meta-llama/Meta-Llama-3-70B-Instruct", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, @@ -46,13 +46,13 @@ "endpoint": "/v2/models/ensemble/generate_stream" }, "vllm_server_parameters": { - "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "tensor_parallel_size": 2, + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tensor_parallel_size": 4, "disable_log_stats": "", "disable_log_requests": "" }, "vllm_client_parameters": { - "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "model": "meta-llama/Meta-Llama-3-70B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", From 313f54f25f57d40bea5e05ed9f1e133c557f5ecc Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 24 Jun 2024 12:15:39 -0700 Subject: [PATCH 045/150] use llama-2 as I do not have llama3 access...) --- .../nightly-benchmarks/tests/nightly-tests.json | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index aeeb0a609c73d..437451d88bc6b 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -2,7 +2,7 @@ { "test_name": "llama70B_tp4", "qps_list": [1, 16], - "lmdeploy_server_model": "meta-llama/Meta-Llama-3-70B-Instruct", + "lmdeploy_server_model": "meta-llama/Llama-2-70b-chat-hf", "lmdeploy_server_parameters": { "tp": 4, "server_port": 8000 @@ -14,12 +14,12 @@ "port": 8000 }, "tgi_server_parameters": { - "model_id": "meta-llama/Meta-Llama-3-70B-Instruct", + "model_id": "meta-llama/Llama-2-70b-chat-hf", "num_shard": 4, "port": 8000 }, "tgi_client_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "model": "meta-llama/Llama-2-70b-chat-hf", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, @@ -27,7 +27,7 @@ "endpoint": "/generate_stream" }, "trt_server_parameters": { - "model_path": "meta-llama/Meta-Llama-3-70B-Instruct", + "model_path": "meta-llama/Llama-2-70b-chat-hf", "model_name": "llama-2-7b-chat-hf", "model_type": "llama", "model_dtype": "float16", @@ -38,7 +38,7 @@ "trt_llm_version": "r24.04" }, "trt_client_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "model": "meta-llama/Llama-2-70b-chat-hf", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, @@ -46,13 +46,13 @@ "endpoint": "/v2/models/ensemble/generate_stream" }, "vllm_server_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "model": "meta-llama/Llama-2-70b-chat-hf", "tensor_parallel_size": 4, "disable_log_stats": "", "disable_log_requests": "" }, "vllm_client_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "model": "meta-llama/Llama-2-70b-chat-hf", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", From 62b2407df2fc7565eae79078689477f8125441eb Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 24 Jun 2024 12:20:23 -0700 Subject: [PATCH 046/150] fix model name --- .buildkite/nightly-benchmarks/tests/nightly-tests.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 437451d88bc6b..e0313a1e32afb 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -28,7 +28,7 @@ }, "trt_server_parameters": { "model_path": "meta-llama/Llama-2-70b-chat-hf", - "model_name": "llama-2-7b-chat-hf", + "model_name": "llama-2-70b-chat-hf", "model_type": "llama", "model_dtype": "float16", "model_tp_size": 4, From 785d246179e2c8a13221065ea1a1f52824959ee2 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 24 Jun 2024 17:56:21 -0700 Subject: [PATCH 047/150] try llama 70B --- .../nightly-benchmarks/nightly-pipeline.yaml | 62 +++++++++---------- .../tests/nightly-tests.json | 16 ++--- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 8dace4b605889..47505ff64e0fb 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -1,35 +1,35 @@ steps: - # - label: "A100 trt benchmark" - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # priorityClassName: perf-benchmark - # containers: - # - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 - # command: - # - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh) - # resources: - # limits: - # nvidia.com/gpu: 8 - # volumeMounts: - # - name: devshm - # mountPath: /dev/shm - # env: - # - name: VLLM_USAGE_SOURCE - # value: ci-test - # - name: HF_TOKEN - # valueFrom: - # secretKeyRef: - # name: hf-token-secret - # key: token - # nodeSelector: - # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - # volumes: - # - name: devshm - # emptyDir: - # medium: Memory + - label: "A100 trt benchmark" + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + priorityClassName: perf-benchmark + containers: + - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 + command: + - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh) + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory - label: "A100 vllm benchmark" agents: queue: A100 diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index e0313a1e32afb..6515e73de76e2 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -2,7 +2,7 @@ { "test_name": "llama70B_tp4", "qps_list": [1, 16], - "lmdeploy_server_model": "meta-llama/Llama-2-70b-chat-hf", + "lmdeploy_server_model": "meta-llama/Meta-Llama-3-70B-Instruct", "lmdeploy_server_parameters": { "tp": 4, "server_port": 8000 @@ -14,12 +14,12 @@ "port": 8000 }, "tgi_server_parameters": { - "model_id": "meta-llama/Llama-2-70b-chat-hf", + "model_id": "meta-llama/Meta-Llama-3-70B-Instruct", "num_shard": 4, "port": 8000 }, "tgi_client_parameters": { - "model": "meta-llama/Llama-2-70b-chat-hf", + "model": "meta-llama/Meta-Llama-3-70B-Instruct", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, @@ -27,8 +27,8 @@ "endpoint": "/generate_stream" }, "trt_server_parameters": { - "model_path": "meta-llama/Llama-2-70b-chat-hf", - "model_name": "llama-2-70b-chat-hf", + "model_path": "meta-llama/Meta-Llama-3-70B-Instruct", + "model_name": "Meta-Llama-3-70B-Instruct", "model_type": "llama", "model_dtype": "float16", "model_tp_size": 4, @@ -38,7 +38,7 @@ "trt_llm_version": "r24.04" }, "trt_client_parameters": { - "model": "meta-llama/Llama-2-70b-chat-hf", + "model": "meta-llama/Meta-Llama-3-70B-Instruct", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, @@ -46,13 +46,13 @@ "endpoint": "/v2/models/ensemble/generate_stream" }, "vllm_server_parameters": { - "model": "meta-llama/Llama-2-70b-chat-hf", + "model": "meta-llama/Meta-Llama-3-70B-Instruct", "tensor_parallel_size": 4, "disable_log_stats": "", "disable_log_requests": "" }, "vllm_client_parameters": { - "model": "meta-llama/Llama-2-70b-chat-hf", + "model": "meta-llama/Meta-Llama-3-70B-Instruct", "backend": "vllm", "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", From 7fd891ec72b7f6b97aa31b788eb8f4ba340670d6 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 24 Jun 2024 18:37:47 -0700 Subject: [PATCH 048/150] check file system size --- .buildkite/nightly-benchmarks/run-nightly-suite.sh | 2 ++ 1 file changed, 2 insertions(+) diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh index 4f09c9e8423f8..1eab836b5503c 100644 --- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh +++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh @@ -34,6 +34,8 @@ main() { check_gpus check_hf_token + df -h + (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get update && apt-get -y install jq) cd / From 52cf795b0505f3bdc23cf2ebaec84d3e23c1f4f2 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 24 Jun 2024 22:26:04 -0700 Subject: [PATCH 049/150] update code for removing cache --- .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 1 + .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh | 1 + .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh | 2 ++ .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh | 1 + 4 files changed, 5 insertions(+) diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index 559c92d4eebf4..7e9398c892c3d 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -163,6 +163,7 @@ run_serving_tests() { # clean up kill_gpu_processes + rm -rf /root/.cache/huggingface/* done } diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh index 23cbafa5c2edc..e2a21a17fa56d 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh @@ -146,6 +146,7 @@ run_serving_tests() { # clean up kill_gpu_processes + rm -rf /root/.cache/huggingface/* done } diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index ec0d99570f08f..f1092b2b3afbf 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -122,6 +122,7 @@ run_trt_server() { --output_dir=${trt_engine_path} cd /tensorrtllm_backend/triton_model_repo + rm -rf ./tensorrt_llm/1/* cp -r ${trt_engine_path}/* ./tensorrt_llm/1 cd /tensorrtllm_backend python3 scripts/launch_triton_server.py \ @@ -235,6 +236,7 @@ run_serving_tests() { # clean up kill_gpu_processes + rm -rf /root/.cache/huggingface/* done } diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh index 021a8dc170d1c..492f03ddd1cb5 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh @@ -165,6 +165,7 @@ run_serving_tests() { # clean up kill_gpu_processes + rm -rf /root/.cache/huggingface/* done } From b8d1c9432daa00575816be5ce2aaebbddc7e7a8a Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 24 Jun 2024 23:03:49 -0700 Subject: [PATCH 050/150] merge common parameters --- .../scripts/run-lmdeploy-nightly.sh | 25 +++++++++--- .../scripts/run-tgi-nightly.sh | 28 +++++++++++-- .../scripts/run-trt-nightly.sh | 36 ++++++++++++----- .../scripts/run-vllm-nightly.sh | 28 ++++++++----- .../tests/nightly-tests.json | 40 +++++-------------- 5 files changed, 98 insertions(+), 59 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index 7e9398c892c3d..b7224cd790d90 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -72,13 +72,21 @@ run_serving_tests() { # append lmdeploy to the test name test_name=lmdeploy_$test_name + + # get common parameters + common_params=$(echo "$params" | jq -r '.common_parameters') + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tensor_parallel_size') + dataset_name=$(echo "$common_params" | jq -r '.dataset_name') + dataset_path=$(echo "$common_params" | jq -r '.dataset_path') + port=$(echo "$common_params" | jq -r '.port') + num_prompts=$(echo "$common_params" | jq -r '.num_prompts') # get client and server arguments server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters') client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters') - model=$(echo "$params" | jq -r '.lmdeploy_server_model') server_args=$(json2args "$server_params") client_args=$(json2args "$client_params") qps_list=$(echo "$params" | jq -r '.qps_list') @@ -86,7 +94,6 @@ run_serving_tests() { echo "Running over qps list $qps_list" # check if there is enough GPU to run the test - tp=$(echo "$server_params" | jq -r '.tp') if [[ $gpu_count -lt $tp ]]; then echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." continue @@ -99,7 +106,10 @@ run_serving_tests() { --model "$model" \ --cachedir /tokenizer_cache - server_command="lmdeploy serve api_server $model $server_args" + server_command="lmdeploy serve api_server $model \ + --tp $tp \ + --server-port $port \ + $server_args" # run the server echo "Running test case $test_name" @@ -110,10 +120,11 @@ run_serving_tests() { wait_for_server if [ $? -eq 0 ]; then echo "" - echo "vllm server is up and running." + echo "lmdeploy server is up and running." else echo "" - echo "vllm failed to start within the timeout period." + echo "lmdeploy failed to start within the timeout period." + continue fi # get model name @@ -133,6 +144,10 @@ run_serving_tests() { client_command="python3 benchmark_serving.py \ --backend lmdeploy \ --tokenizer /tokenizer_cache \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --num-prompts $num_prompts \ + --port $port \ --save-result \ --result-dir $RESULTS_FOLDER \ --result-filename ${new_test_name}.json \ diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh index e2a21a17fa56d..06ceb14b97fe6 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh @@ -61,8 +61,7 @@ run_serving_tests() { jq -c '.[]' "$serving_test_file" | while read -r params; do # get the test name, and append the GPU type back to it. test_name=$(echo "$params" | jq -r '.test_name') - # append tgi to the test name - test_name=tgi_$test_name + # if TEST_SELECTOR is set, only run the test cases that match the selector if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then @@ -70,6 +69,18 @@ run_serving_tests() { continue fi + # append tgi to the test name + test_name=tgi_$test_name + + # get common parameters + common_params=$(echo "$params" | jq -r '.common_parameters') + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tensor_parallel_size') + dataset_name=$(echo "$common_params" | jq -r '.dataset_name') + dataset_path=$(echo "$common_params" | jq -r '.dataset_path') + port=$(echo "$common_params" | jq -r '.port') + num_prompts=$(echo "$common_params" | jq -r '.num_prompts') + # get client and server arguments server_params=$(echo "$params" | jq -r '.tgi_server_parameters') client_params=$(echo "$params" | jq -r '.tgi_client_parameters') @@ -80,14 +91,17 @@ run_serving_tests() { echo "Running over qps list $qps_list" # check if there is enough GPU to run the test - tp=$(echo "$server_params" | jq -r '.num_shard') if [[ $gpu_count -lt $tp ]]; then echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name." continue fi - server_command="/tgi-entrypoint.sh $server_args" + server_command="/tgi-entrypoint.sh \ + --model-id $model \ + --num-shard $tp \ + --port $port \ + $server_args" # run the server echo "Running test case $test_name" @@ -102,6 +116,7 @@ run_serving_tests() { else echo "" echo "tgi failed to start within the timeout period." + continue fi # iterate over different QPS @@ -117,6 +132,11 @@ run_serving_tests() { client_command="python3 benchmark_serving.py \ --backend tgi \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --num-prompts $num_prompts \ + --port $port \ --save-result \ --result-dir $RESULTS_FOLDER \ --result-filename ${new_test_name}.json \ diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index f1092b2b3afbf..742011cd587f7 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -54,12 +54,15 @@ wait_for_server() { run_trt_server() { params=$1 + common_params=$2 - model_name=$(echo "$params" | jq -r '.model_name') - model_path=$(echo "$params" | jq -r '.model_path') + + + model_path=$(echo "$common_params" | jq -r '.model') + model_name="${model_path#*/}" model_type=$(echo "$params" | jq -r '.model_type') model_dtype=$(echo "$params" | jq -r '.model_dtype') - model_tp_size=$(echo "$params" | jq -r '.model_tp_size') + model_tp_size=$(echo "$common_params" | jq -r '.tp') max_batch_size=$(echo "$params" | jq -r '.max_batch_size') max_input_len=$(echo "$params" | jq -r '.max_input_len') max_output_len=$(echo "$params" | jq -r '.max_output_len') @@ -141,26 +144,34 @@ run_serving_tests() { jq -c '.[]' "$serving_test_file" | while read -r params; do # get the test name, and append the GPU type back to it. test_name=$(echo "$params" | jq -r '.test_name') - # append trt to the test name - test_name=trt_$test_name - + # if TEST_SELECTOR is set, only run the test cases that match the selector if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then echo "Skip test case $test_name." continue fi + # append trt to the test name + test_name=trt_$test_name + + # get common parameters + common_params=$(echo "$params" | jq -r '.common_parameters') + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tensor_parallel_size') + dataset_name=$(echo "$common_params" | jq -r '.dataset_name') + dataset_path=$(echo "$common_params" | jq -r '.dataset_path') + port=$(echo "$common_params" | jq -r '.port') + num_prompts=$(echo "$common_params" | jq -r '.num_prompts') + # get client and server arguments server_params=$(echo "$params" | jq -r '.trt_server_parameters') client_params=$(echo "$params" | jq -r '.trt_client_parameters') - model=$(echo "$client_params" | jq -r '.model') client_args=$(json2args "$client_params") qps_list=$(echo "$params" | jq -r '.qps_list') qps_list=$(echo "$qps_list" | jq -r '.[] | @sh') echo "Running over qps list $qps_list" # check if there is enough GPU to run the test - tp=$(echo "$server_params" | jq -r '.model_tp_size') if [[ $gpu_count -lt $tp ]]; then echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name." continue @@ -178,7 +189,7 @@ run_serving_tests() { # run the server echo "Running test case $test_name" - run_trt_server "$server_params" + run_trt_server "$server_params" "$common_params" # wait until the server is alive wait_for_server @@ -188,6 +199,7 @@ run_serving_tests() { else echo "" echo "trt failed to start within the timeout period." + continue fi # go back to vllm benchmarking directory @@ -207,6 +219,11 @@ run_serving_tests() { client_command="python3 benchmark_serving.py \ --backend tensorrt-llm \ --tokenizer /tokenizer_cache \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --num-prompts $num_prompts \ + --port $port \ --save-result \ --result-dir $RESULTS_FOLDER \ --result-filename ${new_test_name}.json \ @@ -218,6 +235,7 @@ run_serving_tests() { eval "$client_command" + server_command="" # record the benchmarking commands jq_output=$(jq -n \ --arg server "$server_command" \ diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh index 492f03ddd1cb5..970680d7293e1 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh @@ -77,7 +77,15 @@ run_serving_tests() { # append vllm to the test name test_name=vllm_$test_name - + + # get common parameters + common_params=$(echo "$params" | jq -r '.common_parameters') + model=$(echo "$common_params" | jq -r '.model') + tp=$(echo "$common_params" | jq -r '.tensor_parallel_size') + dataset_name=$(echo "$common_params" | jq -r '.dataset_name') + dataset_path=$(echo "$common_params" | jq -r '.dataset_path') + port=$(echo "$common_params" | jq -r '.port') + num_prompts=$(echo "$common_params" | jq -r '.num_prompts') # get client and server arguments server_params=$(echo "$params" | jq -r '.vllm_server_parameters') @@ -89,23 +97,17 @@ run_serving_tests() { echo "Running over qps list $qps_list" # check if there is enough GPU to run the test - tp=$(echo "$server_params" | jq -r '.tensor_parallel_size') if [[ $gpu_count -lt $tp ]]; then echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name." continue fi - # check if server model and client model is aligned - server_model=$(echo "$server_params" | jq -r '.model') - client_model=$(echo "$client_params" | jq -r '.model') - if [[ $server_model != "$client_model" ]]; then - echo "Server model and client model must be the same. Skip testcase $test_name." - continue - fi - server_command="python3 \ -m vllm.entrypoints.openai.api_server \ + -tp $tp \ + --model $model \ + --port $port \ $server_args" # run the server @@ -121,6 +123,7 @@ run_serving_tests() { else echo "" echo "vllm failed to start within the timeout period." + continue fi # iterate over different QPS @@ -136,6 +139,11 @@ run_serving_tests() { client_command="python3 benchmark_serving.py \ --backend vllm \ + --model $model \ + --dataset-name $dataset_name \ + --dataset-path $dataset_path \ + --num-prompts $num_prompts \ + --port $port \ --save-result \ --result-dir $RESULTS_FOLDER \ --result-filename ${new_test_name}.json \ diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 6515e73de76e2..0e0385bd01204 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -1,62 +1,40 @@ [ { - "test_name": "llama70B_tp4", - "qps_list": [1, 16], - "lmdeploy_server_model": "meta-llama/Meta-Llama-3-70B-Instruct", - "lmdeploy_server_parameters": { - "tp": 4, - "server_port": 8000 - }, - "lmdeploy_client_parameters": { + "test_name": "llama8B_tp1", + "qps_list": [1, 8], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "tp": 1, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 200, "port": 8000 }, + "lmdeploy_server_parameters": { + }, + "lmdeploy_client_parameters": { + }, "tgi_server_parameters": { - "model_id": "meta-llama/Meta-Llama-3-70B-Instruct", - "num_shard": 4, - "port": 8000 }, "tgi_client_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "port": 8000, "endpoint": "/generate_stream" }, "trt_server_parameters": { - "model_path": "meta-llama/Meta-Llama-3-70B-Instruct", - "model_name": "Meta-Llama-3-70B-Instruct", "model_type": "llama", "model_dtype": "float16", - "model_tp_size": 4, "max_batch_size": 256, "max_input_len": 10000, "max_output_len": 10000, "trt_llm_version": "r24.04" }, "trt_client_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, - "port": 8000, "endpoint": "/v2/models/ensemble/generate_stream" }, "vllm_server_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", - "tensor_parallel_size": 4, "disable_log_stats": "", "disable_log_requests": "" }, "vllm_client_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", - "backend": "vllm", - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200 } } ] \ No newline at end of file From 14fb6500b0190823a98c6e9e4d7e874ccdb7d9b9 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 24 Jun 2024 23:19:28 -0700 Subject: [PATCH 051/150] fix typo --- .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 2 +- .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh | 2 +- .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh | 2 +- .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index b7224cd790d90..94cf83f2f4641 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -76,7 +76,7 @@ run_serving_tests() { # get common parameters common_params=$(echo "$params" | jq -r '.common_parameters') model=$(echo "$common_params" | jq -r '.model') - tp=$(echo "$common_params" | jq -r '.tensor_parallel_size') + tp=$(echo "$common_params" | jq -r '.tp') dataset_name=$(echo "$common_params" | jq -r '.dataset_name') dataset_path=$(echo "$common_params" | jq -r '.dataset_path') port=$(echo "$common_params" | jq -r '.port') diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh index 06ceb14b97fe6..a9e31219e1955 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh @@ -75,7 +75,7 @@ run_serving_tests() { # get common parameters common_params=$(echo "$params" | jq -r '.common_parameters') model=$(echo "$common_params" | jq -r '.model') - tp=$(echo "$common_params" | jq -r '.tensor_parallel_size') + tp=$(echo "$common_params" | jq -r '.tp') dataset_name=$(echo "$common_params" | jq -r '.dataset_name') dataset_path=$(echo "$common_params" | jq -r '.dataset_path') port=$(echo "$common_params" | jq -r '.port') diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index 742011cd587f7..65b0706b6e426 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -157,7 +157,7 @@ run_serving_tests() { # get common parameters common_params=$(echo "$params" | jq -r '.common_parameters') model=$(echo "$common_params" | jq -r '.model') - tp=$(echo "$common_params" | jq -r '.tensor_parallel_size') + tp=$(echo "$common_params" | jq -r '.tp') dataset_name=$(echo "$common_params" | jq -r '.dataset_name') dataset_path=$(echo "$common_params" | jq -r '.dataset_path') port=$(echo "$common_params" | jq -r '.port') diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh index 970680d7293e1..232743b36b5b4 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh @@ -81,7 +81,7 @@ run_serving_tests() { # get common parameters common_params=$(echo "$params" | jq -r '.common_parameters') model=$(echo "$common_params" | jq -r '.model') - tp=$(echo "$common_params" | jq -r '.tensor_parallel_size') + tp=$(echo "$common_params" | jq -r '.tp') dataset_name=$(echo "$common_params" | jq -r '.dataset_name') dataset_path=$(echo "$common_params" | jq -r '.dataset_path') port=$(echo "$common_params" | jq -r '.port') From 3aba28a9823445d063f52b5cc08e2f7b395da861 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 24 Jun 2024 23:24:30 -0700 Subject: [PATCH 052/150] reduce qps to 8, just for testing --- .buildkite/nightly-benchmarks/tests/nightly-tests.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 0e0385bd01204..7422eac46b482 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -1,7 +1,7 @@ [ { "test_name": "llama8B_tp1", - "qps_list": [1, 8], + "qps_list": [8], "common_parameters": { "model": "meta-llama/Meta-Llama-3-8B", "tp": 1, From 733ac33c60cb47e5f2f016c5a10e6e26d0ce1115 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 25 Jun 2024 00:03:49 -0700 Subject: [PATCH 053/150] append to the same context --- .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 2 +- .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh | 2 +- .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh | 2 +- .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index 94cf83f2f4641..1ce24c389de70 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -191,7 +191,7 @@ upload_to_buildkite() { echo "buildkite-agent binary not found. Skip uploading the results." return 0 fi - /workspace/buildkite-agent annotate --style "info" --context "lmdeploy-benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" } diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh index a9e31219e1955..099aa94549cbd 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh @@ -180,7 +180,7 @@ upload_to_buildkite() { echo "buildkite-agent binary not found. Skip uploading the results." return 0 fi - /workspace/buildkite-agent annotate --style "info" --context "tgi-benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" } diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index 65b0706b6e426..77340a1e15098 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -266,7 +266,7 @@ upload_to_buildkite() { echo "buildkite-agent binary not found. Skip uploading the results." return 0 fi - /workspace/buildkite-agent annotate --style "info" --context "trt-benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" } diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh index 232743b36b5b4..163f7e10b44e7 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh @@ -186,7 +186,7 @@ upload_to_buildkite() { echo "buildkite-agent binary not found. Skip uploading the results." return 0 fi - /workspace/buildkite-agent annotate --style "info" --context "vllm-benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" } From 5e1ec4b727628d89fab3928f744f5728d635638d Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 25 Jun 2024 00:16:50 -0700 Subject: [PATCH 054/150] optimize for buildkite annotation --- .../nightly-benchmarks/nightly-descriptions.md | 18 ++++++++++++++++++ .../nightly-benchmarks/run-nightly-suite.sh | 8 ++++++++ .../scripts/run-lmdeploy-nightly.sh | 2 +- .../scripts/run-tgi-nightly.sh | 2 +- .../scripts/run-trt-nightly.sh | 2 +- .../scripts/run-vllm-nightly.sh | 2 +- 6 files changed, 30 insertions(+), 4 deletions(-) create mode 100644 .buildkite/nightly-benchmarks/nightly-descriptions.md diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md new file mode 100644 index 0000000000000..e382433be8488 --- /dev/null +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -0,0 +1,18 @@ + +# Nightly benchmark + +The main goal of this benchmarking is two-fold: +- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload. +- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md](). + + +## Workload description + +We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload: + +- Input length: randomly sample 1000 prompts from ShareGPT dataset (with fixed random seed). +- Output length: the corresponding output length of these 1000 prompts. +- Batch size: dynamically determined by vllm and the arrival pattern of the requests. +- Average QPS (query per second): 4, 8 for 8B model and 1, 4 for larger models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed). +- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. +- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh index 1eab836b5503c..74f02e3035728 100644 --- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh +++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh @@ -45,6 +45,14 @@ main() { cd benchmarks wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json + if [ ! -f /workspace/buildkite-agent ]; then + echo "buildkite-agent binary not found. Skip uploading the results." + return 0 + else + /workspace/buildkite-agent annotate --style "info" --context "header" < /vllm/.buildkite/nightly-benchmarks/nightly-descriptions.md + fi + + # run lmdeploy if which lmdeploy >/dev/null; then echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh" diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index 1ce24c389de70..7a3c06451ea7b 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -191,7 +191,7 @@ upload_to_buildkite() { echo "buildkite-agent binary not found. Skip uploading the results." return 0 fi - /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" } diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh index 099aa94549cbd..48aa56a2799c4 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh @@ -180,7 +180,7 @@ upload_to_buildkite() { echo "buildkite-agent binary not found. Skip uploading the results." return 0 fi - /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" } diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index 77340a1e15098..b9db6ac4dff5b 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -266,7 +266,7 @@ upload_to_buildkite() { echo "buildkite-agent binary not found. Skip uploading the results." return 0 fi - /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" } diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh index 163f7e10b44e7..ab3df9def635f 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh @@ -186,7 +186,7 @@ upload_to_buildkite() { echo "buildkite-agent binary not found. Skip uploading the results." return 0 fi - /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" } From 24231061a00cda9eb76cc866741c545415345b83 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 25 Jun 2024 00:17:05 -0700 Subject: [PATCH 055/150] add double enter for md table --- .../nightly-benchmarks/scripts/summary-nightly-results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index ced57295f735e..c12ae985518fa 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -65,7 +65,7 @@ # document benchmarking results in markdown with open(results_folder / f"{prefix}_nightly_results.md", "w") as f: f.write(serving_md_table) - f.write('\n') + f.write('\n\n') # document benchmarking results in json with open(results_folder / f"{prefix}_nightly_results.json", "w") as f: From d3f970184dbd80e1f1f683c752d756fe68c138d9 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 25 Jun 2024 00:31:16 -0700 Subject: [PATCH 056/150] format adjust for markdown presentation --- .buildkite/nightly-benchmarks/nightly-results-header.md | 2 ++ .buildkite/nightly-benchmarks/run-nightly-suite.sh | 1 + .../scripts/summary-nightly-results.py | 9 ++++++--- 3 files changed, 9 insertions(+), 3 deletions(-) create mode 100644 .buildkite/nightly-benchmarks/nightly-results-header.md diff --git a/.buildkite/nightly-benchmarks/nightly-results-header.md b/.buildkite/nightly-benchmarks/nightly-results-header.md new file mode 100644 index 0000000000000..fedbd9e29fce8 --- /dev/null +++ b/.buildkite/nightly-benchmarks/nightly-results-header.md @@ -0,0 +1,2 @@ +| Test name | GPU | Successful req. | Tput (req/s) | Mean TTFT (ms) | Median TTFT (ms) | P99 TTFT (ms) | Mean ITL (ms) | Median ITL (ms) | P99 ITL (ms) | Engine | +|:----------------------|:---------------|------------------:|---------------:|-----------------:|-------------------:|----------------:|----------------:|------------------:|---------------:|:---------| diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh index 74f02e3035728..e07367ef9653e 100644 --- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh +++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh @@ -50,6 +50,7 @@ main() { return 0 else /workspace/buildkite-agent annotate --style "info" --context "header" < /vllm/.buildkite/nightly-benchmarks/nightly-descriptions.md + /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --append < /vllm/.buildkite/nightly-benchmarks/nightly-results-header.md fi diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index c12ae985518fa..d25a97e47d409 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -55,17 +55,20 @@ serving_column_mapping.keys())].rename( columns=serving_column_mapping) - serving_md_table = tabulate(serving_results, + serving_md_table_with_headers = tabulate(serving_results, headers='keys', tablefmt='pipe', showindex=False) + # remove the first line of header + serving_md_table_lines = serving_md_table_with_headers.split('\n') + serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:]) prefix = os.environ.get("CURRENT_LLM_SERVING_ENGINE") # document benchmarking results in markdown with open(results_folder / f"{prefix}_nightly_results.md", "w") as f: - f.write(serving_md_table) - f.write('\n\n') + f.write(serving_md_table_without_header) + f.write('\n') # document benchmarking results in json with open(results_folder / f"{prefix}_nightly_results.json", "w") as f: From c4d651b654f27e52cd14aacf0f5660b2f700ea96 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 25 Jun 2024 00:46:29 -0700 Subject: [PATCH 057/150] move header to the description file --- .buildkite/nightly-benchmarks/nightly-descriptions.md | 6 ++++++ .buildkite/nightly-benchmarks/nightly-results-header.md | 2 -- .buildkite/nightly-benchmarks/run-nightly-suite.sh | 1 - 3 files changed, 6 insertions(+), 3 deletions(-) delete mode 100644 .buildkite/nightly-benchmarks/nightly-results-header.md diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index e382433be8488..edcbeb8db10c4 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -16,3 +16,9 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload: - Average QPS (query per second): 4, 8 for 8B model and 1, 4 for larger models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed). - Models: llama-3 8B, llama-3 70B, mixtral 8x7B. - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). + + +## Results + +| Test name | GPU | Successful req. | Tput (req/s) | Mean TTFT (ms) | Median TTFT (ms) | P99 TTFT (ms) | Mean ITL (ms) | Median ITL (ms) | P99 ITL (ms) | Engine | +|:----------------------|:---------------|------------------:|---------------:|-----------------:|-------------------:|----------------:|----------------:|------------------:|---------------:|:---------| diff --git a/.buildkite/nightly-benchmarks/nightly-results-header.md b/.buildkite/nightly-benchmarks/nightly-results-header.md deleted file mode 100644 index fedbd9e29fce8..0000000000000 --- a/.buildkite/nightly-benchmarks/nightly-results-header.md +++ /dev/null @@ -1,2 +0,0 @@ -| Test name | GPU | Successful req. | Tput (req/s) | Mean TTFT (ms) | Median TTFT (ms) | P99 TTFT (ms) | Mean ITL (ms) | Median ITL (ms) | P99 ITL (ms) | Engine | -|:----------------------|:---------------|------------------:|---------------:|-----------------:|-------------------:|----------------:|----------------:|------------------:|---------------:|:---------| diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh index e07367ef9653e..74f02e3035728 100644 --- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh +++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh @@ -50,7 +50,6 @@ main() { return 0 else /workspace/buildkite-agent annotate --style "info" --context "header" < /vllm/.buildkite/nightly-benchmarks/nightly-descriptions.md - /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --append < /vllm/.buildkite/nightly-benchmarks/nightly-results-header.md fi From 25c5a2f694945c41aab0023e45b0fb2bed7f8743 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 25 Jun 2024 01:08:08 -0700 Subject: [PATCH 058/150] separate annotation to a new step --- .../nightly-benchmarks/run-nightly-suite.sh | 7 ------ .../scripts/nightly-annotate.sh | 24 +++++++++++++++++++ 2 files changed, 24 insertions(+), 7 deletions(-) create mode 100644 .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh index 74f02e3035728..e608211391b93 100644 --- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh +++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh @@ -44,13 +44,6 @@ main() { git checkout kuntai-benchmark-dev cd benchmarks wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json - - if [ ! -f /workspace/buildkite-agent ]; then - echo "buildkite-agent binary not found. Skip uploading the results." - return 0 - else - /workspace/buildkite-agent annotate --style "info" --context "header" < /vllm/.buildkite/nightly-benchmarks/nightly-descriptions.md - fi # run lmdeploy diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh new file mode 100644 index 0000000000000..78dc66a273ed7 --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -0,0 +1,24 @@ +#!/bin/bash + +set -ex +set -o pipefail + +main() { + + (which wget && which curl) || (apt-get update && apt-get install -y wget curl) + (which jq) || (apt-get update && apt-get -y install jq) + cd / + git clone https://github.com/KuntaiDu/vllm.git + cd vllm + git checkout kuntai-benchmark-dev + + if [ ! -f /workspace/buildkite-agent ]; then + echo "buildkite-agent binary not found. Skip uploading the results." + return 0 + else + /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < /vllm/.buildkite/nightly-benchmarks/nightly-descriptions.md + fi + +} + +main "$@" \ No newline at end of file From 5183fea491cd9dbf9ffa24b355c594a8b2b30ab5 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 25 Jun 2024 01:09:09 -0700 Subject: [PATCH 059/150] add extra step to annotate pipeline --- .../nightly-benchmarks/nightly-pipeline.yaml | 31 +++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 47505ff64e0fb..872b718f45361 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -1,4 +1,35 @@ steps: + - label: "Annotate" + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + priorityClassName: perf-benchmark + containers: + - image: vllm/vllm-openai:latest + command: + - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh) && (bash nightly-annotate.sh) + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory - label: "A100 trt benchmark" agents: queue: A100 From f0684af2ebba21545cc6da099cfea6359843d503 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 25 Jun 2024 01:09:52 -0700 Subject: [PATCH 060/150] add wait --- .buildkite/nightly-benchmarks/nightly-pipeline.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 872b718f45361..0f72c979bbf32 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -30,6 +30,7 @@ steps: - name: devshm emptyDir: medium: Memory + - wait - label: "A100 trt benchmark" agents: queue: A100 From 6c7ddf89db96cf78fbeabcf73c31151b3454dc27 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 25 Jun 2024 01:30:49 -0700 Subject: [PATCH 061/150] bring back the full test case --- .../scripts/run-lmdeploy-nightly.sh | 2 +- .../scripts/run-tgi-nightly.sh | 4 +- .../scripts/run-trt-nightly.sh | 4 +- .../scripts/run-vllm-nightly.sh | 4 +- .../tests/nightly-tests.json | 82 ++++++++++++++++++- 5 files changed, 86 insertions(+), 10 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index 7a3c06451ea7b..de5a4e1231070 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -124,7 +124,7 @@ run_serving_tests() { else echo "" echo "lmdeploy failed to start within the timeout period." - continue + exit 0 fi # get model name diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh index 48aa56a2799c4..38ca9e15b260c 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh @@ -66,7 +66,7 @@ run_serving_tests() { # if TEST_SELECTOR is set, only run the test cases that match the selector if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then echo "Skip test case $test_name." - continue + exit 0 fi # append tgi to the test name @@ -116,7 +116,7 @@ run_serving_tests() { else echo "" echo "tgi failed to start within the timeout period." - continue + exit 0 fi # iterate over different QPS diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index b9db6ac4dff5b..1542ac202bf59 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -148,7 +148,7 @@ run_serving_tests() { # if TEST_SELECTOR is set, only run the test cases that match the selector if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then echo "Skip test case $test_name." - continue + exit 0 fi # append trt to the test name @@ -199,7 +199,7 @@ run_serving_tests() { else echo "" echo "trt failed to start within the timeout period." - continue + exit 0 fi # go back to vllm benchmarking directory diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh index ab3df9def635f..86ab0105647e6 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh @@ -71,7 +71,7 @@ run_serving_tests() { # if TEST_SELECTOR is set, only run the test cases that match the selector if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then echo "Skip test case $test_name." - continue + exit 0 fi # append vllm to the test name @@ -123,7 +123,7 @@ run_serving_tests() { else echo "" echo "vllm failed to start within the timeout period." - continue + exit 0 fi # iterate over different QPS diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 7422eac46b482..349ba6817452a 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -1,13 +1,13 @@ [ { "test_name": "llama8B_tp1", - "qps_list": [8], + "qps_list": [4,8], "common_parameters": { "model": "meta-llama/Meta-Llama-3-8B", "tp": 1, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 200, + "num_prompts": 1000, "port": 8000 }, "lmdeploy_server_parameters": { @@ -36,5 +36,81 @@ }, "vllm_client_parameters": { } - } + }, + { + "test_name": "mixtral8x7B_tp2", + "qps_list": [2,4], + "common_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tp": 2, + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 1000, + "port": 8000 + }, + "lmdeploy_server_parameters": { + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "mixtral", + "model_dtype": "float16", + "max_batch_size": 256, + "max_input_len": 10000, + "max_output_len": 10000, + "trt_llm_version": "r24.04" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "" + }, + "vllm_client_parameters": { + } + }, + { + "test_name": "llama70B_tp4", + "qps_list": [2,4], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tp": 4, + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 1000, + "port": 8000 + }, + "lmdeploy_server_parameters": { + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "float16", + "max_batch_size": 256, + "max_input_len": 10000, + "max_output_len": 10000, + "trt_llm_version": "r24.04" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "" + }, + "vllm_client_parameters": { + } + }, ] \ No newline at end of file From 13f5d99cf9de90fcbd365cea588691551baad2bc Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 25 Jun 2024 02:03:32 -0700 Subject: [PATCH 062/150] fix syntax error --- .buildkite/nightly-benchmarks/tests/nightly-tests.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 349ba6817452a..573bd03114c64 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -112,5 +112,5 @@ }, "vllm_client_parameters": { } - }, + } ] \ No newline at end of file From 11079c74ba21138a5bfca127d54c40743ba12dda Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 25 Jun 2024 11:33:07 -0700 Subject: [PATCH 063/150] break when the server failed to start --- so that the buildkite uploading still works --- .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 3 +-- .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh | 3 +-- .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh | 3 +-- .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh | 3 +-- 4 files changed, 4 insertions(+), 8 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index de5a4e1231070..6606af030ee2f 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -1,6 +1,5 @@ #!/bin/bash -set -ex set -o pipefail check_gpus() { @@ -124,7 +123,7 @@ run_serving_tests() { else echo "" echo "lmdeploy failed to start within the timeout period." - exit 0 + break fi # get model name diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh index 38ca9e15b260c..edc5ea8573319 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh @@ -1,6 +1,5 @@ #!/bin/bash -set -ex set -o pipefail check_gpus() { @@ -116,7 +115,7 @@ run_serving_tests() { else echo "" echo "tgi failed to start within the timeout period." - exit 0 + break fi # iterate over different QPS diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index 1542ac202bf59..f6c8375e0ca76 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -1,6 +1,5 @@ #!/bin/bash -set -ex set -o pipefail check_gpus() { @@ -199,7 +198,7 @@ run_serving_tests() { else echo "" echo "trt failed to start within the timeout period." - exit 0 + break fi # go back to vllm benchmarking directory diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh index 86ab0105647e6..3e79f4b24aa19 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh @@ -1,6 +1,5 @@ #!/bin/bash -set -ex set -o pipefail check_gpus() { @@ -123,7 +122,7 @@ run_serving_tests() { else echo "" echo "vllm failed to start within the timeout period." - exit 0 + break fi # iterate over different QPS From 0ed8131d003a607a4c22daa4e6d2114fb141621b Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 25 Jun 2024 11:34:28 -0700 Subject: [PATCH 064/150] make yapf happy --- .../nightly-benchmarks/scripts/summary-nightly-results.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index d25a97e47d409..640e0bfdaa1f7 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -56,9 +56,9 @@ columns=serving_column_mapping) serving_md_table_with_headers = tabulate(serving_results, - headers='keys', - tablefmt='pipe', - showindex=False) + headers='keys', + tablefmt='pipe', + showindex=False) # remove the first line of header serving_md_table_lines = serving_md_table_with_headers.split('\n') serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:]) From fae306ec7d64d146304987a7a53bf27c924a1be2 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 25 Jun 2024 18:44:31 -0700 Subject: [PATCH 065/150] test vllm code --- .../nightly-benchmarks/nightly-pipeline.yaml | 250 +++++++++--------- .../scripts/nightly-annotate.sh | 5 + 2 files changed, 130 insertions(+), 125 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 0f72c979bbf32..a424f2b99d1c9 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -30,129 +30,129 @@ steps: - name: devshm emptyDir: medium: Memory - - wait - - label: "A100 trt benchmark" - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 - command: - - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh) - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - label: "A100 vllm benchmark" - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: vllm/vllm-openai:latest - command: - - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh) - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - label: "A100 tgi benchmark" - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: ghcr.io/huggingface/text-generation-inference:2.0 - command: - - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh) - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - label: "A100 lmdeploy benchmark" - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: openmmlab/lmdeploy:latest - command: - - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh) - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory + # - wait + # - label: "A100 trt benchmark" + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # priorityClassName: perf-benchmark + # containers: + # - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 + # command: + # - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh) + # resources: + # limits: + # nvidia.com/gpu: 8 + # volumeMounts: + # - name: devshm + # mountPath: /dev/shm + # env: + # - name: VLLM_USAGE_SOURCE + # value: ci-test + # - name: HF_TOKEN + # valueFrom: + # secretKeyRef: + # name: hf-token-secret + # key: token + # nodeSelector: + # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + # volumes: + # - name: devshm + # emptyDir: + # medium: Memory + # - label: "A100 vllm benchmark" + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # priorityClassName: perf-benchmark + # containers: + # - image: vllm/vllm-openai:latest + # command: + # - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh) + # resources: + # limits: + # nvidia.com/gpu: 8 + # volumeMounts: + # - name: devshm + # mountPath: /dev/shm + # env: + # - name: VLLM_USAGE_SOURCE + # value: ci-test + # - name: HF_TOKEN + # valueFrom: + # secretKeyRef: + # name: hf-token-secret + # key: token + # nodeSelector: + # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + # volumes: + # - name: devshm + # emptyDir: + # medium: Memory + # - label: "A100 tgi benchmark" + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # priorityClassName: perf-benchmark + # containers: + # - image: ghcr.io/huggingface/text-generation-inference:2.0 + # command: + # - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh) + # resources: + # limits: + # nvidia.com/gpu: 8 + # volumeMounts: + # - name: devshm + # mountPath: /dev/shm + # env: + # - name: VLLM_USAGE_SOURCE + # value: ci-test + # - name: HF_TOKEN + # valueFrom: + # secretKeyRef: + # name: hf-token-secret + # key: token + # nodeSelector: + # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + # volumes: + # - name: devshm + # emptyDir: + # medium: Memory + # - label: "A100 lmdeploy benchmark" + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # priorityClassName: perf-benchmark + # containers: + # - image: openmmlab/lmdeploy:latest + # command: + # - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh) + # resources: + # limits: + # nvidia.com/gpu: 8 + # volumeMounts: + # - name: devshm + # mountPath: /dev/shm + # env: + # - name: VLLM_USAGE_SOURCE + # value: ci-test + # - name: HF_TOKEN + # valueFrom: + # secretKeyRef: + # name: hf-token-secret + # key: token + # nodeSelector: + # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + # volumes: + # - name: devshm + # emptyDir: + # medium: Memory \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh index 78dc66a273ed7..b9ff5eb146bb9 100644 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -7,6 +7,11 @@ main() { (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get update && apt-get -y install jq) + cd /workspace + ls + cd ./vllm + ls + exit 0 cd / git clone https://github.com/KuntaiDu/vllm.git cd vllm From 5098e1079ceb1929e63b2cf70071b9e959468792 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 25 Jun 2024 20:44:12 -0700 Subject: [PATCH 066/150] check if mounting is successfull --- .buildkite/nightly-benchmarks/nightly-pipeline.yaml | 3 +++ .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh | 3 +-- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index a424f2b99d1c9..51f95ab6a19d5 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -30,6 +30,9 @@ steps: - name: devshm emptyDir: medium: Memory + - name: nvme-raid + hostpath: + path: /mnt/fast-disks/nvme-raid # - wait # - label: "A100 trt benchmark" # agents: diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh index b9ff5eb146bb9..50ed0931ef218 100644 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -7,10 +7,9 @@ main() { (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get update && apt-get -y install jq) + df -h cd /workspace ls - cd ./vllm - ls exit 0 cd / git clone https://github.com/KuntaiDu/vllm.git From b0e766712df5ec80cb2c78c11549de3a2e344327 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 25 Jun 2024 21:41:40 -0700 Subject: [PATCH 067/150] add pwd --- .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh index 50ed0931ef218..44f9f996ab9c9 100644 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -7,6 +7,7 @@ main() { (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get update && apt-get -y install jq) + pwd df -h cd /workspace ls From 4034f5f9a41b7bf0e9e6683fbea5285b5628a766 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 25 Jun 2024 21:51:08 -0700 Subject: [PATCH 068/150] add ls --- .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh index 44f9f996ab9c9..3a2704c8c8187 100644 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -8,6 +8,7 @@ main() { (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get update && apt-get -y install jq) pwd + ls df -h cd /workspace ls From 4ab2ecacaddb5c832d9a77d97fd9d61c793d53a5 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 25 Jun 2024 23:40:56 -0700 Subject: [PATCH 069/150] update to read code from the docker, instead of running wget --- .../nightly-benchmarks/nightly-pipeline.yaml | 265 +++++++++--------- .../nightly-benchmarks/run-nightly-suite.sh | 7 +- .../scripts/nightly-annotate.sh | 11 +- .../scripts/run-lmdeploy-nightly.sh | 2 +- .../scripts/run-tgi-nightly.sh | 3 +- .../scripts/run-trt-nightly.sh | 6 +- .../scripts/run-vllm-nightly.sh | 3 +- 7 files changed, 149 insertions(+), 148 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 51f95ab6a19d5..0ba924aa2e272 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -9,16 +9,20 @@ steps: containers: - image: vllm/vllm-openai:latest command: - - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh) && (bash nightly-annotate.sh) + - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh resources: limits: nvidia.com/gpu: 8 volumeMounts: - name: devshm mountPath: /dev/shm + - name: nvme-raid + mountPath: /mnt/fast-disks/nvme-raid env: - name: VLLM_USAGE_SOURCE value: ci-test + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark - name: HF_TOKEN valueFrom: secretKeyRef: @@ -33,129 +37,138 @@ steps: - name: nvme-raid hostpath: path: /mnt/fast-disks/nvme-raid - # - wait - # - label: "A100 trt benchmark" - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # priorityClassName: perf-benchmark - # containers: - # - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 - # command: - # - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh) - # resources: - # limits: - # nvidia.com/gpu: 8 - # volumeMounts: - # - name: devshm - # mountPath: /dev/shm - # env: - # - name: VLLM_USAGE_SOURCE - # value: ci-test - # - name: HF_TOKEN - # valueFrom: - # secretKeyRef: - # name: hf-token-secret - # key: token - # nodeSelector: - # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - # volumes: - # - name: devshm - # emptyDir: - # medium: Memory - # - label: "A100 vllm benchmark" - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # priorityClassName: perf-benchmark - # containers: - # - image: vllm/vllm-openai:latest - # command: - # - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh) - # resources: - # limits: - # nvidia.com/gpu: 8 - # volumeMounts: - # - name: devshm - # mountPath: /dev/shm - # env: - # - name: VLLM_USAGE_SOURCE - # value: ci-test - # - name: HF_TOKEN - # valueFrom: - # secretKeyRef: - # name: hf-token-secret - # key: token - # nodeSelector: - # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - # volumes: - # - name: devshm - # emptyDir: - # medium: Memory - # - label: "A100 tgi benchmark" - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # priorityClassName: perf-benchmark - # containers: - # - image: ghcr.io/huggingface/text-generation-inference:2.0 - # command: - # - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh) - # resources: - # limits: - # nvidia.com/gpu: 8 - # volumeMounts: - # - name: devshm - # mountPath: /dev/shm - # env: - # - name: VLLM_USAGE_SOURCE - # value: ci-test - # - name: HF_TOKEN - # valueFrom: - # secretKeyRef: - # name: hf-token-secret - # key: token - # nodeSelector: - # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - # volumes: - # - name: devshm - # emptyDir: - # medium: Memory - # - label: "A100 lmdeploy benchmark" - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # priorityClassName: perf-benchmark - # containers: - # - image: openmmlab/lmdeploy:latest - # command: - # - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh) - # resources: - # limits: - # nvidia.com/gpu: 8 - # volumeMounts: - # - name: devshm - # mountPath: /dev/shm - # env: - # - name: VLLM_USAGE_SOURCE - # value: ci-test - # - name: HF_TOKEN - # valueFrom: - # secretKeyRef: - # name: hf-token-secret - # key: token - # nodeSelector: - # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - # volumes: - # - name: devshm - # emptyDir: - # medium: Memory + type: directory + - wait + - label: "A100 trt benchmark" + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + priorityClassName: perf-benchmark + containers: + - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 + command: + - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - label: "A100 vllm benchmark" + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + priorityClassName: perf-benchmark + containers: + - image: vllm/vllm-openai:latest + command: + - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - label: "A100 tgi benchmark" + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + priorityClassName: perf-benchmark + containers: + - image: ghcr.io/huggingface/text-generation-inference:2.0 + command: + - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - label: "A100 lmdeploy benchmark" + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + priorityClassName: perf-benchmark + containers: + - image: openmmlab/lmdeploy:latest + command: + - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh index e608211391b93..a157074287083 100644 --- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh +++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh @@ -38,11 +38,8 @@ main() { (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get update && apt-get -y install jq) - cd / - git clone https://github.com/KuntaiDu/vllm.git - cd vllm - git checkout kuntai-benchmark-dev - cd benchmarks + + cd $VLLM_SOURCE_CODE_LOC/benchmarks wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh index 3a2704c8c8187..f8168c92d1cbc 100644 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -3,20 +3,13 @@ set -ex set -o pipefail + main() { (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get update && apt-get -y install jq) - pwd - ls + df -h - cd /workspace - ls - exit 0 - cd / - git clone https://github.com/KuntaiDu/vllm.git - cd vllm - git checkout kuntai-benchmark-dev if [ ! -f /workspace/buildkite-agent ]; then echo "buildkite-agent binary not found. Skip uploading the results." diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index 6606af030ee2f..e9c29bbe7de47 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -199,7 +199,7 @@ main() { check_gpus # enter vllm directory - cd /vllm/benchmarks + cd $VLLM_SOURCE_CODE_LOC/benchmarks declare -g RESULTS_FOLDER=results/ mkdir -p $RESULTS_FOLDER diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh index edc5ea8573319..67f88eee653d9 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh @@ -187,8 +187,7 @@ main() { check_gpus # enter vllm directory - cd /vllm/benchmarks - + cd $VLLM_SOURCE_CODE_LOC/benchmarks declare -g RESULTS_FOLDER=results/ mkdir -p $RESULTS_FOLDER BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index f6c8375e0ca76..d3abd53bf01ea 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -178,7 +178,7 @@ run_serving_tests() { # prepare tokenizer - cd /vllm/benchmarks + cd $VLLM_SOURCE_CODE_LOC/benchmarks rm -rf /tokenizer_cache mkdir /tokenizer_cache python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ @@ -202,7 +202,7 @@ run_serving_tests() { fi # go back to vllm benchmarking directory - cd /vllm/benchmarks + cd $VLLM_SOURCE_CODE_LOC/benchmarks # iterate over different QPS for qps in $qps_list; do @@ -276,7 +276,7 @@ main() { # enter vllm directory - cd /vllm/benchmarks + cd $VLLM_SOURCE_CODE_LOC/benchmarks declare -g RESULTS_FOLDER=results/ mkdir -p $RESULTS_FOLDER diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh index 3e79f4b24aa19..774e6f3d5cb2c 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh @@ -193,8 +193,7 @@ main() { check_gpus # enter vllm directory - cd /vllm/benchmarks - + cd $VLLM_SOURCE_CODE_LOC/benchmarks declare -g RESULTS_FOLDER=results/ mkdir -p $RESULTS_FOLDER BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ From d6a34d3c500c8957158d40078739ba43b5ba392c Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 26 Jun 2024 00:10:52 -0700 Subject: [PATCH 070/150] remove raid --- .buildkite/nightly-benchmarks/nightly-pipeline.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 0ba924aa2e272..3d4187b5d8ef8 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -16,8 +16,8 @@ steps: volumeMounts: - name: devshm mountPath: /dev/shm - - name: nvme-raid - mountPath: /mnt/fast-disks/nvme-raid + # - name: nvme-raid + # mountPath: /mnt/fast-disks/nvme-raid env: - name: VLLM_USAGE_SOURCE value: ci-test @@ -34,10 +34,10 @@ steps: - name: devshm emptyDir: medium: Memory - - name: nvme-raid - hostpath: - path: /mnt/fast-disks/nvme-raid - type: directory + # - name: nvme-raid + # hostpath: + # path: /mnt/fast-disks/nvme-raid + # type: directory - wait - label: "A100 trt benchmark" agents: From c57ac0aab9964c975b6660d11af15b10c2fbe585 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 26 Jun 2024 09:54:35 -0700 Subject: [PATCH 071/150] try Roger's fix --- .buildkite/nightly-benchmarks/nightly-pipeline.yaml | 12 ++++++------ .../nightly-benchmarks/scripts/nightly-annotate.sh | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 3d4187b5d8ef8..241b0dda145b8 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -16,8 +16,8 @@ steps: volumeMounts: - name: devshm mountPath: /dev/shm - # - name: nvme-raid - # mountPath: /mnt/fast-disks/nvme-raid + - name: nvme-raid + mountPath: /mnt/fast-disks/nvme-raid env: - name: VLLM_USAGE_SOURCE value: ci-test @@ -34,10 +34,10 @@ steps: - name: devshm emptyDir: medium: Memory - # - name: nvme-raid - # hostpath: - # path: /mnt/fast-disks/nvme-raid - # type: directory + - name: nvme-raid + hostpath: + path: /mnt/fast-disks/nvme-raid + type: Directory - wait - label: "A100 trt benchmark" agents: diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh index f8168c92d1cbc..cee44c3d6eb92 100644 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -15,7 +15,7 @@ main() { echo "buildkite-agent binary not found. Skip uploading the results." return 0 else - /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < /vllm/.buildkite/nightly-benchmarks/nightly-descriptions.md + /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md fi } From d75d45b943fe2c576383b026baa271fec6aafc68 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 26 Jun 2024 20:04:06 -0700 Subject: [PATCH 072/150] remove nvme raid --- .buildkite/nightly-benchmarks/nightly-pipeline.yaml | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 241b0dda145b8..37845b2802cf1 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -16,8 +16,8 @@ steps: volumeMounts: - name: devshm mountPath: /dev/shm - - name: nvme-raid - mountPath: /mnt/fast-disks/nvme-raid + # - name: nvme-raid + # mountPath: /mnt/fast-disks/nvme-raid env: - name: VLLM_USAGE_SOURCE value: ci-test @@ -34,10 +34,10 @@ steps: - name: devshm emptyDir: medium: Memory - - name: nvme-raid - hostpath: - path: /mnt/fast-disks/nvme-raid - type: Directory + # - name: nvme-raid + # hostpath: + # path: /mnt/fast-disks/nvme-raid + # type: Directory - wait - label: "A100 trt benchmark" agents: From 5dc8c8cccec38f194be19ca43e13ac6c919d4108 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 27 Jun 2024 21:40:58 -0700 Subject: [PATCH 073/150] raise the priority of benchmarking development jobs --- .buildkite/nightly-benchmarks/nightly-pipeline.yaml | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 37845b2802cf1..71edb6d6257ee 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -1,5 +1,6 @@ steps: - label: "Annotate" + priority: 100 agents: queue: A100 plugins: @@ -16,8 +17,6 @@ steps: volumeMounts: - name: devshm mountPath: /dev/shm - # - name: nvme-raid - # mountPath: /mnt/fast-disks/nvme-raid env: - name: VLLM_USAGE_SOURCE value: ci-test @@ -34,12 +33,9 @@ steps: - name: devshm emptyDir: medium: Memory - # - name: nvme-raid - # hostpath: - # path: /mnt/fast-disks/nvme-raid - # type: Directory - wait - label: "A100 trt benchmark" + priority: 100 agents: queue: A100 plugins: @@ -73,6 +69,7 @@ steps: emptyDir: medium: Memory - label: "A100 vllm benchmark" + priority: 100 agents: queue: A100 plugins: @@ -106,6 +103,7 @@ steps: emptyDir: medium: Memory - label: "A100 tgi benchmark" + priority: 100 agents: queue: A100 plugins: @@ -139,6 +137,7 @@ steps: emptyDir: medium: Memory - label: "A100 lmdeploy benchmark" + priority: 100 agents: queue: A100 plugins: From 8b9192761005c649ae0ef152e439fbfad69bea61 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 27 Jun 2024 22:57:37 -0700 Subject: [PATCH 074/150] reduce the # of test from 1000 to 500, for faster testing --- .buildkite/nightly-benchmarks/tests/nightly-tests.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 573bd03114c64..04f387a6eb4aa 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -7,7 +7,7 @@ "tp": 1, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 1000, + "num_prompts": 500, "port": 8000 }, "lmdeploy_server_parameters": { @@ -45,7 +45,7 @@ "tp": 2, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 1000, + "num_prompts": 500, "port": 8000 }, "lmdeploy_server_parameters": { @@ -83,7 +83,7 @@ "tp": 4, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 1000, + "num_prompts": 500, "port": 8000 }, "lmdeploy_server_parameters": { From 8539874030e657386a64b2bbf636c39cd9893d88 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 27 Jun 2024 23:15:52 -0700 Subject: [PATCH 075/150] trt won't run all the test. Just run llama-3 70B. Fix this bug tomorrow --- .../tests/nightly-tests.json | 76 ------------------- 1 file changed, 76 deletions(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 04f387a6eb4aa..31dea7a43c632 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -1,80 +1,4 @@ [ - { - "test_name": "llama8B_tp1", - "qps_list": [4,8], - "common_parameters": { - "model": "meta-llama/Meta-Llama-3-8B", - "tp": 1, - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 500, - "port": 8000 - }, - "lmdeploy_server_parameters": { - }, - "lmdeploy_client_parameters": { - }, - "tgi_server_parameters": { - }, - "tgi_client_parameters": { - "endpoint": "/generate_stream" - }, - "trt_server_parameters": { - "model_type": "llama", - "model_dtype": "float16", - "max_batch_size": 256, - "max_input_len": 10000, - "max_output_len": 10000, - "trt_llm_version": "r24.04" - }, - "trt_client_parameters": { - "endpoint": "/v2/models/ensemble/generate_stream" - }, - "vllm_server_parameters": { - "disable_log_stats": "", - "disable_log_requests": "" - }, - "vllm_client_parameters": { - } - }, - { - "test_name": "mixtral8x7B_tp2", - "qps_list": [2,4], - "common_parameters": { - "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "tp": 2, - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 500, - "port": 8000 - }, - "lmdeploy_server_parameters": { - }, - "lmdeploy_client_parameters": { - }, - "tgi_server_parameters": { - }, - "tgi_client_parameters": { - "endpoint": "/generate_stream" - }, - "trt_server_parameters": { - "model_type": "mixtral", - "model_dtype": "float16", - "max_batch_size": 256, - "max_input_len": 10000, - "max_output_len": 10000, - "trt_llm_version": "r24.04" - }, - "trt_client_parameters": { - "endpoint": "/v2/models/ensemble/generate_stream" - }, - "vllm_server_parameters": { - "disable_log_stats": "", - "disable_log_requests": "" - }, - "vllm_client_parameters": { - } - }, { "test_name": "llama70B_tp4", "qps_list": [2,4], From 144328b8c0613e71ddd52bb3347a92eaefe52e86 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sat, 29 Jun 2024 21:23:25 -0700 Subject: [PATCH 076/150] debug tensorrt --- .../nightly-benchmarks/nightly-pipeline.yaml | 204 +++++++++--------- .../scripts/run-trt-nightly.sh | 1 + .../tests/nightly-tests.json | 46 +++- 3 files changed, 145 insertions(+), 106 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 71edb6d6257ee..19f1c6a355dbd 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -68,106 +68,106 @@ steps: - name: devshm emptyDir: medium: Memory - - label: "A100 vllm benchmark" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: vllm/vllm-openai:latest - command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - label: "A100 tgi benchmark" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: ghcr.io/huggingface/text-generation-inference:2.0 - command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - label: "A100 lmdeploy benchmark" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: openmmlab/lmdeploy:latest - command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory + # - label: "A100 vllm benchmark" + # priority: 100 + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # priorityClassName: perf-benchmark + # containers: + # - image: vllm/vllm-openai:latest + # command: + # - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + # resources: + # limits: + # nvidia.com/gpu: 8 + # volumeMounts: + # - name: devshm + # mountPath: /dev/shm + # env: + # - name: VLLM_USAGE_SOURCE + # value: ci-test + # - name: VLLM_SOURCE_CODE_LOC + # value: /workspace/build/buildkite/vllm/performance-benchmark + # - name: HF_TOKEN + # valueFrom: + # secretKeyRef: + # name: hf-token-secret + # key: token + # nodeSelector: + # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + # volumes: + # - name: devshm + # emptyDir: + # medium: Memory + # - label: "A100 tgi benchmark" + # priority: 100 + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # priorityClassName: perf-benchmark + # containers: + # - image: ghcr.io/huggingface/text-generation-inference:2.0 + # command: + # - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + # resources: + # limits: + # nvidia.com/gpu: 8 + # volumeMounts: + # - name: devshm + # mountPath: /dev/shm + # env: + # - name: VLLM_USAGE_SOURCE + # value: ci-test + # - name: VLLM_SOURCE_CODE_LOC + # value: /workspace/build/buildkite/vllm/performance-benchmark + # - name: HF_TOKEN + # valueFrom: + # secretKeyRef: + # name: hf-token-secret + # key: token + # nodeSelector: + # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + # volumes: + # - name: devshm + # emptyDir: + # medium: Memory + # - label: "A100 lmdeploy benchmark" + # priority: 100 + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # priorityClassName: perf-benchmark + # containers: + # - image: openmmlab/lmdeploy:latest + # command: + # - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + # resources: + # limits: + # nvidia.com/gpu: 8 + # volumeMounts: + # - name: devshm + # mountPath: /dev/shm + # env: + # - name: VLLM_USAGE_SOURCE + # value: ci-test + # - name: VLLM_SOURCE_CODE_LOC + # value: /workspace/build/buildkite/vllm/performance-benchmark + # - name: HF_TOKEN + # valueFrom: + # secretKeyRef: + # name: hf-token-secret + # key: token + # nodeSelector: + # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + # volumes: + # - name: devshm + # emptyDir: + # medium: Memory \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index d3abd53bf01ea..4324d418de9e0 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -1,6 +1,7 @@ #!/bin/bash set -o pipefail +set -ex check_gpus() { # check the number of GPUs and GPU type. diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 31dea7a43c632..c55da0ae70b2f 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -1,10 +1,48 @@ [ { - "test_name": "llama70B_tp4", - "qps_list": [2,4], + "test_name": "llama8B_tp1", + "qps_list": [4,8], "common_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", - "tp": 4, + "model": "meta-llama/Meta-Llama-3-8B", + "tp": 1, + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 500, + "port": 8000 + }, + "lmdeploy_server_parameters": { + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "float16", + "max_batch_size": 256, + "max_input_len": 10000, + "max_output_len": 10000, + "trt_llm_version": "r24.04" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "" + }, + "vllm_client_parameters": { + } + }, + { + "test_name": "llama8B_tp1", + "qps_list": [4,8], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-8B", + "tp": 1, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 500, From 64e951891acb7e4277efdb995f3a073a7cc372b1 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sun, 30 Jun 2024 23:29:21 -0700 Subject: [PATCH 077/150] bug fix: avoid reassigning params during the for loop --- .../nightly-benchmarks/scripts/run-trt-nightly.sh | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index 4324d418de9e0..7c1149f86aa32 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -53,20 +53,20 @@ wait_for_server() { run_trt_server() { - params=$1 + server_params=$1 common_params=$2 model_path=$(echo "$common_params" | jq -r '.model') model_name="${model_path#*/}" - model_type=$(echo "$params" | jq -r '.model_type') - model_dtype=$(echo "$params" | jq -r '.model_dtype') + model_type=$(echo "$server_params" | jq -r '.model_type') + model_dtype=$(echo "$server_params" | jq -r '.model_dtype') model_tp_size=$(echo "$common_params" | jq -r '.tp') - max_batch_size=$(echo "$params" | jq -r '.max_batch_size') - max_input_len=$(echo "$params" | jq -r '.max_input_len') - max_output_len=$(echo "$params" | jq -r '.max_output_len') - trt_llm_version=$(echo "$params" | jq -r '.trt_llm_version') + max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size') + max_input_len=$(echo "$server_params" | jq -r '.max_input_len') + max_output_len=$(echo "$server_params" | jq -r '.max_output_len') + trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version') cd ~ rm -rf models From a94c1403d3998d036b5418c0b89e80dcced259e7 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sun, 30 Jun 2024 23:30:28 -0700 Subject: [PATCH 078/150] bring lmdeploy back for testing --- .../nightly-benchmarks/nightly-pipeline.yaml | 68 +++++++++---------- 1 file changed, 34 insertions(+), 34 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 19f1c6a355dbd..7dae9c9c59a95 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -136,38 +136,38 @@ steps: # - name: devshm # emptyDir: # medium: Memory - # - label: "A100 lmdeploy benchmark" - # priority: 100 - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # priorityClassName: perf-benchmark - # containers: - # - image: openmmlab/lmdeploy:latest - # command: - # - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - # resources: - # limits: - # nvidia.com/gpu: 8 - # volumeMounts: - # - name: devshm - # mountPath: /dev/shm - # env: - # - name: VLLM_USAGE_SOURCE - # value: ci-test - # - name: VLLM_SOURCE_CODE_LOC - # value: /workspace/build/buildkite/vllm/performance-benchmark - # - name: HF_TOKEN - # valueFrom: - # secretKeyRef: - # name: hf-token-secret - # key: token - # nodeSelector: - # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - # volumes: - # - name: devshm - # emptyDir: - # medium: Memory + - label: "A100 lmdeploy benchmark" + priority: 100 + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + priorityClassName: perf-benchmark + containers: + - image: openmmlab/lmdeploy:latest + command: + - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory \ No newline at end of file From 1f0ccb05ded608be29b170587c741d0339dcdab8 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 1 Jul 2024 00:13:46 -0700 Subject: [PATCH 079/150] change test name --- .buildkite/nightly-benchmarks/tests/nightly-tests.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index c55da0ae70b2f..c8e950570cf42 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -38,7 +38,7 @@ } }, { - "test_name": "llama8B_tp1", + "test_name": "llama8B_tp2", "qps_list": [4,8], "common_parameters": { "model": "meta-llama/Meta-Llama-3-8B", From 2f53b96bd44f405ebec098bc3260fb1ddbfead48 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 1 Jul 2024 01:12:48 -0700 Subject: [PATCH 080/150] separating run server command from the bash file --- .../scripts/launch-trt-server.sh | 82 +++++++++++++++++++ .../scripts/run-trt-nightly.sh | 3 +- .../tests/nightly-tests.json | 2 +- 3 files changed, 85 insertions(+), 2 deletions(-) create mode 100644 .buildkite/nightly-benchmarks/scripts/launch-trt-server.sh diff --git a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh new file mode 100644 index 0000000000000..251ab139c5729 --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh @@ -0,0 +1,82 @@ + + +set -x + +server_params=$1 +common_params=$2 + + + +model_path=$(echo "$common_params" | jq -r '.model') +model_name="${model_path#*/}" +model_type=$(echo "$server_params" | jq -r '.model_type') +model_dtype=$(echo "$server_params" | jq -r '.model_dtype') +model_tp_size=$(echo "$common_params" | jq -r '.tp') +max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size') +max_input_len=$(echo "$server_params" | jq -r '.max_input_len') +max_output_len=$(echo "$server_params" | jq -r '.max_output_len') +trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version') + +cd ~ +rm -rf models +mkdir -p models +cd models +models_dir=$(pwd) +trt_model_path=${models_dir}/${model_name}-trt-ckpt +trt_engine_path=${models_dir}/${model_name}-trt-engine + +cd ~ +rm -rf tensorrt-demo +git clone https://github.com/neuralmagic/tensorrt-demo.git +cd tensorrt-demo +tensorrt_demo_dir=$(pwd) + +# make sure the parameter inside tensorrt_demo is consistent to envvar +sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt +sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt +sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt +sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt +sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt +sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt + + +cd / +rm -rf tensorrtllm_backend +git clone https://github.com/triton-inference-server/tensorrtllm_backend.git +git lfs install +cd tensorrtllm_backend +git checkout $trt_llm_version +tensorrtllm_backend_dir=$(pwd) +git submodule update --init --recursive +cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/ + +cd /tensorrtllm_backend +cd ./tensorrt_llm/examples/${model_type} + +python3 convert_checkpoint.py \ +--model_dir ${model_path} \ +--dtype ${model_dtype} \ +--tp_size ${model_tp_size} \ +--output_dir ${trt_model_path} + +trtllm-build \ +--checkpoint_dir=${trt_model_path} \ +--gpt_attention_plugin=${model_dtype} \ +--gemm_plugin=${model_dtype} \ +--remove_input_padding=enable \ +--paged_kv_cache=enable \ +--tp_size=${model_tp_size} \ +--max_batch_size=${max_batch_size} \ +--max_input_len=${max_input_len} \ +--max_output_len=${max_output_len} \ +--max_num_tokens=${max_output_len} \ +--opt_num_tokens=${max_output_len} \ +--output_dir=${trt_engine_path} + +cd /tensorrtllm_backend/triton_model_repo +rm -rf ./tensorrt_llm/1/* +cp -r ${trt_engine_path}/* ./tensorrt_llm/1 +cd /tensorrtllm_backend +python3 scripts/launch_triton_server.py \ +--world_size=${model_tp_size} \ +--model_repo=/tensorrtllm_backend/triton_model_repo & \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index 7c1149f86aa32..58002651bd188 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -189,7 +189,8 @@ run_serving_tests() { # run the server echo "Running test case $test_name" - run_trt_server "$server_params" "$common_params" + # run_trt_server "$server_params" "$common_params" + bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params" # wait until the server is alive wait_for_server diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index c8e950570cf42..14e1730fd4a90 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -3,7 +3,7 @@ "test_name": "llama8B_tp1", "qps_list": [4,8], "common_parameters": { - "model": "meta-llama/Meta-Llama-3-8B", + "model": "meta-llama/llama-2-7b-hf", "tp": 1, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", From 51d679e02876a08aaa713478eaa4b7d6fe0d4990 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 1 Jul 2024 01:21:21 -0700 Subject: [PATCH 081/150] clean up --- .../scripts/launch-trt-server.sh | 2 +- .../scripts/run-trt-nightly.sh | 84 ------------------- 2 files changed, 1 insertion(+), 85 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh index 251ab139c5729..d9108ef3a3168 100644 --- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh +++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh @@ -1,4 +1,4 @@ - +#!/bin/bash set -x diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index 58002651bd188..22363dbc25dcb 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -50,89 +50,6 @@ wait_for_server() { done' && return 0 || return 1 } - -run_trt_server() { - - server_params=$1 - common_params=$2 - - - - model_path=$(echo "$common_params" | jq -r '.model') - model_name="${model_path#*/}" - model_type=$(echo "$server_params" | jq -r '.model_type') - model_dtype=$(echo "$server_params" | jq -r '.model_dtype') - model_tp_size=$(echo "$common_params" | jq -r '.tp') - max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size') - max_input_len=$(echo "$server_params" | jq -r '.max_input_len') - max_output_len=$(echo "$server_params" | jq -r '.max_output_len') - trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version') - - cd ~ - rm -rf models - mkdir -p models - cd models - models_dir=$(pwd) - trt_model_path=${models_dir}/${model_name}-trt-ckpt - trt_engine_path=${models_dir}/${model_name}-trt-engine - - cd ~ - rm -rf tensorrt-demo - git clone https://github.com/neuralmagic/tensorrt-demo.git - cd tensorrt-demo - tensorrt_demo_dir=$(pwd) - - # make sure the parameter inside tensorrt_demo is consistent to envvar - sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt - sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt - sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt - sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt - sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt - sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt - - - cd / - rm -rf tensorrtllm_backend - git clone https://github.com/triton-inference-server/tensorrtllm_backend.git - git lfs install - cd tensorrtllm_backend - git checkout $trt_llm_version - tensorrtllm_backend_dir=$(pwd) - git submodule update --init --recursive - cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/ - - cd /tensorrtllm_backend - cd ./tensorrt_llm/examples/${model_type} - - python3 convert_checkpoint.py \ - --model_dir ${model_path} \ - --dtype ${model_dtype} \ - --tp_size ${model_tp_size} \ - --output_dir ${trt_model_path} - - trtllm-build \ - --checkpoint_dir=${trt_model_path} \ - --gpt_attention_plugin=${model_dtype} \ - --gemm_plugin=${model_dtype} \ - --remove_input_padding=enable \ - --paged_kv_cache=enable \ - --tp_size=${model_tp_size} \ - --max_batch_size=${max_batch_size} \ - --max_input_len=${max_input_len} \ - --max_output_len=${max_output_len} \ - --max_num_tokens=${max_output_len} \ - --opt_num_tokens=${max_output_len} \ - --output_dir=${trt_engine_path} - - cd /tensorrtllm_backend/triton_model_repo - rm -rf ./tensorrt_llm/1/* - cp -r ${trt_engine_path}/* ./tensorrt_llm/1 - cd /tensorrtllm_backend - python3 scripts/launch_triton_server.py \ - --world_size=${model_tp_size} \ - --model_repo=/tensorrtllm_backend/triton_model_repo & -} - run_serving_tests() { # run serving tests using `benchmark_serving.py` # $1: a json file specifying serving test cases @@ -189,7 +106,6 @@ run_serving_tests() { # run the server echo "Running test case $test_name" - # run_trt_server "$server_params" "$common_params" bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params" # wait until the server is alive From 21c986d5eadff3b540fb8f2a44296e20ba6d799c Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 1 Jul 2024 01:24:21 -0700 Subject: [PATCH 082/150] run lmdeploy server in a separate process --- .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index e9c29bbe7de47..275725e5741c5 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -113,7 +113,7 @@ run_serving_tests() { # run the server echo "Running test case $test_name" echo "Server command: $server_command" - eval "$server_command" & + bash -c "$server_command" & # wait until the server is alive wait_for_server From 96bc2490c3b2d1461d7f5c6a1fd252e94c17fb24 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 1 Jul 2024 01:25:19 -0700 Subject: [PATCH 083/150] bring back the full test suite --- .../tests/nightly-tests.json | 48 +++++++++++++++++-- 1 file changed, 43 insertions(+), 5 deletions(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 14e1730fd4a90..04f387a6eb4aa 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -3,7 +3,7 @@ "test_name": "llama8B_tp1", "qps_list": [4,8], "common_parameters": { - "model": "meta-llama/llama-2-7b-hf", + "model": "meta-llama/Meta-Llama-3-8B", "tp": 1, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", @@ -38,11 +38,49 @@ } }, { - "test_name": "llama8B_tp2", - "qps_list": [4,8], + "test_name": "mixtral8x7B_tp2", + "qps_list": [2,4], "common_parameters": { - "model": "meta-llama/Meta-Llama-3-8B", - "tp": 1, + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tp": 2, + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 500, + "port": 8000 + }, + "lmdeploy_server_parameters": { + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "mixtral", + "model_dtype": "float16", + "max_batch_size": 256, + "max_input_len": 10000, + "max_output_len": 10000, + "trt_llm_version": "r24.04" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "" + }, + "vllm_client_parameters": { + } + }, + { + "test_name": "llama70B_tp4", + "qps_list": [2,4], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tp": 4, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 500, From 6c566cbe7a17cb39ad2a9e230f5d6570a6714c5c Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 1 Jul 2024 11:15:26 -0700 Subject: [PATCH 084/150] bug fix: need to use llama checkpoint converter for mixtral model --- .../tests/nightly-tests.json | 38 ------------------- 1 file changed, 38 deletions(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 04f387a6eb4aa..9627eae547a7d 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -57,44 +57,6 @@ "tgi_client_parameters": { "endpoint": "/generate_stream" }, - "trt_server_parameters": { - "model_type": "mixtral", - "model_dtype": "float16", - "max_batch_size": 256, - "max_input_len": 10000, - "max_output_len": 10000, - "trt_llm_version": "r24.04" - }, - "trt_client_parameters": { - "endpoint": "/v2/models/ensemble/generate_stream" - }, - "vllm_server_parameters": { - "disable_log_stats": "", - "disable_log_requests": "" - }, - "vllm_client_parameters": { - } - }, - { - "test_name": "llama70B_tp4", - "qps_list": [2,4], - "common_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", - "tp": 4, - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 500, - "port": 8000 - }, - "lmdeploy_server_parameters": { - }, - "lmdeploy_client_parameters": { - }, - "tgi_server_parameters": { - }, - "tgi_client_parameters": { - "endpoint": "/generate_stream" - }, "trt_server_parameters": { "model_type": "llama", "model_dtype": "float16", From 162700f100b7340cbff6e6f471cccb2763be0acc Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 1 Jul 2024 22:05:41 -0700 Subject: [PATCH 085/150] reduce test case to only mixtral, debug lmdeploy + mixtral --- .../tests/nightly-tests.json | 38 ------------------- 1 file changed, 38 deletions(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 9627eae547a7d..bd881fdb831f9 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -1,42 +1,4 @@ [ - { - "test_name": "llama8B_tp1", - "qps_list": [4,8], - "common_parameters": { - "model": "meta-llama/Meta-Llama-3-8B", - "tp": 1, - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 500, - "port": 8000 - }, - "lmdeploy_server_parameters": { - }, - "lmdeploy_client_parameters": { - }, - "tgi_server_parameters": { - }, - "tgi_client_parameters": { - "endpoint": "/generate_stream" - }, - "trt_server_parameters": { - "model_type": "llama", - "model_dtype": "float16", - "max_batch_size": 256, - "max_input_len": 10000, - "max_output_len": 10000, - "trt_llm_version": "r24.04" - }, - "trt_client_parameters": { - "endpoint": "/v2/models/ensemble/generate_stream" - }, - "vllm_server_parameters": { - "disable_log_stats": "", - "disable_log_requests": "" - }, - "vllm_client_parameters": { - } - }, { "test_name": "mixtral8x7B_tp2", "qps_list": [2,4], From b0d74cdfef1293612cd34a247ddc9c29b37d227e Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 1 Jul 2024 23:13:50 -0700 Subject: [PATCH 086/150] developing fp8 + tensorrt-llm --- .../scripts/launch-trt-server.sh | 34 ++++++++++++++++--- .../tests/nightly-tests.json | 13 ++++--- 2 files changed, 37 insertions(+), 10 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh index d9108ef3a3168..2b877601da123 100644 --- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh +++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh @@ -53,11 +53,35 @@ cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/ cd /tensorrtllm_backend cd ./tensorrt_llm/examples/${model_type} -python3 convert_checkpoint.py \ ---model_dir ${model_path} \ ---dtype ${model_dtype} \ ---tp_size ${model_tp_size} \ ---output_dir ${trt_model_path} + +if echo "$server_params" | jq -e 'has("qformat")' > /dev/null; then + + echo "Key 'qformat' exists in tensorrt server params. Use quantize.py" + echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md" + qformat=$(echo "$server_params" | jq -r '.qformat') + kv_cache_dtype=$(echo "$server_params" | jq -r '.kv_cache_dtype') + calib_size=$(echo "$server_params" | jq -r '.calib_size') + python ../quantization/quantize.py \ + --model_dir ${model_path} \ + --dtype ${model_dtype} \ + --tp_size ${model_tp_size} \ + --output_dir ${trt_model_path} \ + --qformat ${qformat} \ + --kv_cache_dtype ${kv_cache_dtype} \ + --calib_size ${calib_size} \ + +else + + echo "Key 'qformat' does not exist in tensorrt server params. Use convert_checkpoint.py" + python3 convert_checkpoint.py \ + --model_dir ${model_path} \ + --dtype ${model_dtype} \ + --tp_size ${model_tp_size} \ + --output_dir ${trt_model_path} + +fi + + trtllm-build \ --checkpoint_dir=${trt_model_path} \ diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index bd881fdb831f9..85abacac29802 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -1,10 +1,10 @@ [ { - "test_name": "mixtral8x7B_tp2", - "qps_list": [2,4], + "test_name": "llama8B_fp8_tp1", + "qps_list": [4,8], "common_parameters": { - "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "tp": 2, + "model": "meta-llama/Meta-Llama-3-8B", + "tp": 1, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 500, @@ -25,7 +25,10 @@ "max_batch_size": 256, "max_input_len": 10000, "max_output_len": 10000, - "trt_llm_version": "r24.04" + "trt_llm_version": "r24.04", + "qformat": "fp8", + "kv_cache_dtype": "fp8", + "calib_size": 512 }, "trt_client_parameters": { "endpoint": "/v2/models/ensemble/generate_stream" From f1a795557383da7e76680e4268cbe5e47f542ff1 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 1 Jul 2024 23:16:43 -0700 Subject: [PATCH 087/150] move fp8 quantization to common parameters --- .../scripts/launch-trt-server.sh | 17 +++++++---------- .../nightly-benchmarks/tests/nightly-tests.json | 3 ++- 2 files changed, 9 insertions(+), 11 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh index 2b877601da123..ed6ed1aff722b 100644 --- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh +++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh @@ -54,31 +54,28 @@ cd /tensorrtllm_backend cd ./tensorrt_llm/examples/${model_type} -if echo "$server_params" | jq -e 'has("qformat")' > /dev/null; then +if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then - echo "Key 'qformat' exists in tensorrt server params. Use quantize.py" + echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py" echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md" - qformat=$(echo "$server_params" | jq -r '.qformat') - kv_cache_dtype=$(echo "$server_params" | jq -r '.kv_cache_dtype') - calib_size=$(echo "$server_params" | jq -r '.calib_size') python ../quantization/quantize.py \ --model_dir ${model_path} \ --dtype ${model_dtype} \ --tp_size ${model_tp_size} \ --output_dir ${trt_model_path} \ - --qformat ${qformat} \ - --kv_cache_dtype ${kv_cache_dtype} \ - --calib_size ${calib_size} \ + --qformat fp8 \ + --kv_cache_dtype fp8 \ + --calib_size 512 else - echo "Key 'qformat' does not exist in tensorrt server params. Use convert_checkpoint.py" + echo "Key 'fp8' exists in common params. Use convert_checkpoint.py" python3 convert_checkpoint.py \ --model_dir ${model_path} \ --dtype ${model_dtype} \ --tp_size ${model_tp_size} \ --output_dir ${trt_model_path} - + fi diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 85abacac29802..433772f3b6d2b 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -8,7 +8,8 @@ "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 500, - "port": 8000 + "port": 8000, + "fp8": true }, "lmdeploy_server_parameters": { }, From 459fb2f01dd0f410deabe3e916d70b23c6242222 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 1 Jul 2024 23:37:42 -0700 Subject: [PATCH 088/150] add fp8 for vllm --- .../scripts/run-vllm-nightly.sh | 25 +++++++++++++------ 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh index 774e6f3d5cb2c..3560f93005eeb 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh @@ -101,13 +101,24 @@ run_serving_tests() { continue fi - - server_command="python3 \ - -m vllm.entrypoints.openai.api_server \ - -tp $tp \ - --model $model \ - --port $port \ - $server_args" + if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then + echo "Key 'fp8' exists in common params." + server_command="python3 \ + -m vllm.entrypoints.openai.api_server \ + -tp $tp \ + --model $model \ + --port $port \ + --quantization fp8 \ + $server_args" + else + echo "Key 'fp8' does not exist in common params." + server_command="python3 \ + -m vllm.entrypoints.openai.api_server \ + -tp $tp \ + --model $model \ + --port $port \ + $server_args" + fi # run the server echo "Running test case $test_name" From 79b295cd197086efbd955e13c56eaa523552faf8 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 1 Jul 2024 23:46:14 -0700 Subject: [PATCH 089/150] remove unused parameters --- .buildkite/nightly-benchmarks/tests/nightly-tests.json | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 433772f3b6d2b..cf33c4fe898ec 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -26,10 +26,7 @@ "max_batch_size": 256, "max_input_len": 10000, "max_output_len": 10000, - "trt_llm_version": "r24.04", - "qformat": "fp8", - "kv_cache_dtype": "fp8", - "calib_size": 512 + "trt_llm_version": "r24.04" }, "trt_client_parameters": { "endpoint": "/v2/models/ensemble/generate_stream" From 019802a93e54fc633ab99e75f009186c062dc944 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Mon, 1 Jul 2024 23:56:22 -0700 Subject: [PATCH 090/150] use llama2 for local debugging --- .../scripts/run-tgi-nightly.sh | 23 +++++++++++++++---- .../tests/nightly-tests.json | 2 +- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh index 67f88eee653d9..36a9f434d4740 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh @@ -95,12 +95,25 @@ run_serving_tests() { continue fi + if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then + echo "Key 'fp8' exists in common params." + server_command="/tgi-entrypoint.sh \ + --model-id $model \ + --num-shard $tp \ + --port $port \ + --quantize fp8 \ + $server_args" + else + echo "Key 'fp8' does not exist in common params." + server_command="/tgi-entrypoint.sh \ + --model-id $model \ + --num-shard $tp \ + --port $port \ + $server_args" + fi + - server_command="/tgi-entrypoint.sh \ - --model-id $model \ - --num-shard $tp \ - --port $port \ - $server_args" + # run the server echo "Running test case $test_name" diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index cf33c4fe898ec..33b88b811ec0b 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -3,7 +3,7 @@ "test_name": "llama8B_fp8_tp1", "qps_list": [4,8], "common_parameters": { - "model": "meta-llama/Meta-Llama-3-8B", + "model": "meta-llama/llama-2-7b-chat-hf", "tp": 1, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", From 3d20f9235e4d90a5b9f3794cfc0a8da2afafb641 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 2 Jul 2024 14:02:28 -0700 Subject: [PATCH 091/150] move kv cache dtype inside vllm --- .buildkite/nightly-benchmarks/tests/nightly-tests.json | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 33b88b811ec0b..d08dccf5455f2 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -3,7 +3,8 @@ "test_name": "llama8B_fp8_tp1", "qps_list": [4,8], "common_parameters": { - "model": "meta-llama/llama-2-7b-chat-hf", + "model": "meta-llama/Meta-Llama-3-8B-Instruct", + "neuralmagic_quantized_model": "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV", "tp": 1, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", @@ -33,7 +34,8 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "" + "disable_log_requests": "", + "kv_cache_dtype": "fp8" }, "vllm_client_parameters": { } From 44e2d971587bf455c30e1520575efd36366945cd Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 2 Jul 2024 14:11:02 -0700 Subject: [PATCH 092/150] change model --- .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh index 3560f93005eeb..f309e391d2cbe 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh @@ -102,13 +102,13 @@ run_serving_tests() { fi if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then - echo "Key 'fp8' exists in common params." + echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience." + model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model') server_command="python3 \ -m vllm.entrypoints.openai.api_server \ -tp $tp \ --model $model \ --port $port \ - --quantization fp8 \ $server_args" else echo "Key 'fp8' does not exist in common params." From b8dbd8ac9e268a01022e38d291ada164fa9740f2 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 2 Jul 2024 23:35:36 -0700 Subject: [PATCH 093/150] test fp8 performance --- .buildkite/nightly-benchmarks/tests/nightly-tests.json | 1 + 1 file changed, 1 insertion(+) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index d08dccf5455f2..ce98a7604fae8 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -13,6 +13,7 @@ "fp8": true }, "lmdeploy_server_parameters": { + "quant_policy": 8 }, "lmdeploy_client_parameters": { }, From 0313c19e8fc6a1ce09ab0f3f45a416716653dbd0 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 3 Jul 2024 00:01:03 -0700 Subject: [PATCH 094/150] reduce calib size --- .../nightly-benchmarks/nightly-pipeline.yaml | 140 +++++++++--------- .../scripts/launch-trt-server.sh | 2 +- 2 files changed, 71 insertions(+), 71 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 7dae9c9c59a95..7a20d526e06c2 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -34,6 +34,40 @@ steps: emptyDir: medium: Memory - wait + - label: "A100 lmdeploy benchmark" + priority: 100 + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + priorityClassName: perf-benchmark + containers: + - image: openmmlab/lmdeploy:latest + command: + - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory - label: "A100 trt benchmark" priority: 100 agents: @@ -68,75 +102,7 @@ steps: - name: devshm emptyDir: medium: Memory - # - label: "A100 vllm benchmark" - # priority: 100 - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # priorityClassName: perf-benchmark - # containers: - # - image: vllm/vllm-openai:latest - # command: - # - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - # resources: - # limits: - # nvidia.com/gpu: 8 - # volumeMounts: - # - name: devshm - # mountPath: /dev/shm - # env: - # - name: VLLM_USAGE_SOURCE - # value: ci-test - # - name: VLLM_SOURCE_CODE_LOC - # value: /workspace/build/buildkite/vllm/performance-benchmark - # - name: HF_TOKEN - # valueFrom: - # secretKeyRef: - # name: hf-token-secret - # key: token - # nodeSelector: - # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - # volumes: - # - name: devshm - # emptyDir: - # medium: Memory - # - label: "A100 tgi benchmark" - # priority: 100 - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # priorityClassName: perf-benchmark - # containers: - # - image: ghcr.io/huggingface/text-generation-inference:2.0 - # command: - # - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - # resources: - # limits: - # nvidia.com/gpu: 8 - # volumeMounts: - # - name: devshm - # mountPath: /dev/shm - # env: - # - name: VLLM_USAGE_SOURCE - # value: ci-test - # - name: VLLM_SOURCE_CODE_LOC - # value: /workspace/build/buildkite/vllm/performance-benchmark - # - name: HF_TOKEN - # valueFrom: - # secretKeyRef: - # name: hf-token-secret - # key: token - # nodeSelector: - # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - # volumes: - # - name: devshm - # emptyDir: - # medium: Memory - - label: "A100 lmdeploy benchmark" + - label: "A100 vllm benchmark" priority: 100 agents: queue: A100 @@ -145,7 +111,41 @@ steps: podSpec: priorityClassName: perf-benchmark containers: - - image: openmmlab/lmdeploy:latest + - image: vllm/vllm-openai:latest + command: + - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - label: "A100 tgi benchmark" + priority: 100 + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + priorityClassName: perf-benchmark + containers: + - image: ghcr.io/huggingface/text-generation-inference:2.1 command: - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh resources: diff --git a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh index ed6ed1aff722b..3a4c8e704abfb 100644 --- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh +++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh @@ -65,7 +65,7 @@ if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then --output_dir ${trt_model_path} \ --qformat fp8 \ --kv_cache_dtype fp8 \ - --calib_size 512 + --calib_size 2 else From 7b483a128ceef770f472f3ff136576b8c7fe895a Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 3 Jul 2024 11:45:03 -0700 Subject: [PATCH 095/150] freeze fp16 benchmark --- .../tests/nightly-tests.json | 92 +++++++++++++++++-- 1 file changed, 82 insertions(+), 10 deletions(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index ce98a7604fae8..47d94a39f503a 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -1,19 +1,16 @@ [ { - "test_name": "llama8B_fp8_tp1", + "test_name": "llama8B_tp1", "qps_list": [4,8], "common_parameters": { - "model": "meta-llama/Meta-Llama-3-8B-Instruct", - "neuralmagic_quantized_model": "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV", + "model": "meta-llama/Meta-Llama-3-8B", "tp": 1, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 500, - "port": 8000, - "fp8": true + "port": 8000 }, "lmdeploy_server_parameters": { - "quant_policy": 8 }, "lmdeploy_client_parameters": { }, @@ -26,8 +23,8 @@ "model_type": "llama", "model_dtype": "float16", "max_batch_size": 256, - "max_input_len": 10000, - "max_output_len": 10000, + "max_input_len": 4096, + "max_output_len": 4096, "trt_llm_version": "r24.04" }, "trt_client_parameters": { @@ -35,8 +32,83 @@ }, "vllm_server_parameters": { "disable_log_stats": "", - "disable_log_requests": "", - "kv_cache_dtype": "fp8" + "disable_log_requests": "" + }, + "vllm_client_parameters": { + } + }, + { + "test_name": "mixtral8x7B_tp2", + "qps_list": [2,4], + "common_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tp": 2, + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 500, + "port": 8000 + }, + "lmdeploy_server_parameters": { + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "float16", + "max_batch_size": 256, + "max_input_len": 4096, + "max_output_len": 4096, + "trt_llm_version": "r24.04" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "" + }, + "vllm_client_parameters": { + } + }, + { + "test_name": "llama70B_tp4", + "qps_list": [2,4], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tp": 4, + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 500, + "port": 8000 + }, + "lmdeploy_server_parameters": { + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "float16", + "max_batch_size": 256, + "max_input_len": 4096, + "max_output_len": 4096, + "trt_llm_version": "r24.04" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "" }, "vllm_client_parameters": { } From c5e6662094ece8adcef11c2a74763551bf9654a0 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 3 Jul 2024 15:30:35 -0700 Subject: [PATCH 096/150] add standard deviation for each metric -- to plot confidence interval --- .../nightly-benchmarks/nightly-descriptions.md | 17 +++++++++++++++++ .../nightly-benchmarks/nightly-pipeline.yaml | 4 ++-- .../nightly-benchmarks/run-nightly-suite.sh | 1 - .../scripts/launch-trt-server.sh | 1 - benchmarks/benchmark_serving.py | 9 +++++++++ 5 files changed, 28 insertions(+), 4 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index edcbeb8db10c4..7d42d11fea1f3 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -6,6 +6,17 @@ The main goal of this benchmarking is two-fold: - Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md](). +## Versions + +We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images: +- vllm/vllm-openai:v0.5.0.post1 +- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 +- openmmlab/lmdeploy:v0.5.0 +- ghcr.io/huggingface/text-generation-inference:2.1 + +Check `nightly-pipeline.yaml` artifact for more details. + + ## Workload description We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload: @@ -18,6 +29,12 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload: - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). +## Known crashes + +- TGI v2.1 crashes when running mixtral model [TGI PR #2122](https://github.com/huggingface/text-generation-inference/issues/2122) +- + + ## Results | Test name | GPU | Successful req. | Tput (req/s) | Mean TTFT (ms) | Median TTFT (ms) | P99 TTFT (ms) | Mean ITL (ms) | Median ITL (ms) | P99 ITL (ms) | Engine | diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 7a20d526e06c2..c3dfc800f1fe7 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -8,7 +8,7 @@ steps: podSpec: priorityClassName: perf-benchmark containers: - - image: vllm/vllm-openai:latest + - image: vllm/vllm-openai:v0.5.0.post1 command: - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh resources: @@ -43,7 +43,7 @@ steps: podSpec: priorityClassName: perf-benchmark containers: - - image: openmmlab/lmdeploy:latest + - image: openmmlab/lmdeploy:v0.5.0 command: - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh resources: diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh index a157074287083..e50d2ba4b2e7a 100644 --- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh +++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh @@ -1,6 +1,5 @@ #!/bin/bash -set -ex set -o pipefail check_gpus() { diff --git a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh index 3a4c8e704abfb..26d3ca610af81 100644 --- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh +++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh @@ -1,6 +1,5 @@ #!/bin/bash -set -x server_params=$1 common_params=$2 diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py index 42867fc40edd2..99b2ac30c3516 100644 --- a/benchmarks/benchmark_serving.py +++ b/benchmarks/benchmark_serving.py @@ -60,12 +60,15 @@ class BenchmarkMetrics: output_throughput: float mean_ttft_ms: float median_ttft_ms: float + std_ttft_ms: float p99_ttft_ms: float mean_tpot_ms: float median_tpot_ms: float + std_tpot_ms: float p99_tpot_ms: float mean_itl_ms: float median_itl_ms: float + std_itl_ms: float p99_itl_ms: float @@ -249,12 +252,15 @@ def calculate_metrics( mean_ttft_ms=np.mean(ttfts or 0) * 1000, # ttfts is empty if streaming is not supported by backend median_ttft_ms=np.median(ttfts or 0) * 1000, + std_ttft_ms=np.std(ttfts or 0) * 1000, p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000, mean_tpot_ms=np.mean(tpots or 0) * 1000, median_tpot_ms=np.median(tpots or 0) * 1000, + std_tpot_ms=np.std(tpots or 0) * 1000, p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000, mean_itl_ms=np.mean(itls or 0) * 1000, median_itl_ms=np.median(itls or 0) * 1000, + std_itl_ms=np.std(itls or 0) * 1000, p99_itl_ms=np.percentile(itls or 0, 99) * 1000, ) @@ -371,12 +377,15 @@ async def benchmark( "output_throughput": metrics.output_throughput, "mean_ttft_ms": metrics.mean_ttft_ms, "median_ttft_ms": metrics.median_ttft_ms, + "std_ttft_ms": metrics.std_ttft_ms, "p99_ttft_ms": metrics.p99_ttft_ms, "mean_tpot_ms": metrics.mean_tpot_ms, "median_tpot_ms": metrics.median_tpot_ms, + "std_tpot_ms": metrics.std_tpot_ms, "p99_tpot_ms": metrics.p99_tpot_ms, "mean_itl_ms": metrics.mean_itl_ms, "median_itl_ms": metrics.median_itl_ms, + "std_itl_ms": metrics.std_itl_ms, "p99_itl_ms": metrics.p99_itl_ms, "input_lens": [output.prompt_len for output in outputs], "output_lens": actual_output_lens, From 22e78b5a7c8d21171b9ea1ea6e574ce0bb43afce Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 3 Jul 2024 16:06:18 -0700 Subject: [PATCH 097/150] remove annotation inside the job --- run the annotation at the last. --- .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 2 +- .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh | 2 +- .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh | 2 +- .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index 275725e5741c5..c23438679578b 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -190,7 +190,7 @@ upload_to_buildkite() { echo "buildkite-agent binary not found. Skip uploading the results." return 0 fi - /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" } diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh index 36a9f434d4740..b805c52d3fa8c 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh @@ -192,7 +192,7 @@ upload_to_buildkite() { echo "buildkite-agent binary not found. Skip uploading the results." return 0 fi - /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" } diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index 22363dbc25dcb..ae6f4316eb4c2 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -183,7 +183,7 @@ upload_to_buildkite() { echo "buildkite-agent binary not found. Skip uploading the results." return 0 fi - /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" } diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh index f309e391d2cbe..1e6d2893983bf 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh @@ -196,7 +196,7 @@ upload_to_buildkite() { echo "buildkite-agent binary not found. Skip uploading the results." return 0 fi - /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md + # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*" } From 59072ed19d3a0b7456b54f9f6f4c5a335c0d98fa Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 3 Jul 2024 16:07:14 -0700 Subject: [PATCH 098/150] reduce nightly pipeline length --- .../nightly-benchmarks/nightly-pipeline.yaml | 254 ++++++++++-------- 1 file changed, 144 insertions(+), 110 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index c3dfc800f1fe7..ad1bd25f3b230 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -1,108 +1,142 @@ steps: - - label: "Annotate" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: vllm/vllm-openai:v0.5.0.post1 - command: - - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - wait - - label: "A100 lmdeploy benchmark" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: openmmlab/lmdeploy:v0.5.0 - command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - label: "A100 trt benchmark" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 - command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - label: "A100 vllm benchmark" + # - label: "Annotate" + # priority: 100 + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # priorityClassName: perf-benchmark + # containers: + # - image: vllm/vllm-openai:v0.5.0.post1 + # command: + # - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh + # resources: + # limits: + # nvidia.com/gpu: 8 + # volumeMounts: + # - name: devshm + # mountPath: /dev/shm + # env: + # - name: VLLM_USAGE_SOURCE + # value: ci-test + # - name: VLLM_SOURCE_CODE_LOC + # value: /workspace/build/buildkite/vllm/performance-benchmark + # - name: HF_TOKEN + # valueFrom: + # secretKeyRef: + # name: hf-token-secret + # key: token + # nodeSelector: + # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + # volumes: + # - name: devshm + # emptyDir: + # medium: Memory + # - wait + # - label: "A100 lmdeploy benchmark" + # priority: 100 + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # priorityClassName: perf-benchmark + # containers: + # - image: openmmlab/lmdeploy:v0.5.0 + # command: + # - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + # resources: + # limits: + # nvidia.com/gpu: 8 + # volumeMounts: + # - name: devshm + # mountPath: /dev/shm + # env: + # - name: VLLM_USAGE_SOURCE + # value: ci-test + # - name: VLLM_SOURCE_CODE_LOC + # value: /workspace/build/buildkite/vllm/performance-benchmark + # - name: HF_TOKEN + # valueFrom: + # secretKeyRef: + # name: hf-token-secret + # key: token + # nodeSelector: + # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + # volumes: + # - name: devshm + # emptyDir: + # medium: Memory + # - label: "A100 trt benchmark" + # priority: 100 + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # priorityClassName: perf-benchmark + # containers: + # - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 + # command: + # - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + # resources: + # limits: + # nvidia.com/gpu: 8 + # volumeMounts: + # - name: devshm + # mountPath: /dev/shm + # env: + # - name: VLLM_USAGE_SOURCE + # value: ci-test + # - name: VLLM_SOURCE_CODE_LOC + # value: /workspace/build/buildkite/vllm/performance-benchmark + # - name: HF_TOKEN + # valueFrom: + # secretKeyRef: + # name: hf-token-secret + # key: token + # nodeSelector: + # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + # volumes: + # - name: devshm + # emptyDir: + # medium: Memory + # - label: "A100 vllm benchmark" + # priority: 100 + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # priorityClassName: perf-benchmark + # containers: + # - image: vllm/vllm-openai:latest + # command: + # - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + # resources: + # limits: + # nvidia.com/gpu: 8 + # volumeMounts: + # - name: devshm + # mountPath: /dev/shm + # env: + # - name: VLLM_USAGE_SOURCE + # value: ci-test + # - name: VLLM_SOURCE_CODE_LOC + # value: /workspace/build/buildkite/vllm/performance-benchmark + # - name: HF_TOKEN + # valueFrom: + # secretKeyRef: + # name: hf-token-secret + # key: token + # nodeSelector: + # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + # volumes: + # - name: devshm + # emptyDir: + # medium: Memory + - label: "A100 tgi benchmark" priority: 100 agents: queue: A100 @@ -111,7 +145,7 @@ steps: podSpec: priorityClassName: perf-benchmark containers: - - image: vllm/vllm-openai:latest + - image: ghcr.io/huggingface/text-generation-inference:2.1 command: - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh resources: @@ -136,7 +170,8 @@ steps: - name: devshm emptyDir: medium: Memory - - label: "A100 tgi benchmark" + - wait + - label: "Plot" priority: 100 agents: queue: A100 @@ -145,9 +180,9 @@ steps: podSpec: priorityClassName: perf-benchmark containers: - - image: ghcr.io/huggingface/text-generation-inference:2.1 + - image: vllm/vllm-openai:v0.5.0.post1 command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh resources: limits: nvidia.com/gpu: 8 @@ -169,5 +204,4 @@ steps: volumes: - name: devshm emptyDir: - medium: Memory - \ No newline at end of file + medium: Memory \ No newline at end of file From a3e4355c24d5c75ff4a18056d0dc91451e8efcf9 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 3 Jul 2024 16:15:46 -0700 Subject: [PATCH 099/150] remove headers in result --- .buildkite/nightly-benchmarks/nightly-descriptions.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index 7d42d11fea1f3..b0ae36953d7d6 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -37,5 +37,3 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload: ## Results -| Test name | GPU | Successful req. | Tput (req/s) | Mean TTFT (ms) | Median TTFT (ms) | P99 TTFT (ms) | Mean ITL (ms) | Median ITL (ms) | P99 ITL (ms) | Engine | -|:----------------------|:---------------|------------------:|---------------:|-----------------:|-------------------:|----------------:|----------------:|------------------:|---------------:|:---------| From e27677ae4d88d4ebd421695820ebb58ffadf7eb9 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 3 Jul 2024 22:50:07 -0700 Subject: [PATCH 100/150] add visualization step --- .../nightly-descriptions.md | 9 +- .../scripts/nightly-annotate.sh | 20 ++++- .../scripts/plot-nightly-results.py | 83 +++++++++++++++++++ .../scripts/summary-nightly-results.py | 15 ++-- .../tests/nightly-tests.json | 76 ----------------- 5 files changed, 113 insertions(+), 90 deletions(-) create mode 100644 .buildkite/nightly-benchmarks/scripts/plot-nightly-results.py diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index b0ae36953d7d6..3f792d788c273 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -32,8 +32,15 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload: ## Known crashes - TGI v2.1 crashes when running mixtral model [TGI PR #2122](https://github.com/huggingface/text-generation-inference/issues/2122) -- ## Results + + + +ITL: + + +Comparison table: +{nightly_results_benchmarking_table} \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh index cee44c3d6eb92..83ad79674e7df 100644 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -13,10 +13,24 @@ main() { if [ ! -f /workspace/buildkite-agent ]; then echo "buildkite-agent binary not found. Skip uploading the results." - return 0 - else - /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md + exit 0 fi + + # initial annotation + description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md" + + # download results + cd $VLLM_SOURCE_CODE_LOC/benchmarks + mkdir -p results/ + /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results + + # generate figures + python3 -m pip install tabulate pandas + python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py \ + --results-folder results \ + --description $description + + } diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py new file mode 100644 index 0000000000000..6fd2bcf631a7a --- /dev/null +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -0,0 +1,83 @@ + +import json +import os +from pathlib import Path +import argparse +import matplotlib.pyplot as plt + +import pandas as pd +from tabulate import tabulate + +def parse_arguments(): + parser = argparse.ArgumentParser(description='Parse command line arguments for summary-nightly-results script.') + parser.add_argument('--results-folder', type=str, required=True, help='The folder where the results are stored.') + parser.add_argument('--description', type=str, required=True, help='Description of the results.') + + args = parser.parse_args() + return args + + +def main(args): + results_folder = Path(args.results_folder) + + results = [] + + # collect results + for test_file in results_folder.glob("*.json"): + with open(test_file, "r") as f: + results = results + json.loads(f.read()) + + + # generate markdown table + df = pd.DataFrame.from_dict(results) + + md_table = tabulate(df, + headers='keys', + tablefmt='pipe', + showindex=False) + + with open(args.description, "r") as f: + description = f.read() + + description = description.format( + nightly_results_benchmarking_table=md_table + ) + + with open("nightly_results.md", "w") as f: + f.write(description) + + + # plot results + fig, axes = plt.subplots((3, 2), figsize=(16, 18)) + for i, model in enumerate(["llama8b", "llama70b", "mixtral8x7b"]): + for j, metric in enumerate(["TTFT", "ITL"]): + means, stds = [], [] + for method in ["vllm", "trt", "lmdeploy", "tgi"]: + target = df['Test name'].str.contains(model) + target = target & df['Test name'].str.contains(method) + filtered_df = df[target] + + if filtered_df.empty: + means.append(0.) + stds.append(0.) + else: + means.append(filtered_df[f"Mean {metric} (ms)"].values[0]) + stds.append(filtered_df[f"Std {metric} (ms)"].values[0]) + + ax = axes[i, j] + + ax.errorbar( + ["vllm", "trt", "lmdeploy", "tgi"], + means, + yerr=stds, + fmt='o', capsize=5) + + ax.set_xlabel("Method") + ax.set_ylabel(f"{metric} (ms)") + ax.set_title(f"{model} {metric} comparison") + + fig.savefig("nightly_results.jpg", bbox_inches='tight') + +if __name__ == '__main__': + args = parse_arguments() + main(args) \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index 640e0bfdaa1f7..d9fc46cb45c92 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -14,17 +14,10 @@ "gpu_type": "GPU", "completed": "Successful req.", "request_throughput": "Tput (req/s)", - # "input_throughput": "Input Tput (tok/s)", - # "output_throughput": "Output Tput (tok/s)", "mean_ttft_ms": "Mean TTFT (ms)", - "median_ttft_ms": "Median TTFT (ms)", - "p99_ttft_ms": "P99 TTFT (ms)", - # "mean_tpot_ms": "Mean TPOT (ms)", - # "median_tpot_ms": "Median", - # "p99_tpot_ms": "P99", + "std_ttft_ms": "Std TTFT (ms)", "mean_itl_ms": "Mean ITL (ms)", - "median_itl_ms": "Median ITL (ms)", - "p99_itl_ms": "P99 ITL (ms)", + "std_itl_ms": "Std ITL (ms)", "engine": "Engine", } @@ -67,7 +60,9 @@ # document benchmarking results in markdown with open(results_folder / f"{prefix}_nightly_results.md", "w") as f: - f.write(serving_md_table_without_header) + # document results with header. + # for those who wants to reproduce our benchmark. + f.write(serving_md_table_with_headers) f.write('\n') # document benchmarking results in json diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 47d94a39f503a..57d462393eeaf 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -36,81 +36,5 @@ }, "vllm_client_parameters": { } - }, - { - "test_name": "mixtral8x7B_tp2", - "qps_list": [2,4], - "common_parameters": { - "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "tp": 2, - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 500, - "port": 8000 - }, - "lmdeploy_server_parameters": { - }, - "lmdeploy_client_parameters": { - }, - "tgi_server_parameters": { - }, - "tgi_client_parameters": { - "endpoint": "/generate_stream" - }, - "trt_server_parameters": { - "model_type": "llama", - "model_dtype": "float16", - "max_batch_size": 256, - "max_input_len": 4096, - "max_output_len": 4096, - "trt_llm_version": "r24.04" - }, - "trt_client_parameters": { - "endpoint": "/v2/models/ensemble/generate_stream" - }, - "vllm_server_parameters": { - "disable_log_stats": "", - "disable_log_requests": "" - }, - "vllm_client_parameters": { - } - }, - { - "test_name": "llama70B_tp4", - "qps_list": [2,4], - "common_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", - "tp": 4, - "dataset_name": "sharegpt", - "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", - "num_prompts": 500, - "port": 8000 - }, - "lmdeploy_server_parameters": { - }, - "lmdeploy_client_parameters": { - }, - "tgi_server_parameters": { - }, - "tgi_client_parameters": { - "endpoint": "/generate_stream" - }, - "trt_server_parameters": { - "model_type": "llama", - "model_dtype": "float16", - "max_batch_size": 256, - "max_input_len": 4096, - "max_output_len": 4096, - "trt_llm_version": "r24.04" - }, - "trt_client_parameters": { - "endpoint": "/v2/models/ensemble/generate_stream" - }, - "vllm_server_parameters": { - "disable_log_stats": "", - "disable_log_requests": "" - }, - "vllm_client_parameters": { - } } ] \ No newline at end of file From 7c845ae8a5b07c08e0bbdbe14aac58561b6cf1a8 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 3 Jul 2024 23:31:45 -0700 Subject: [PATCH 101/150] support figure visualization --- .../nightly-descriptions.md | 7 ++---- .../scripts/nightly-annotate.sh | 21 +++++++++--------- .../scripts/plot-nightly-results.py | 22 ++++++++++++++----- .../results/trt_llama8B_tp1_qps_8.commands | 6 +++++ benchmarks/results/trt_nightly_results.md | 1 + 5 files changed, 37 insertions(+), 20 deletions(-) create mode 100644 benchmarks/results/trt_llama8B_tp1_qps_8.commands create mode 100644 benchmarks/results/trt_nightly_results.md diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index 3f792d788c273..4445ecee72697 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -24,7 +24,7 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload: - Input length: randomly sample 1000 prompts from ShareGPT dataset (with fixed random seed). - Output length: the corresponding output length of these 1000 prompts. - Batch size: dynamically determined by vllm and the arrival pattern of the requests. -- Average QPS (query per second): 4, 8 for 8B model and 1, 4 for larger models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed). +- Average QPS (query per second): 4 for 8B model and 2 for larger models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed). - Models: llama-3 8B, llama-3 70B, mixtral 8x7B. - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). @@ -36,11 +36,8 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload: ## Results +![[Overall benchmarking results]([artifacts](artifact://nightly_results.png))](artifact://indy.png) -ITL: - - -Comparison table: {nightly_results_benchmarking_table} \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh index 83ad79674e7df..19f789702703f 100644 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -8,11 +8,9 @@ main() { (which wget && which curl) || (apt-get update && apt-get install -y wget curl) (which jq) || (apt-get update && apt-get -y install jq) - - df -h if [ ! -f /workspace/buildkite-agent ]; then - echo "buildkite-agent binary not found. Skip uploading the results." + echo "buildkite-agent binary not found. Skip plotting the results." exit 0 fi @@ -22,16 +20,19 @@ main() { # download results cd $VLLM_SOURCE_CODE_LOC/benchmarks mkdir -p results/ - /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results + /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/ + ls + ls results/ # generate figures - python3 -m pip install tabulate pandas - python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py \ - --results-folder results \ - --description $description - - + python3 -m pip install tabulate pandas matplotlib + python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \ + --description $description \ + --results-folder results/ + # upload results and figures + /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < nightly_results.md + /workspace/buildkite-agent artifact upload "nightly_results.png" } main "$@" \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py index 6fd2bcf631a7a..f6ad25d8ca981 100644 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -23,7 +23,7 @@ def main(args): results = [] # collect results - for test_file in results_folder.glob("*.json"): + for test_file in results_folder.glob("*_nightly_results.json"): with open(test_file, "r") as f: results = results + json.loads(f.read()) @@ -46,13 +46,16 @@ def main(args): with open("nightly_results.md", "w") as f: f.write(description) + + plt.rcParams.update({'font.size': 20}) # plot results - fig, axes = plt.subplots((3, 2), figsize=(16, 18)) - for i, model in enumerate(["llama8b", "llama70b", "mixtral8x7b"]): + fig, axes = plt.subplots(3, 2, figsize=(16, 18)) + methods = ["vllm", "trt", "lmdeploy", "tgi"] + for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]): for j, metric in enumerate(["TTFT", "ITL"]): means, stds = [], [] - for method in ["vllm", "trt", "lmdeploy", "tgi"]: + for method in methods: target = df['Test name'].str.contains(model) target = target & df['Test name'].str.contains(method) filtered_df = df[target] @@ -71,12 +74,21 @@ def main(args): means, yerr=stds, fmt='o', capsize=5) + ax.set_ylim(bottom=0) + + for i, (method, mean, std) in enumerate(zip(method, means, stds)): + ax.text( + i - 0.2, mean, # Adjust position above the error bar + f'{mean:.0f}', + ha='center', + va='bottom' + ) ax.set_xlabel("Method") ax.set_ylabel(f"{metric} (ms)") ax.set_title(f"{model} {metric} comparison") - fig.savefig("nightly_results.jpg", bbox_inches='tight') + fig.savefig("nightly_results.png", bbox_inches='tight') if __name__ == '__main__': args = parse_arguments() diff --git a/benchmarks/results/trt_llama8B_tp1_qps_8.commands b/benchmarks/results/trt_llama8B_tp1_qps_8.commands new file mode 100644 index 0000000000000..e0312b4e22dc2 --- /dev/null +++ b/benchmarks/results/trt_llama8B_tp1_qps_8.commands @@ -0,0 +1,6 @@ +{ + "server_command": "", + "client_command": "python3 benchmark_serving.py --backend tensorrt-llm --tokenizer /tokenizer_cache --model meta-llama/Meta-Llama-3-8B --dataset-name sharegpt --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 200 --port 8000 --save-result --result-dir results/ --result-filename trt_llama8B_tp1_qps_8.json --request-rate 8 --endpoint /v2/models/ensemble/generate_stream", + "gpu_type": "A100-SXM4-80GB", + "engine": "trt" +} diff --git a/benchmarks/results/trt_nightly_results.md b/benchmarks/results/trt_nightly_results.md new file mode 100644 index 0000000000000..3befc74903d68 --- /dev/null +++ b/benchmarks/results/trt_nightly_results.md @@ -0,0 +1 @@ +| trt_llama8B_tp1_qps_8 | A100-SXM4-80GB | 200 | 6.49609 | 60.3214 | 55.7321 | 119.186 | 15.6021 | 14.187 | 56.2925 | trt | From 3a70a60aa9245652d0086d99d70118689260f8fb Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 3 Jul 2024 23:52:56 -0700 Subject: [PATCH 102/150] adjust visualization --- .buildkite/nightly-benchmarks/nightly-descriptions.md | 6 ++++-- .../nightly-benchmarks/scripts/plot-nightly-results.py | 7 +++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index 4445ecee72697..9bb965b2450a8 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -36,8 +36,10 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload: ## Results -![[Overall benchmarking results]([artifacts](artifact://nightly_results.png))](artifact://indy.png) +{nightly_results_benchmarking_table} +## Plots +In the following plots, the error bar shows the standard error of the mean. -{nightly_results_benchmarking_table} \ No newline at end of file +Benchmarking results \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py index f6ad25d8ca981..41089aae90405 100644 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -4,6 +4,7 @@ from pathlib import Path import argparse import matplotlib.pyplot as plt +import math import pandas as pd from tabulate import tabulate @@ -65,7 +66,9 @@ def main(args): stds.append(0.) else: means.append(filtered_df[f"Mean {metric} (ms)"].values[0]) - stds.append(filtered_df[f"Std {metric} (ms)"].values[0]) + std = filtered_df[f"Std {metric} (ms)"].values[0] + success = filtered_df["Successful req."].values[0] + stds.append(std / math.sqrt(success)) ax = axes[i, j] @@ -88,7 +91,7 @@ def main(args): ax.set_ylabel(f"{metric} (ms)") ax.set_title(f"{model} {metric} comparison") - fig.savefig("nightly_results.png", bbox_inches='tight') + fig.savefig("nightly_results.png") if __name__ == '__main__': args = parse_arguments() From 8260d3889d0b58cf70f5e22444794df9ce325dc9 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 00:04:59 -0700 Subject: [PATCH 103/150] visual adjustment --- .buildkite/nightly-benchmarks/nightly-descriptions.md | 8 +++++--- .../nightly-benchmarks/scripts/nightly-annotate.sh | 4 +++- .../nightly-benchmarks/scripts/plot-nightly-results.py | 9 +++++---- .buildkite/nightly-benchmarks/tests/nightly-tests.json | 2 +- 4 files changed, 14 insertions(+), 9 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index 9bb965b2450a8..98a593d455639 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -14,7 +14,7 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker ima - openmmlab/lmdeploy:v0.5.0 - ghcr.io/huggingface/text-generation-inference:2.1 -Check `nightly-pipeline.yaml` artifact for more details. +Check [nightly-pipeline.yaml](artifact://nightly-pipeline.yaml) artifact for more details. ## Workload description @@ -26,12 +26,14 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload: - Batch size: dynamically determined by vllm and the arrival pattern of the requests. - Average QPS (query per second): 4 for 8B model and 2 for larger models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed). - Models: llama-3 8B, llama-3 70B, mixtral 8x7B. -- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99). +- Evaluation metrics: Throughput, TTFT (time to the first token, with mean and std), ITL (inter-token latency, with mean and std). + +Check [nightly-tests.json](artifact://nightly-tests.json) artifact for more details. ## Known crashes -- TGI v2.1 crashes when running mixtral model [TGI PR #2122](https://github.com/huggingface/text-generation-inference/issues/2122) +- TGI v2.1 crashes when running mixtral model, see [TGI PR #2122](https://github.com/huggingface/text-generation-inference/issues/2122) ## Results diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh index 19f789702703f..99f1548039ab6 100644 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -31,8 +31,10 @@ main() { --results-folder results/ # upload results and figures - /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < nightly_results.md /workspace/buildkite-agent artifact upload "nightly_results.png" + /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml + /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json + /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < nightly_results.md } main "$@" \ No newline at end of file diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py index 41089aae90405..962099a6b4c5f 100644 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -51,14 +51,14 @@ def main(args): plt.rcParams.update({'font.size': 20}) # plot results - fig, axes = plt.subplots(3, 2, figsize=(16, 18)) + fig, axes = plt.subplots(3, 2, figsize=(14, 16)) methods = ["vllm", "trt", "lmdeploy", "tgi"] for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]): for j, metric in enumerate(["TTFT", "ITL"]): means, stds = [], [] for method in methods: target = df['Test name'].str.contains(model) - target = target & df['Test name'].str.contains(method) + target = target & df['Engine'].str.contains(method) filtered_df = df[target] if filtered_df.empty: @@ -70,6 +70,8 @@ def main(args): success = filtered_df["Successful req."].values[0] stds.append(std / math.sqrt(success)) + print(means, stds) + ax = axes[i, j] ax.errorbar( @@ -87,11 +89,10 @@ def main(args): va='bottom' ) - ax.set_xlabel("Method") ax.set_ylabel(f"{metric} (ms)") ax.set_title(f"{model} {metric} comparison") - fig.savefig("nightly_results.png") + fig.savefig("nightly_results.png", bbox_inches='tight') if __name__ == '__main__': args = parse_arguments() diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 57d462393eeaf..0f2ac3be3df81 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -1,7 +1,7 @@ [ { "test_name": "llama8B_tp1", - "qps_list": [4,8], + "qps_list": [4], "common_parameters": { "model": "meta-llama/Meta-Llama-3-8B", "tp": 1, From 464374916df8a477189fb936ba04e5510f0ce50e Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 00:06:27 -0700 Subject: [PATCH 104/150] remove text annotation --- .../scripts/plot-nightly-results.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py index 962099a6b4c5f..6ef5c3a2c0ae4 100644 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -48,10 +48,10 @@ def main(args): f.write(description) - plt.rcParams.update({'font.size': 20}) + plt.rcParams.update({'font.size': 15}) # plot results - fig, axes = plt.subplots(3, 2, figsize=(14, 16)) + fig, axes = plt.subplots(3, 2, figsize=(10, 12)) methods = ["vllm", "trt", "lmdeploy", "tgi"] for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]): for j, metric in enumerate(["TTFT", "ITL"]): @@ -81,13 +81,13 @@ def main(args): fmt='o', capsize=5) ax.set_ylim(bottom=0) - for i, (method, mean, std) in enumerate(zip(method, means, stds)): - ax.text( - i - 0.2, mean, # Adjust position above the error bar - f'{mean:.0f}', - ha='center', - va='bottom' - ) + # for i, (method, mean, std) in enumerate(zip(method, means, stds)): + # ax.text( + # i - 0.2, mean, # Adjust position above the error bar + # f'{mean:.0f}', + # ha='center', + # va='bottom' + # ) ax.set_ylabel(f"{metric} (ms)") ax.set_title(f"{model} {metric} comparison") From 3146a96df465e878af724fb92d17d461b9f2953f Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 00:16:21 -0700 Subject: [PATCH 105/150] add padding --- .buildkite/nightly-benchmarks/scripts/plot-nightly-results.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py index 6ef5c3a2c0ae4..16f37be40f3c6 100644 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -91,7 +91,8 @@ def main(args): ax.set_ylabel(f"{metric} (ms)") ax.set_title(f"{model} {metric} comparison") - + + fig.tight_layout(pad=0.1) fig.savefig("nightly_results.png", bbox_inches='tight') if __name__ == '__main__': From 6da59d16294dc9a232bab5b03178c66d37e21eee Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 00:18:54 -0700 Subject: [PATCH 106/150] add hyperlink --- .buildkite/nightly-benchmarks/nightly-descriptions.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index 98a593d455639..8ad73491202a0 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -14,7 +14,7 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker ima - openmmlab/lmdeploy:v0.5.0 - ghcr.io/huggingface/text-generation-inference:2.1 -Check [nightly-pipeline.yaml](artifact://nightly-pipeline.yaml) artifact for more details. +Check nightly-pipeline.yaml artifact for more details. ## Workload description @@ -28,7 +28,7 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload: - Models: llama-3 8B, llama-3 70B, mixtral 8x7B. - Evaluation metrics: Throughput, TTFT (time to the first token, with mean and std), ITL (inter-token latency, with mean and std). -Check [nightly-tests.json](artifact://nightly-tests.json) artifact for more details. +Check nightly-tests.json artifact for more details. ## Known crashes From 0802f9fdb6fc375e0305895d47c55cf3759fdce2 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 00:37:15 -0700 Subject: [PATCH 107/150] bring back the full suite of test --- .../nightly-descriptions.md | 4 +- .../tests/nightly-tests.json | 76 +++++++++++++++++++ 2 files changed, 78 insertions(+), 2 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index 8ad73491202a0..fc9431d33a5a5 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -14,7 +14,7 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker ima - openmmlab/lmdeploy:v0.5.0 - ghcr.io/huggingface/text-generation-inference:2.1 -Check nightly-pipeline.yaml artifact for more details. +Check nightly-pipeline.yaml artifact for more details. ## Workload description @@ -28,7 +28,7 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload: - Models: llama-3 8B, llama-3 70B, mixtral 8x7B. - Evaluation metrics: Throughput, TTFT (time to the first token, with mean and std), ITL (inter-token latency, with mean and std). -Check nightly-tests.json artifact for more details. +Check nightly-tests.json artifact for more details. ## Known crashes diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 0f2ac3be3df81..89ef0b14e11b2 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -36,5 +36,81 @@ }, "vllm_client_parameters": { } + }, + { + "test_name": "mixtral8x7B_tp2", + "qps_list": [2], + "common_parameters": { + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tp": 2, + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 500, + "port": 8000 + }, + "lmdeploy_server_parameters": { + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "float16", + "max_batch_size": 256, + "max_input_len": 4096, + "max_output_len": 4096, + "trt_llm_version": "r24.04" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "" + }, + "vllm_client_parameters": { + } + }, + { + "test_name": "llama70B_tp4", + "qps_list": [2], + "common_parameters": { + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tp": 4, + "dataset_name": "sharegpt", + "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", + "num_prompts": 500, + "port": 8000 + }, + "lmdeploy_server_parameters": { + }, + "lmdeploy_client_parameters": { + }, + "tgi_server_parameters": { + }, + "tgi_client_parameters": { + "endpoint": "/generate_stream" + }, + "trt_server_parameters": { + "model_type": "llama", + "model_dtype": "float16", + "max_batch_size": 256, + "max_input_len": 4096, + "max_output_len": 4096, + "trt_llm_version": "r24.04" + }, + "trt_client_parameters": { + "endpoint": "/v2/models/ensemble/generate_stream" + }, + "vllm_server_parameters": { + "disable_log_stats": "", + "disable_log_requests": "" + }, + "vllm_client_parameters": { + } } ] \ No newline at end of file From 8e6fca22b2c2c40bbe9d06ce41b218b580777348 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 00:37:55 -0700 Subject: [PATCH 108/150] adjust test order --- .../nightly-benchmarks/tests/nightly-tests.json | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json index 89ef0b14e11b2..f250833c62710 100644 --- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json +++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json @@ -38,11 +38,11 @@ } }, { - "test_name": "mixtral8x7B_tp2", + "test_name": "llama70B_tp4", "qps_list": [2], "common_parameters": { - "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", - "tp": 2, + "model": "meta-llama/Meta-Llama-3-70B-Instruct", + "tp": 4, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 500, @@ -76,11 +76,11 @@ } }, { - "test_name": "llama70B_tp4", + "test_name": "mixtral8x7B_tp2", "qps_list": [2], "common_parameters": { - "model": "meta-llama/Meta-Llama-3-70B-Instruct", - "tp": 4, + "model": "mistralai/Mixtral-8x7B-Instruct-v0.1", + "tp": 2, "dataset_name": "sharegpt", "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json", "num_prompts": 500, From a77fcbdf87f7fb3bfdb4b6c636ecc75bf21ae65d Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 00:38:56 -0700 Subject: [PATCH 109/150] bring back full benchmark suite --- .../nightly-benchmarks/nightly-pipeline.yaml | 239 ++++++++---------- 1 file changed, 102 insertions(+), 137 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index ad1bd25f3b230..c9f7740fef968 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -1,141 +1,106 @@ steps: - # - label: "Annotate" - # priority: 100 - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # priorityClassName: perf-benchmark - # containers: - # - image: vllm/vllm-openai:v0.5.0.post1 - # command: - # - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh - # resources: - # limits: - # nvidia.com/gpu: 8 - # volumeMounts: - # - name: devshm - # mountPath: /dev/shm - # env: - # - name: VLLM_USAGE_SOURCE - # value: ci-test - # - name: VLLM_SOURCE_CODE_LOC - # value: /workspace/build/buildkite/vllm/performance-benchmark - # - name: HF_TOKEN - # valueFrom: - # secretKeyRef: - # name: hf-token-secret - # key: token - # nodeSelector: - # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - # volumes: - # - name: devshm - # emptyDir: - # medium: Memory - # - wait - # - label: "A100 lmdeploy benchmark" - # priority: 100 - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # priorityClassName: perf-benchmark - # containers: - # - image: openmmlab/lmdeploy:v0.5.0 - # command: - # - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - # resources: - # limits: - # nvidia.com/gpu: 8 - # volumeMounts: - # - name: devshm - # mountPath: /dev/shm - # env: - # - name: VLLM_USAGE_SOURCE - # value: ci-test - # - name: VLLM_SOURCE_CODE_LOC - # value: /workspace/build/buildkite/vllm/performance-benchmark - # - name: HF_TOKEN - # valueFrom: - # secretKeyRef: - # name: hf-token-secret - # key: token - # nodeSelector: - # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - # volumes: - # - name: devshm - # emptyDir: - # medium: Memory - # - label: "A100 trt benchmark" - # priority: 100 - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # priorityClassName: perf-benchmark - # containers: - # - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 - # command: - # - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - # resources: - # limits: - # nvidia.com/gpu: 8 - # volumeMounts: - # - name: devshm - # mountPath: /dev/shm - # env: - # - name: VLLM_USAGE_SOURCE - # value: ci-test - # - name: VLLM_SOURCE_CODE_LOC - # value: /workspace/build/buildkite/vllm/performance-benchmark - # - name: HF_TOKEN - # valueFrom: - # secretKeyRef: - # name: hf-token-secret - # key: token - # nodeSelector: - # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - # volumes: - # - name: devshm - # emptyDir: - # medium: Memory - # - label: "A100 vllm benchmark" - # priority: 100 - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # priorityClassName: perf-benchmark - # containers: - # - image: vllm/vllm-openai:latest - # command: - # - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - # resources: - # limits: - # nvidia.com/gpu: 8 - # volumeMounts: - # - name: devshm - # mountPath: /dev/shm - # env: - # - name: VLLM_USAGE_SOURCE - # value: ci-test - # - name: VLLM_SOURCE_CODE_LOC - # value: /workspace/build/buildkite/vllm/performance-benchmark - # - name: HF_TOKEN - # valueFrom: - # secretKeyRef: - # name: hf-token-secret - # key: token - # nodeSelector: - # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - # volumes: - # - name: devshm - # emptyDir: - # medium: Memory + - label: "A100 lmdeploy benchmark" + priority: 100 + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + priorityClassName: perf-benchmark + containers: + - image: openmmlab/lmdeploy:v0.5.0 + command: + - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - label: "A100 trt benchmark" + priority: 100 + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + priorityClassName: perf-benchmark + containers: + - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 + command: + - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - label: "A100 vllm benchmark" + priority: 100 + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + priorityClassName: perf-benchmark + containers: + - image: vllm/vllm-openai:latest + command: + - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory - label: "A100 tgi benchmark" priority: 100 agents: From 8b51f458e2cf25f57cf8d415378b155a61e8a3bd Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 00:39:38 -0700 Subject: [PATCH 110/150] add more pad --- .buildkite/nightly-benchmarks/scripts/plot-nightly-results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py index 16f37be40f3c6..b40de8fd07481 100644 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -92,7 +92,7 @@ def main(args): ax.set_ylabel(f"{metric} (ms)") ax.set_title(f"{model} {metric} comparison") - fig.tight_layout(pad=0.1) + fig.tight_layout(pad=0.3) fig.savefig("nightly_results.png", bbox_inches='tight') if __name__ == '__main__': From 4427b06d1f6cb87010cbb2436834fee2b1ec6bb8 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 10:58:31 -0700 Subject: [PATCH 111/150] mount huggingface cache --- .../nightly-descriptions.md | 2 +- .../nightly-benchmarks/nightly-pipeline.yaml | 212 +++++++++--------- .../nightly-benchmarks/run-nightly-suite.sh | 1 + 3 files changed, 112 insertions(+), 103 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index fc9431d33a5a5..5699c938eae42 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -33,7 +33,7 @@ Check Date: Thu, 4 Jul 2024 11:33:03 -0700 Subject: [PATCH 112/150] mount huggingface cache --- .../nightly-benchmarks/nightly-pipeline.yaml | 230 ++++++++++-------- 1 file changed, 127 insertions(+), 103 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 875b40ffb1aec..5616ce6af028d 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -1,38 +1,4 @@ -steps: - # - label: "A100 lmdeploy benchmark" - # priority: 100 - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # priorityClassName: perf-benchmark - # containers: - # - image: openmmlab/lmdeploy:v0.5.0 - # command: - # - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - # resources: - # limits: - # nvidia.com/gpu: 8 - # volumeMounts: - # - name: devshm - # mountPath: /dev/shm - # env: - # - name: VLLM_USAGE_SOURCE - # value: ci-test - # - name: VLLM_SOURCE_CODE_LOC - # value: /workspace/build/buildkite/vllm/performance-benchmark - # - name: HF_TOKEN - # valueFrom: - # secretKeyRef: - # name: hf-token-secret - # key: token - # nodeSelector: - # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - # volumes: - # - name: devshm - # emptyDir: - # medium: Memory +steps: - label: "A100 trt benchmark" priority: 100 agents: @@ -75,74 +41,132 @@ steps: hostPath: path: /root/.cache/huggingface type: Directory - # - label: "A100 vllm benchmark" - # priority: 100 - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # priorityClassName: perf-benchmark - # containers: - # - image: vllm/vllm-openai:latest - # command: - # - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - # resources: - # limits: - # nvidia.com/gpu: 8 - # volumeMounts: - # - name: devshm - # mountPath: /dev/shm - # env: - # - name: VLLM_USAGE_SOURCE - # value: ci-test - # - name: VLLM_SOURCE_CODE_LOC - # value: /workspace/build/buildkite/vllm/performance-benchmark - # - name: HF_TOKEN - # valueFrom: - # secretKeyRef: - # name: hf-token-secret - # key: token - # nodeSelector: - # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - # volumes: - # - name: devshm - # emptyDir: - # medium: Memory - # - label: "A100 tgi benchmark" - # priority: 100 - # agents: - # queue: A100 - # plugins: - # - kubernetes: - # podSpec: - # priorityClassName: perf-benchmark - # containers: - # - image: ghcr.io/huggingface/text-generation-inference:2.1 - # command: - # - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - # resources: - # limits: - # nvidia.com/gpu: 8 - # volumeMounts: - # - name: devshm - # mountPath: /dev/shm - # env: - # - name: VLLM_USAGE_SOURCE - # value: ci-test - # - name: VLLM_SOURCE_CODE_LOC - # value: /workspace/build/buildkite/vllm/performance-benchmark - # - name: HF_TOKEN - # valueFrom: - # secretKeyRef: - # name: hf-token-secret - # key: token - # nodeSelector: - # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - # volumes: - # - name: devshm - # emptyDir: - # medium: Memory + - label: "A100 lmdeploy benchmark" + priority: 100 + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + priorityClassName: perf-benchmark + containers: + - image: openmmlab/lmdeploy:v0.5.0 + command: + - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_HOME + value: /root/.cache/huggingface + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /root/.cache/huggingface + type: Directory + - label: "A100 vllm benchmark" + priority: 100 + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + priorityClassName: perf-benchmark + containers: + - image: vllm/vllm-openai:latest + command: + - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_HOME + value: /root/.cache/huggingface + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /root/.cache/huggingface + type: Directory + - label: "A100 tgi benchmark" + priority: 100 + agents: + queue: A100 + plugins: + - kubernetes: + podSpec: + priorityClassName: perf-benchmark + containers: + - image: ghcr.io/huggingface/text-generation-inference:2.1 + command: + - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_HOME + value: /root/.cache/huggingface + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /root/.cache/huggingface + type: Directory - wait - label: "Plot" priority: 100 From a174d268ca405d15a090aa20a73662ebd518ca2c Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 11:34:16 -0700 Subject: [PATCH 113/150] add even more padding --- .buildkite/nightly-benchmarks/scripts/plot-nightly-results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py index b40de8fd07481..71ad313372467 100644 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -92,7 +92,7 @@ def main(args): ax.set_ylabel(f"{metric} (ms)") ax.set_title(f"{model} {metric} comparison") - fig.tight_layout(pad=0.3) + fig.tight_layout(pad=0.6) fig.savefig("nightly_results.png", bbox_inches='tight') if __name__ == '__main__': From 3f49b0cf7fcf22dd7ce89ab1c88d22b007bdc7c8 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 11:35:09 -0700 Subject: [PATCH 114/150] add illustration --- .buildkite/nightly-benchmarks/nightly-descriptions.md | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index 5699c938eae42..84d304bebc0e1 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -36,12 +36,13 @@ Check -Benchmarking results \ No newline at end of file +## Results + +{nightly_results_benchmarking_table} From 5c3a7d083d8a229ebe34c8a63e5293766aecb476 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 11:36:00 -0700 Subject: [PATCH 115/150] make yapf and ruff happy --- .../scripts/plot-nightly-results.py | 78 ++++++++++--------- 1 file changed, 40 insertions(+), 38 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py index 71ad313372467..1641be259e06c 100644 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -1,55 +1,56 @@ - -import json -import os -from pathlib import Path import argparse -import matplotlib.pyplot as plt +import json import math +from pathlib import Path +import matplotlib.pyplot as plt import pandas as pd from tabulate import tabulate + def parse_arguments(): - parser = argparse.ArgumentParser(description='Parse command line arguments for summary-nightly-results script.') - parser.add_argument('--results-folder', type=str, required=True, help='The folder where the results are stored.') - parser.add_argument('--description', type=str, required=True, help='Description of the results.') - + parser = argparse.ArgumentParser( + description= + 'Parse command line arguments for summary-nightly-results script.') + parser.add_argument('--results-folder', + type=str, + required=True, + help='The folder where the results are stored.') + parser.add_argument('--description', + type=str, + required=True, + help='Description of the results.') + args = parser.parse_args() return args - + def main(args): results_folder = Path(args.results_folder) - + results = [] # collect results for test_file in results_folder.glob("*_nightly_results.json"): with open(test_file, "r") as f: results = results + json.loads(f.read()) - - - # generate markdown table + + # generate markdown table df = pd.DataFrame.from_dict(results) - md_table = tabulate(df, - headers='keys', - tablefmt='pipe', - showindex=False) - + md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False) + with open(args.description, "r") as f: description = f.read() - + description = description.format( - nightly_results_benchmarking_table=md_table - ) - + nightly_results_benchmarking_table=md_table) + with open("nightly_results.md", "w") as f: f.write(description) - plt.rcParams.update({'font.size': 15}) - + # plot results fig, axes = plt.subplots(3, 2, figsize=(10, 12)) methods = ["vllm", "trt", "lmdeploy", "tgi"] @@ -60,7 +61,7 @@ def main(args): target = df['Test name'].str.contains(model) target = target & df['Engine'].str.contains(method) filtered_df = df[target] - + if filtered_df.empty: means.append(0.) stds.append(0.) @@ -69,32 +70,33 @@ def main(args): std = filtered_df[f"Std {metric} (ms)"].values[0] success = filtered_df["Successful req."].values[0] stds.append(std / math.sqrt(success)) - + print(means, stds) - + ax = axes[i, j] - - ax.errorbar( - ["vllm", "trt", "lmdeploy", "tgi"], - means, - yerr=stds, - fmt='o', capsize=5) + + ax.errorbar(["vllm", "trt", "lmdeploy", "tgi"], + means, + yerr=stds, + fmt='o', + capsize=5) ax.set_ylim(bottom=0) # for i, (method, mean, std) in enumerate(zip(method, means, stds)): # ax.text( # i - 0.2, mean, # Adjust position above the error bar - # f'{mean:.0f}', - # ha='center', + # f'{mean:.0f}', + # ha='center', # va='bottom' # ) - + ax.set_ylabel(f"{metric} (ms)") ax.set_title(f"{model} {metric} comparison") fig.tight_layout(pad=0.6) fig.savefig("nightly_results.png", bbox_inches='tight') + if __name__ == '__main__': args = parse_arguments() - main(args) \ No newline at end of file + main(args) From ec6f42de9ea32c1c3baa491e31cfc88a8d6bdab9 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 13:16:11 -0700 Subject: [PATCH 116/150] add datetime to filename and make yapf happy --- .../nightly-benchmarks/scripts/summary-nightly-results.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index d9fc46cb45c92..f7d765a9ac06e 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -1,3 +1,4 @@ +import datetime import json import os from pathlib import Path @@ -56,7 +57,8 @@ serving_md_table_lines = serving_md_table_with_headers.split('\n') serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:]) - prefix = os.environ.get("CURRENT_LLM_SERVING_ENGINE") + prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") + prefix = prefix + os.environ.get("CURRENT_LLM_SERVING_ENGINE") # document benchmarking results in markdown with open(results_folder / f"{prefix}_nightly_results.md", "w") as f: From 00905776e217ba2b4e22723ff8ed95db177667d6 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 13:31:07 -0700 Subject: [PATCH 117/150] debug mixtral and llama70B --- .../nightly-benchmarks/nightly-pipeline.yaml | 265 ++++++++++++------ .../scripts/run-trt-nightly.sh | 18 +- 2 files changed, 189 insertions(+), 94 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 5616ce6af028d..9bea50673e81f 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -1,5 +1,5 @@ steps: - - label: "A100 trt benchmark" + - label: "A100 trt benchmark mixtral8x7B" priority: 100 agents: queue: A100 @@ -22,6 +22,8 @@ steps: env: - name: VLLM_USAGE_SOURCE value: ci-test + - name: TEST_SELECTOR + value: mixtral8x7B_tp2 - name: HF_HOME value: /root/.cache/huggingface - name: VLLM_SOURCE_CODE_LOC @@ -41,49 +43,8 @@ steps: hostPath: path: /root/.cache/huggingface type: Directory - - label: "A100 lmdeploy benchmark" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: openmmlab/lmdeploy:v0.5.0 - command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /root/.cache/huggingface - type: Directory - - label: "A100 vllm benchmark" + - wait + - label: "A100 trt benchmark llama8B" priority: 100 agents: queue: A100 @@ -92,7 +53,7 @@ steps: podSpec: priorityClassName: perf-benchmark containers: - - image: vllm/vllm-openai:latest + - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 command: - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh resources: @@ -106,6 +67,8 @@ steps: env: - name: VLLM_USAGE_SOURCE value: ci-test + - name: TEST_SELECTOR + value: llama8B_tp1 - name: HF_HOME value: /root/.cache/huggingface - name: VLLM_SOURCE_CODE_LOC @@ -125,48 +88,176 @@ steps: hostPath: path: /root/.cache/huggingface type: Directory - - label: "A100 tgi benchmark" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: ghcr.io/huggingface/text-generation-inference:2.1 - command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /root/.cache/huggingface - type: Directory + # - label: "A100 trt benchmark llama70B" + # priority: 100 + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # priorityClassName: perf-benchmark + # containers: + # - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 + # command: + # - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + # resources: + # limits: + # nvidia.com/gpu: 8 + # volumeMounts: + # - name: devshm + # mountPath: /dev/shm + # - name: hf-cache + # mountPath: /root/.cache/huggingface + # env: + # - name: VLLM_USAGE_SOURCE + # value: ci-test + # - name: TEST_SELECTOR + # value: llama70B_tp4 + # - name: HF_HOME + # value: /root/.cache/huggingface + # - name: VLLM_SOURCE_CODE_LOC + # value: /workspace/build/buildkite/vllm/performance-benchmark + # - name: HF_TOKEN + # valueFrom: + # secretKeyRef: + # name: hf-token-secret + # key: token + # nodeSelector: + # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + # volumes: + # - name: devshm + # emptyDir: + # medium: Memory + # - name: hf-cache + # hostPath: + # path: /root/.cache/huggingface + # type: Directory + # - label: "A100 lmdeploy benchmark" + # priority: 100 + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # priorityClassName: perf-benchmark + # containers: + # - image: openmmlab/lmdeploy:v0.5.0 + # command: + # - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + # resources: + # limits: + # nvidia.com/gpu: 8 + # volumeMounts: + # - name: devshm + # mountPath: /dev/shm + # - name: hf-cache + # mountPath: /root/.cache/huggingface + # env: + # - name: VLLM_USAGE_SOURCE + # value: ci-test + # - name: HF_HOME + # value: /root/.cache/huggingface + # - name: VLLM_SOURCE_CODE_LOC + # value: /workspace/build/buildkite/vllm/performance-benchmark + # - name: HF_TOKEN + # valueFrom: + # secretKeyRef: + # name: hf-token-secret + # key: token + # nodeSelector: + # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + # volumes: + # - name: devshm + # emptyDir: + # medium: Memory + # - name: hf-cache + # hostPath: + # path: /root/.cache/huggingface + # type: Directory + # - label: "A100 vllm benchmark" + # priority: 100 + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # priorityClassName: perf-benchmark + # containers: + # - image: vllm/vllm-openai:latest + # command: + # - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + # resources: + # limits: + # nvidia.com/gpu: 8 + # volumeMounts: + # - name: devshm + # mountPath: /dev/shm + # - name: hf-cache + # mountPath: /root/.cache/huggingface + # env: + # - name: VLLM_USAGE_SOURCE + # value: ci-test + # - name: HF_HOME + # value: /root/.cache/huggingface + # - name: VLLM_SOURCE_CODE_LOC + # value: /workspace/build/buildkite/vllm/performance-benchmark + # - name: HF_TOKEN + # valueFrom: + # secretKeyRef: + # name: hf-token-secret + # key: token + # nodeSelector: + # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + # volumes: + # - name: devshm + # emptyDir: + # medium: Memory + # - name: hf-cache + # hostPath: + # path: /root/.cache/huggingface + # type: Directory + # - label: "A100 tgi benchmark" + # priority: 100 + # agents: + # queue: A100 + # plugins: + # - kubernetes: + # podSpec: + # priorityClassName: perf-benchmark + # containers: + # - image: ghcr.io/huggingface/text-generation-inference:2.1 + # command: + # - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + # resources: + # limits: + # nvidia.com/gpu: 8 + # volumeMounts: + # - name: devshm + # mountPath: /dev/shm + # - name: hf-cache + # mountPath: /root/.cache/huggingface + # env: + # - name: VLLM_USAGE_SOURCE + # value: ci-test + # - name: HF_HOME + # value: /root/.cache/huggingface + # - name: VLLM_SOURCE_CODE_LOC + # value: /workspace/build/buildkite/vllm/performance-benchmark + # - name: HF_TOKEN + # valueFrom: + # secretKeyRef: + # name: hf-token-secret + # key: token + # nodeSelector: + # nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + # volumes: + # - name: devshm + # emptyDir: + # medium: Memory + # - name: hf-cache + # hostPath: + # path: /root/.cache/huggingface + # type: Directory - wait - label: "Plot" priority: 100 diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index ae6f4316eb4c2..a063a78e51d4c 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -95,13 +95,8 @@ run_serving_tests() { fi - # prepare tokenizer + cd $VLLM_SOURCE_CODE_LOC/benchmarks - rm -rf /tokenizer_cache - mkdir /tokenizer_cache - python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ - --model "$model" \ - --cachedir /tokenizer_cache # run the server @@ -119,8 +114,17 @@ run_serving_tests() { break fi - # go back to vllm benchmarking directory + # prepare tokenizer cd $VLLM_SOURCE_CODE_LOC/benchmarks + rm -rf /tokenizer_cache + mkdir /tokenizer_cache + # update transformers package, to make sure mixtral tokenizer is available + python -m pip install transformers -U + python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ + --model "$model" \ + --cachedir /tokenizer_cache + cd $VLLM_SOURCE_CODE_LOC/benchmarks + # iterate over different QPS for qps in $qps_list; do From f76a04a84bc6b6d4c72cbd23d5093549305bd2dd Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 13:45:59 -0700 Subject: [PATCH 118/150] pin lmdeploy transformers to 4.41.2 --- .buildkite/nightly-benchmarks/nightly-descriptions.md | 5 +++-- .../nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 2 ++ 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index 84d304bebc0e1..eaa59309c0957 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -31,9 +31,10 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload: Check nightly-tests.json artifact for more details. -## Known crashes +## Known issues -- TGI v2.1 crashes when running mixtral model, see [TGI Issue #2122](https://github.com/huggingface/text-generation-inference/issues/2122) +- TGI v2.1 crashes when running mixtral model, see [tgi issue #2122](https://github.com/huggingface/text-generation-inference/issues/2122) +- pin the transformers library to 4.41.2 to avoid lmdeploy missing cache_position error, see [lmdeploy issue 1885](https://github.com/InternLM/lmdeploy/issues/1885). diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index c23438679578b..0f6393b2f7784 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -205,6 +205,8 @@ main() { mkdir -p $RESULTS_FOLDER BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + python -m pip install transformers==4.41.2 + export CURRENT_LLM_SERVING_ENGINE=lmdeploy run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json python -m pip install tabulate pandas From 859d6f3e55071f3ae37b0fc5c32947fb6f296123 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 13:48:34 -0700 Subject: [PATCH 119/150] skip the test case instead of exit the whoel test suite --- .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh | 2 +- .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh | 2 +- .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh index b805c52d3fa8c..492fb5260ed70 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh @@ -65,7 +65,7 @@ run_serving_tests() { # if TEST_SELECTOR is set, only run the test cases that match the selector if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then echo "Skip test case $test_name." - exit 0 + continue fi # append tgi to the test name diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index a063a78e51d4c..59e129f7b9f52 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -65,7 +65,7 @@ run_serving_tests() { # if TEST_SELECTOR is set, only run the test cases that match the selector if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then echo "Skip test case $test_name." - exit 0 + continue fi # append trt to the test name diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh index 1e6d2893983bf..abed7cbb67348 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh @@ -70,7 +70,7 @@ run_serving_tests() { # if TEST_SELECTOR is set, only run the test cases that match the selector if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then echo "Skip test case $test_name." - exit 0 + continue fi # append vllm to the test name From 3ce4f5fdb2f27d1859ecf4c93d3051227b44629c Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 14:52:33 -0700 Subject: [PATCH 120/150] update transformers for mixtral model --- .../nightly-benchmarks/nightly-pipeline.yaml | 22 ++++++++++--------- .../scripts/run-trt-nightly.sh | 1 + 2 files changed, 13 insertions(+), 10 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 9bea50673e81f..bb6af945146c2 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -1,5 +1,5 @@ steps: - - label: "A100 trt benchmark mixtral8x7B" + - label: "A100 lmdeploy benchmark" priority: 100 agents: queue: A100 @@ -8,7 +8,7 @@ steps: podSpec: priorityClassName: perf-benchmark containers: - - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 + - image: openmmlab/lmdeploy:v0.5.0 command: - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh resources: @@ -20,10 +20,10 @@ steps: - name: hf-cache mountPath: /root/.cache/huggingface env: - - name: VLLM_USAGE_SOURCE - value: ci-test - name: TEST_SELECTOR value: mixtral8x7B_tp2 + - name: VLLM_USAGE_SOURCE + value: ci-test - name: HF_HOME value: /root/.cache/huggingface - name: VLLM_SOURCE_CODE_LOC @@ -44,7 +44,7 @@ steps: path: /root/.cache/huggingface type: Directory - wait - - label: "A100 trt benchmark llama8B" + - label: "A100 trt benchmark mixtral8x7B" priority: 100 agents: queue: A100 @@ -68,7 +68,7 @@ steps: - name: VLLM_USAGE_SOURCE value: ci-test - name: TEST_SELECTOR - value: llama8B_tp1 + value: mixtral8x7B_tp2 - name: HF_HOME value: /root/.cache/huggingface - name: VLLM_SOURCE_CODE_LOC @@ -88,7 +88,7 @@ steps: hostPath: path: /root/.cache/huggingface type: Directory - # - label: "A100 trt benchmark llama70B" + # - label: "A100 trt benchmark llama8B" # priority: 100 # agents: # queue: A100 @@ -112,7 +112,7 @@ steps: # - name: VLLM_USAGE_SOURCE # value: ci-test # - name: TEST_SELECTOR - # value: llama70B_tp4 + # value: llama8B_tp1 # - name: HF_HOME # value: /root/.cache/huggingface # - name: VLLM_SOURCE_CODE_LOC @@ -132,7 +132,7 @@ steps: # hostPath: # path: /root/.cache/huggingface # type: Directory - # - label: "A100 lmdeploy benchmark" + # - label: "A100 trt benchmark llama70B" # priority: 100 # agents: # queue: A100 @@ -141,7 +141,7 @@ steps: # podSpec: # priorityClassName: perf-benchmark # containers: - # - image: openmmlab/lmdeploy:v0.5.0 + # - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 # command: # - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh # resources: @@ -155,6 +155,8 @@ steps: # env: # - name: VLLM_USAGE_SOURCE # value: ci-test + # - name: TEST_SELECTOR + # value: llama70B_tp4 # - name: HF_HOME # value: /root/.cache/huggingface # - name: VLLM_SOURCE_CODE_LOC diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index 59e129f7b9f52..25b1bae78e6b6 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -100,6 +100,7 @@ run_serving_tests() { # run the server + python -m pip install transformers -U echo "Running test case $test_name" bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params" From 646114d299827043451fbb4a11710a1bd0405261 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 14:53:38 -0700 Subject: [PATCH 121/150] move transformers update --- .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index 25b1bae78e6b6..312126bb56b3f 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -99,8 +99,6 @@ run_serving_tests() { cd $VLLM_SOURCE_CODE_LOC/benchmarks - # run the server - python -m pip install transformers -U echo "Running test case $test_name" bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params" @@ -119,8 +117,6 @@ run_serving_tests() { cd $VLLM_SOURCE_CODE_LOC/benchmarks rm -rf /tokenizer_cache mkdir /tokenizer_cache - # update transformers package, to make sure mixtral tokenizer is available - python -m pip install transformers -U python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \ --model "$model" \ --cachedir /tokenizer_cache @@ -205,6 +201,9 @@ main() { mkdir -p $RESULTS_FOLDER BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/ + # update transformers package, to make sure mixtral tokenizer is available + python -m pip install transformers -U + export CURRENT_LLM_SERVING_ENGINE=trt run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json python -m pip install tabulate pandas From b6058aa91596c87fdb567d84f3d4f19d96251385 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 14:56:58 -0700 Subject: [PATCH 122/150] typo fix --- .buildkite/nightly-benchmarks/scripts/launch-trt-server.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh index 26d3ca610af81..f8262653a6628 100644 --- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh +++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh @@ -68,7 +68,7 @@ if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then else - echo "Key 'fp8' exists in common params. Use convert_checkpoint.py" + echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py" python3 convert_checkpoint.py \ --model_dir ${model_path} \ --dtype ${model_dtype} \ From ac4d13774b7f8623801c2e1d77b345dfec6569c6 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 16:14:46 -0700 Subject: [PATCH 123/150] bring back the full test suite --- .../nightly-descriptions.md | 2 +- .../nightly-benchmarks/nightly-pipeline.yaml | 351 +++++++++--------- .../scripts/run-trt-nightly.sh | 1 - .../scripts/summary-nightly-results.py | 2 +- 4 files changed, 176 insertions(+), 180 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index eaa59309c0957..58a3fb2c07833 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -34,7 +34,7 @@ Check Date: Thu, 4 Jul 2024 22:34:59 -0700 Subject: [PATCH 124/150] remove wrongfully-added results --- benchmarks/results/trt_llama8B_tp1_qps_8.commands | 6 ------ benchmarks/results/trt_nightly_results.md | 1 - 2 files changed, 7 deletions(-) delete mode 100644 benchmarks/results/trt_llama8B_tp1_qps_8.commands delete mode 100644 benchmarks/results/trt_nightly_results.md diff --git a/benchmarks/results/trt_llama8B_tp1_qps_8.commands b/benchmarks/results/trt_llama8B_tp1_qps_8.commands deleted file mode 100644 index e0312b4e22dc2..0000000000000 --- a/benchmarks/results/trt_llama8B_tp1_qps_8.commands +++ /dev/null @@ -1,6 +0,0 @@ -{ - "server_command": "", - "client_command": "python3 benchmark_serving.py --backend tensorrt-llm --tokenizer /tokenizer_cache --model meta-llama/Meta-Llama-3-8B --dataset-name sharegpt --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 200 --port 8000 --save-result --result-dir results/ --result-filename trt_llama8B_tp1_qps_8.json --request-rate 8 --endpoint /v2/models/ensemble/generate_stream", - "gpu_type": "A100-SXM4-80GB", - "engine": "trt" -} diff --git a/benchmarks/results/trt_nightly_results.md b/benchmarks/results/trt_nightly_results.md deleted file mode 100644 index 3befc74903d68..0000000000000 --- a/benchmarks/results/trt_nightly_results.md +++ /dev/null @@ -1 +0,0 @@ -| trt_llama8B_tp1_qps_8 | A100-SXM4-80GB | 200 | 6.49609 | 60.3214 | 55.7321 | 119.186 | 15.6021 | 14.187 | 56.2925 | trt | From b012f719f26e7b6f4bad17c2270c4007ad0adf84 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Thu, 4 Jul 2024 23:01:24 -0700 Subject: [PATCH 125/150] adjust plotting & provide more details in nightly description --- .../nightly-descriptions.md | 30 ++++++++----------- .../scripts/plot-nightly-results.py | 1 + README.md | 2 ++ 3 files changed, 16 insertions(+), 17 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index 58a3fb2c07833..08e8b0c02543e 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -6,7 +6,7 @@ The main goal of this benchmarking is two-fold: - Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md](). -## Versions +## Docker images We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images: - vllm/vllm-openai:v0.5.0.post1 @@ -14,33 +14,29 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker ima - openmmlab/lmdeploy:v0.5.0 - ghcr.io/huggingface/text-generation-inference:2.1 -Check nightly-pipeline.yaml artifact for more details. + -## Workload description - -We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload: - -- Input length: randomly sample 1000 prompts from ShareGPT dataset (with fixed random seed). -- Output length: the corresponding output length of these 1000 prompts. -- Batch size: dynamically determined by vllm and the arrival pattern of the requests. -- Average QPS (query per second): 4 for 8B model and 2 for larger models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed). -- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. -- Evaluation metrics: Throughput, TTFT (time to the first token, with mean and std), ITL (inter-token latency, with mean and std). +## Hardware -Check nightly-tests.json artifact for more details. +One AWS node with 8x NVIDIA A100 GPUs. -## Known issues +## Workload description -- TGI v2.1 crashes when running mixtral model, see [tgi issue #2122](https://github.com/huggingface/text-generation-inference/issues/2122) -- Pin the transformers library to 4.41.2 to avoid lmdeploy missing cache_position error, see [lmdeploy issue 1885](https://github.com/InternLM/lmdeploy/issues/1885). +We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload: +- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed). +- Output length: the corresponding output length of these 500 prompts. +- Models: llama-3 8B, llama-3 70B, mixtral 8x7B. +- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed). +- Evaluation metrics: Throughput, TTFT (time to the first token), ITL (inter-token latency). + ## Plots -In the following plots, the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed. +In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed. Benchmarking results diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py index 1641be259e06c..9c93d654d1926 100644 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -92,6 +92,7 @@ def main(args): ax.set_ylabel(f"{metric} (ms)") ax.set_title(f"{model} {metric} comparison") + ax.grid() fig.tight_layout(pad=0.6) fig.savefig("nightly_results.png", bbox_inches='tight') diff --git a/README.md b/README.md index 3e0da945d9be8..879d47fefe0f4 100644 --- a/README.md +++ b/README.md @@ -52,6 +52,8 @@ vLLM is fast with: - Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache - Optimized CUDA kernels +**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/3924) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)). + vLLM is flexible and easy to use with: - Seamless integration with popular Hugging Face models From 4def3026f196f351ac45a71a009666a48576ad7d Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sat, 6 Jul 2024 23:03:42 -0700 Subject: [PATCH 126/150] adjust figure -- add grid, bar plot, color, +throughput --- .../scripts/plot-nightly-results.py | 50 +++++++++++++------ .../scripts/summary-nightly-results.py | 2 + 2 files changed, 36 insertions(+), 16 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py index 9c93d654d1926..c67b1b8414b70 100644 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -52,11 +52,11 @@ def main(args): plt.rcParams.update({'font.size': 15}) # plot results - fig, axes = plt.subplots(3, 2, figsize=(10, 12)) + fig, axes = plt.subplots(3, 3, figsize=(10, 12)) methods = ["vllm", "trt", "lmdeploy", "tgi"] for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]): for j, metric in enumerate(["TTFT", "ITL"]): - means, stds = [], [] + means, stds = [], [], [] for method in methods: target = df['Test name'].str.contains(model) target = target & df['Engine'].str.contains(method) @@ -75,24 +75,42 @@ def main(args): ax = axes[i, j] - ax.errorbar(["vllm", "trt", "lmdeploy", "tgi"], - means, - yerr=stds, - fmt='o', - capsize=5) + ax.bar(["vllm", "trt", "lmdeploy", "tgi"], + means, + yerr=stds, + capsize=5, + colors=['#E69F00', '#56B4E9','#D55E00', '#009E73']) ax.set_ylim(bottom=0) - # for i, (method, mean, std) in enumerate(zip(method, means, stds)): - # ax.text( - # i - 0.2, mean, # Adjust position above the error bar - # f'{mean:.0f}', - # ha='center', - # va='bottom' - # ) - ax.set_ylabel(f"{metric} (ms)") ax.set_title(f"{model} {metric} comparison") - ax.grid() + ax.grid(axis='y') + + metric = "Tput" + j = 2 + if True: + tputs = [] + for method in methods: + target = df['Test name'].str.contains(model) + target = target & df['Engine'].str.contains(method) + filtered_df = df[target] + + if filtered_df.empty: + tputs.append(0.) + else: + tputs.append(filtered_df["Input Tput (tok/s)"].values[0] + filtered_df["Output Tput (tok/s)"].values[0]) + + ax = axes[i, j] + + ax.bar(["vllm", "trt", "lmdeploy", "tgi"], + tputs, + colors=['#E69F00', '#56B4E9','#D55E00', '#009E73']) + ax.set_ylim(bottom=0) + + ax.set_ylabel(f"Tput (token/s)") + ax.set_title(f"{model} {metric} comparison") + ax.grid(axis='y') + fig.tight_layout(pad=0.6) fig.savefig("nightly_results.png", bbox_inches='tight') diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index c78c831eaab7b..782d1ef9aab98 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -19,6 +19,8 @@ "std_ttft_ms": "Std TTFT (ms)", "mean_itl_ms": "Mean ITL (ms)", "std_itl_ms": "Std ITL (ms)", + "input_throughput": "Input Tput (tok/s)", + "output_throughput": "Output Tput (tok/s)", "engine": "Engine", } From 6b77d2bf8a0eedcd0e5939eb1a4376bdc2111bef Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sat, 6 Jul 2024 23:12:37 -0700 Subject: [PATCH 127/150] typo fix --- .buildkite/nightly-benchmarks/scripts/plot-nightly-results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py index c67b1b8414b70..74f10c467c186 100644 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -56,7 +56,7 @@ def main(args): methods = ["vllm", "trt", "lmdeploy", "tgi"] for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]): for j, metric in enumerate(["TTFT", "ITL"]): - means, stds = [], [], [] + means, stds = [], [] for method in methods: target = df['Test name'].str.contains(model) target = target & df['Engine'].str.contains(method) From 8c0259ccb5e2c3ada603ba3f53564b172657e61c Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sat, 6 Jul 2024 23:19:22 -0700 Subject: [PATCH 128/150] bug fix: set color using attribute --- .../scripts/plot-nightly-results.py | 21 ++++++++++++------- 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py index 74f10c467c186..e2b12a948e074 100644 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -26,6 +26,7 @@ def parse_arguments(): def main(args): + bar_colors = ['#E69F00', '#56B4E9','#D55E00', '#009E73'] results_folder = Path(args.results_folder) results = [] @@ -75,11 +76,13 @@ def main(args): ax = axes[i, j] - ax.bar(["vllm", "trt", "lmdeploy", "tgi"], - means, - yerr=stds, - capsize=5, - colors=['#E69F00', '#56B4E9','#D55E00', '#009E73']) + bars = ax.bar(["vllm", "trt", "lmdeploy", "tgi"], + means, + yerr=stds, + capsize=5, + ) + for idx, bar in enumerate(bars): + bar.set_color(bar_colors[idx]) ax.set_ylim(bottom=0) ax.set_ylabel(f"{metric} (ms)") @@ -102,9 +105,11 @@ def main(args): ax = axes[i, j] - ax.bar(["vllm", "trt", "lmdeploy", "tgi"], - tputs, - colors=['#E69F00', '#56B4E9','#D55E00', '#009E73']) + bars = ax.bar(["vllm", "trt", "lmdeploy", "tgi"], + tputs,) + for idx, bar in enumerate(bars): + bar.set_color(bar_colors[idx]) + ax.set_ylim(bottom=0) ax.set_ylabel(f"Tput (token/s)") From 2ee07df2f9d041a810630993ebaf6aa0e9f2114a Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sat, 6 Jul 2024 23:25:27 -0700 Subject: [PATCH 129/150] mute curl output --- it's getting toooo long --- .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 2 +- .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh | 2 +- .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh | 2 +- .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh index 0f6393b2f7784..d6f112aaa42fd 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh @@ -46,7 +46,7 @@ wait_for_server() { # wait for vllm server to start # return 1 if vllm server crashes timeout 1200 bash -c ' - until curl localhost:8000/v1/completions; do + until curl -s localhost:8000/v1/completions > /dev/null; do sleep 1 done' && return 0 || return 1 } diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh index 492fb5260ed70..fed03654f8b77 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh @@ -44,7 +44,7 @@ json2args() { wait_for_server() { timeout 1200 bash -c ' - until curl localhost:8000/generate_stream; do + until curl -s localhost:8000/generate_stream > /dev/null; do sleep 1 done' && return 0 || return 1 } diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh index bec59dad6e55c..4a82b9ec64d71 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh @@ -44,7 +44,7 @@ json2args() { wait_for_server() { timeout 1200 bash -c ' - until curl localhost:8000/generate_stream; do + until curl -s localhost:8000/generate_stream > /dev/null; do sleep 1 done' && return 0 || return 1 } diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh index abed7cbb67348..663045b8a9122 100644 --- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh +++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh @@ -50,7 +50,7 @@ wait_for_server() { # wait for vllm server to start # return 1 if vllm server crashes timeout 1200 bash -c ' - until curl localhost:8000/v1/completions; do + until curl -s localhost:8000/v1/completions > /dev/null; do sleep 1 done' && return 0 || return 1 } From 9547066fd3de7c2dbf3155a71733bf15360534ee Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sat, 6 Jul 2024 23:38:20 -0700 Subject: [PATCH 130/150] adjust coloring --- .../nightly-benchmarks/scripts/plot-nightly-results.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py index e2b12a948e074..c7545cbc79ae8 100644 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -26,7 +26,7 @@ def parse_arguments(): def main(args): - bar_colors = ['#E69F00', '#56B4E9','#D55E00', '#009E73'] + bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00'] results_folder = Path(args.results_folder) results = [] @@ -53,7 +53,7 @@ def main(args): plt.rcParams.update({'font.size': 15}) # plot results - fig, axes = plt.subplots(3, 3, figsize=(10, 12)) + fig, axes = plt.subplots(3, 3, figsize=(14, 14)) methods = ["vllm", "trt", "lmdeploy", "tgi"] for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]): for j, metric in enumerate(["TTFT", "ITL"]): @@ -74,7 +74,7 @@ def main(args): print(means, stds) - ax = axes[i, j] + ax = axes[i, j+1] bars = ax.bar(["vllm", "trt", "lmdeploy", "tgi"], means, @@ -90,7 +90,7 @@ def main(args): ax.grid(axis='y') metric = "Tput" - j = 2 + j = 0 if True: tputs = [] for method in methods: From a3085a10f67817fbb2bbf6d197b5ae1e22664092 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sat, 6 Jul 2024 23:42:58 -0700 Subject: [PATCH 131/150] increase font size, adjust coloring --- .buildkite/nightly-benchmarks/nightly-descriptions.md | 2 +- .../nightly-benchmarks/scripts/plot-nightly-results.py | 5 +++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md index 08e8b0c02543e..c3d3cbf473968 100644 --- a/.buildkite/nightly-benchmarks/nightly-descriptions.md +++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md @@ -30,7 +30,7 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload: - Output length: the corresponding output length of these 500 prompts. - Models: llama-3 8B, llama-3 70B, mixtral 8x7B. - Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed). -- Evaluation metrics: Throughput, TTFT (time to the first token), ITL (inter-token latency). +- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better). diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py index c7545cbc79ae8..3e5bd72f56de2 100644 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -50,10 +50,11 @@ def main(args): with open("nightly_results.md", "w") as f: f.write(description) - plt.rcParams.update({'font.size': 15}) + plt.rcParams.update({'font.size': 20}) # plot results fig, axes = plt.subplots(3, 3, figsize=(14, 14)) + fig.subplots_adjust(hspace=0.5) methods = ["vllm", "trt", "lmdeploy", "tgi"] for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]): for j, metric in enumerate(["TTFT", "ITL"]): @@ -117,7 +118,7 @@ def main(args): ax.grid(axis='y') - fig.tight_layout(pad=0.6) + fig.tight_layout() fig.savefig("nightly_results.png", bbox_inches='tight') From 0a554aef2e26914301a6fc3f309f10f1d9f7aae8 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sat, 6 Jul 2024 23:43:58 -0700 Subject: [PATCH 132/150] adjust font size --- .buildkite/nightly-benchmarks/scripts/plot-nightly-results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py index 3e5bd72f56de2..cf9042a8c14d2 100644 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -50,7 +50,7 @@ def main(args): with open("nightly_results.md", "w") as f: f.write(description) - plt.rcParams.update({'font.size': 20}) + plt.rcParams.update({'font.size': 18}) # plot results fig, axes = plt.subplots(3, 3, figsize=(14, 14)) From c6c9292003fa01a1a7ab13d45ce1318772993deb Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sat, 6 Jul 2024 23:45:35 -0700 Subject: [PATCH 133/150] adjust spacing --- .../nightly-benchmarks/scripts/plot-nightly-results.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py index cf9042a8c14d2..7a860261593bf 100644 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -53,8 +53,8 @@ def main(args): plt.rcParams.update({'font.size': 18}) # plot results - fig, axes = plt.subplots(3, 3, figsize=(14, 14)) - fig.subplots_adjust(hspace=0.5) + fig, axes = plt.subplots(3, 3, figsize=(15, 14)) + fig.subplots_adjust(hspace=1) methods = ["vllm", "trt", "lmdeploy", "tgi"] for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]): for j, metric in enumerate(["TTFT", "ITL"]): @@ -73,6 +73,7 @@ def main(args): success = filtered_df["Successful req."].values[0] stds.append(std / math.sqrt(success)) + print(model, metric) print(means, stds) ax = axes[i, j+1] @@ -103,6 +104,9 @@ def main(args): tputs.append(0.) else: tputs.append(filtered_df["Input Tput (tok/s)"].values[0] + filtered_df["Output Tput (tok/s)"].values[0]) + + print(model, metric) + print(tputs) ax = axes[i, j] From 4788d27f4c90d37129614849a0ea67c56796cca3 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sat, 6 Jul 2024 23:46:55 -0700 Subject: [PATCH 134/150] increase font size --- .../nightly-benchmarks/scripts/plot-nightly-results.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py index 7a860261593bf..13f87ae5a222a 100644 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -50,10 +50,10 @@ def main(args): with open("nightly_results.md", "w") as f: f.write(description) - plt.rcParams.update({'font.size': 18}) + plt.rcParams.update({'font.size': 20}) # plot results - fig, axes = plt.subplots(3, 3, figsize=(15, 14)) + fig, axes = plt.subplots(3, 3, figsize=(16, 14)) fig.subplots_adjust(hspace=1) methods = ["vllm", "trt", "lmdeploy", "tgi"] for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]): @@ -88,7 +88,7 @@ def main(args): ax.set_ylim(bottom=0) ax.set_ylabel(f"{metric} (ms)") - ax.set_title(f"{model} {metric} comparison") + ax.set_title(f"{model} {metric}") ax.grid(axis='y') metric = "Tput" @@ -118,7 +118,7 @@ def main(args): ax.set_ylim(bottom=0) ax.set_ylabel(f"Tput (token/s)") - ax.set_title(f"{model} {metric} comparison") + ax.set_title(f"{model} {metric}") ax.grid(axis='y') From ccc160ceaf9248d488fa860f8ce6d8b0d5763c43 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sat, 6 Jul 2024 23:47:37 -0700 Subject: [PATCH 135/150] increase cap size --- .buildkite/nightly-benchmarks/scripts/plot-nightly-results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py index 13f87ae5a222a..a53d9570dac23 100644 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -81,7 +81,7 @@ def main(args): bars = ax.bar(["vllm", "trt", "lmdeploy", "tgi"], means, yerr=stds, - capsize=5, + capsize=10, ) for idx, bar in enumerate(bars): bar.set_color(bar_colors[idx]) From b6c557211f7c4eb3919a54f6a55f087ad11307a4 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Sat, 6 Jul 2024 23:51:24 -0700 Subject: [PATCH 136/150] make yapf and ruff happy --- .../scripts/plot-nightly-results.py | 30 +++++++++++-------- 1 file changed, 17 insertions(+), 13 deletions(-) diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py index a53d9570dac23..b57e2d384e744 100644 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -76,12 +76,13 @@ def main(args): print(model, metric) print(means, stds) - ax = axes[i, j+1] + ax = axes[i, j + 1] - bars = ax.bar(["vllm", "trt", "lmdeploy", "tgi"], - means, - yerr=stds, - capsize=10, + bars = ax.bar( + ["vllm", "trt", "lmdeploy", "tgi"], + means, + yerr=stds, + capsize=10, ) for idx, bar in enumerate(bars): bar.set_color(bar_colors[idx]) @@ -90,7 +91,7 @@ def main(args): ax.set_ylabel(f"{metric} (ms)") ax.set_title(f"{model} {metric}") ax.grid(axis='y') - + metric = "Tput" j = 0 if True: @@ -103,24 +104,27 @@ def main(args): if filtered_df.empty: tputs.append(0.) else: - tputs.append(filtered_df["Input Tput (tok/s)"].values[0] + filtered_df["Output Tput (tok/s)"].values[0]) + input_tput = filtered_df["Input Tput (tok/s)"].values[0] + output_tput = filtered_df["Output Tput (tok/s)"].values[0] + tputs.append(input_tput + output_tput) print(model, metric) print(tputs) - + ax = axes[i, j] - bars = ax.bar(["vllm", "trt", "lmdeploy", "tgi"], - tputs,) + bars = ax.bar( + ["vllm", "trt", "lmdeploy", "tgi"], + tputs, + ) for idx, bar in enumerate(bars): bar.set_color(bar_colors[idx]) - + ax.set_ylim(bottom=0) - ax.set_ylabel(f"Tput (token/s)") + ax.set_ylabel("Tput (token/s)") ax.set_title(f"{model} {metric}") ax.grid(axis='y') - fig.tight_layout() fig.savefig("nightly_results.png", bbox_inches='tight') From 13d8c0479f218528dd149362d393deae869fb0bd Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 9 Jul 2024 16:31:27 -0700 Subject: [PATCH 137/150] allow running performance benchmark & nightly benchmark simultaneously --- .buildkite/nightly-benchmarks/kickoff-pipeline.sh | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh index 8a4f852477713..3c2c0770e52cf 100755 --- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh +++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh @@ -10,7 +10,6 @@ apt install -y curl jq # Install minijinja for templating curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh source $HOME/.cargo/env -target_yaml_file="" # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then @@ -18,16 +17,11 @@ if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then echo "This PR has the 'perf-benchmarks' label. Proceeding with the performance benchmarks." - target_yaml_file=".buildkite/nightly-benchmarks/benchmark-pipeline.yaml" + buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml fi if [[ $PR_LABELS == *"nightly-benchmarks"* ]]; then echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks." - target_yaml_file=".buildkite/nightly-benchmarks/nightly-pipeline.yaml" + buildkite-agent pipeline upload .buildkite/nightly-benchmarks/nightly-pipeline.yaml fi fi - -if [ -n "$target_yaml_file" ]; then - # Upload sample.yaml - buildkite-agent pipeline upload $target_yaml_file -fi From 4d77e8f896f94a5c467a1596030807f06d419d7a Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 9 Jul 2024 16:34:13 -0700 Subject: [PATCH 138/150] adjust the annotation context for nightly benchmark so that it does not overlap with performance benchmark --- .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh index 99f1548039ab6..1168912c6e229 100644 --- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh +++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh @@ -34,7 +34,7 @@ main() { /workspace/buildkite-agent artifact upload "nightly_results.png" /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json - /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < nightly_results.md + /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md } main "$@" \ No newline at end of file From da41c537a5d20256e8c733f22b27acaca654f190 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 9 Jul 2024 16:53:09 -0700 Subject: [PATCH 139/150] cut redundant lines in nightly-pipeline.yaml using yaml anchor --- .../nightly-benchmarks/nightly-pipeline.yaml | 272 ++++-------------- 1 file changed, 48 insertions(+), 224 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index c58f99cde40c6..75ea50d10ad95 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -1,136 +1,52 @@ +common: &common + priorityClassName: perf-benchmark + containers: + - command: + - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_HOME + value: /root/.cache/huggingface + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /root/.cache/huggingface + type: Directory + steps: - - label: "A100 trt benchmark llama8B" + - block: ":rocket: Ready for comparing vllm against alternatives?" + - label: "A100 trt benchmark" priority: 100 agents: queue: A100 plugins: - kubernetes: podSpec: - priorityClassName: perf-benchmark + <<: *common containers: - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 - command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: TEST_SELECTOR - value: llama8B_tp1 - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /root/.cache/huggingface - type: Directory - - label: "A100 trt benchmark mixtral8x7B" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 - command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: TEST_SELECTOR - value: mixtral8x7B_tp2 - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /root/.cache/huggingface - type: Directory - - label: "A100 trt benchmark llama70B" - priority: 100 - agents: - queue: A100 - plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 - command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: TEST_SELECTOR - value: llama70B_tp4 - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /root/.cache/huggingface - type: Directory + - label: "A100 lmdeploy benchmark" priority: 100 agents: @@ -138,41 +54,10 @@ steps: plugins: - kubernetes: podSpec: - priorityClassName: perf-benchmark + <<: *common containers: - image: openmmlab/lmdeploy:v0.5.0 - command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /root/.cache/huggingface - type: Directory + - label: "A100 vllm benchmark" priority: 100 agents: @@ -180,41 +65,10 @@ steps: plugins: - kubernetes: podSpec: - priorityClassName: perf-benchmark + <<: *common containers: - image: vllm/vllm-openai:latest - command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /root/.cache/huggingface - type: Directory + - label: "A100 tgi benchmark" priority: 100 agents: @@ -222,42 +76,12 @@ steps: plugins: - kubernetes: podSpec: - priorityClassName: perf-benchmark + <<: *common containers: - image: ghcr.io/huggingface/text-generation-inference:2.1 - command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /root/.cache/huggingface - type: Directory + - wait + - label: "Plot" priority: 100 agents: @@ -291,4 +115,4 @@ steps: volumes: - name: devshm emptyDir: - medium: Memory \ No newline at end of file + medium: Memory \ No newline at end of file From c1080840ebfe93c8abd707b8e944e388f387b9c5 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 9 Jul 2024 16:56:08 -0700 Subject: [PATCH 140/150] add dpi=400 --- .buildkite/nightly-benchmarks/scripts/plot-nightly-results.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py index b57e2d384e744..e5cfcc64a9b2a 100644 --- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py @@ -127,7 +127,7 @@ def main(args): ax.grid(axis='y') fig.tight_layout() - fig.savefig("nightly_results.png", bbox_inches='tight') + fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400) if __name__ == '__main__': From 57e678327bbbb2bf0ae9b7d22aeaf8d01ed0837c Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 9 Jul 2024 17:03:27 -0700 Subject: [PATCH 141/150] adjust pipeline upload order --- .buildkite/nightly-benchmarks/kickoff-pipeline.sh | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh index 3c2c0770e52cf..29af45013aa87 100755 --- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh +++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh @@ -15,13 +15,16 @@ source $HOME/.cargo/env if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name') + if [[ $PR_LABELS == *"nightly-benchmarks"* ]]; then + echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks." + buildkite-agent pipeline upload .buildkite/nightly-benchmarks/nightly-pipeline.yaml + fi + + # Run performance benchmark first by upload it at last + # See https://buildkite.com/docs/agent/v3/cli-pipeline if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then echo "This PR has the 'perf-benchmarks' label. Proceeding with the performance benchmarks." buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml fi - if [[ $PR_LABELS == *"nightly-benchmarks"* ]]; then - echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks." - buildkite-agent pipeline upload .buildkite/nightly-benchmarks/nightly-pipeline.yaml - fi fi From b057b4bb13c40be2b2258848149c9c724b2f2a13 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 9 Jul 2024 17:38:12 -0700 Subject: [PATCH 142/150] merge two pipelines using yq --- .../nightly-benchmarks/kickoff-pipeline.sh | 20 ++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh index 29af45013aa87..692506532a011 100755 --- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh +++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh @@ -6,6 +6,8 @@ set -euo pipefail # Install system packages apt update apt install -y curl jq +# install yq +add-apt-repository ppa:rmescandon/yq -y && apt update && apt install yq -y # Install minijinja for templating curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh @@ -15,16 +17,20 @@ source $HOME/.cargo/env if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name') - if [[ $PR_LABELS == *"nightly-benchmarks"* ]]; then - echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks." - buildkite-agent pipeline upload .buildkite/nightly-benchmarks/nightly-pipeline.yaml - fi + touch final.yaml - # Run performance benchmark first by upload it at last - # See https://buildkite.com/docs/agent/v3/cli-pipeline if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then echo "This PR has the 'perf-benchmarks' label. Proceeding with the performance benchmarks." - buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml + # append benchmark-pipeline.yaml to the end of final.yaml + yq 'load(".buildkite/nightly-benchmarks/benchmark-pipeline.yaml") * load("final.yaml")' > final.yaml + fi + + if [[ $PR_LABELS == *"nightly-benchmarks"* ]]; then + echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks." + # append nightly-pipeline.yaml to the end of final.yaml + yq 'load(".buildkite/nightly-benchmarks/nightly-pipeline.yaml") * load("final.yaml")' > final.yaml fi + buildkite-agent pipeline upload final.yaml + fi From 1053900ed75b05a87bd767fcc5dc0dc978817b44 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 9 Jul 2024 18:22:39 -0700 Subject: [PATCH 143/150] adjust merging logic --- .buildkite/nightly-benchmarks/kickoff-pipeline.sh | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh index 692506532a011..d8edf39a5882b 100755 --- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh +++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh @@ -21,16 +21,18 @@ if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then echo "This PR has the 'perf-benchmarks' label. Proceeding with the performance benchmarks." - # append benchmark-pipeline.yaml to the end of final.yaml - yq 'load(".buildkite/nightly-benchmarks/benchmark-pipeline.yaml") * load("final.yaml")' > final.yaml + yq -n 'load("final.yaml") *+ load(".buildkite/nightly-benchmarks/benchmark-pipeline.yaml")' > final.yaml fi + cat final.yaml + if [[ $PR_LABELS == *"nightly-benchmarks"* ]]; then echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks." - # append nightly-pipeline.yaml to the end of final.yaml - yq 'load(".buildkite/nightly-benchmarks/nightly-pipeline.yaml") * load("final.yaml")' > final.yaml + yq -n 'load("final.yaml") *+ load(".buildkite/nightly-benchmarks/nightly-pipeline.yaml")' > final.yaml fi + cat final.yaml + buildkite-agent pipeline upload final.yaml fi From 5ef7e8a974bdc93474436c1597c73c4531378950 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 9 Jul 2024 18:23:54 -0700 Subject: [PATCH 144/150] put blocking step as the first step --- .buildkite/nightly-benchmarks/kickoff-pipeline.sh | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh index d8edf39a5882b..441ee58f082c2 100755 --- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh +++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh @@ -19,16 +19,17 @@ if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then touch final.yaml - if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then - echo "This PR has the 'perf-benchmarks' label. Proceeding with the performance benchmarks." - yq -n 'load("final.yaml") *+ load(".buildkite/nightly-benchmarks/benchmark-pipeline.yaml")' > final.yaml + # put blocking step (the nightly benchmark) as the first step + if [[ $PR_LABELS == *"nightly-benchmarks"* ]]; then + echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks." + yq -n 'load("final.yaml") *+ load(".buildkite/nightly-benchmarks/nightly-pipeline.yaml")' > final.yaml fi cat final.yaml - if [[ $PR_LABELS == *"nightly-benchmarks"* ]]; then - echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks." - yq -n 'load("final.yaml") *+ load(".buildkite/nightly-benchmarks/nightly-pipeline.yaml")' > final.yaml + if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then + echo "This PR has the 'perf-benchmarks' label. Proceeding with the performance benchmarks." + yq -n 'load("final.yaml") *+ load(".buildkite/nightly-benchmarks/benchmark-pipeline.yaml")' > final.yaml fi cat final.yaml From bbe115db7e796b83237ab6f2abbe49eca4b40483 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 9 Jul 2024 18:51:58 -0700 Subject: [PATCH 145/150] this file has been moved to vllm-project/buildkite-ci. Remove it. --- .../nightly-benchmarks/kickoff-pipeline.sh | 39 ------------------- 1 file changed, 39 deletions(-) delete mode 100755 .buildkite/nightly-benchmarks/kickoff-pipeline.sh diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh deleted file mode 100755 index 441ee58f082c2..0000000000000 --- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh +++ /dev/null @@ -1,39 +0,0 @@ -#!/usr/bin/env bash - -# NOTE(simon): this script runs inside a buildkite agent with CPU only access. -set -euo pipefail - -# Install system packages -apt update -apt install -y curl jq -# install yq -add-apt-repository ppa:rmescandon/yq -y && apt update && apt install yq -y - -# Install minijinja for templating -curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh -source $HOME/.cargo/env - -# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq -if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then - PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name') - - touch final.yaml - - # put blocking step (the nightly benchmark) as the first step - if [[ $PR_LABELS == *"nightly-benchmarks"* ]]; then - echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks." - yq -n 'load("final.yaml") *+ load(".buildkite/nightly-benchmarks/nightly-pipeline.yaml")' > final.yaml - fi - - cat final.yaml - - if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then - echo "This PR has the 'perf-benchmarks' label. Proceeding with the performance benchmarks." - yq -n 'load("final.yaml") *+ load(".buildkite/nightly-benchmarks/benchmark-pipeline.yaml")' > final.yaml - fi - - cat final.yaml - - buildkite-agent pipeline upload final.yaml - -fi From fb1e3926ed02306796e91cf26d0697d01402011b Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 9 Jul 2024 18:53:13 -0700 Subject: [PATCH 146/150] add warning message --- .buildkite/nightly-benchmarks/nightly-pipeline.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 75ea50d10ad95..2d33579c37364 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -35,7 +35,7 @@ common: &common type: Directory steps: - - block: ":rocket: Ready for comparing vllm against alternatives?" + - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours." - label: "A100 trt benchmark" priority: 100 agents: From 50ed6b7a4a0a8451edcbd77cfc3a142cba4851d0 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Tue, 9 Jul 2024 18:57:11 -0700 Subject: [PATCH 147/150] add a wait at the end, essential when merging multiple yaml files --- .buildkite/nightly-benchmarks/nightly-pipeline.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index 2d33579c37364..de620b9f107a5 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -115,4 +115,6 @@ steps: volumes: - name: devshm emptyDir: - medium: Memory \ No newline at end of file + medium: Memory + + - wait \ No newline at end of file From 9758f94b07c39e3b0a716758dbb41596ef260781 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 10 Jul 2024 17:12:44 -0700 Subject: [PATCH 148/150] adjust pipeline.yaml --- .../nightly-benchmarks/nightly-pipeline.yaml | 178 +++++++++++++----- 1 file changed, 132 insertions(+), 46 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index de620b9f107a5..c12841b0c03cd 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -1,39 +1,3 @@ -common: &common - priorityClassName: perf-benchmark - containers: - - command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /root/.cache/huggingface - type: Directory - steps: - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours." - label: "A100 trt benchmark" @@ -43,10 +7,41 @@ steps: plugins: - kubernetes: podSpec: - <<: *common + priorityClassName: perf-benchmark containers: - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 - + command: + - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_HOME + value: /root/.cache/huggingface + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /root/.cache/huggingface + type: Directory - label: "A100 lmdeploy benchmark" priority: 100 agents: @@ -54,10 +49,41 @@ steps: plugins: - kubernetes: podSpec: - <<: *common + priorityClassName: perf-benchmark containers: - image: openmmlab/lmdeploy:v0.5.0 - + command: + - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_HOME + value: /root/.cache/huggingface + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /root/.cache/huggingface + type: Directory - label: "A100 vllm benchmark" priority: 100 agents: @@ -65,10 +91,41 @@ steps: plugins: - kubernetes: podSpec: - <<: *common + priorityClassName: perf-benchmark containers: - image: vllm/vllm-openai:latest - + command: + - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_HOME + value: /root/.cache/huggingface + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /root/.cache/huggingface + type: Directory - label: "A100 tgi benchmark" priority: 100 agents: @@ -76,12 +133,42 @@ steps: plugins: - kubernetes: podSpec: - <<: *common + priorityClassName: perf-benchmark containers: - image: ghcr.io/huggingface/text-generation-inference:2.1 - + command: + - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_HOME + value: /root/.cache/huggingface + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /root/.cache/huggingface + type: Directory - wait - - label: "Plot" priority: 100 agents: @@ -116,5 +203,4 @@ steps: - name: devshm emptyDir: medium: Memory - - wait \ No newline at end of file From 8608d17644bfb113912b696d86369415e75908ee Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 10 Jul 2024 17:19:17 -0700 Subject: [PATCH 149/150] adjust pipeline.yaml --- .../nightly-benchmarks/nightly-pipeline.yaml | 160 +++--------------- 1 file changed, 19 insertions(+), 141 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index c12841b0c03cd..d7e3254407a2f 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -1,6 +1,8 @@ -steps: +steps: - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours." - - label: "A100 trt benchmark" + + - &benchmark_template # Anchor for the repeated structure + label: "A100 trt benchmark" priority: 100 agents: queue: A100 @@ -42,165 +44,41 @@ steps: hostPath: path: /root/.cache/huggingface type: Directory - - label: "A100 lmdeploy benchmark" - priority: 100 - agents: - queue: A100 + + - <<: *benchmark_template # Using alias to repeat the structure + label: "A100 lmdeploy benchmark" plugins: - kubernetes: podSpec: - priorityClassName: perf-benchmark containers: - image: openmmlab/lmdeploy:v0.5.0 - command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /root/.cache/huggingface - type: Directory - - label: "A100 vllm benchmark" - priority: 100 - agents: - queue: A100 + + - <<: *benchmark_template # Reuse the template + label: "A100 vllm benchmark" plugins: - kubernetes: podSpec: - priorityClassName: perf-benchmark containers: - image: vllm/vllm-openai:latest - command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /root/.cache/huggingface - type: Directory - - label: "A100 tgi benchmark" - priority: 100 - agents: - queue: A100 + + - <<: *benchmark_template # Reuse the template + label: "A100 tgi benchmark" plugins: - kubernetes: podSpec: - priorityClassName: perf-benchmark containers: - image: ghcr.io/huggingface/text-generation-inference:2.1 - command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /root/.cache/huggingface - type: Directory + - wait - - label: "Plot" - priority: 100 - agents: - queue: A100 + + - <<: *benchmark_template # Reuse the template for the plot + label: "Plot" plugins: - kubernetes: podSpec: - priorityClassName: perf-benchmark containers: - image: vllm/vllm-openai:v0.5.0.post1 command: - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - wait \ No newline at end of file + + - wait From 37c4c118ea79a911ad253f3cef386b6ae0859812 Mon Sep 17 00:00:00 2001 From: Kuntai Du Date: Wed, 10 Jul 2024 19:24:07 -0700 Subject: [PATCH 150/150] fix pipeline yaml --- .../nightly-benchmarks/nightly-pipeline.yaml | 172 +++++++++++------- 1 file changed, 104 insertions(+), 68 deletions(-) diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml index d7e3254407a2f..6e399bb936fbc 100644 --- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml +++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml @@ -1,84 +1,120 @@ +common_pod_spec: &common_pod_spec + priorityClassName: perf-benchmark + nodeSelector: + nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB + volumes: + - name: devshm + emptyDir: + medium: Memory + - name: hf-cache + hostPath: + path: /root/.cache/huggingface + type: Directory + +common_container_settings: &common_container_settings + command: + - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + - name: hf-cache + mountPath: /root/.cache/huggingface + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: HF_HOME + value: /root/.cache/huggingface + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token + steps: - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours." - - - &benchmark_template # Anchor for the repeated structure - label: "A100 trt benchmark" + - label: "A100 trt benchmark" priority: 100 agents: queue: A100 plugins: - - kubernetes: - podSpec: - priorityClassName: perf-benchmark - containers: - - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 - command: - - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh - resources: - limits: - nvidia.com/gpu: 8 - volumeMounts: - - name: devshm - mountPath: /dev/shm - - name: hf-cache - mountPath: /root/.cache/huggingface - env: - - name: VLLM_USAGE_SOURCE - value: ci-test - - name: HF_HOME - value: /root/.cache/huggingface - - name: VLLM_SOURCE_CODE_LOC - value: /workspace/build/buildkite/vllm/performance-benchmark - - name: HF_TOKEN - valueFrom: - secretKeyRef: - name: hf-token-secret - key: token - nodeSelector: - nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB - volumes: - - name: devshm - emptyDir: - medium: Memory - - name: hf-cache - hostPath: - path: /root/.cache/huggingface - type: Directory + - kubernetes: + podSpec: + <<: *common_pod_spec + containers: + - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3 + <<: *common_container_settings - - <<: *benchmark_template # Using alias to repeat the structure - label: "A100 lmdeploy benchmark" + - label: "A100 lmdeploy benchmark" + priority: 100 + agents: + queue: A100 plugins: - - kubernetes: - podSpec: - containers: - - image: openmmlab/lmdeploy:v0.5.0 + - kubernetes: + podSpec: + <<: *common_pod_spec + containers: + - image: openmmlab/lmdeploy:v0.5.0 + <<: *common_container_settings + - - <<: *benchmark_template # Reuse the template - label: "A100 vllm benchmark" + - label: "A100 vllm benchmark" + priority: 100 + agents: + queue: A100 plugins: - - kubernetes: - podSpec: - containers: - - image: vllm/vllm-openai:latest + - kubernetes: + podSpec: + <<: *common_pod_spec + containers: + - image: vllm/vllm-openai:latest + <<: *common_container_settings - - <<: *benchmark_template # Reuse the template - label: "A100 tgi benchmark" + - label: "A100 tgi benchmark" + priority: 100 + agents: + queue: A100 plugins: - - kubernetes: - podSpec: - containers: - - image: ghcr.io/huggingface/text-generation-inference:2.1 - + - kubernetes: + podSpec: + <<: *common_pod_spec + containers: + - image: ghcr.io/huggingface/text-generation-inference:2.1 + <<: *common_container_settings + - wait - - <<: *benchmark_template # Reuse the template for the plot - label: "Plot" + - label: "Plot" + priority: 100 + agents: + queue: A100 plugins: - - kubernetes: - podSpec: - containers: - - image: vllm/vllm-openai:v0.5.0.post1 - command: - - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh + - kubernetes: + podSpec: + <<: *common_pod_spec + containers: + - image: vllm/vllm-openai:v0.5.0.post1 + command: + - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh + resources: + limits: + nvidia.com/gpu: 8 + volumeMounts: + - name: devshm + mountPath: /dev/shm + env: + - name: VLLM_USAGE_SOURCE + value: ci-test + - name: VLLM_SOURCE_CODE_LOC + value: /workspace/build/buildkite/vllm/performance-benchmark + - name: HF_TOKEN + valueFrom: + secretKeyRef: + name: hf-token-secret + key: token - - wait + - wait \ No newline at end of file