From f813e2eaefb02ba9517f6d02435e50a83b7e51fd Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sun, 9 Jun 2024 00:58:19 -0700
Subject: [PATCH 001/150] Kuntai: add tgi and trt benchmarking script (initial
 version)

---
 .../nightly-benchmarks/run-tgi-benchmarks.sh  |  72 +++++++++++++
 .../nightly-benchmarks/run-trt-benchmarks.sh  | 102 ++++++++++++++++++
 2 files changed, 174 insertions(+)
 create mode 100644 .buildkite/nightly-benchmarks/run-tgi-benchmarks.sh
 create mode 100644 .buildkite/nightly-benchmarks/run-trt-benchmarks.sh
diff --git a/.buildkite/nightly-benchmarks/run-tgi-benchmarks.sh b/.buildkite/nightly-benchmarks/run-tgi-benchmarks.sh
new file mode 100644
index 0000000000000..27f0fe57f3716
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/run-tgi-benchmarks.sh
@@ -0,0 +1,72 @@
+#!/bin/bash
+
+# This script should be run inside the tgi container. Enter the latest tgi container by
+# docker run -it --gpus all -e "HF_TOKEN=<your HF TOKEN>"   --shm-size 1g --entrypoint /bin/bash ghcr.io/huggingface/text-generation-inference:2.0
+# (please modify `<your HF TOKEN>` to your own huggingface token in the above command
+# Then, copy-paste this file into any directory you prefer in the docker and execute it using bash.
+# Benchmarking results will be inside /vllm/benchmarks/*.txt
+# NOTE: this script gradually reduces the request rate from 20, to ensure all requests are successful.
+
+set -ex
+set -o pipefail
+
+# install conda
+(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+bash Miniconda3-latest-Linux-x86_64.sh -b -u -p ~/miniconda3
+~/miniconda3/bin/conda init bash
+eval "$(cat ~/.bashrc | tail -n +15)"
+
+# create conda environment for vllm
+conda create -n vllm python=3.9 -y
+eval "$(conda shell.bash hook)"
+conda activate vllm
+pip install vllm
+
+# clone vllm repo
+cd /
+git clone https://github.com/vllm-project/vllm.git
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+# launch TGI server
+/tgi-entrypoint.sh --port 8000 --model-id meta-llama/Llama-2-7b-chat-hf &
+tgi_pid=$!
+timeout 600 bash -c 'until curl localhost:8000/generate_stream; do sleep 1; done' || exit 1
+
+# gradually reduce the request rate from 20, untill all request successed
+request_rate=20
+get_successful_requests() {
+  grep "Successful requests:" benchmark_serving.txt | awk '{print $3}'
+}
+while true; do
+  echo "Running benchmark with request rate $request_rate..."
+  python3 vllm/benchmarks/benchmark_serving.py --backend tgi --model meta-llama/Llama-2-7b-chat-hf --dataset-name sharegpt --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1000 --endpoint /generate_stream --request-rate $request_rate --port 8000 --save-result 2>&1 | tee benchmark_serving.txt
+  bench_serving_exit_code=$?
+  successful_requests=$(get_successful_requests)
+  echo "Successful requests: $successful_requests"
+  if [ "$successful_requests" -eq 1000 ]; then
+    echo "Reached 1000 successful requests with request rate $request_rate"
+    break
+  fi
+  request_rate=$((request_rate - 1))
+  if [ "$request_rate" -lt 1 ]; then
+    echo "Request rate went below 1. Exiting."
+    break
+  fi
+done
+kill $tgi_pid
+
+echo "### TGI Serving Benchmarks" >>benchmark_results.md
+sed -n '1p' benchmark_serving.txt >>benchmark_results.md
+echo "" >>benchmark_results.md
+echo '```' >>benchmark_results.md
+tail -n 17 benchmark_serving.txt >>benchmark_results.md
+echo '```' >>benchmark_results.md
+
+# if the agent binary is not found, skip uploading the results, exit 0
+if [ ! -f /workspace/buildkite-agent ]; then
+  exit 0
+fi
+
+# upload the results to buildkite
+/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" <benchmark_results.md
diff --git a/.buildkite/nightly-benchmarks/run-trt-benchmarks.sh b/.buildkite/nightly-benchmarks/run-trt-benchmarks.sh
new file mode 100644
index 0000000000000..6ab565890f9d8
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/run-trt-benchmarks.sh
@@ -0,0 +1,102 @@
+#!/bin/bash
+
+# This script should be run inside the trt-llm docker container, command:
+# docker run -it --net host -e HF_TOKEN=<your HF TOKEN> --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 --runtime=nvidia --gpus all --entrypoint /bin/bash nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+# (please modify `<your HF TOKEN>` to your own huggingface token in the above command
+# Then, copy-paste this file into the docker and execute it using bash.
+
+set -xe
+TRT_LLM_VERSION=r24.04
+model_path=meta-llama/llama-2-7b-chat-hf
+model_name=llama-2-7b-chat-hf
+model_type=llama
+model_dtype=float16
+model_tp_size=1
+max_batch_size=233
+max_input_len=15000
+max_output_len=15000
+cd ~
+mkdir models
+cd models
+models_dir=`pwd`
+trt_model_path=${models_dir}/${model_name}-trt-ckpt
+trt_engine_path=${models_dir}/${model_name}-trt-engine
+
+
+
+cd ~
+git clone https://github.com/neuralmagic/tensorrt-demo.git
+cd tensorrt-demo
+tensorrt_demo_dir=`pwd`
+
+# make sure the parameter inside tensorrt_demo is consistent to envvar
+sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
+sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
+
+
+cd /
+git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
+git lfs install
+cd tensorrtllm_backend
+git checkout $TRT_LLM_VERSION
+tensorrtllm_backend_dir=`pwd`
+
+git submodule update --init --recursive
+cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
+
+cd /tensorrtllm_backend
+cd ./tensorrt_llm/examples/${model_type}
+
+python3 convert_checkpoint.py \
+    --model_dir ${model_path} \
+    --dtype ${model_dtype} \
+    --tp_size ${model_tp_size} \
+    --output_dir ${trt_model_path}
+    
+trtllm-build \
+    --checkpoint_dir=${trt_model_path} \
+    --gpt_attention_plugin=${model_dtype} \
+    --gemm_plugin=${model_dtype} \
+    --remove_input_padding=enable \
+    --paged_kv_cache=enable \
+    --tp_size=${model_tp_size} \
+    --max_batch_size=${max_batch_size} \
+    --max_input_len=${max_input_len} \
+    --max_output_len=${max_output_len} \
+    --max_num_tokens=${max_output_len} \
+    --opt_num_tokens=${max_output_len} \
+    --output_dir=${trt_engine_path} 
+    
+cd /tensorrtllm_backend/triton_model_repo
+cp -r ${trt_engine_path}/* ./tensorrt_llm/1
+cd /tensorrtllm_backend
+python3 scripts/launch_triton_server.py --world_size=${model_tp_size} --model_repo=/tensorrtllm_backend/triton_model_repo &
+
+
+# sleep for 20 seconds, to make sure the server is launched 
+sleep 30
+
+
+# install vllm inside conda, for benchmarking.
+(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+bash Miniconda3-latest-Linux-x86_64.sh -b -u -p ~/miniconda3
+~/miniconda3/bin/conda init bash
+eval "$(cat ~/.bashrc | tail -n +15)"
+conda create -n vllm python=3.9 -y
+eval "$(conda shell.bash hook)"
+conda activate vllm
+pip install vllm
+
+# clone vllm's benchmark_serving script
+cd ~
+git clone https://github.com/vllm-project/vllm.git
+cd vllm/benchmarks/
+
+export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+python benchmark_serving.py --backend tensorrt-llm --endpoint /v2/models/ensemble/generate_stream --port 8000 --model $model_path --save-result --dataset-name sharegpt --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1000 2>&1 | tee benchmark_serving.txt

From d6cba4653b414192779861c717ddd565e51338e0 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 13 Jun 2024 23:13:16 -0700
Subject: [PATCH 002/150] update initial benchmarking script for lmdeploy

---
 .../run-lmdeploy-benchmarks.sh                | 42 +++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 .buildkite/nightly-benchmarks/run-lmdeploy-benchmarks.sh

diff --git a/.buildkite/nightly-benchmarks/run-lmdeploy-benchmarks.sh b/.buildkite/nightly-benchmarks/run-lmdeploy-benchmarks.sh
new file mode 100644
index 0000000000000..c1a579eaeefc4
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/run-lmdeploy-benchmarks.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+
+# This script should be run inside the tgi container. Enter the latest tgi container by
+# docker run --gpus all -e "HF_TOKEN=<your HF TOKEN>" -v ~/.cache/huggingface:/root/.cache/huggingface --entrypoint /bin/bash openmmlab/lmdeploy:latest 
+# lmdeploy serve api_server internlm/internlm2-chat-7b
+# docker run -it --gpus all -e "HF_TOKEN=<your HF TOKEN>"   --shm-size 1g --entrypoint /bin/bash ghcr.io/huggingface/text-generation-inference:2.0
+# (please modify `<your HF TOKEN>` to your own huggingface token in the above command
+# Then, copy-paste this file into any directory you prefer in the docker and execute it using bash.
+
+
+
+set -ex
+set -o pipefail
+
+# install conda
+(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
+bash Miniconda3-latest-Linux-x86_64.sh -b -u -p ~/miniconda3
+~/miniconda3/bin/conda init bash
+eval "$(cat ~/.bashrc | tail -n +15)"
+
+# create conda environment for vllm
+conda create -n vllm python=3.9 -y
+eval "$(conda shell.bash hook)"
+conda activate vllm
+pip install vllm
+
+# clone vllm repo
+cd /
+git clone https://github.com/vllm-project/vllm.git
+wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+# launch TGI server
+lmdeploy serve api_server meta-llama/Llama-2-7b-hf  --server-port 8000 &
+tgi_pid=$!
+timeout 600 bash -c 'until curl localhost:8000/v1/completion; do sleep 1; done' || exit 1
+
+# gradually reduce the request rate from 20, untill all request successed
+request_rate=20
+echo "Running benchmark with request rate $request_rate..."
+python3 vllm/benchmarks/benchmark_serving.py --backend lmdeploy --model meta-llama/Llama-2-7b-chat-hf --dataset-name sharegpt --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 200 --request-rate $request_rate --port 8000 --save-result 2>&1 | tee benchmark_serving.txt
+kill $tgi_pid

From 5d8292bf5a12ff86ac73cea8422b5ce4b292ce31 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 18 Jun 2024 21:09:21 -0700
Subject: [PATCH 003/150] Add download tokenizer script for lmdeploy

---
 .../scripts/download-tokenizer.py              | 18 ++++++++++++++++++
 1 file changed, 18 insertions(+)
 create mode 100644 .buildkite/nightly-benchmarks/scripts/download-tokenizer.py

diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
new file mode 100644
index 0000000000000..140233e5dad91
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@@ -0,0 +1,18 @@
+
+import argparse
+from transformers import AutoTokenizer
+
+def main(model, cachedir):
+    # Load the tokenizer and save it to the specified directory
+    tokenizer = AutoTokenizer.from_pretrained(model)
+    tokenizer.save_pretrained(cachedir)
+    print(f"Tokenizer saved to {cachedir}")
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Download and save Hugging Face tokenizer")
+    parser.add_argument("--model", type=str, required=True, help="Name of the model")
+    parser.add_argument("--cachedir", type=str, required=True, help="Directory to save the tokenizer")
+
+    args = parser.parse_args()
+    main(args.model, args.cachedir)
+    
\ No newline at end of file

From a2dd7c9f3f86c82a0eb392bfb498fff80feb84fa Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 18 Jun 2024 23:45:22 -0700
Subject: [PATCH 004/150] add one-click runnable script for lmdeploy, parse
 tests from json file

---
 .../nightly-benchmarks/run-nightly-suite.sh   |  53 +++++
 .../scripts/download-tokenizer.py             |   1 +
 .../scripts/run-lmdeploy-nightly.sh           | 192 ++++++++++++++++++
 3 files changed, 246 insertions(+)
 create mode 100644 .buildkite/nightly-benchmarks/run-nightly-suite.sh
 create mode 100644 .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh

diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
new file mode 100644
index 0000000000000..04cdd0d8322b7
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@@ -0,0 +1,53 @@
+#!/bin/bash
+
+set -ex
+set -o pipefail
+
+check_gpus() {
+    # check the number of GPUs and GPU type.
+    declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+    if [[ $gpu_count -gt 0 ]]; then
+        echo "GPU found."
+    else
+        echo "Need at least 1 GPU to run benchmarking."
+        exit 1
+    fi
+    declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+    echo "GPU type is $gpu_type"
+}
+
+check_hf_token() {
+    # check if HF_TOKEN is available and valid
+    if [[ -z "$HF_TOKEN" ]]; then
+        echo "Error: HF_TOKEN is not set."
+        exit 1
+    elif [[ ! "$HF_TOKEN" =~ ^hf_ ]]; then
+        echo "Error: HF_TOKEN does not start with 'hf_'."
+        exit 1
+    else
+        echo "HF_TOKEN is set and valid."
+    fi
+}
+
+main() {
+
+    check_gpus
+    check_hf_token
+
+    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+    cd /
+    git clone https://github.com/KuntaiDu/vllm.git
+    cd vllm
+    git checkout kuntai-benchmark-dev
+    cd benchmarks
+    wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
+
+    # run lmdeploy
+    if which lmdeploy >/dev/null; then
+        echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
+        bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+    fi
+
+}
+
+main "$@"
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
index 140233e5dad91..add331bfbd9f3 100644
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@@ -1,5 +1,6 @@
 
 import argparse
+from pathlib import Path
 from transformers import AutoTokenizer
 
 def main(model, cachedir):
diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
new file mode 100644
index 0000000000000..495d3adf2ae14
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -0,0 +1,192 @@
+#!/bin/bash
+
+set -ex
+set -o pipefail
+
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+
+
+kill_gpu_processes() {
+  # kill all processes on GPU.
+  pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
+  if [ -z "$pids" ]; then
+      echo "No GPU processes found."
+  else
+      for pid in $pids; do
+          kill -9 "$pid"
+          echo "Killed process with PID: $pid"
+      done
+
+      echo "All GPU processes have been killed."
+  fi
+
+  # waiting for GPU processes to be fully killed
+  sleep 10
+
+  # remove vllm config file
+  rm -rf ~/.config/vllm
+
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    # append lmdeploy to the test name
+    test_name=lmdeploy_$test_name
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
+    client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
+    model=$(echo "$server_params" | jq -r '.model')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+      continue
+    fi
+
+    # prepare tokenizer
+    server_model=$(echo "$server_params" | jq -r '.model')
+    rm /tokenizer_cache/*
+    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+      --model "$server_model" \
+      --cachedir /tokenizer_cache
+
+    server_command="lmdeploy server api_server $model $server_args"
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    eval "$server_command" &
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "vllm server is up and running."
+    else
+      echo ""
+      echo "vllm failed to start within the timeout period."
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend lmdeploy \
+        --tokenizer /tokenizer_cache \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "lmdeploy" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+  done
+}
+
+main () {
+
+    # create tokenizer directory
+    mkdir /tokenizer_cache
+    # enter vllm directory
+    cd /vllm/benchmarks
+
+    declare -g RESULTS_FOLDER=results/
+    mkdir -p $RESULTS_FOLDER
+    BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+    run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+
+}
+
+main "$@"
\ No newline at end of file

From 8416ce6a15773a1bce5ecd85feea4540d2d9d5eb Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 18 Jun 2024 23:50:24 -0700
Subject: [PATCH 005/150] add nightly test json file

---
 .../tests/nightly-tests.json                  | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)
 create mode 100644 .buildkite/nightly-benchmarks/tests/nightly-tests.json

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
new file mode 100644
index 0000000000000..b4290dc42dce3
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -0,0 +1,25 @@
+[
+    {
+        "test_name": "llama8B_tp1_sharegpt",
+        "qps_list": [1, 16],
+        "model": "meta-llama/Llama-8B-hf",
+        "lmdeploy_server_parameters": {
+            "tp": 1,
+            "server_port": 8000
+        },
+        "client_parameters": {
+            "model": "meta-llama/Llama-2-7b-hf",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "port": 8000
+        },
+        "lmdeploy_client_parameters": {
+            "model": "llama2",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "port": 8000
+        }
+    }
+]
\ No newline at end of file

From df4ba8f8d3476c9136d6b64fd2299692c8fe1158 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 18 Jun 2024 23:53:27 -0700
Subject: [PATCH 006/150] bug fix on tokenizer directory

---
 .../nightly-benchmarks/scripts/run-lmdeploy-nightly.sh       | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index 495d3adf2ae14..b957f506431f8 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -106,7 +106,8 @@ run_serving_tests() {
 
     # prepare tokenizer
     server_model=$(echo "$server_params" | jq -r '.model')
-    rm /tokenizer_cache/*
+    rm -rf /tokenizer_cache
+    mkdir /tokenizer_cache
     python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
       --model "$server_model" \
       --cachedir /tokenizer_cache
@@ -176,8 +177,6 @@ run_serving_tests() {
 
 main () {
 
-    # create tokenizer directory
-    mkdir /tokenizer_cache
     # enter vllm directory
     cd /vllm/benchmarks
 

From b974495a0839189502ec6b1c0245e711ed800e50 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 18 Jun 2024 23:55:27 -0700
Subject: [PATCH 007/150] bug fix on getting model

---
 .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index b957f506431f8..7580ecc68e9a9 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -90,7 +90,7 @@ run_serving_tests() {
     # get client and server arguments
     server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
     client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
-    model=$(echo "$server_params" | jq -r '.model')
+    model=$(echo "$params" | jq -r '.model')
     server_args=$(json2args "$server_params")
     client_args=$(json2args "$client_params")
     qps_list=$(echo "$params" | jq -r '.qps_list')

From 80d1c77fd110ab1e958707c59d9de0cb3b9f6e11 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 18 Jun 2024 23:57:12 -0700
Subject: [PATCH 008/150] update test cases

---
 .buildkite/nightly-benchmarks/tests/nightly-tests.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index b4290dc42dce3..d854e59342119 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -2,7 +2,7 @@
     {
         "test_name": "llama8B_tp1_sharegpt",
         "qps_list": [1, 16],
-        "model": "meta-llama/Llama-8B-hf",
+        "model": "meta-llama/Llama-2-7b-hf",
         "lmdeploy_server_parameters": {
             "tp": 1,
             "server_port": 8000

From b3f3b0e3ec5112bf00730f895ba423391bdd0ed0 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 18 Jun 2024 23:58:59 -0700
Subject: [PATCH 009/150] update parameter name

---
 .../nightly-benchmarks/scripts/run-lmdeploy-nightly.sh       | 5 ++---
 .buildkite/nightly-benchmarks/tests/nightly-tests.json       | 2 +-
 2 files changed, 3 insertions(+), 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index 7580ecc68e9a9..898c78ec86a20 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -90,7 +90,7 @@ run_serving_tests() {
     # get client and server arguments
     server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
     client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
-    model=$(echo "$params" | jq -r '.model')
+    model=$(echo "$params" | jq -r '.lmdeploy_server_model')
     server_args=$(json2args "$server_params")
     client_args=$(json2args "$client_params")
     qps_list=$(echo "$params" | jq -r '.qps_list')
@@ -105,11 +105,10 @@ run_serving_tests() {
     fi
 
     # prepare tokenizer
-    server_model=$(echo "$server_params" | jq -r '.model')
     rm -rf /tokenizer_cache
     mkdir /tokenizer_cache
     python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
-      --model "$server_model" \
+      --model "$model" \
       --cachedir /tokenizer_cache
 
     server_command="lmdeploy server api_server $model $server_args"
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index d854e59342119..a730c172089f8 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -2,7 +2,7 @@
     {
         "test_name": "llama8B_tp1_sharegpt",
         "qps_list": [1, 16],
-        "model": "meta-llama/Llama-2-7b-hf",
+        "lmdeploy_server_model": "meta-llama/Llama-2-7b-hf",
         "lmdeploy_server_parameters": {
             "tp": 1,
             "server_port": 8000

From 9483acf7f2c5d606b768f096143d1af26c4da264 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 19 Jun 2024 00:00:58 -0700
Subject: [PATCH 010/150] typo fix

---
 .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index 898c78ec86a20..626b23f1e1689 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -111,7 +111,7 @@ run_serving_tests() {
       --model "$model" \
       --cachedir /tokenizer_cache
 
-    server_command="lmdeploy server api_server $model $server_args"
+    server_command="lmdeploy serve api_server $model $server_args"
 
     # run the server
     echo "Running test case $test_name"

From d72ae51d7e0a0357bc131d12339d8931f20d1899 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 19 Jun 2024 00:02:48 -0700
Subject: [PATCH 011/150] add wait_for_server

---
 .../nightly-benchmarks/scripts/run-lmdeploy-nightly.sh   | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index 626b23f1e1689..306830d2651ee 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -64,6 +64,15 @@ json2args() {
   echo "$args"
 }
 
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl localhost:8000/v1/completions; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
 
 
 run_serving_tests() {

From 0e819f034109e79984efec0a8c30e4bbd27369c4 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 19 Jun 2024 00:22:57 -0700
Subject: [PATCH 012/150] update summarization script

---
 .../run-lmdeploy-benchmarks.sh                | 42 -----------
 .../scripts/run-lmdeploy-nightly.sh           | 40 +++++------
 .../scripts/summary-nightly-results.py        | 70 +++++++++++++++++++
 3 files changed, 87 insertions(+), 65 deletions(-)
 delete mode 100644 .buildkite/nightly-benchmarks/run-lmdeploy-benchmarks.sh
 create mode 100644 .buildkite/nightly-benchmarks/scripts/summary-nightly-results.py

diff --git a/.buildkite/nightly-benchmarks/run-lmdeploy-benchmarks.sh b/.buildkite/nightly-benchmarks/run-lmdeploy-benchmarks.sh
deleted file mode 100644
index c1a579eaeefc4..0000000000000
--- a/.buildkite/nightly-benchmarks/run-lmdeploy-benchmarks.sh
+++ /dev/null
@@ -1,42 +0,0 @@
-#!/bin/bash
-
-# This script should be run inside the tgi container. Enter the latest tgi container by
-# docker run --gpus all -e "HF_TOKEN=<your HF TOKEN>" -v ~/.cache/huggingface:/root/.cache/huggingface --entrypoint /bin/bash openmmlab/lmdeploy:latest 
-# lmdeploy serve api_server internlm/internlm2-chat-7b
-# docker run -it --gpus all -e "HF_TOKEN=<your HF TOKEN>"   --shm-size 1g --entrypoint /bin/bash ghcr.io/huggingface/text-generation-inference:2.0
-# (please modify `<your HF TOKEN>` to your own huggingface token in the above command
-# Then, copy-paste this file into any directory you prefer in the docker and execute it using bash.
-
-
-
-set -ex
-set -o pipefail
-
-# install conda
-(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
-wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
-bash Miniconda3-latest-Linux-x86_64.sh -b -u -p ~/miniconda3
-~/miniconda3/bin/conda init bash
-eval "$(cat ~/.bashrc | tail -n +15)"
-
-# create conda environment for vllm
-conda create -n vllm python=3.9 -y
-eval "$(conda shell.bash hook)"
-conda activate vllm
-pip install vllm
-
-# clone vllm repo
-cd /
-git clone https://github.com/vllm-project/vllm.git
-wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-# launch TGI server
-lmdeploy serve api_server meta-llama/Llama-2-7b-hf  --server-port 8000 &
-tgi_pid=$!
-timeout 600 bash -c 'until curl localhost:8000/v1/completion; do sleep 1; done' || exit 1
-
-# gradually reduce the request rate from 20, untill all request successed
-request_rate=20
-echo "Running benchmark with request rate $request_rate..."
-python3 vllm/benchmarks/benchmark_serving.py --backend lmdeploy --model meta-llama/Llama-2-7b-chat-hf --dataset-name sharegpt --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 200 --request-rate $request_rate --port 8000 --save-result 2>&1 | tee benchmark_serving.txt
-kill $tgi_pid
diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index 306830d2651ee..82bca8d8ed344 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -3,7 +3,6 @@
 set -ex
 set -o pipefail
 
-
 check_gpus() {
   # check the number of GPUs and GPU type.
   declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
@@ -17,20 +16,18 @@ check_gpus() {
   echo "GPU type is $gpu_type"
 }
 
-
-
 kill_gpu_processes() {
   # kill all processes on GPU.
   pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
   if [ -z "$pids" ]; then
-      echo "No GPU processes found."
+    echo "No GPU processes found."
   else
-      for pid in $pids; do
-          kill -9 "$pid"
-          echo "Killed process with PID: $pid"
-      done
+    for pid in $pids; do
+      kill -9 "$pid"
+      echo "Killed process with PID: $pid"
+    done
 
-      echo "All GPU processes have been killed."
+    echo "All GPU processes have been killed."
   fi
 
   # waiting for GPU processes to be fully killed
@@ -46,8 +43,6 @@ kill_gpu_processes() {
   echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
 }
 
-
-
 json2args() {
   # transforms the JSON string to command line args, and '_' is replaced to '-'
   # example:
@@ -73,8 +68,6 @@ wait_for_server() {
     done' && return 0 || return 1
 }
 
-
-
 run_serving_tests() {
   # run serving tests using `benchmark_serving.py`
   # $1: a json file specifying serving test cases
@@ -95,7 +88,6 @@ run_serving_tests() {
       continue
     fi
 
-
     # get client and server arguments
     server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
     client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
@@ -174,7 +166,7 @@ run_serving_tests() {
           gpu_type: $gpu,
           engine: $engine
         }')
-      echo "$jq_output" > "$RESULTS_FOLDER/${new_test_name}.commands"
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
 
     done
 
@@ -183,17 +175,19 @@ run_serving_tests() {
   done
 }
 
-main () {
+main() {
 
-    # enter vllm directory
-    cd /vllm/benchmarks
+  check_gpus
+  # enter vllm directory
+  cd /vllm/benchmarks
 
-    declare -g RESULTS_FOLDER=results/
-    mkdir -p $RESULTS_FOLDER
-    BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
 
-    run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  CURRENT_LLM_SERVING_ENGINE=lmdeploy python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
 
 }
 
-main "$@"
\ No newline at end of file
+main "$@"
diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
new file mode 100644
index 0000000000000..6c2668ed2b3ec
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -0,0 +1,70 @@
+import json
+import os
+from pathlib import Path
+
+import pandas as pd
+
+results_folder = Path("results/")
+
+# serving results and the keys that will be printed into markdown
+serving_results = []
+serving_column_mapping = {
+    "test_name": "Test name",
+    "gpu_type": "GPU",
+    "completed": "Successful req.",
+    "request_throughput": "Tput (req/s)",
+    # "input_throughput": "Input Tput (tok/s)",
+    # "output_throughput": "Output Tput (tok/s)",
+    "mean_ttft_ms": "Mean TTFT (ms)",
+    "median_ttft_ms": "Median TTFT (ms)",
+    "p99_ttft_ms": "P99 TTFT (ms)",
+    # "mean_tpot_ms": "Mean TPOT (ms)",
+    # "median_tpot_ms": "Median",
+    # "p99_tpot_ms": "P99",
+    "mean_itl_ms": "Mean ITL (ms)",
+    "median_itl_ms": "Median ITL (ms)",
+    "p99_itl_ms": "P99 ITL (ms)",
+    "engine": "Engine",
+}
+
+
+
+
+if __name__ == "__main__":
+
+    # collect results
+    for test_file in results_folder.glob("*.json"):
+
+        with open(test_file, "r") as f:
+            raw_result = json.loads(f.read())
+
+            
+        # attach the benchmarking command to raw_result
+        with open(test_file.with_suffix(".commands"), "r") as f:
+            command = json.loads(f.read())
+        raw_result.update(command)
+
+        # update the test name of this result
+        raw_result.update({"test_name": test_file.stem})
+
+        # add the result to raw_result
+        serving_results.append(raw_result)
+        continue
+
+
+    serving_results = pd.DataFrame.from_dict(serving_results)
+
+
+    if not serving_results.empty:
+        serving_results = serving_results[list(
+            serving_column_mapping.keys())].rename(
+                columns=serving_column_mapping)
+            
+            
+    prefix = os.environ.get("CURRENT_LLM_SERVING_ENGINE")
+
+    # document benchmarking results in json
+    with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
+
+        results = serving_results.to_dict(orient='records')
+        f.write(json.dumps(results))

From 9181a1d27a56bb7054d3706343b657ca3e9b7283 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 19 Jun 2024 00:37:33 -0700
Subject: [PATCH 013/150] use pkill tp kill lmdeploy

---
 .../scripts/run-lmdeploy-nightly.sh            | 18 +-----------------
 1 file changed, 1 insertion(+), 17 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index 82bca8d8ed344..ff58d93b06d7c 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -17,25 +17,9 @@ check_gpus() {
 }
 
 kill_gpu_processes() {
-  # kill all processes on GPU.
-  pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
-  if [ -z "$pids" ]; then
-    echo "No GPU processes found."
-  else
-    for pid in $pids; do
-      kill -9 "$pid"
-      echo "Killed process with PID: $pid"
-    done
-
-    echo "All GPU processes have been killed."
-  fi
-
+  pkill lmdeploy || true
   # waiting for GPU processes to be fully killed
   sleep 10
-
-  # remove vllm config file
-  rm -rf ~/.config/vllm
-
   # Print the GPU memory usage
   # so that we know if all GPU processes are killed.
   gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)

From 6e1936c6625c57c707ea6c73914ac5d80f2cd330 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 19 Jun 2024 01:04:59 -0700
Subject: [PATCH 014/150] update script for tgi

---
 .../nightly-benchmarks/run-nightly-suite.sh   |   6 +
 .../scripts/run-tgi-nightly.sh                | 167 ++++++++++++++++++
 .../tests/nightly-tests.json                  |  16 +-
 3 files changed, 184 insertions(+), 5 deletions(-)
 create mode 100644 .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh

diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
index 04cdd0d8322b7..058b84df80a86 100644
--- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@@ -48,6 +48,12 @@ main() {
         bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
     fi
 
+    # run tgi
+    if [ -e /tgi-entrypoint.sh ]; then
+        echo "tgi is available, redirect to run-tgi-nightly.sh"
+        bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+    fi
+
 }
 
 main "$@"
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
new file mode 100644
index 0000000000000..b0b8f26cfee33
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -0,0 +1,167 @@
+#!/bin/bash
+
+set -ex
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  pkill text-generation || true
+  # waiting for GPU processes to be fully killed
+  sleep 10
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  timeout 1200 bash -c '
+    until curl localhost:8000/generate_stream; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    # append tgi to the test name
+    test_name=tgi_$test_name
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
+    client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    tp=$(echo "$server_params" | jq -r '.num_shard')
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $testname."
+      continue
+    fi
+
+
+    server_command="./tgi-entrypoint.sh $server_args"
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    eval "$server_command" &
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "tgi server is up and running."
+    else
+      echo ""
+      echo "tgi failed to start within the timeout period."
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend tgi \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "tgi" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+  done
+}
+
+main() {
+
+  check_gpus
+  # enter vllm directory
+  cd /vllm/benchmarks
+
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  CURRENT_LLM_SERVING_ENGINE=tgi python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+
+}
+
+main "$@"
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index a730c172089f8..29d800a215bda 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -7,19 +7,25 @@
             "tp": 1,
             "server_port": 8000
         },
-        "client_parameters": {
-            "model": "meta-llama/Llama-2-7b-hf",
+        "lmdeploy_client_parameters": {
+            "model": "llama2",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,
             "port": 8000
         },
-        "lmdeploy_client_parameters": {
-            "model": "llama2",
+        "tgi_server_parameters": {
+            "model_id": "meta-llama/Llama-2-7b-hf",
+            "num_shard": 2,
+            "port": 8000
+        },
+        "tgi_client_parameters": {
+            "model": "meta-llama/Llama-2-7b-hf",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,
-            "port": 8000
+            "port": 8000,
+            "endpoint": "/generate_stream"
         }
     }
 ]
\ No newline at end of file

From c6aded948727b1ec43ce8b21916331735eb71989 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 19 Jun 2024 01:07:20 -0700
Subject: [PATCH 015/150] add install jq

---
 .buildkite/nightly-benchmarks/run-nightly-suite.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
index 058b84df80a86..2e0e974873c83 100644
--- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@@ -35,6 +35,7 @@ main() {
     check_hf_token
 
     (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+    (which jq) || (apt-get update && apt-get -y install jq)
     cd /
     git clone https://github.com/KuntaiDu/vllm.git
     cd vllm

From ccbcd18a272c2b30551bfe8b93c0d3750ee79064 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 19 Jun 2024 01:10:06 -0700
Subject: [PATCH 016/150] reduce 7b llama tp to 1

---
 .buildkite/nightly-benchmarks/tests/nightly-tests.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 29d800a215bda..f393dee4ca7a2 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -16,7 +16,7 @@
         },
         "tgi_server_parameters": {
             "model_id": "meta-llama/Llama-2-7b-hf",
-            "num_shard": 2,
+            "num_shard": 1,
             "port": 8000
         },
         "tgi_client_parameters": {

From 38cc38a70c5934bd66b20bc9f3fdc8b09aab8e2e Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 19 Jun 2024 01:11:07 -0700
Subject: [PATCH 017/150] update lmdeploy tp

---
 .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index ff58d93b06d7c..43d0b26e19f88 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -83,7 +83,7 @@ run_serving_tests() {
     echo "Running over qps list $qps_list"
 
     # check if there is enough GPU to run the test
-    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
+    tp=$(echo "$server_params" | jq -r '.tp')
     if [[ $gpu_count -lt $tp ]]; then
       echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
       continue

From 832891e993b6aee0d40eb82ea6c5b2551e5e2126 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 19 Jun 2024 01:13:11 -0700
Subject: [PATCH 018/150] bug fix

---
 .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
index b0b8f26cfee33..a6edb652619ca 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -87,7 +87,7 @@ run_serving_tests() {
     fi
 
 
-    server_command="./tgi-entrypoint.sh $server_args"
+    server_command="/tgi-entrypoint.sh $server_args"
 
     # run the server
     echo "Running test case $test_name"

From 587780694b8bd17eede9033e930c82cf6b4d341d Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 19 Jun 2024 21:21:34 -0700
Subject: [PATCH 019/150] update tensorrt script

---
 .../scripts/run-lmdeploy-nightly.sh           |   2 +-
 .../scripts/run-tgi-nightly.sh                |   2 +-
 .../scripts/run-trt-nightly.sh                | 244 ++++++++++++++++++
 .../tests/nightly-tests.json                  |  18 ++
 4 files changed, 264 insertions(+), 2 deletions(-)
 create mode 100644 .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh

diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index 43d0b26e19f88..0010fe8403974 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -85,7 +85,7 @@ run_serving_tests() {
     # check if there is enough GPU to run the test
     tp=$(echo "$server_params" | jq -r '.tp')
     if [[ $gpu_count -lt $tp ]]; then
-      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $testname."
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
       continue
     fi
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
index a6edb652619ca..f6de7728aea2f 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -82,7 +82,7 @@ run_serving_tests() {
     # check if there is enough GPU to run the test
     tp=$(echo "$server_params" | jq -r '.num_shard')
     if [[ $gpu_count -lt $tp ]]; then
-      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $testname."
+      echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
       continue
     fi
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
new file mode 100644
index 0000000000000..416839b7d32fb
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -0,0 +1,244 @@
+#!/bin/bash
+
+set -ex
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  pkill text-generation || true
+  # waiting for GPU processes to be fully killed
+  sleep 10
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  timeout 1200 bash -c '
+    until curl localhost:8000/generate_stream; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+
+run_trt_server() {
+
+  params=$1
+
+  model_name=$(echo "$params" | jq -r '.model_name')
+  model_path=$(echo "$params" | jq -r '.model_path')
+  model_type=$(echo "$params" | jq -r '.model_type')
+  model_dtype=$(echo "$params" | jq -r '.model_dtype')
+  model_tp_size=$(echo "$params" | jq -r '.model_tp_size')
+  max_batch_size=$(echo "$params" | jq -r '.max_batch_size')
+  max_input_len=$(echo "$params" | jq -r '.max_input_len')
+  max_output_len=$(echo "$params" | jq -r '.max_output_len')
+  trt_llm_version=$(echo "$params" | jq -r '.trt_llm_version')
+
+  cd ~
+  rm -rf models
+  mkdir -p models
+  cd models
+  models_dir=$(pwd)
+  trt_model_path=${models_dir}/${model_name}-trt-ckpt
+  trt_engine_path=${models_dir}/${model_name}-trt-engine
+
+  cd ~
+  rm -rf tensorrt-demo
+  git clone https://github.com/neuralmagic/tensorrt-demo.git
+  cd tensorrt-demo
+  tensorrt_demo_dir=$(pwd)
+
+  # make sure the parameter inside tensorrt_demo is consistent to envvar
+  sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
+  sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
+  sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
+  sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
+  sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
+  sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
+
+
+  cd /
+  rm -rf tensorrtllm_backend
+  git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
+  git lfs install
+  cd tensorrtllm_backend
+  git checkout $trt_llm_version
+  tensorrtllm_backend_dir=$(pwd)
+  git submodule update --init --recursive
+  cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
+
+  cd /tensorrtllm_backend
+  cd ./tensorrt_llm/examples/${model_type}
+
+  python3 convert_checkpoint.py \
+    --model_dir ${model_path} \
+    --dtype ${model_dtype} \
+    --tp_size ${model_tp_size} \
+    --output_dir ${trt_model_path}
+
+  trtllm-build \
+    --checkpoint_dir=${trt_model_path} \
+    --gpt_attention_plugin=${model_dtype} \
+    --gemm_plugin=${model_dtype} \
+    --remove_input_padding=enable \
+    --paged_kv_cache=enable \
+    --tp_size=${model_tp_size} \
+    --max_batch_size=${max_batch_size} \
+    --max_input_len=${max_input_len} \
+    --max_output_len=${max_output_len} \
+    --max_num_tokens=${max_output_len} \
+    --opt_num_tokens=${max_output_len} \
+    --output_dir=${trt_engine_path} 
+
+  cd /tensorrtllm_backend/triton_model_repo
+  cp -r ${trt_engine_path}/* ./tensorrt_llm/1
+  cd /tensorrtllm_backend
+  python3 scripts/launch_triton_server.py \
+    --world_size=${model_tp_size} \
+    --model_repo=/tensorrtllm_backend/triton_model_repo &
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+    # append tgi to the test name
+    test_name=trt_$test_name
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.trt_server_parameters')
+    client_params=$(echo "$params" | jq -r '.trt_client_parameters')
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    tp=$(echo "$server_params" | jq -r '.model_tp_size')
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+
+    # run the server
+    echo "Running test case $test_name"
+    run_trt_server $server_params
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "trt server is up and running."
+    else
+      echo ""
+      echo "trt failed to start within the timeout period."
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend trt \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "tgi" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+  done
+}
+
+main() {
+
+  check_gpus
+
+
+  # enter vllm directory
+  cd /vllm/benchmarks
+
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  CURRENT_LLM_SERVING_ENGINE=trt python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+
+}
+
+main "$@"
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index f393dee4ca7a2..88fa7a9fd5ea8 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -26,6 +26,24 @@
             "num_prompts": 200,
             "port": 8000,
             "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_path": "meta-llama/llama-2-7b-chat-hf",
+            "model_name": "llama-2-7b-chat-hf",
+            "model_type": "llama",
+            "model_dtype": "float16",
+            "model_tp_size": 1,
+            "max_batch_size": 256,
+            "max_input_len": 10000,
+            "max_output_len": 10000
+        },
+        "trt_client_parameters": {
+            "model": "meta-llama/Llama-2-7b-hf",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200,
+            "port": 8000,
+            "endpoint": "/v2/models/ensemble/generate_stream"
         }
     }
 ]
\ No newline at end of file

From 9972abac594dc7f57754f20a27860e2a366493b9 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 19 Jun 2024 22:41:39 -0700
Subject: [PATCH 020/150] update nightly suite

---
 .buildkite/nightly-benchmarks/run-nightly-suite.sh | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
index 2e0e974873c83..2a3e0b81c981d 100644
--- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@@ -55,6 +55,12 @@ main() {
         bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
     fi
 
+    # run trt
+    if which trtllm-build >/dev/null; then
+        echo "trtllm is available, redirect to run-trt-nightly.sh"
+        bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+    fi
+
 }
 
 main "$@"
\ No newline at end of file

From 6493679313dd3e882c80ae95f6257b6fff824790 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 19 Jun 2024 22:47:40 -0700
Subject: [PATCH 021/150] add double quote

---
 .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index 416839b7d32fb..f4ebab70e18b1 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -167,7 +167,7 @@ run_serving_tests() {
 
     # run the server
     echo "Running test case $test_name"
-    run_trt_server $server_params
+    run_trt_server "$server_params"
 
     # wait until the server is alive
     wait_for_server

From e62cae68a29a8037f2dd60d7be72715c45b30742 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 19 Jun 2024 22:48:53 -0700
Subject: [PATCH 022/150] add trt llm version

---
 .buildkite/nightly-benchmarks/tests/nightly-tests.json | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 88fa7a9fd5ea8..adb62761d36a8 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -35,7 +35,8 @@
             "model_tp_size": 1,
             "max_batch_size": 256,
             "max_input_len": 10000,
-            "max_output_len": 10000
+            "max_output_len": 10000,
+            "trt_llm_version": "r24.04"
         },
         "trt_client_parameters": {
             "model": "meta-llama/Llama-2-7b-hf",

From 9ce358963a3491f700435b0bcdb17c9591eb43ee Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 19 Jun 2024 22:58:15 -0700
Subject: [PATCH 023/150] update trt

---
 .../scripts/run-trt-nightly.sh                   | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index f4ebab70e18b1..4e46064c69e2b 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -152,6 +152,7 @@ run_serving_tests() {
     # get client and server arguments
     server_params=$(echo "$params" | jq -r '.trt_server_parameters')
     client_params=$(echo "$params" | jq -r '.trt_client_parameters')
+    model=$(echo "$client_params" | jq -r '.model')
     client_args=$(json2args "$client_params")
     qps_list=$(echo "$params" | jq -r '.qps_list')
     qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
@@ -165,6 +166,15 @@ run_serving_tests() {
     fi
 
 
+    # prepare tokenizer
+    cd /vllm/benchmarks
+    rm -rf /tokenizer_cache
+    mkdir /tokenizer_cache
+    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+      --model "$model" \
+      --cachedir /tokenizer_cache
+
+
     # run the server
     echo "Running test case $test_name"
     run_trt_server "$server_params"
@@ -179,6 +189,9 @@ run_serving_tests() {
       echo "trt failed to start within the timeout period."
     fi
 
+    # go back to vllm benchmarking directory
+    cd /vllm/benchmarks
+
     # iterate over different QPS
     for qps in $qps_list; do
       # remove the surrounding single quote from qps
@@ -191,7 +204,8 @@ run_serving_tests() {
       new_test_name=$test_name"_qps_"$qps
 
       client_command="python3 benchmark_serving.py \
-        --backend trt \
+        --backend tensorrt-llm \
+        --tokenizer /tokenizer_cache \
         --save-result \
         --result-dir $RESULTS_FOLDER \
         --result-filename ${new_test_name}.json \

From f634dee89da1abd8ba9283e650ea7809eaa90f20 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 19 Jun 2024 23:13:23 -0700
Subject: [PATCH 024/150] update on how to kill the server

---
 .../nightly-benchmarks/run-tgi-benchmarks.sh  |  72 -------------
 .../nightly-benchmarks/run-trt-benchmarks.sh  | 102 ------------------
 .../scripts/run-trt-nightly.sh                |   4 +-
 3 files changed, 2 insertions(+), 176 deletions(-)
 delete mode 100644 .buildkite/nightly-benchmarks/run-tgi-benchmarks.sh
 delete mode 100644 .buildkite/nightly-benchmarks/run-trt-benchmarks.sh

diff --git a/.buildkite/nightly-benchmarks/run-tgi-benchmarks.sh b/.buildkite/nightly-benchmarks/run-tgi-benchmarks.sh
deleted file mode 100644
index 27f0fe57f3716..0000000000000
--- a/.buildkite/nightly-benchmarks/run-tgi-benchmarks.sh
+++ /dev/null
@@ -1,72 +0,0 @@
-#!/bin/bash
-
-# This script should be run inside the tgi container. Enter the latest tgi container by
-# docker run -it --gpus all -e "HF_TOKEN=<your HF TOKEN>"   --shm-size 1g --entrypoint /bin/bash ghcr.io/huggingface/text-generation-inference:2.0
-# (please modify `<your HF TOKEN>` to your own huggingface token in the above command
-# Then, copy-paste this file into any directory you prefer in the docker and execute it using bash.
-# Benchmarking results will be inside /vllm/benchmarks/*.txt
-# NOTE: this script gradually reduces the request rate from 20, to ensure all requests are successful.
-
-set -ex
-set -o pipefail
-
-# install conda
-(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
-wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
-bash Miniconda3-latest-Linux-x86_64.sh -b -u -p ~/miniconda3
-~/miniconda3/bin/conda init bash
-eval "$(cat ~/.bashrc | tail -n +15)"
-
-# create conda environment for vllm
-conda create -n vllm python=3.9 -y
-eval "$(conda shell.bash hook)"
-conda activate vllm
-pip install vllm
-
-# clone vllm repo
-cd /
-git clone https://github.com/vllm-project/vllm.git
-wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-# launch TGI server
-/tgi-entrypoint.sh --port 8000 --model-id meta-llama/Llama-2-7b-chat-hf &
-tgi_pid=$!
-timeout 600 bash -c 'until curl localhost:8000/generate_stream; do sleep 1; done' || exit 1
-
-# gradually reduce the request rate from 20, untill all request successed
-request_rate=20
-get_successful_requests() {
-  grep "Successful requests:" benchmark_serving.txt | awk '{print $3}'
-}
-while true; do
-  echo "Running benchmark with request rate $request_rate..."
-  python3 vllm/benchmarks/benchmark_serving.py --backend tgi --model meta-llama/Llama-2-7b-chat-hf --dataset-name sharegpt --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1000 --endpoint /generate_stream --request-rate $request_rate --port 8000 --save-result 2>&1 | tee benchmark_serving.txt
-  bench_serving_exit_code=$?
-  successful_requests=$(get_successful_requests)
-  echo "Successful requests: $successful_requests"
-  if [ "$successful_requests" -eq 1000 ]; then
-    echo "Reached 1000 successful requests with request rate $request_rate"
-    break
-  fi
-  request_rate=$((request_rate - 1))
-  if [ "$request_rate" -lt 1 ]; then
-    echo "Request rate went below 1. Exiting."
-    break
-  fi
-done
-kill $tgi_pid
-
-echo "### TGI Serving Benchmarks" >>benchmark_results.md
-sed -n '1p' benchmark_serving.txt >>benchmark_results.md
-echo "" >>benchmark_results.md
-echo '```' >>benchmark_results.md
-tail -n 17 benchmark_serving.txt >>benchmark_results.md
-echo '```' >>benchmark_results.md
-
-# if the agent binary is not found, skip uploading the results, exit 0
-if [ ! -f /workspace/buildkite-agent ]; then
-  exit 0
-fi
-
-# upload the results to buildkite
-/workspace/buildkite-agent annotate --style "info" --context "benchmark-results" <benchmark_results.md
diff --git a/.buildkite/nightly-benchmarks/run-trt-benchmarks.sh b/.buildkite/nightly-benchmarks/run-trt-benchmarks.sh
deleted file mode 100644
index 6ab565890f9d8..0000000000000
--- a/.buildkite/nightly-benchmarks/run-trt-benchmarks.sh
+++ /dev/null
@@ -1,102 +0,0 @@
-#!/bin/bash
-
-# This script should be run inside the trt-llm docker container, command:
-# docker run -it --net host -e HF_TOKEN=<your HF TOKEN> --shm-size=2g --ulimit memlock=-1 --ulimit stack=67108864 --runtime=nvidia --gpus all --entrypoint /bin/bash nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
-# (please modify `<your HF TOKEN>` to your own huggingface token in the above command
-# Then, copy-paste this file into the docker and execute it using bash.
-
-set -xe
-TRT_LLM_VERSION=r24.04
-model_path=meta-llama/llama-2-7b-chat-hf
-model_name=llama-2-7b-chat-hf
-model_type=llama
-model_dtype=float16
-model_tp_size=1
-max_batch_size=233
-max_input_len=15000
-max_output_len=15000
-cd ~
-mkdir models
-cd models
-models_dir=`pwd`
-trt_model_path=${models_dir}/${model_name}-trt-ckpt
-trt_engine_path=${models_dir}/${model_name}-trt-engine
-
-
-
-cd ~
-git clone https://github.com/neuralmagic/tensorrt-demo.git
-cd tensorrt-demo
-tensorrt_demo_dir=`pwd`
-
-# make sure the parameter inside tensorrt_demo is consistent to envvar
-sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
-sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
-sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
-sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
-sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
-sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
-
-
-cd /
-git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
-git lfs install
-cd tensorrtllm_backend
-git checkout $TRT_LLM_VERSION
-tensorrtllm_backend_dir=`pwd`
-
-git submodule update --init --recursive
-cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
-
-cd /tensorrtllm_backend
-cd ./tensorrt_llm/examples/${model_type}
-
-python3 convert_checkpoint.py \
-    --model_dir ${model_path} \
-    --dtype ${model_dtype} \
-    --tp_size ${model_tp_size} \
-    --output_dir ${trt_model_path}
-    
-trtllm-build \
-    --checkpoint_dir=${trt_model_path} \
-    --gpt_attention_plugin=${model_dtype} \
-    --gemm_plugin=${model_dtype} \
-    --remove_input_padding=enable \
-    --paged_kv_cache=enable \
-    --tp_size=${model_tp_size} \
-    --max_batch_size=${max_batch_size} \
-    --max_input_len=${max_input_len} \
-    --max_output_len=${max_output_len} \
-    --max_num_tokens=${max_output_len} \
-    --opt_num_tokens=${max_output_len} \
-    --output_dir=${trt_engine_path} 
-    
-cd /tensorrtllm_backend/triton_model_repo
-cp -r ${trt_engine_path}/* ./tensorrt_llm/1
-cd /tensorrtllm_backend
-python3 scripts/launch_triton_server.py --world_size=${model_tp_size} --model_repo=/tensorrtllm_backend/triton_model_repo &
-
-
-# sleep for 20 seconds, to make sure the server is launched 
-sleep 30
-
-
-# install vllm inside conda, for benchmarking.
-(which wget && which curl) || (apt-get update && apt-get install -y wget curl)
-wget https://repo.anaconda.com/miniconda/Miniconda3-latest-Linux-x86_64.sh
-bash Miniconda3-latest-Linux-x86_64.sh -b -u -p ~/miniconda3
-~/miniconda3/bin/conda init bash
-eval "$(cat ~/.bashrc | tail -n +15)"
-conda create -n vllm python=3.9 -y
-eval "$(conda shell.bash hook)"
-conda activate vllm
-pip install vllm
-
-# clone vllm's benchmark_serving script
-cd ~
-git clone https://github.com/vllm-project/vllm.git
-cd vllm/benchmarks/
-
-export VLLM_HOST_IP=$(hostname -I | awk '{print $1}')
-wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-python benchmark_serving.py --backend tensorrt-llm --endpoint /v2/models/ensemble/generate_stream --port 8000 --model $model_path --save-result --dataset-name sharegpt --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json --num-prompts 1000 2>&1 | tee benchmark_serving.txt
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index 4e46064c69e2b..ea49df1d22f59 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -17,9 +17,9 @@ check_gpus() {
 }
 
 kill_gpu_processes() {
-  pkill text-generation || true
+  pkill tritonserver || true
   # waiting for GPU processes to be fully killed
-  sleep 10
+  sleep 20
   # Print the GPU memory usage
   # so that we know if all GPU processes are killed.
   gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)

From b47e30be938f2ec86c8c8227ff8b478dd5fc23d0 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 20 Jun 2024 00:19:02 -0700
Subject: [PATCH 025/150] update vllm nightly test

---
 .../nightly-benchmarks/run-nightly-suite.sh   |   6 +
 .../scripts/run-lmdeploy-nightly.sh           |   7 +-
 .../scripts/run-trt-nightly.sh                |   4 +-
 .../scripts/run-vllm-nightly.sh               | 182 ++++++++++++++++++
 .../tests/nightly-tests.json                  |  11 ++
 5 files changed, 206 insertions(+), 4 deletions(-)
 create mode 100644 .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh

diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
index 2a3e0b81c981d..3e938a87a1bfb 100644
--- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@@ -61,6 +61,12 @@ main() {
         bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
     fi
 
+    # run vllm
+    if python3 -c "import vllm" &> /dev/null; then
+        echo "vllm is available, redirect to run-vllm-nightly.sh"
+        bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+    fi
+
 }
 
 main "$@"
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index 0010fe8403974..42918e41946bb 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -63,14 +63,17 @@ run_serving_tests() {
   jq -c '.[]' "$serving_test_file" | while read -r params; do
     # get the test name, and append the GPU type back to it.
     test_name=$(echo "$params" | jq -r '.test_name')
-    # append lmdeploy to the test name
-    test_name=lmdeploy_$test_name
 
     # if TEST_SELECTOR is set, only run the test cases that match the selector
     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
       echo "Skip test case $test_name."
       continue
     fi
+    
+    # append lmdeploy to the test name
+    test_name=lmdeploy_$test_name
+
+    
 
     # get client and server arguments
     server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index ea49df1d22f59..3956db112023c 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -140,7 +140,7 @@ run_serving_tests() {
   jq -c '.[]' "$serving_test_file" | while read -r params; do
     # get the test name, and append the GPU type back to it.
     test_name=$(echo "$params" | jq -r '.test_name')
-    # append tgi to the test name
+    # append trt to the test name
     test_name=trt_$test_name
 
     # if TEST_SELECTOR is set, only run the test cases that match the selector
@@ -222,7 +222,7 @@ run_serving_tests() {
         --arg server "$server_command" \
         --arg client "$client_command" \
         --arg gpu "$gpu_type" \
-        --arg engine "tgi" \
+        --arg engine "trt" \
         '{
           server_command: $server,
           client_command: $client,
diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
new file mode 100644
index 0000000000000..a06851e474638
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -0,0 +1,182 @@
+#!/bin/bash
+
+set -ex
+set -o pipefail
+
+check_gpus() {
+  # check the number of GPUs and GPU type.
+  declare -g gpu_count=$(nvidia-smi --list-gpus | wc -l)
+  if [[ $gpu_count -gt 0 ]]; then
+    echo "GPU found."
+  else
+    echo "Need at least 1 GPU to run benchmarking."
+    exit 1
+  fi
+  declare -g gpu_type=$(echo $(nvidia-smi --query-gpu=name --format=csv,noheader) | awk '{print $2}')
+  echo "GPU type is $gpu_type"
+}
+
+kill_gpu_processes() {
+  pkill lmdeploy || true
+  # waiting for GPU processes to be fully killed
+  sleep 10
+  # Print the GPU memory usage
+  # so that we know if all GPU processes are killed.
+  gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
+  # The memory usage should be 0 MB.
+  echo "GPU 0 Memory Usage: $gpu_memory_usage MB"
+}
+
+json2args() {
+  # transforms the JSON string to command line args, and '_' is replaced to '-'
+  # example:
+  # input: { "model": "meta-llama/Llama-2-7b-chat-hf", "tensor_parallel_size": 1 }
+  # output: --model meta-llama/Llama-2-7b-chat-hf --tensor-parallel-size 1
+  local json_string=$1
+  local args=$(
+    echo "$json_string" | jq -r '
+      to_entries |
+      map("--" + (.key | gsub("_"; "-")) + " " + (.value | tostring)) |
+      join(" ")
+    '
+  )
+  echo "$args"
+}
+
+wait_for_server() {
+  # wait for vllm server to start
+  # return 1 if vllm server crashes
+  timeout 1200 bash -c '
+    until curl localhost:8000/v1/completions; do
+      sleep 1
+    done' && return 0 || return 1
+}
+
+run_serving_tests() {
+  # run serving tests using `benchmark_serving.py`
+  # $1: a json file specifying serving test cases
+
+  local serving_test_file
+  serving_test_file=$1
+
+  # Iterate over serving tests
+  jq -c '.[]' "$serving_test_file" | while read -r params; do
+    # get the test name, and append the GPU type back to it.
+    test_name=$(echo "$params" | jq -r '.test_name')
+
+    # if TEST_SELECTOR is set, only run the test cases that match the selector
+    if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
+      echo "Skip test case $test_name."
+      continue
+    fi
+
+    # append vllm to the test name
+    test_name=vllm_$test_name
+
+    
+
+    # get client and server arguments
+    server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
+    client_params=$(echo "$params" | jq -r '.vllm_client_parameters')
+    server_args=$(json2args "$server_params")
+    client_args=$(json2args "$client_params")
+    qps_list=$(echo "$params" | jq -r '.qps_list')
+    qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
+    echo "Running over qps list $qps_list"
+
+    # check if there is enough GPU to run the test
+    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
+    if [[ $gpu_count -lt $tp ]]; then
+      echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
+      continue
+    fi
+
+    # check if server model and client model is aligned
+    server_model=$(echo "$server_params" | jq -r '.model')
+    client_model=$(echo "$client_params" | jq -r '.model')
+    if [[ $server_model != "$client_model" ]]; then
+      echo "Server model and client model must be the same. Skip testcase $test_name."
+      continue
+    fi
+
+
+    server_command="python3 \
+      -m vllm.entrypoints.openai.api_server \
+      $server_args"
+
+    # run the server
+    echo "Running test case $test_name"
+    echo "Server command: $server_command"
+    eval "$server_command" &
+
+    # wait until the server is alive
+    wait_for_server
+    if [ $? -eq 0 ]; then
+      echo ""
+      echo "vllm server is up and running."
+    else
+      echo ""
+      echo "vllm failed to start within the timeout period."
+    fi
+
+    # iterate over different QPS
+    for qps in $qps_list; do
+      # remove the surrounding single quote from qps
+      if [[ "$qps" == *"inf"* ]]; then
+        echo "qps was $qps"
+        qps="inf"
+        echo "now qps is $qps"
+      fi
+
+      new_test_name=$test_name"_qps_"$qps
+
+      client_command="python3 benchmark_serving.py \
+        --backend vllm \
+        --save-result \
+        --result-dir $RESULTS_FOLDER \
+        --result-filename ${new_test_name}.json \
+        --request-rate $qps \
+        $client_args"
+
+      echo "Running test case $test_name with qps $qps"
+      echo "Client command: $client_command"
+
+      eval "$client_command"
+
+      # record the benchmarking commands
+      jq_output=$(jq -n \
+        --arg server "$server_command" \
+        --arg client "$client_command" \
+        --arg gpu "$gpu_type" \
+        --arg engine "vllm" \
+        '{
+          server_command: $server,
+          client_command: $client,
+          gpu_type: $gpu,
+          engine: $engine
+        }')
+      echo "$jq_output" >"$RESULTS_FOLDER/${new_test_name}.commands"
+
+    done
+
+    # clean up
+    kill_gpu_processes
+  done
+}
+
+main() {
+
+  check_gpus
+  # enter vllm directory
+  cd /vllm/benchmarks
+
+  declare -g RESULTS_FOLDER=results/
+  mkdir -p $RESULTS_FOLDER
+  BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
+
+  run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
+  CURRENT_LLM_SERVING_ENGINE=vllm python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+
+}
+
+main "$@"
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index adb62761d36a8..1328f281536bf 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -45,6 +45,17 @@
             "num_prompts": 200,
             "port": 8000,
             "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "model": "meta-llama/Llama-2-7b-hf",
+            "tensor_parallel_size": 1
+        },
+        "vllm_client_parameters": {
+            "model": "meta-llama/Llama-2-7b-hf",
+            "backend": "vllm",
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 200
         }
     }
 ]
\ No newline at end of file

From ec8b29597e70bd1a53f8d55457d5a1add8058d6a Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 20 Jun 2024 00:22:21 -0700
Subject: [PATCH 026/150] disalbe vllm server log

---
 .buildkite/nightly-benchmarks/tests/nightly-tests.json | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 1328f281536bf..c37ef39470a98 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -55,7 +55,9 @@
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
+            "num_prompts": 200,
+            "disable_log_stats": "",
+            "disable_log_requests": ""
         }
     }
 ]
\ No newline at end of file

From 792ef7f21ccedbd1a2dd27c1f94e8f3eb06671ca Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 20 Jun 2024 00:28:33 -0700
Subject: [PATCH 027/150] adjust how to kill processes in vllm

---
 .../scripts/run-vllm-nightly.sh               | 20 +++++++++++++++++--
 1 file changed, 18 insertions(+), 2 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
index a06851e474638..d0a145d8186f2 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -17,9 +17,25 @@ check_gpus() {
 }
 
 kill_gpu_processes() {
-  pkill lmdeploy || true
+  # kill all processes on GPU.
+  pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
+  if [ -z "$pids" ]; then
+      echo "No GPU processes found."
+  else
+      for pid in $pids; do
+          kill -9 "$pid"
+          echo "Killed process with PID: $pid"
+      done
+
+      echo "All GPU processes have been killed."
+  fi
+
   # waiting for GPU processes to be fully killed
   sleep 10
+
+  # remove vllm config file
+  rm -rf ~/.config/vllm
+
   # Print the GPU memory usage
   # so that we know if all GPU processes are killed.
   gpu_memory_usage=$(nvidia-smi --query-gpu=memory.used --format=csv,noheader,nounits -i 0)
@@ -175,7 +191,7 @@ main() {
   BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
 
   run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
-  CURRENT_LLM_SERVING_ENGINE=vllm python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  CURRENT_LLM_SERVING_ENGINE=vllm python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
 
 }
 

From 4f67c960ccc3d30ae7e4a73b1a6ed835e0d6d61e Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 20 Jun 2024 15:07:10 -0700
Subject: [PATCH 028/150] update nightly tests

---
 .buildkite/nightly-benchmarks/tests/nightly-tests.json | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index c37ef39470a98..bfff63ee1cb47 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -48,16 +48,16 @@
         },
         "vllm_server_parameters": {
             "model": "meta-llama/Llama-2-7b-hf",
-            "tensor_parallel_size": 1
+            "tensor_parallel_size": 1,
+            "disable_log_stats": "",
+            "disable_log_requests": ""
         },
         "vllm_client_parameters": {
             "model": "meta-llama/Llama-2-7b-hf",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "disable_log_stats": "",
-            "disable_log_requests": ""
+            "num_prompts": 200
         }
     }
 ]
\ No newline at end of file

From f0fe30cc6464a4c816b0346e7de247f77631ff0d Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 20 Jun 2024 15:25:20 -0700
Subject: [PATCH 029/150] update summary results

---
 .../scripts/summary-nightly-results.py                | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
index 6c2668ed2b3ec..3814f4f515114 100644
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -3,6 +3,7 @@
 from pathlib import Path
 
 import pandas as pd
+from tabulate import tabulate
 
 results_folder = Path("results/")
 
@@ -53,6 +54,10 @@
 
 
     serving_results = pd.DataFrame.from_dict(serving_results)
+    serving_md_table = tabulate(serving_results,
+                                headers='keys',
+                                tablefmt='pipe',
+                                showindex=False)
 
 
     if not serving_results.empty:
@@ -63,6 +68,12 @@
             
     prefix = os.environ.get("CURRENT_LLM_SERVING_ENGINE")
 
+    # document benchmarking results in markdown
+    with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
+        f.write(serving_md_table)
+        f.write('\n')
+        
+
     # document benchmarking results in json
     with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:
 

From d0978436d7f43a1e7ac65d9cdd7210ce94c270ea Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 20 Jun 2024 15:26:51 -0700
Subject: [PATCH 030/150] update summary results

---
 .../scripts/summary-nightly-results.py                   | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
index 3814f4f515114..ef8239ed94472 100644
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -54,10 +54,6 @@
 
 
     serving_results = pd.DataFrame.from_dict(serving_results)
-    serving_md_table = tabulate(serving_results,
-                                headers='keys',
-                                tablefmt='pipe',
-                                showindex=False)
 
 
     if not serving_results.empty:
@@ -65,6 +61,11 @@
             serving_column_mapping.keys())].rename(
                 columns=serving_column_mapping)
             
+    serving_md_table = tabulate(serving_results,
+                                headers='keys',
+                                tablefmt='pipe',
+                                showindex=False)
+            
             
     prefix = os.environ.get("CURRENT_LLM_SERVING_ENGINE")
 

From 2c30f38b1520e77776af9796d6d80f861233e235 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 20 Jun 2024 15:34:55 -0700
Subject: [PATCH 031/150] add upload_to_buildkite utility

---
 .../scripts/run-lmdeploy-nightly.sh           | 19 ++++++++++-
 .../scripts/run-tgi-nightly.sh                | 19 ++++++++++-
 .../scripts/run-trt-nightly.sh                | 18 +++++++++-
 .../scripts/run-vllm-nightly.sh               | 33 +++++++++++--------
 4 files changed, 72 insertions(+), 17 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index 42918e41946bb..79cf30f924746 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -162,6 +162,20 @@ run_serving_tests() {
   done
 }
 
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+
 main() {
 
   check_gpus
@@ -172,8 +186,11 @@ main() {
   mkdir -p $RESULTS_FOLDER
   BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
 
+  export CURRENT_LLM_SERVING_ENGINE=lmdeploy
   run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
-  CURRENT_LLM_SERVING_ENGINE=lmdeploy python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  python -m pip install tabulate pandas
+  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
 
 }
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
index f6de7728aea2f..398e2017f9b56 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -149,6 +149,20 @@ run_serving_tests() {
   done
 }
 
+
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
 main() {
 
   check_gpus
@@ -159,8 +173,11 @@ main() {
   mkdir -p $RESULTS_FOLDER
   BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
 
+  export CURRENT_LLM_SERVING_ENGINE=tgi
   run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
-  CURRENT_LLM_SERVING_ENGINE=tgi python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  python -m pip install tabulate pandas
+  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
 
 }
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index 3956db112023c..71badb8ebf647 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -238,6 +238,19 @@ run_serving_tests() {
   done
 }
 
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
+
 main() {
 
   check_gpus
@@ -250,8 +263,11 @@ main() {
   mkdir -p $RESULTS_FOLDER
   BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
 
+  export CURRENT_LLM_SERVING_ENGINE=trt
   run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
-  CURRENT_LLM_SERVING_ENGINE=trt python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  python -m pip install tabulate pandas
+  python $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
 
 }
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
index d0a145d8186f2..3ce8f7e3bb4d8 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -18,19 +18,7 @@ check_gpus() {
 
 kill_gpu_processes() {
   # kill all processes on GPU.
-  pids=$(nvidia-smi --query-compute-apps=pid --format=csv,noheader)
-  if [ -z "$pids" ]; then
-      echo "No GPU processes found."
-  else
-      for pid in $pids; do
-          kill -9 "$pid"
-          echo "Killed process with PID: $pid"
-      done
-
-      echo "All GPU processes have been killed."
-  fi
-
-  # waiting for GPU processes to be fully killed
+  pkill pt_main_thread
   sleep 10
 
   # remove vllm config file
@@ -180,6 +168,19 @@ run_serving_tests() {
   done
 }
 
+
+upload_to_buildkite() {
+  # upload the benchmarking results to buildkite
+
+  # if the agent binary is not found, skip uploading the results, exit 0
+  if [ ! -f /workspace/buildkite-agent ]; then
+    echo "buildkite-agent binary not found. Skip uploading the results."
+    return 0
+  fi
+  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
+}
+
 main() {
 
   check_gpus
@@ -190,8 +191,12 @@ main() {
   mkdir -p $RESULTS_FOLDER
   BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
 
+  export CURRENT_LLM_SERVING_ENGINE=vllm
   run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
-  CURRENT_LLM_SERVING_ENGINE=vllm python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+
+  python3 -m pip install tabulate pandas
+  python3 $BENCHMARK_ROOT/scripts/summary-nightly-results.py
+  upload_to_buildkite
 
 }
 

From 7304668667d035710d2eb2686c381b87aeaaa063 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 20 Jun 2024 15:38:56 -0700
Subject: [PATCH 032/150] update kickoff pipeline to initiate nightly benchmark

---
 .../nightly-benchmarks/kickoff-pipeline.sh    |  18 ++-
 .../nightly-benchmarks/nightly-pipeline.yaml  | 126 ++++++++++++++++++
 2 files changed, 138 insertions(+), 6 deletions(-)
 create mode 100644 .buildkite/nightly-benchmarks/nightly-pipeline.yaml

diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
index 15d411febcee1..12e63e9c9278e 100755
--- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
+++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
@@ -10,18 +10,24 @@ apt install -y curl jq
 # Install minijinja for templating
 curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
 source $HOME/.cargo/env
+local target_yaml_file=""
 
 # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
 if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
   PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
 
   if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
-    echo "This PR has the 'perf-benchmarks' label. Proceeding with the nightly benchmarks."
-  else
-    echo "This PR does not have the 'perf-benchmarks' label. Skipping the nightly benchmarks."
-    exit 0
+    echo "This PR has the 'perf-benchmarks' label. Proceeding with the performance benchmarks."
+    target_yaml_file=".buildkite/nightly-benchmarks/benchmark-pipeline.yaml"
+  fi
+
+  if [[ $PR_LABELS == *"comp-benchmarks"* ]]; then
+    echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks."
+    target_yaml_file=".buildkite/nightly-benchmarks/nightly-pipeline.yaml"
   fi
 fi
 
-# Upload sample.yaml
-buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+if [ -n "$target_yaml_file" ]; then
+  # Upload sample.yaml
+  buildkite-agent pipeline upload $target_yaml_file
+fi
diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
new file mode 100644
index 0000000000000..79092eeed4bca
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -0,0 +1,126 @@
+steps:
+  - label: "A100 trt benchmark"
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          containers:
+          - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+            command:
+            - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh)
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+  - wait
+  - label: "A100 vllm benchmark"
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          containers:
+          - image: vllm/vllm-openai:latest
+            command:
+            - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh)
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+  - wait
+  - label: "A100 tgi benchmark"
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          containers:
+          - image: ghcr.io/huggingface/text-generation-inference:2.0
+            command:
+            - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh)
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+  - wait
+  - label: "A100 lmdeploy benchmark"
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          containers:
+          - image: openmmlab/lmdeploy:latest
+            command:
+            - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh)
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+  - wait
+  
\ No newline at end of file

From f811ef0832de5469ce04ca58392c52975fe8c918 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 20 Jun 2024 15:39:47 -0700
Subject: [PATCH 033/150] update kickoff pipeline

---
 .buildkite/nightly-benchmarks/kickoff-pipeline.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
index 12e63e9c9278e..bf25aef70f5ac 100755
--- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
+++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
@@ -10,7 +10,7 @@ apt install -y curl jq
 # Install minijinja for templating
 curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
 source $HOME/.cargo/env
-local target_yaml_file=""
+target_yaml_file=""
 
 # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
 if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then

From 1876048c51fb9d5a9ea515ba248b017d5471ee46 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 20 Jun 2024 15:44:24 -0700
Subject: [PATCH 034/150] update the label name

---
 .buildkite/nightly-benchmarks/kickoff-pipeline.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
index bf25aef70f5ac..8a4f852477713 100755
--- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
+++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
@@ -21,7 +21,7 @@ if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
     target_yaml_file=".buildkite/nightly-benchmarks/benchmark-pipeline.yaml"
   fi
 
-  if [[ $PR_LABELS == *"comp-benchmarks"* ]]; then
+  if [[ $PR_LABELS == *"nightly-benchmarks"* ]]; then
     echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks."
     target_yaml_file=".buildkite/nightly-benchmarks/nightly-pipeline.yaml"
   fi

From 0a4518dda8823e16a8889eca259ec386aa5ad860 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 20 Jun 2024 22:21:18 -0700
Subject: [PATCH 035/150] bug fix: exit benchmarking script after finish
 benchmarking one application

---
 .buildkite/nightly-benchmarks/run-nightly-suite.sh          | 6 +++++-
 .../nightly-benchmarks/scripts/run-lmdeploy-nightly.sh      | 2 +-
 .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh    | 2 +-
 .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh    | 2 +-
 .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh   | 2 +-
 5 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
index 3e938a87a1bfb..4f09c9e8423f8 100644
--- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@@ -47,24 +47,28 @@ main() {
     if which lmdeploy >/dev/null; then
         echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
         bash ../.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+        exit 0
     fi
 
     # run tgi
     if [ -e /tgi-entrypoint.sh ]; then
         echo "tgi is available, redirect to run-tgi-nightly.sh"
         bash ../.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+        exit 0
     fi
 
     # run trt
     if which trtllm-build >/dev/null; then
         echo "trtllm is available, redirect to run-trt-nightly.sh"
         bash ../.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+        exit 0
     fi
 
     # run vllm
-    if python3 -c "import vllm" &> /dev/null; then
+    if [ -e /vllm-workspace ]; then
         echo "vllm is available, redirect to run-vllm-nightly.sh"
         bash ../.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+        exit 0
     fi
 
 }
diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index 79cf30f924746..c95b8845be1c6 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -171,7 +171,7 @@ upload_to_buildkite() {
     echo "buildkite-agent binary not found. Skip uploading the results."
     return 0
   fi
-  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent annotate --style "info" --context "lmdeploy-benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
   /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
index 398e2017f9b56..23cbafa5c2edc 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -159,7 +159,7 @@ upload_to_buildkite() {
     echo "buildkite-agent binary not found. Skip uploading the results."
     return 0
   fi
-  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent annotate --style "info" --context "tgi-benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
   /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index 71badb8ebf647..ec0d99570f08f 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -246,7 +246,7 @@ upload_to_buildkite() {
     echo "buildkite-agent binary not found. Skip uploading the results."
     return 0
   fi
-  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent annotate --style "info" --context "trt-benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
   /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
index 3ce8f7e3bb4d8..021a8dc170d1c 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -177,7 +177,7 @@ upload_to_buildkite() {
     echo "buildkite-agent binary not found. Skip uploading the results."
     return 0
   fi
-  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent annotate --style "info" --context "vllm-benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
   /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 

From f47db88c31a2315080c918e2f8bdee343924c7a6 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 20 Jun 2024 22:22:55 -0700
Subject: [PATCH 036/150] make yapf, ruff and isort happy

---
 .../scripts/download-tokenizer.py             | 19 +++++++++++++------
 .../scripts/summary-nightly-results.py        | 12 ++----------
 2 files changed, 15 insertions(+), 16 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
index add331bfbd9f3..68ac5909e5951 100644
--- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
+++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py
@@ -1,19 +1,26 @@
-
 import argparse
-from pathlib import Path
+
 from transformers import AutoTokenizer
 
+
 def main(model, cachedir):
     # Load the tokenizer and save it to the specified directory
     tokenizer = AutoTokenizer.from_pretrained(model)
     tokenizer.save_pretrained(cachedir)
     print(f"Tokenizer saved to {cachedir}")
 
+
 if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Download and save Hugging Face tokenizer")
-    parser.add_argument("--model", type=str, required=True, help="Name of the model")
-    parser.add_argument("--cachedir", type=str, required=True, help="Directory to save the tokenizer")
+    parser = argparse.ArgumentParser(
+        description="Download and save Hugging Face tokenizer")
+    parser.add_argument("--model",
+                        type=str,
+                        required=True,
+                        help="Name of the model")
+    parser.add_argument("--cachedir",
+                        type=str,
+                        required=True,
+                        help="Directory to save the tokenizer")
 
     args = parser.parse_args()
     main(args.model, args.cachedir)
-    
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
index ef8239ed94472..ced57295f735e 100644
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -28,9 +28,6 @@
     "engine": "Engine",
 }
 
-
-
-
 if __name__ == "__main__":
 
     # collect results
@@ -39,7 +36,6 @@
         with open(test_file, "r") as f:
             raw_result = json.loads(f.read())
 
-            
         # attach the benchmarking command to raw_result
         with open(test_file.with_suffix(".commands"), "r") as f:
             command = json.loads(f.read())
@@ -52,28 +48,24 @@
         serving_results.append(raw_result)
         continue
 
-
     serving_results = pd.DataFrame.from_dict(serving_results)
 
-
     if not serving_results.empty:
         serving_results = serving_results[list(
             serving_column_mapping.keys())].rename(
                 columns=serving_column_mapping)
-            
+
     serving_md_table = tabulate(serving_results,
                                 headers='keys',
                                 tablefmt='pipe',
                                 showindex=False)
-            
-            
+
     prefix = os.environ.get("CURRENT_LLM_SERVING_ENGINE")
 
     # document benchmarking results in markdown
     with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
         f.write(serving_md_table)
         f.write('\n')
-        
 
     # document benchmarking results in json
     with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:

From 8f4da1b2a33217d9f43d48f65b09a988d0524087 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Fri, 21 Jun 2024 16:20:43 -0700
Subject: [PATCH 037/150] give nightly pipeline higher priority

---
 .buildkite/nightly-benchmarks/nightly-pipeline.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 79092eeed4bca..536ad4b0b5235 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -5,6 +5,7 @@ steps:
     plugins:
     - kubernetes:
         podSpec:
+          priorityClassName: perf-benchmark
           containers:
           - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
             command:
@@ -36,6 +37,7 @@ steps:
     plugins:
     - kubernetes:
         podSpec:
+          priorityClassName: perf-benchmark
           containers:
           - image: vllm/vllm-openai:latest
             command:
@@ -67,6 +69,7 @@ steps:
     plugins:
     - kubernetes:
         podSpec:
+          priorityClassName: perf-benchmark
           containers:
           - image: ghcr.io/huggingface/text-generation-inference:2.0
             command:
@@ -98,6 +101,7 @@ steps:
     plugins:
     - kubernetes:
         podSpec:
+          priorityClassName: perf-benchmark
           containers:
           - image: openmmlab/lmdeploy:latest
             command:

From 2cbdac3af8603b4538b371d5a3826292e6f9aa60 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 22 Jun 2024 11:59:46 -0700
Subject: [PATCH 038/150] fix new bugs in latest lmdeploy docker

---
 .../nightly-benchmarks/scripts/get-lmdeploy-modelname.py    | 6 ++++++
 .../nightly-benchmarks/scripts/get_lmdeploy_modelname.py    | 6 ++++++
 .../nightly-benchmarks/scripts/run-lmdeploy-nightly.sh      | 4 ++++
 .buildkite/nightly-benchmarks/tests/nightly-tests.json      | 1 -
 4 files changed, 16 insertions(+), 1 deletion(-)
 create mode 100644 .buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
 create mode 100644 .buildkite/nightly-benchmarks/scripts/get_lmdeploy_modelname.py

diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
new file mode 100644
index 0000000000000..1f7ecb306c575
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@@ -0,0 +1,6 @@
+
+from lmdeploy.serve.openai.api_client import APIClient
+api_client = APIClient("http://localhost:8000")
+model_name = api_client.available_models[0]
+
+print(model_name)
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/get_lmdeploy_modelname.py b/.buildkite/nightly-benchmarks/scripts/get_lmdeploy_modelname.py
new file mode 100644
index 0000000000000..1f7ecb306c575
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/get_lmdeploy_modelname.py
@@ -0,0 +1,6 @@
+
+from lmdeploy.serve.openai.api_client import APIClient
+api_client = APIClient("http://localhost:8000")
+model_name = api_client.available_models[0]
+
+print(model_name)
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index c95b8845be1c6..559c92d4eebf4 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -116,6 +116,9 @@ run_serving_tests() {
       echo "vllm failed to start within the timeout period."
     fi
 
+    # get model name
+    model_name=$(python ../.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py)
+
     # iterate over different QPS
     for qps in $qps_list; do
       # remove the surrounding single quote from qps
@@ -134,6 +137,7 @@ run_serving_tests() {
         --result-dir $RESULTS_FOLDER \
         --result-filename ${new_test_name}.json \
         --request-rate $qps \
+        --model \"$model_name\" \
         $client_args"
 
       echo "Running test case $test_name with qps $qps"
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index bfff63ee1cb47..b12ac211a6c14 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -8,7 +8,6 @@
             "server_port": 8000
         },
         "lmdeploy_client_parameters": {
-            "model": "llama2",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,

From c24b963b8d1147817a6fc2b72ad22bec5c792edb Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sun, 23 Jun 2024 19:59:50 -0700
Subject: [PATCH 039/150] try llama 70B with tp 4

---
 .../scripts/get-lmdeploy-modelname.py         |  4 ++--
 .../scripts/get_lmdeploy_modelname.py         |  6 -----
 .../tests/nightly-tests.json                  | 24 +++++++++----------
 3 files changed, 14 insertions(+), 20 deletions(-)
 delete mode 100644 .buildkite/nightly-benchmarks/scripts/get_lmdeploy_modelname.py

diff --git a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
index 1f7ecb306c575..18bcc3a8714c4 100644
--- a/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
+++ b/.buildkite/nightly-benchmarks/scripts/get-lmdeploy-modelname.py
@@ -1,6 +1,6 @@
-
 from lmdeploy.serve.openai.api_client import APIClient
+
 api_client = APIClient("http://localhost:8000")
 model_name = api_client.available_models[0]
 
-print(model_name)
\ No newline at end of file
+print(model_name)
diff --git a/.buildkite/nightly-benchmarks/scripts/get_lmdeploy_modelname.py b/.buildkite/nightly-benchmarks/scripts/get_lmdeploy_modelname.py
deleted file mode 100644
index 1f7ecb306c575..0000000000000
--- a/.buildkite/nightly-benchmarks/scripts/get_lmdeploy_modelname.py
+++ /dev/null
@@ -1,6 +0,0 @@
-
-from lmdeploy.serve.openai.api_client import APIClient
-api_client = APIClient("http://localhost:8000")
-model_name = api_client.available_models[0]
-
-print(model_name)
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index b12ac211a6c14..64d29ffdc0d5e 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -1,10 +1,10 @@
 [
     {
-        "test_name": "llama8B_tp1_sharegpt",
+        "test_name": "llama70B_tp4_sharegpt",
         "qps_list": [1, 16],
-        "lmdeploy_server_model": "meta-llama/Llama-2-7b-hf",
+        "lmdeploy_server_model": "meta-llama/Meta-Llama-3-70B-Instruct",
         "lmdeploy_server_parameters": {
-            "tp": 1,
+            "tp": 4,
             "server_port": 8000
         },
         "lmdeploy_client_parameters": {
@@ -14,12 +14,12 @@
             "port": 8000
         },
         "tgi_server_parameters": {
-            "model_id": "meta-llama/Llama-2-7b-hf",
-            "num_shard": 1,
+            "model_id": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "num_shard": 4,
             "port": 8000
         },
         "tgi_client_parameters": {
-            "model": "meta-llama/Llama-2-7b-hf",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,
@@ -27,18 +27,18 @@
             "endpoint": "/generate_stream"
         },
         "trt_server_parameters": {
-            "model_path": "meta-llama/llama-2-7b-chat-hf",
+            "model_path": "meta-llama/Meta-Llama-3-70B-Instruct",
             "model_name": "llama-2-7b-chat-hf",
             "model_type": "llama",
             "model_dtype": "float16",
-            "model_tp_size": 1,
+            "model_tp_size": 4,
             "max_batch_size": 256,
             "max_input_len": 10000,
             "max_output_len": 10000,
             "trt_llm_version": "r24.04"
         },
         "trt_client_parameters": {
-            "model": "meta-llama/Llama-2-7b-hf",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,
@@ -46,13 +46,13 @@
             "endpoint": "/v2/models/ensemble/generate_stream"
         },
         "vllm_server_parameters": {
-            "model": "meta-llama/Llama-2-7b-hf",
-            "tensor_parallel_size": 1,
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tensor_parallel_size": 4,
             "disable_log_stats": "",
             "disable_log_requests": ""
         },
         "vllm_client_parameters": {
-            "model": "meta-llama/Llama-2-7b-hf",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",

From f79b6c425a8526fdb7c3452d81adef6be42a0cb3 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sun, 23 Jun 2024 21:59:17 -0700
Subject: [PATCH 040/150] rebuild

---
 .buildkite/nightly-benchmarks/README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/nightly-benchmarks/README.md b/.buildkite/nightly-benchmarks/README.md
index 4036b32a46bf7..c84e150934306 100644
--- a/.buildkite/nightly-benchmarks/README.md
+++ b/.buildkite/nightly-benchmarks/README.md
@@ -1,5 +1,6 @@
 # vLLM benchmark suite
 
+
 ## Introduction
 
 This directory contains the performance benchmarking CI for vllm.

From dfb77f436756b93bcb8952d8ed285dc694c644a2 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sun, 23 Jun 2024 22:32:16 -0700
Subject: [PATCH 041/150] use mixtral model to prevent disk quota exceeded

---
 .../tests/nightly-tests.json                  | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 64d29ffdc0d5e..98ea73fd87371 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -1,10 +1,10 @@
 [
     {
-        "test_name": "llama70B_tp4_sharegpt",
+        "test_name": "mixtral8x7B_tp2_sharegpt",
         "qps_list": [1, 16],
-        "lmdeploy_server_model": "meta-llama/Meta-Llama-3-70B-Instruct",
+        "lmdeploy_server_model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
         "lmdeploy_server_parameters": {
-            "tp": 4,
+            "tp": 2,
             "server_port": 8000
         },
         "lmdeploy_client_parameters": {
@@ -14,12 +14,12 @@
             "port": 8000
         },
         "tgi_server_parameters": {
-            "model_id": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "num_shard": 4,
+            "model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "num_shard": 2,
             "port": 8000
         },
         "tgi_client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,
@@ -27,18 +27,18 @@
             "endpoint": "/generate_stream"
         },
         "trt_server_parameters": {
-            "model_path": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model_path": "mistralai/Mixtral-8x7B-Instruct-v0.1",
             "model_name": "llama-2-7b-chat-hf",
             "model_type": "llama",
             "model_dtype": "float16",
-            "model_tp_size": 4,
+            "model_tp_size": 2,
             "max_batch_size": 256,
             "max_input_len": 10000,
             "max_output_len": 10000,
             "trt_llm_version": "r24.04"
         },
         "trt_client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,
@@ -46,13 +46,13 @@
             "endpoint": "/v2/models/ensemble/generate_stream"
         },
         "vllm_server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "tensor_parallel_size": 4,
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tensor_parallel_size": 2,
             "disable_log_stats": "",
             "disable_log_requests": ""
         },
         "vllm_client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",

From d2e4171f2a343794a469b3cc0fd122b44378d92c Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sun, 23 Jun 2024 23:25:46 -0700
Subject: [PATCH 042/150] remove wait

---
 .buildkite/nightly-benchmarks/nightly-pipeline.yaml | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 536ad4b0b5235..47505ff64e0fb 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -30,7 +30,6 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-  - wait
   - label: "A100 vllm benchmark"
     agents:
       queue: A100
@@ -62,7 +61,6 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-  - wait
   - label: "A100 tgi benchmark"
     agents:
       queue: A100
@@ -94,7 +92,6 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-  - wait
   - label: "A100 lmdeploy benchmark"
     agents:
       queue: A100
@@ -126,5 +123,4 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-  - wait
   
\ No newline at end of file

From dc5219567946942834ed3db15b55357fc30f91e0 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 24 Jun 2024 11:04:09 -0700
Subject: [PATCH 043/150] temporarily remove trt pipeline --- disk quota
 exceeded

---
 .../nightly-benchmarks/nightly-pipeline.yaml  | 62 +++++++++----------
 1 file changed, 31 insertions(+), 31 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 47505ff64e0fb..8dace4b605889 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -1,35 +1,35 @@
 steps:
-  - label: "A100 trt benchmark"
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
-            command:
-            - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh)
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
+  # - label: "A100 trt benchmark"
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #   - kubernetes:
+  #       podSpec:
+  #         priorityClassName: perf-benchmark
+  #         containers:
+  #         - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+  #           command:
+  #           - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh)
+  #           resources:
+  #             limits:
+  #               nvidia.com/gpu: 8
+  #           volumeMounts:
+  #           - name: devshm
+  #             mountPath: /dev/shm
+  #           env:
+  #           - name: VLLM_USAGE_SOURCE
+  #             value: ci-test
+  #           - name: HF_TOKEN
+  #             valueFrom:
+  #               secretKeyRef:
+  #                 name: hf-token-secret
+  #                 key: token
+  #         nodeSelector:
+  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  #         volumes:
+  #         - name: devshm
+  #           emptyDir:
+  #             medium: Memory
   - label: "A100 vllm benchmark"
     agents:
       queue: A100

From 8ffd8b15144e4a6b462c143d3436f1e96d07eac0 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 24 Jun 2024 12:13:51 -0700
Subject: [PATCH 044/150] fall back to 70B and test the storage required

---
 .../tests/nightly-tests.json                  | 24 +++++++++----------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 98ea73fd87371..aeeb0a609c73d 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -1,10 +1,10 @@
 [
     {
-        "test_name": "mixtral8x7B_tp2_sharegpt",
+        "test_name": "llama70B_tp4",
         "qps_list": [1, 16],
-        "lmdeploy_server_model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+        "lmdeploy_server_model": "meta-llama/Meta-Llama-3-70B-Instruct",
         "lmdeploy_server_parameters": {
-            "tp": 2,
+            "tp": 4,
             "server_port": 8000
         },
         "lmdeploy_client_parameters": {
@@ -14,12 +14,12 @@
             "port": 8000
         },
         "tgi_server_parameters": {
-            "model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "num_shard": 2,
+            "model_id": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "num_shard": 4,
             "port": 8000
         },
         "tgi_client_parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,
@@ -27,18 +27,18 @@
             "endpoint": "/generate_stream"
         },
         "trt_server_parameters": {
-            "model_path": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "model_path": "meta-llama/Meta-Llama-3-70B-Instruct",
             "model_name": "llama-2-7b-chat-hf",
             "model_type": "llama",
             "model_dtype": "float16",
-            "model_tp_size": 2,
+            "model_tp_size": 4,
             "max_batch_size": 256,
             "max_input_len": 10000,
             "max_output_len": 10000,
             "trt_llm_version": "r24.04"
         },
         "trt_client_parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,
@@ -46,13 +46,13 @@
             "endpoint": "/v2/models/ensemble/generate_stream"
         },
         "vllm_server_parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tensor_parallel_size": 2,
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tensor_parallel_size": 4,
             "disable_log_stats": "",
             "disable_log_requests": ""
         },
         "vllm_client_parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",

From 313f54f25f57d40bea5e05ed9f1e133c557f5ecc Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 24 Jun 2024 12:15:39 -0700
Subject: [PATCH 045/150] use llama-2 as I do not have llama3 access...)

---
 .../nightly-benchmarks/tests/nightly-tests.json    | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index aeeb0a609c73d..437451d88bc6b 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -2,7 +2,7 @@
     {
         "test_name": "llama70B_tp4",
         "qps_list": [1, 16],
-        "lmdeploy_server_model": "meta-llama/Meta-Llama-3-70B-Instruct",
+        "lmdeploy_server_model": "meta-llama/Llama-2-70b-chat-hf",
         "lmdeploy_server_parameters": {
             "tp": 4,
             "server_port": 8000
@@ -14,12 +14,12 @@
             "port": 8000
         },
         "tgi_server_parameters": {
-            "model_id": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model_id": "meta-llama/Llama-2-70b-chat-hf",
             "num_shard": 4,
             "port": 8000
         },
         "tgi_client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Llama-2-70b-chat-hf",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,
@@ -27,7 +27,7 @@
             "endpoint": "/generate_stream"
         },
         "trt_server_parameters": {
-            "model_path": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model_path": "meta-llama/Llama-2-70b-chat-hf",
             "model_name": "llama-2-7b-chat-hf",
             "model_type": "llama",
             "model_dtype": "float16",
@@ -38,7 +38,7 @@
             "trt_llm_version": "r24.04"
         },
         "trt_client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Llama-2-70b-chat-hf",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,
@@ -46,13 +46,13 @@
             "endpoint": "/v2/models/ensemble/generate_stream"
         },
         "vllm_server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Llama-2-70b-chat-hf",
             "tensor_parallel_size": 4,
             "disable_log_stats": "",
             "disable_log_requests": ""
         },
         "vllm_client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model": "meta-llama/Llama-2-70b-chat-hf",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",

From 62b2407df2fc7565eae79078689477f8125441eb Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 24 Jun 2024 12:20:23 -0700
Subject: [PATCH 046/150] fix model name

---
 .buildkite/nightly-benchmarks/tests/nightly-tests.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 437451d88bc6b..e0313a1e32afb 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -28,7 +28,7 @@
         },
         "trt_server_parameters": {
             "model_path": "meta-llama/Llama-2-70b-chat-hf",
-            "model_name": "llama-2-7b-chat-hf",
+            "model_name": "llama-2-70b-chat-hf",
             "model_type": "llama",
             "model_dtype": "float16",
             "model_tp_size": 4,

From 785d246179e2c8a13221065ea1a1f52824959ee2 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 24 Jun 2024 17:56:21 -0700
Subject: [PATCH 047/150] try llama 70B

---
 .../nightly-benchmarks/nightly-pipeline.yaml  | 62 +++++++++----------
 .../tests/nightly-tests.json                  | 16 ++---
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 8dace4b605889..47505ff64e0fb 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -1,35 +1,35 @@
 steps:
-  # - label: "A100 trt benchmark"
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #   - kubernetes:
-  #       podSpec:
-  #         priorityClassName: perf-benchmark
-  #         containers:
-  #         - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
-  #           command:
-  #           - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh)
-  #           resources:
-  #             limits:
-  #               nvidia.com/gpu: 8
-  #           volumeMounts:
-  #           - name: devshm
-  #             mountPath: /dev/shm
-  #           env:
-  #           - name: VLLM_USAGE_SOURCE
-  #             value: ci-test
-  #           - name: HF_TOKEN
-  #             valueFrom:
-  #               secretKeyRef:
-  #                 name: hf-token-secret
-  #                 key: token
-  #         nodeSelector:
-  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  #         volumes:
-  #         - name: devshm
-  #           emptyDir:
-  #             medium: Memory
+  - label: "A100 trt benchmark"
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+            command:
+            - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh)
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
   - label: "A100 vllm benchmark"
     agents:
       queue: A100
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index e0313a1e32afb..6515e73de76e2 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -2,7 +2,7 @@
     {
         "test_name": "llama70B_tp4",
         "qps_list": [1, 16],
-        "lmdeploy_server_model": "meta-llama/Llama-2-70b-chat-hf",
+        "lmdeploy_server_model": "meta-llama/Meta-Llama-3-70B-Instruct",
         "lmdeploy_server_parameters": {
             "tp": 4,
             "server_port": 8000
@@ -14,12 +14,12 @@
             "port": 8000
         },
         "tgi_server_parameters": {
-            "model_id": "meta-llama/Llama-2-70b-chat-hf",
+            "model_id": "meta-llama/Meta-Llama-3-70B-Instruct",
             "num_shard": 4,
             "port": 8000
         },
         "tgi_client_parameters": {
-            "model": "meta-llama/Llama-2-70b-chat-hf",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,
@@ -27,8 +27,8 @@
             "endpoint": "/generate_stream"
         },
         "trt_server_parameters": {
-            "model_path": "meta-llama/Llama-2-70b-chat-hf",
-            "model_name": "llama-2-70b-chat-hf",
+            "model_path": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "model_name": "Meta-Llama-3-70B-Instruct",
             "model_type": "llama",
             "model_dtype": "float16",
             "model_tp_size": 4,
@@ -38,7 +38,7 @@
             "trt_llm_version": "r24.04"
         },
         "trt_client_parameters": {
-            "model": "meta-llama/Llama-2-70b-chat-hf",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,
@@ -46,13 +46,13 @@
             "endpoint": "/v2/models/ensemble/generate_stream"
         },
         "vllm_server_parameters": {
-            "model": "meta-llama/Llama-2-70b-chat-hf",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
             "tensor_parallel_size": 4,
             "disable_log_stats": "",
             "disable_log_requests": ""
         },
         "vllm_client_parameters": {
-            "model": "meta-llama/Llama-2-70b-chat-hf",
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
             "backend": "vllm",
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",

From 7fd891ec72b7f6b97aa31b788eb8f4ba340670d6 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 24 Jun 2024 18:37:47 -0700
Subject: [PATCH 048/150] check file system size

---
 .buildkite/nightly-benchmarks/run-nightly-suite.sh | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
index 4f09c9e8423f8..1eab836b5503c 100644
--- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@@ -34,6 +34,8 @@ main() {
     check_gpus
     check_hf_token
 
+    df -h
+
     (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
     (which jq) || (apt-get update && apt-get -y install jq)
     cd /

From 52cf795b0505f3bdc23cf2ebaec84d3e23c1f4f2 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 24 Jun 2024 22:26:04 -0700
Subject: [PATCH 049/150] update code for removing cache

---
 .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 1 +
 .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh      | 1 +
 .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh      | 2 ++
 .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh     | 1 +
 4 files changed, 5 insertions(+)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index 559c92d4eebf4..7e9398c892c3d 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -163,6 +163,7 @@ run_serving_tests() {
 
     # clean up
     kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
   done
 }
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
index 23cbafa5c2edc..e2a21a17fa56d 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -146,6 +146,7 @@ run_serving_tests() {
 
     # clean up
     kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
   done
 }
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index ec0d99570f08f..f1092b2b3afbf 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -122,6 +122,7 @@ run_trt_server() {
     --output_dir=${trt_engine_path} 
 
   cd /tensorrtllm_backend/triton_model_repo
+  rm -rf ./tensorrt_llm/1/*
   cp -r ${trt_engine_path}/* ./tensorrt_llm/1
   cd /tensorrtllm_backend
   python3 scripts/launch_triton_server.py \
@@ -235,6 +236,7 @@ run_serving_tests() {
 
     # clean up
     kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
   done
 }
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
index 021a8dc170d1c..492f03ddd1cb5 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -165,6 +165,7 @@ run_serving_tests() {
 
     # clean up
     kill_gpu_processes
+    rm -rf /root/.cache/huggingface/*
   done
 }
 

From b8d1c9432daa00575816be5ce2aaebbddc7e7a8a Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 24 Jun 2024 23:03:49 -0700
Subject: [PATCH 050/150] merge common parameters

---
 .../scripts/run-lmdeploy-nightly.sh           | 25 +++++++++---
 .../scripts/run-tgi-nightly.sh                | 28 +++++++++++--
 .../scripts/run-trt-nightly.sh                | 36 ++++++++++++-----
 .../scripts/run-vllm-nightly.sh               | 28 ++++++++-----
 .../tests/nightly-tests.json                  | 40 +++++--------------
 5 files changed, 98 insertions(+), 59 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index 7e9398c892c3d..b7224cd790d90 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -72,13 +72,21 @@ run_serving_tests() {
     
     # append lmdeploy to the test name
     test_name=lmdeploy_$test_name
+    
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tensor_parallel_size')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
 
     
 
     # get client and server arguments
     server_params=$(echo "$params" | jq -r '.lmdeploy_server_parameters')
     client_params=$(echo "$params" | jq -r '.lmdeploy_client_parameters')
-    model=$(echo "$params" | jq -r '.lmdeploy_server_model')
     server_args=$(json2args "$server_params")
     client_args=$(json2args "$client_params")
     qps_list=$(echo "$params" | jq -r '.qps_list')
@@ -86,7 +94,6 @@ run_serving_tests() {
     echo "Running over qps list $qps_list"
 
     # check if there is enough GPU to run the test
-    tp=$(echo "$server_params" | jq -r '.tp')
     if [[ $gpu_count -lt $tp ]]; then
       echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
       continue
@@ -99,7 +106,10 @@ run_serving_tests() {
       --model "$model" \
       --cachedir /tokenizer_cache
 
-    server_command="lmdeploy serve api_server $model $server_args"
+    server_command="lmdeploy serve api_server $model \
+      --tp $tp \
+      --server-port $port \
+      $server_args"
 
     # run the server
     echo "Running test case $test_name"
@@ -110,10 +120,11 @@ run_serving_tests() {
     wait_for_server
     if [ $? -eq 0 ]; then
       echo ""
-      echo "vllm server is up and running."
+      echo "lmdeploy server is up and running."
     else
       echo ""
-      echo "vllm failed to start within the timeout period."
+      echo "lmdeploy failed to start within the timeout period."
+      continue
     fi
 
     # get model name
@@ -133,6 +144,10 @@ run_serving_tests() {
       client_command="python3 benchmark_serving.py \
         --backend lmdeploy \
         --tokenizer /tokenizer_cache \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
         --save-result \
         --result-dir $RESULTS_FOLDER \
         --result-filename ${new_test_name}.json \
diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
index e2a21a17fa56d..06ceb14b97fe6 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -61,8 +61,7 @@ run_serving_tests() {
   jq -c '.[]' "$serving_test_file" | while read -r params; do
     # get the test name, and append the GPU type back to it.
     test_name=$(echo "$params" | jq -r '.test_name')
-    # append tgi to the test name
-    test_name=tgi_$test_name
+    
 
     # if TEST_SELECTOR is set, only run the test cases that match the selector
     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
@@ -70,6 +69,18 @@ run_serving_tests() {
       continue
     fi
 
+    # append tgi to the test name
+    test_name=tgi_$test_name
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tensor_parallel_size')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
     # get client and server arguments
     server_params=$(echo "$params" | jq -r '.tgi_server_parameters')
     client_params=$(echo "$params" | jq -r '.tgi_client_parameters')
@@ -80,14 +91,17 @@ run_serving_tests() {
     echo "Running over qps list $qps_list"
 
     # check if there is enough GPU to run the test
-    tp=$(echo "$server_params" | jq -r '.num_shard')
     if [[ $gpu_count -lt $tp ]]; then
       echo "Required num-shard $tp but only $gpu_count GPU found. Skip testcase $test_name."
       continue
     fi
 
 
-    server_command="/tgi-entrypoint.sh $server_args"
+    server_command="/tgi-entrypoint.sh \
+      --model-id $model \
+      --num-shard $tp \
+      --port $port \
+      $server_args"
 
     # run the server
     echo "Running test case $test_name"
@@ -102,6 +116,7 @@ run_serving_tests() {
     else
       echo ""
       echo "tgi failed to start within the timeout period."
+      continue
     fi
 
     # iterate over different QPS
@@ -117,6 +132,11 @@ run_serving_tests() {
 
       client_command="python3 benchmark_serving.py \
         --backend tgi \
+        --model $model \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
         --save-result \
         --result-dir $RESULTS_FOLDER \
         --result-filename ${new_test_name}.json \
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index f1092b2b3afbf..742011cd587f7 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -54,12 +54,15 @@ wait_for_server() {
 run_trt_server() {
 
   params=$1
+  common_params=$2
 
-  model_name=$(echo "$params" | jq -r '.model_name')
-  model_path=$(echo "$params" | jq -r '.model_path')
+
+
+  model_path=$(echo "$common_params" | jq -r '.model')
+  model_name="${model_path#*/}"
   model_type=$(echo "$params" | jq -r '.model_type')
   model_dtype=$(echo "$params" | jq -r '.model_dtype')
-  model_tp_size=$(echo "$params" | jq -r '.model_tp_size')
+  model_tp_size=$(echo "$common_params" | jq -r '.tp')
   max_batch_size=$(echo "$params" | jq -r '.max_batch_size')
   max_input_len=$(echo "$params" | jq -r '.max_input_len')
   max_output_len=$(echo "$params" | jq -r '.max_output_len')
@@ -141,26 +144,34 @@ run_serving_tests() {
   jq -c '.[]' "$serving_test_file" | while read -r params; do
     # get the test name, and append the GPU type back to it.
     test_name=$(echo "$params" | jq -r '.test_name')
-    # append trt to the test name
-    test_name=trt_$test_name
-
+    
     # if TEST_SELECTOR is set, only run the test cases that match the selector
     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
       echo "Skip test case $test_name."
       continue
     fi
 
+    # append trt to the test name
+    test_name=trt_$test_name
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tensor_parallel_size')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
+
     # get client and server arguments
     server_params=$(echo "$params" | jq -r '.trt_server_parameters')
     client_params=$(echo "$params" | jq -r '.trt_client_parameters')
-    model=$(echo "$client_params" | jq -r '.model')
     client_args=$(json2args "$client_params")
     qps_list=$(echo "$params" | jq -r '.qps_list')
     qps_list=$(echo "$qps_list" | jq -r '.[] | @sh')
     echo "Running over qps list $qps_list"
 
     # check if there is enough GPU to run the test
-    tp=$(echo "$server_params" | jq -r '.model_tp_size')
     if [[ $gpu_count -lt $tp ]]; then
       echo "Required model_tp_size $tp but only $gpu_count GPU found. Skip testcase $test_name."
       continue
@@ -178,7 +189,7 @@ run_serving_tests() {
 
     # run the server
     echo "Running test case $test_name"
-    run_trt_server "$server_params"
+    run_trt_server "$server_params" "$common_params"
 
     # wait until the server is alive
     wait_for_server
@@ -188,6 +199,7 @@ run_serving_tests() {
     else
       echo ""
       echo "trt failed to start within the timeout period."
+      continue
     fi
 
     # go back to vllm benchmarking directory
@@ -207,6 +219,11 @@ run_serving_tests() {
       client_command="python3 benchmark_serving.py \
         --backend tensorrt-llm \
         --tokenizer /tokenizer_cache \
+        --model $model \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
         --save-result \
         --result-dir $RESULTS_FOLDER \
         --result-filename ${new_test_name}.json \
@@ -218,6 +235,7 @@ run_serving_tests() {
 
       eval "$client_command"
 
+      server_command=""
       # record the benchmarking commands
       jq_output=$(jq -n \
         --arg server "$server_command" \
diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
index 492f03ddd1cb5..970680d7293e1 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -77,7 +77,15 @@ run_serving_tests() {
     # append vllm to the test name
     test_name=vllm_$test_name
 
-    
+
+    # get common parameters
+    common_params=$(echo "$params" | jq -r '.common_parameters')
+    model=$(echo "$common_params" | jq -r '.model')
+    tp=$(echo "$common_params" | jq -r '.tensor_parallel_size')
+    dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
+    dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
+    port=$(echo "$common_params" | jq -r '.port')
+    num_prompts=$(echo "$common_params" | jq -r '.num_prompts')
 
     # get client and server arguments
     server_params=$(echo "$params" | jq -r '.vllm_server_parameters')
@@ -89,23 +97,17 @@ run_serving_tests() {
     echo "Running over qps list $qps_list"
 
     # check if there is enough GPU to run the test
-    tp=$(echo "$server_params" | jq -r '.tensor_parallel_size')
     if [[ $gpu_count -lt $tp ]]; then
       echo "Required tensor-parallel-size $tp but only $gpu_count GPU found. Skip testcase $test_name."
       continue
     fi
 
-    # check if server model and client model is aligned
-    server_model=$(echo "$server_params" | jq -r '.model')
-    client_model=$(echo "$client_params" | jq -r '.model')
-    if [[ $server_model != "$client_model" ]]; then
-      echo "Server model and client model must be the same. Skip testcase $test_name."
-      continue
-    fi
-
 
     server_command="python3 \
       -m vllm.entrypoints.openai.api_server \
+      -tp $tp \
+      --model $model \
+      --port $port \
       $server_args"
 
     # run the server
@@ -121,6 +123,7 @@ run_serving_tests() {
     else
       echo ""
       echo "vllm failed to start within the timeout period."
+      continue
     fi
 
     # iterate over different QPS
@@ -136,6 +139,11 @@ run_serving_tests() {
 
       client_command="python3 benchmark_serving.py \
         --backend vllm \
+        --model $model \
+        --dataset-name $dataset_name \
+        --dataset-path $dataset_path \
+        --num-prompts $num_prompts \
+        --port $port \
         --save-result \
         --result-dir $RESULTS_FOLDER \
         --result-filename ${new_test_name}.json \
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 6515e73de76e2..0e0385bd01204 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -1,62 +1,40 @@
 [
     {
-        "test_name": "llama70B_tp4",
-        "qps_list": [1, 16],
-        "lmdeploy_server_model": "meta-llama/Meta-Llama-3-70B-Instruct",
-        "lmdeploy_server_parameters": {
-            "tp": 4,
-            "server_port": 8000
-        },
-        "lmdeploy_client_parameters": {
+        "test_name": "llama8B_tp1",
+        "qps_list": [1, 8],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tp": 1,
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 200,
             "port": 8000
         },
+        "lmdeploy_server_parameters": {
+        },
+        "lmdeploy_client_parameters": {
+        },
         "tgi_server_parameters": {
-            "model_id": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "num_shard": 4,
-            "port": 8000
         },
         "tgi_client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "port": 8000,
             "endpoint": "/generate_stream"
         },
         "trt_server_parameters": {
-            "model_path": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "model_name": "Meta-Llama-3-70B-Instruct",
             "model_type": "llama",
             "model_dtype": "float16",
-            "model_tp_size": 4,
             "max_batch_size": 256,
             "max_input_len": 10000,
             "max_output_len": 10000,
             "trt_llm_version": "r24.04"
         },
         "trt_client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
-            "port": 8000,
             "endpoint": "/v2/models/ensemble/generate_stream"
         },
         "vllm_server_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "tensor_parallel_size": 4,
             "disable_log_stats": "",
             "disable_log_requests": ""
         },
         "vllm_client_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "backend": "vllm",
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200
         }
     }
 ]
\ No newline at end of file

From 14fb6500b0190823a98c6e9e4d7e874ccdb7d9b9 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 24 Jun 2024 23:19:28 -0700
Subject: [PATCH 051/150] fix typo

---
 .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 2 +-
 .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh      | 2 +-
 .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh      | 2 +-
 .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index b7224cd790d90..94cf83f2f4641 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -76,7 +76,7 @@ run_serving_tests() {
     # get common parameters
     common_params=$(echo "$params" | jq -r '.common_parameters')
     model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tensor_parallel_size')
+    tp=$(echo "$common_params" | jq -r '.tp')
     dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
     dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
     port=$(echo "$common_params" | jq -r '.port')
diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
index 06ceb14b97fe6..a9e31219e1955 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -75,7 +75,7 @@ run_serving_tests() {
     # get common parameters
     common_params=$(echo "$params" | jq -r '.common_parameters')
     model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tensor_parallel_size')
+    tp=$(echo "$common_params" | jq -r '.tp')
     dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
     dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
     port=$(echo "$common_params" | jq -r '.port')
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index 742011cd587f7..65b0706b6e426 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -157,7 +157,7 @@ run_serving_tests() {
     # get common parameters
     common_params=$(echo "$params" | jq -r '.common_parameters')
     model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tensor_parallel_size')
+    tp=$(echo "$common_params" | jq -r '.tp')
     dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
     dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
     port=$(echo "$common_params" | jq -r '.port')
diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
index 970680d7293e1..232743b36b5b4 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -81,7 +81,7 @@ run_serving_tests() {
     # get common parameters
     common_params=$(echo "$params" | jq -r '.common_parameters')
     model=$(echo "$common_params" | jq -r '.model')
-    tp=$(echo "$common_params" | jq -r '.tensor_parallel_size')
+    tp=$(echo "$common_params" | jq -r '.tp')
     dataset_name=$(echo "$common_params" | jq -r '.dataset_name')
     dataset_path=$(echo "$common_params" | jq -r '.dataset_path')
     port=$(echo "$common_params" | jq -r '.port')

From 3aba28a9823445d063f52b5cc08e2f7b395da861 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 24 Jun 2024 23:24:30 -0700
Subject: [PATCH 052/150] reduce qps to 8, just for testing

---
 .buildkite/nightly-benchmarks/tests/nightly-tests.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 0e0385bd01204..7422eac46b482 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -1,7 +1,7 @@
 [
     {
         "test_name": "llama8B_tp1",
-        "qps_list": [1, 8],
+        "qps_list": [8],
         "common_parameters": {
             "model": "meta-llama/Meta-Llama-3-8B",
             "tp": 1,

From 733ac33c60cb47e5f2f016c5a10e6e26d0ce1115 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 25 Jun 2024 00:03:49 -0700
Subject: [PATCH 053/150] append to the same context

---
 .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 2 +-
 .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh      | 2 +-
 .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh      | 2 +-
 .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index 94cf83f2f4641..1ce24c389de70 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -191,7 +191,7 @@ upload_to_buildkite() {
     echo "buildkite-agent binary not found. Skip uploading the results."
     return 0
   fi
-  /workspace/buildkite-agent annotate --style "info" --context "lmdeploy-benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
   /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
index a9e31219e1955..099aa94549cbd 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -180,7 +180,7 @@ upload_to_buildkite() {
     echo "buildkite-agent binary not found. Skip uploading the results."
     return 0
   fi
-  /workspace/buildkite-agent annotate --style "info" --context "tgi-benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
   /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index 65b0706b6e426..77340a1e15098 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -266,7 +266,7 @@ upload_to_buildkite() {
     echo "buildkite-agent binary not found. Skip uploading the results."
     return 0
   fi
-  /workspace/buildkite-agent annotate --style "info" --context "trt-benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
   /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
index 232743b36b5b4..163f7e10b44e7 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -186,7 +186,7 @@ upload_to_buildkite() {
     echo "buildkite-agent binary not found. Skip uploading the results."
     return 0
   fi
-  /workspace/buildkite-agent annotate --style "info" --context "vllm-benchmark-results" < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
   /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 

From 5e1ec4b727628d89fab3928f744f5728d635638d Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 25 Jun 2024 00:16:50 -0700
Subject: [PATCH 054/150] optimize for buildkite annotation

---
 .../nightly-benchmarks/nightly-descriptions.md | 18 ++++++++++++++++++
 .../nightly-benchmarks/run-nightly-suite.sh    |  8 ++++++++
 .../scripts/run-lmdeploy-nightly.sh            |  2 +-
 .../scripts/run-tgi-nightly.sh                 |  2 +-
 .../scripts/run-trt-nightly.sh                 |  2 +-
 .../scripts/run-vllm-nightly.sh                |  2 +-
 6 files changed, 30 insertions(+), 4 deletions(-)
 create mode 100644 .buildkite/nightly-benchmarks/nightly-descriptions.md

diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
new file mode 100644
index 0000000000000..e382433be8488
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -0,0 +1,18 @@
+
+# Nightly benchmark
+
+The main goal of this benchmarking is two-fold:
+- Performance clarity: Provide clarity on which one (vllm, tensorrt-llm, lmdeploy and tgi) leads in performance in what workload.
+- Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
+
+
+## Workload description
+
+We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
+
+- Input length: randomly sample 1000 prompts from ShareGPT dataset (with fixed random seed).
+- Output length: the corresponding output length of these 1000 prompts.
+- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
+- Average QPS (query per second): 4, 8 for 8B model and 1, 4 for larger models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
index 1eab836b5503c..74f02e3035728 100644
--- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@@ -45,6 +45,14 @@ main() {
     cd benchmarks
     wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
 
+    if [ ! -f /workspace/buildkite-agent ]; then
+        echo "buildkite-agent binary not found. Skip uploading the results."
+        return 0
+    else
+        /workspace/buildkite-agent annotate --style "info" --context "header" < /vllm/.buildkite/nightly-benchmarks/nightly-descriptions.md
+    fi
+    
+
     # run lmdeploy
     if which lmdeploy >/dev/null; then
         echo "lmdeploy is available, redirect to run-lmdeploy-nightly.sh"
diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index 1ce24c389de70..7a3c06451ea7b 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -191,7 +191,7 @@ upload_to_buildkite() {
     echo "buildkite-agent binary not found. Skip uploading the results."
     return 0
   fi
-  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
   /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
index 099aa94549cbd..48aa56a2799c4 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -180,7 +180,7 @@ upload_to_buildkite() {
     echo "buildkite-agent binary not found. Skip uploading the results."
     return 0
   fi
-  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
   /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index 77340a1e15098..b9db6ac4dff5b 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -266,7 +266,7 @@ upload_to_buildkite() {
     echo "buildkite-agent binary not found. Skip uploading the results."
     return 0
   fi
-  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
   /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
index 163f7e10b44e7..ab3df9def635f 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -186,7 +186,7 @@ upload_to_buildkite() {
     echo "buildkite-agent binary not found. Skip uploading the results."
     return 0
   fi
-  /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
   /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 

From 24231061a00cda9eb76cc866741c545415345b83 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 25 Jun 2024 00:17:05 -0700
Subject: [PATCH 055/150] add double enter for md table

---
 .../nightly-benchmarks/scripts/summary-nightly-results.py       | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
index ced57295f735e..c12ae985518fa 100644
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -65,7 +65,7 @@
     # document benchmarking results in markdown
     with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
         f.write(serving_md_table)
-        f.write('\n')
+        f.write('\n\n')
 
     # document benchmarking results in json
     with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:

From d3f970184dbd80e1f1f683c752d756fe68c138d9 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 25 Jun 2024 00:31:16 -0700
Subject: [PATCH 056/150] format adjust for markdown presentation

---
 .buildkite/nightly-benchmarks/nightly-results-header.md  | 2 ++
 .buildkite/nightly-benchmarks/run-nightly-suite.sh       | 1 +
 .../scripts/summary-nightly-results.py                   | 9 ++++++---
 3 files changed, 9 insertions(+), 3 deletions(-)
 create mode 100644 .buildkite/nightly-benchmarks/nightly-results-header.md

diff --git a/.buildkite/nightly-benchmarks/nightly-results-header.md b/.buildkite/nightly-benchmarks/nightly-results-header.md
new file mode 100644
index 0000000000000..fedbd9e29fce8
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/nightly-results-header.md
@@ -0,0 +1,2 @@
+| Test name             | GPU            |   Successful req. |   Tput (req/s) |   Mean TTFT (ms) |   Median TTFT (ms) |   P99 TTFT (ms) |   Mean ITL (ms) |   Median ITL (ms) |   P99 ITL (ms) | Engine   |
+|:----------------------|:---------------|------------------:|---------------:|-----------------:|-------------------:|----------------:|----------------:|------------------:|---------------:|:---------|
diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
index 74f02e3035728..e07367ef9653e 100644
--- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@@ -50,6 +50,7 @@ main() {
         return 0
     else
         /workspace/buildkite-agent annotate --style "info" --context "header" < /vllm/.buildkite/nightly-benchmarks/nightly-descriptions.md
+        /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --append < /vllm/.buildkite/nightly-benchmarks/nightly-results-header.md
     fi
     
 
diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
index c12ae985518fa..d25a97e47d409 100644
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -55,17 +55,20 @@
             serving_column_mapping.keys())].rename(
                 columns=serving_column_mapping)
 
-    serving_md_table = tabulate(serving_results,
+    serving_md_table_with_headers = tabulate(serving_results,
                                 headers='keys',
                                 tablefmt='pipe',
                                 showindex=False)
+    # remove the first line of header
+    serving_md_table_lines = serving_md_table_with_headers.split('\n')
+    serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
 
     prefix = os.environ.get("CURRENT_LLM_SERVING_ENGINE")
 
     # document benchmarking results in markdown
     with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
-        f.write(serving_md_table)
-        f.write('\n\n')
+        f.write(serving_md_table_without_header)
+        f.write('\n')
 
     # document benchmarking results in json
     with open(results_folder / f"{prefix}_nightly_results.json", "w") as f:

From c4d651b654f27e52cd14aacf0f5660b2f700ea96 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 25 Jun 2024 00:46:29 -0700
Subject: [PATCH 057/150] move header to the description file

---
 .buildkite/nightly-benchmarks/nightly-descriptions.md   | 6 ++++++
 .buildkite/nightly-benchmarks/nightly-results-header.md | 2 --
 .buildkite/nightly-benchmarks/run-nightly-suite.sh      | 1 -
 3 files changed, 6 insertions(+), 3 deletions(-)
 delete mode 100644 .buildkite/nightly-benchmarks/nightly-results-header.md

diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index e382433be8488..edcbeb8db10c4 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -16,3 +16,9 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
 - Average QPS (query per second): 4, 8 for 8B model and 1, 4 for larger models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
 - Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
+
+
+## Results
+
+| Test name             | GPU            |   Successful req. |   Tput (req/s) |   Mean TTFT (ms) |   Median TTFT (ms) |   P99 TTFT (ms) |   Mean ITL (ms) |   Median ITL (ms) |   P99 ITL (ms) | Engine   |
+|:----------------------|:---------------|------------------:|---------------:|-----------------:|-------------------:|----------------:|----------------:|------------------:|---------------:|:---------|
diff --git a/.buildkite/nightly-benchmarks/nightly-results-header.md b/.buildkite/nightly-benchmarks/nightly-results-header.md
deleted file mode 100644
index fedbd9e29fce8..0000000000000
--- a/.buildkite/nightly-benchmarks/nightly-results-header.md
+++ /dev/null
@@ -1,2 +0,0 @@
-| Test name             | GPU            |   Successful req. |   Tput (req/s) |   Mean TTFT (ms) |   Median TTFT (ms) |   P99 TTFT (ms) |   Mean ITL (ms) |   Median ITL (ms) |   P99 ITL (ms) | Engine   |
-|:----------------------|:---------------|------------------:|---------------:|-----------------:|-------------------:|----------------:|----------------:|------------------:|---------------:|:---------|
diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
index e07367ef9653e..74f02e3035728 100644
--- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@@ -50,7 +50,6 @@ main() {
         return 0
     else
         /workspace/buildkite-agent annotate --style "info" --context "header" < /vllm/.buildkite/nightly-benchmarks/nightly-descriptions.md
-        /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" --append < /vllm/.buildkite/nightly-benchmarks/nightly-results-header.md
     fi
     
 

From 25c5a2f694945c41aab0023e45b0fb2bed7f8743 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 25 Jun 2024 01:08:08 -0700
Subject: [PATCH 058/150] separate annotation to a new step

---
 .../nightly-benchmarks/run-nightly-suite.sh   |  7 ------
 .../scripts/nightly-annotate.sh               | 24 +++++++++++++++++++
 2 files changed, 24 insertions(+), 7 deletions(-)
 create mode 100644 .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh

diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
index 74f02e3035728..e608211391b93 100644
--- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@@ -44,13 +44,6 @@ main() {
     git checkout kuntai-benchmark-dev
     cd benchmarks
     wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
-
-    if [ ! -f /workspace/buildkite-agent ]; then
-        echo "buildkite-agent binary not found. Skip uploading the results."
-        return 0
-    else
-        /workspace/buildkite-agent annotate --style "info" --context "header" < /vllm/.buildkite/nightly-benchmarks/nightly-descriptions.md
-    fi
     
 
     # run lmdeploy
diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
new file mode 100644
index 0000000000000..78dc66a273ed7
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -0,0 +1,24 @@
+#!/bin/bash
+
+set -ex
+set -o pipefail
+
+main() {
+
+    (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
+    (which jq) || (apt-get update && apt-get -y install jq)
+    cd /
+    git clone https://github.com/KuntaiDu/vllm.git
+    cd vllm
+    git checkout kuntai-benchmark-dev
+
+    if [ ! -f /workspace/buildkite-agent ]; then
+        echo "buildkite-agent binary not found. Skip uploading the results."
+        return 0
+    else
+        /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < /vllm/.buildkite/nightly-benchmarks/nightly-descriptions.md
+    fi
+    
+}
+
+main "$@"
\ No newline at end of file

From 5183fea491cd9dbf9ffa24b355c594a8b2b30ab5 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 25 Jun 2024 01:09:09 -0700
Subject: [PATCH 059/150] add extra step to annotate pipeline

---
 .../nightly-benchmarks/nightly-pipeline.yaml  | 31 +++++++++++++++++++
 1 file changed, 31 insertions(+)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 47505ff64e0fb..872b718f45361 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -1,4 +1,35 @@
 steps:
+  - label: "Annotate"
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: vllm/vllm-openai:latest
+            command:
+            - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh) && (bash nightly-annotate.sh)
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
   - label: "A100 trt benchmark"
     agents:
       queue: A100

From f0684af2ebba21545cc6da099cfea6359843d503 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 25 Jun 2024 01:09:52 -0700
Subject: [PATCH 060/150] add wait

---
 .buildkite/nightly-benchmarks/nightly-pipeline.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 872b718f45361..0f72c979bbf32 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -30,6 +30,7 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
+  - wait
   - label: "A100 trt benchmark"
     agents:
       queue: A100

From 6c7ddf89db96cf78fbeabcf73c31151b3454dc27 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 25 Jun 2024 01:30:49 -0700
Subject: [PATCH 061/150] bring back the full test case

---
 .../scripts/run-lmdeploy-nightly.sh           |  2 +-
 .../scripts/run-tgi-nightly.sh                |  4 +-
 .../scripts/run-trt-nightly.sh                |  4 +-
 .../scripts/run-vllm-nightly.sh               |  4 +-
 .../tests/nightly-tests.json                  | 82 ++++++++++++++++++-
 5 files changed, 86 insertions(+), 10 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index 7a3c06451ea7b..de5a4e1231070 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -124,7 +124,7 @@ run_serving_tests() {
     else
       echo ""
       echo "lmdeploy failed to start within the timeout period."
-      continue
+      exit 0
     fi
 
     # get model name
diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
index 48aa56a2799c4..38ca9e15b260c 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -66,7 +66,7 @@ run_serving_tests() {
     # if TEST_SELECTOR is set, only run the test cases that match the selector
     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
       echo "Skip test case $test_name."
-      continue
+      exit 0
     fi
 
     # append tgi to the test name
@@ -116,7 +116,7 @@ run_serving_tests() {
     else
       echo ""
       echo "tgi failed to start within the timeout period."
-      continue
+      exit 0
     fi
 
     # iterate over different QPS
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index b9db6ac4dff5b..1542ac202bf59 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -148,7 +148,7 @@ run_serving_tests() {
     # if TEST_SELECTOR is set, only run the test cases that match the selector
     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
       echo "Skip test case $test_name."
-      continue
+      exit 0
     fi
 
     # append trt to the test name
@@ -199,7 +199,7 @@ run_serving_tests() {
     else
       echo ""
       echo "trt failed to start within the timeout period."
-      continue
+      exit 0
     fi
 
     # go back to vllm benchmarking directory
diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
index ab3df9def635f..86ab0105647e6 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -71,7 +71,7 @@ run_serving_tests() {
     # if TEST_SELECTOR is set, only run the test cases that match the selector
     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
       echo "Skip test case $test_name."
-      continue
+      exit 0
     fi
 
     # append vllm to the test name
@@ -123,7 +123,7 @@ run_serving_tests() {
     else
       echo ""
       echo "vllm failed to start within the timeout period."
-      continue
+      exit 0
     fi
 
     # iterate over different QPS
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 7422eac46b482..349ba6817452a 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -1,13 +1,13 @@
 [
     {
         "test_name": "llama8B_tp1",
-        "qps_list": [8],
+        "qps_list": [4,8],
         "common_parameters": {
             "model": "meta-llama/Meta-Llama-3-8B",
             "tp": 1,
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 200,
+            "num_prompts": 1000,
             "port": 8000
         },
         "lmdeploy_server_parameters": {
@@ -36,5 +36,81 @@
         },
         "vllm_client_parameters": {
         }
-    }
+    },
+    {
+        "test_name": "mixtral8x7B_tp2",
+        "qps_list": [2,4],
+        "common_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tp": 2,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 1000,
+            "port": 8000
+        },
+        "lmdeploy_server_parameters": {
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "mixtral",
+            "model_dtype": "float16",
+            "max_batch_size": 256,
+            "max_input_len": 10000,
+            "max_output_len": 10000,
+            "trt_llm_version": "r24.04"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": ""
+        },
+        "vllm_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama70B_tp4",
+        "qps_list": [2,4],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tp": 4,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 1000,
+            "port": 8000
+        },
+        "lmdeploy_server_parameters": {
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "float16",
+            "max_batch_size": 256,
+            "max_input_len": 10000,
+            "max_output_len": 10000,
+            "trt_llm_version": "r24.04"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": ""
+        },
+        "vllm_client_parameters": {
+        }
+    },
 ]
\ No newline at end of file

From 13f5d99cf9de90fcbd365cea588691551baad2bc Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 25 Jun 2024 02:03:32 -0700
Subject: [PATCH 062/150] fix syntax error

---
 .buildkite/nightly-benchmarks/tests/nightly-tests.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 349ba6817452a..573bd03114c64 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -112,5 +112,5 @@
         },
         "vllm_client_parameters": {
         }
-    },
+    }
 ]
\ No newline at end of file

From 11079c74ba21138a5bfca127d54c40743ba12dda Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 25 Jun 2024 11:33:07 -0700
Subject: [PATCH 063/150] break when the server failed to start --- so that the
 buildkite uploading still works

---
 .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 3 +--
 .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh      | 3 +--
 .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh      | 3 +--
 .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh     | 3 +--
 4 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index de5a4e1231070..6606af030ee2f 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
 
-set -ex
 set -o pipefail
 
 check_gpus() {
@@ -124,7 +123,7 @@ run_serving_tests() {
     else
       echo ""
       echo "lmdeploy failed to start within the timeout period."
-      exit 0
+      break
     fi
 
     # get model name
diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
index 38ca9e15b260c..edc5ea8573319 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
 
-set -ex
 set -o pipefail
 
 check_gpus() {
@@ -116,7 +115,7 @@ run_serving_tests() {
     else
       echo ""
       echo "tgi failed to start within the timeout period."
-      exit 0
+      break
     fi
 
     # iterate over different QPS
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index 1542ac202bf59..f6c8375e0ca76 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
 
-set -ex
 set -o pipefail
 
 check_gpus() {
@@ -199,7 +198,7 @@ run_serving_tests() {
     else
       echo ""
       echo "trt failed to start within the timeout period."
-      exit 0
+      break
     fi
 
     # go back to vllm benchmarking directory
diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
index 86ab0105647e6..3e79f4b24aa19 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
 
-set -ex
 set -o pipefail
 
 check_gpus() {
@@ -123,7 +122,7 @@ run_serving_tests() {
     else
       echo ""
       echo "vllm failed to start within the timeout period."
-      exit 0
+      break
     fi
 
     # iterate over different QPS

From 0ed8131d003a607a4c22daa4e6d2114fb141621b Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 25 Jun 2024 11:34:28 -0700
Subject: [PATCH 064/150] make yapf happy

---
 .../nightly-benchmarks/scripts/summary-nightly-results.py   | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
index d25a97e47d409..640e0bfdaa1f7 100644
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -56,9 +56,9 @@
                 columns=serving_column_mapping)
 
     serving_md_table_with_headers = tabulate(serving_results,
-                                headers='keys',
-                                tablefmt='pipe',
-                                showindex=False)
+                                             headers='keys',
+                                             tablefmt='pipe',
+                                             showindex=False)
     # remove the first line of header
     serving_md_table_lines = serving_md_table_with_headers.split('\n')
     serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])

From fae306ec7d64d146304987a7a53bf27c924a1be2 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 25 Jun 2024 18:44:31 -0700
Subject: [PATCH 065/150] test vllm code

---
 .../nightly-benchmarks/nightly-pipeline.yaml  | 250 +++++++++---------
 .../scripts/nightly-annotate.sh               |   5 +
 2 files changed, 130 insertions(+), 125 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 0f72c979bbf32..a424f2b99d1c9 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -30,129 +30,129 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-  - wait
-  - label: "A100 trt benchmark"
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
-            command:
-            - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh)
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-  - label: "A100 vllm benchmark"
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: vllm/vllm-openai:latest
-            command:
-            - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh)
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-  - label: "A100 tgi benchmark"
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: ghcr.io/huggingface/text-generation-inference:2.0
-            command:
-            - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh)
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-  - label: "A100 lmdeploy benchmark"
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: openmmlab/lmdeploy:latest
-            command:
-            - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh)
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
+  # - wait
+  # - label: "A100 trt benchmark"
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #   - kubernetes:
+  #       podSpec:
+  #         priorityClassName: perf-benchmark
+  #         containers:
+  #         - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+  #           command:
+  #           - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh)
+  #           resources:
+  #             limits:
+  #               nvidia.com/gpu: 8
+  #           volumeMounts:
+  #           - name: devshm
+  #             mountPath: /dev/shm
+  #           env:
+  #           - name: VLLM_USAGE_SOURCE
+  #             value: ci-test
+  #           - name: HF_TOKEN
+  #             valueFrom:
+  #               secretKeyRef:
+  #                 name: hf-token-secret
+  #                 key: token
+  #         nodeSelector:
+  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  #         volumes:
+  #         - name: devshm
+  #           emptyDir:
+  #             medium: Memory
+  # - label: "A100 vllm benchmark"
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #   - kubernetes:
+  #       podSpec:
+  #         priorityClassName: perf-benchmark
+  #         containers:
+  #         - image: vllm/vllm-openai:latest
+  #           command:
+  #           - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh)
+  #           resources:
+  #             limits:
+  #               nvidia.com/gpu: 8
+  #           volumeMounts:
+  #           - name: devshm
+  #             mountPath: /dev/shm
+  #           env:
+  #           - name: VLLM_USAGE_SOURCE
+  #             value: ci-test
+  #           - name: HF_TOKEN
+  #             valueFrom:
+  #               secretKeyRef:
+  #                 name: hf-token-secret
+  #                 key: token
+  #         nodeSelector:
+  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  #         volumes:
+  #         - name: devshm
+  #           emptyDir:
+  #             medium: Memory
+  # - label: "A100 tgi benchmark"
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #   - kubernetes:
+  #       podSpec:
+  #         priorityClassName: perf-benchmark
+  #         containers:
+  #         - image: ghcr.io/huggingface/text-generation-inference:2.0
+  #           command:
+  #           - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh)
+  #           resources:
+  #             limits:
+  #               nvidia.com/gpu: 8
+  #           volumeMounts:
+  #           - name: devshm
+  #             mountPath: /dev/shm
+  #           env:
+  #           - name: VLLM_USAGE_SOURCE
+  #             value: ci-test
+  #           - name: HF_TOKEN
+  #             valueFrom:
+  #               secretKeyRef:
+  #                 name: hf-token-secret
+  #                 key: token
+  #         nodeSelector:
+  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  #         volumes:
+  #         - name: devshm
+  #           emptyDir:
+  #             medium: Memory
+  # - label: "A100 lmdeploy benchmark"
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #   - kubernetes:
+  #       podSpec:
+  #         priorityClassName: perf-benchmark
+  #         containers:
+  #         - image: openmmlab/lmdeploy:latest
+  #           command:
+  #           - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh)
+  #           resources:
+  #             limits:
+  #               nvidia.com/gpu: 8
+  #           volumeMounts:
+  #           - name: devshm
+  #             mountPath: /dev/shm
+  #           env:
+  #           - name: VLLM_USAGE_SOURCE
+  #             value: ci-test
+  #           - name: HF_TOKEN
+  #             valueFrom:
+  #               secretKeyRef:
+  #                 name: hf-token-secret
+  #                 key: token
+  #         nodeSelector:
+  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  #         volumes:
+  #         - name: devshm
+  #           emptyDir:
+  #             medium: Memory
   
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
index 78dc66a273ed7..b9ff5eb146bb9 100644
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -7,6 +7,11 @@ main() {
 
     (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
     (which jq) || (apt-get update && apt-get -y install jq)
+    cd /workspace
+    ls
+    cd ./vllm
+    ls
+    exit 0
     cd /
     git clone https://github.com/KuntaiDu/vllm.git
     cd vllm

From 5098e1079ceb1929e63b2cf70071b9e959468792 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 25 Jun 2024 20:44:12 -0700
Subject: [PATCH 066/150] check if mounting is successfull

---
 .buildkite/nightly-benchmarks/nightly-pipeline.yaml       | 3 +++
 .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh | 3 +--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index a424f2b99d1c9..51f95ab6a19d5 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -30,6 +30,9 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
+          - name: nvme-raid
+            hostpath:
+              path: /mnt/fast-disks/nvme-raid
   # - wait
   # - label: "A100 trt benchmark"
   #   agents:
diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
index b9ff5eb146bb9..50ed0931ef218 100644
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -7,10 +7,9 @@ main() {
 
     (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
     (which jq) || (apt-get update && apt-get -y install jq)
+    df -h
     cd /workspace
     ls
-    cd ./vllm
-    ls
     exit 0
     cd /
     git clone https://github.com/KuntaiDu/vllm.git

From b0e766712df5ec80cb2c78c11549de3a2e344327 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 25 Jun 2024 21:41:40 -0700
Subject: [PATCH 067/150] add pwd

---
 .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
index 50ed0931ef218..44f9f996ab9c9 100644
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -7,6 +7,7 @@ main() {
 
     (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
     (which jq) || (apt-get update && apt-get -y install jq)
+    pwd
     df -h
     cd /workspace
     ls

From 4034f5f9a41b7bf0e9e6683fbea5285b5628a766 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 25 Jun 2024 21:51:08 -0700
Subject: [PATCH 068/150] add ls

---
 .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
index 44f9f996ab9c9..3a2704c8c8187 100644
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -8,6 +8,7 @@ main() {
     (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
     (which jq) || (apt-get update && apt-get -y install jq)
     pwd
+    ls
     df -h
     cd /workspace
     ls

From 4ab2ecacaddb5c832d9a77d97fd9d61c793d53a5 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 25 Jun 2024 23:40:56 -0700
Subject: [PATCH 069/150] update to read code from the docker, instead of
 running wget

---
 .../nightly-benchmarks/nightly-pipeline.yaml  | 265 +++++++++---------
 .../nightly-benchmarks/run-nightly-suite.sh   |   7 +-
 .../scripts/nightly-annotate.sh               |  11 +-
 .../scripts/run-lmdeploy-nightly.sh           |   2 +-
 .../scripts/run-tgi-nightly.sh                |   3 +-
 .../scripts/run-trt-nightly.sh                |   6 +-
 .../scripts/run-vllm-nightly.sh               |   3 +-
 7 files changed, 149 insertions(+), 148 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 51f95ab6a19d5..0ba924aa2e272 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -9,16 +9,20 @@ steps:
           containers:
           - image: vllm/vllm-openai:latest
             command:
-            - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh) && (bash nightly-annotate.sh)
+            - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
             resources:
               limits:
                 nvidia.com/gpu: 8
             volumeMounts:
             - name: devshm
               mountPath: /dev/shm
+            - name: nvme-raid
+              mountPath: /mnt/fast-disks/nvme-raid
             env:
             - name: VLLM_USAGE_SOURCE
               value: ci-test
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
             - name: HF_TOKEN
               valueFrom:
                 secretKeyRef:
@@ -33,129 +37,138 @@ steps:
           - name: nvme-raid
             hostpath:
               path: /mnt/fast-disks/nvme-raid
-  # - wait
-  # - label: "A100 trt benchmark"
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #   - kubernetes:
-  #       podSpec:
-  #         priorityClassName: perf-benchmark
-  #         containers:
-  #         - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
-  #           command:
-  #           - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh)
-  #           resources:
-  #             limits:
-  #               nvidia.com/gpu: 8
-  #           volumeMounts:
-  #           - name: devshm
-  #             mountPath: /dev/shm
-  #           env:
-  #           - name: VLLM_USAGE_SOURCE
-  #             value: ci-test
-  #           - name: HF_TOKEN
-  #             valueFrom:
-  #               secretKeyRef:
-  #                 name: hf-token-secret
-  #                 key: token
-  #         nodeSelector:
-  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  #         volumes:
-  #         - name: devshm
-  #           emptyDir:
-  #             medium: Memory
-  # - label: "A100 vllm benchmark"
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #   - kubernetes:
-  #       podSpec:
-  #         priorityClassName: perf-benchmark
-  #         containers:
-  #         - image: vllm/vllm-openai:latest
-  #           command:
-  #           - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh)
-  #           resources:
-  #             limits:
-  #               nvidia.com/gpu: 8
-  #           volumeMounts:
-  #           - name: devshm
-  #             mountPath: /dev/shm
-  #           env:
-  #           - name: VLLM_USAGE_SOURCE
-  #             value: ci-test
-  #           - name: HF_TOKEN
-  #             valueFrom:
-  #               secretKeyRef:
-  #                 name: hf-token-secret
-  #                 key: token
-  #         nodeSelector:
-  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  #         volumes:
-  #         - name: devshm
-  #           emptyDir:
-  #             medium: Memory
-  # - label: "A100 tgi benchmark"
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #   - kubernetes:
-  #       podSpec:
-  #         priorityClassName: perf-benchmark
-  #         containers:
-  #         - image: ghcr.io/huggingface/text-generation-inference:2.0
-  #           command:
-  #           - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh)
-  #           resources:
-  #             limits:
-  #               nvidia.com/gpu: 8
-  #           volumeMounts:
-  #           - name: devshm
-  #             mountPath: /dev/shm
-  #           env:
-  #           - name: VLLM_USAGE_SOURCE
-  #             value: ci-test
-  #           - name: HF_TOKEN
-  #             valueFrom:
-  #               secretKeyRef:
-  #                 name: hf-token-secret
-  #                 key: token
-  #         nodeSelector:
-  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  #         volumes:
-  #         - name: devshm
-  #           emptyDir:
-  #             medium: Memory
-  # - label: "A100 lmdeploy benchmark"
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #   - kubernetes:
-  #       podSpec:
-  #         priorityClassName: perf-benchmark
-  #         containers:
-  #         - image: openmmlab/lmdeploy:latest
-  #           command:
-  #           - (apt update) && (apt install wget) && (wget https://github.com/KuntaiDu/vllm/raw/kuntai-benchmark-dev/.buildkite/nightly-benchmarks/run-nightly-suite.sh) && (bash run-nightly-suite.sh)
-  #           resources:
-  #             limits:
-  #               nvidia.com/gpu: 8
-  #           volumeMounts:
-  #           - name: devshm
-  #             mountPath: /dev/shm
-  #           env:
-  #           - name: VLLM_USAGE_SOURCE
-  #             value: ci-test
-  #           - name: HF_TOKEN
-  #             valueFrom:
-  #               secretKeyRef:
-  #                 name: hf-token-secret
-  #                 key: token
-  #         nodeSelector:
-  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  #         volumes:
-  #         - name: devshm
-  #           emptyDir:
-  #             medium: Memory
+              type: directory
+  - wait
+  - label: "A100 trt benchmark"
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+  - label: "A100 vllm benchmark"
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: vllm/vllm-openai:latest
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+  - label: "A100 tgi benchmark"
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: ghcr.io/huggingface/text-generation-inference:2.0
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+  - label: "A100 lmdeploy benchmark"
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: openmmlab/lmdeploy:latest
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
   
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
index e608211391b93..a157074287083 100644
--- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@@ -38,11 +38,8 @@ main() {
 
     (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
     (which jq) || (apt-get update && apt-get -y install jq)
-    cd /
-    git clone https://github.com/KuntaiDu/vllm.git
-    cd vllm
-    git checkout kuntai-benchmark-dev
-    cd benchmarks
+
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
     wget https://huggingface.co/datasets/anon8231489123/ShareGPT_Vicuna_unfiltered/resolve/main/ShareGPT_V3_unfiltered_cleaned_split.json
     
 
diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
index 3a2704c8c8187..f8168c92d1cbc 100644
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -3,20 +3,13 @@
 set -ex
 set -o pipefail
 
+
 main() {
 
     (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
     (which jq) || (apt-get update && apt-get -y install jq)
-    pwd
-    ls
+    
     df -h
-    cd /workspace
-    ls
-    exit 0
-    cd /
-    git clone https://github.com/KuntaiDu/vllm.git
-    cd vllm
-    git checkout kuntai-benchmark-dev
 
     if [ ! -f /workspace/buildkite-agent ]; then
         echo "buildkite-agent binary not found. Skip uploading the results."
diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index 6606af030ee2f..e9c29bbe7de47 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -199,7 +199,7 @@ main() {
 
   check_gpus
   # enter vllm directory
-  cd /vllm/benchmarks
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
 
   declare -g RESULTS_FOLDER=results/
   mkdir -p $RESULTS_FOLDER
diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
index edc5ea8573319..67f88eee653d9 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -187,8 +187,7 @@ main() {
 
   check_gpus
   # enter vllm directory
-  cd /vllm/benchmarks
-
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
   declare -g RESULTS_FOLDER=results/
   mkdir -p $RESULTS_FOLDER
   BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index f6c8375e0ca76..d3abd53bf01ea 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -178,7 +178,7 @@ run_serving_tests() {
 
 
     # prepare tokenizer
-    cd /vllm/benchmarks
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
     rm -rf /tokenizer_cache
     mkdir /tokenizer_cache
     python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
@@ -202,7 +202,7 @@ run_serving_tests() {
     fi
 
     # go back to vllm benchmarking directory
-    cd /vllm/benchmarks
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
 
     # iterate over different QPS
     for qps in $qps_list; do
@@ -276,7 +276,7 @@ main() {
 
 
   # enter vllm directory
-  cd /vllm/benchmarks
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
 
   declare -g RESULTS_FOLDER=results/
   mkdir -p $RESULTS_FOLDER
diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
index 3e79f4b24aa19..774e6f3d5cb2c 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -193,8 +193,7 @@ main() {
 
   check_gpus
   # enter vllm directory
-  cd /vllm/benchmarks
-
+  cd $VLLM_SOURCE_CODE_LOC/benchmarks
   declare -g RESULTS_FOLDER=results/
   mkdir -p $RESULTS_FOLDER
   BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/

From d6a34d3c500c8957158d40078739ba43b5ba392c Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 26 Jun 2024 00:10:52 -0700
Subject: [PATCH 070/150] remove raid

---
 .buildkite/nightly-benchmarks/nightly-pipeline.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 0ba924aa2e272..3d4187b5d8ef8 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -16,8 +16,8 @@ steps:
             volumeMounts:
             - name: devshm
               mountPath: /dev/shm
-            - name: nvme-raid
-              mountPath: /mnt/fast-disks/nvme-raid
+            # - name: nvme-raid
+            #   mountPath: /mnt/fast-disks/nvme-raid
             env:
             - name: VLLM_USAGE_SOURCE
               value: ci-test
@@ -34,10 +34,10 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-          - name: nvme-raid
-            hostpath:
-              path: /mnt/fast-disks/nvme-raid
-              type: directory
+          # - name: nvme-raid
+          #   hostpath:
+          #     path: /mnt/fast-disks/nvme-raid
+          #     type: directory
   - wait
   - label: "A100 trt benchmark"
     agents:

From c57ac0aab9964c975b6660d11af15b10c2fbe585 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 26 Jun 2024 09:54:35 -0700
Subject: [PATCH 071/150] try Roger's fix

---
 .buildkite/nightly-benchmarks/nightly-pipeline.yaml  | 12 ++++++------
 .../nightly-benchmarks/scripts/nightly-annotate.sh   |  2 +-
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 3d4187b5d8ef8..241b0dda145b8 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -16,8 +16,8 @@ steps:
             volumeMounts:
             - name: devshm
               mountPath: /dev/shm
-            # - name: nvme-raid
-            #   mountPath: /mnt/fast-disks/nvme-raid
+            - name: nvme-raid
+              mountPath: /mnt/fast-disks/nvme-raid
             env:
             - name: VLLM_USAGE_SOURCE
               value: ci-test
@@ -34,10 +34,10 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-          # - name: nvme-raid
-          #   hostpath:
-          #     path: /mnt/fast-disks/nvme-raid
-          #     type: directory
+          - name: nvme-raid
+            hostpath:
+              path: /mnt/fast-disks/nvme-raid
+              type: Directory
   - wait
   - label: "A100 trt benchmark"
     agents:
diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
index f8168c92d1cbc..cee44c3d6eb92 100644
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -15,7 +15,7 @@ main() {
         echo "buildkite-agent binary not found. Skip uploading the results."
         return 0
     else
-        /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < /vllm/.buildkite/nightly-benchmarks/nightly-descriptions.md
+        /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md
     fi
     
 }

From d75d45b943fe2c576383b026baa271fec6aafc68 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 26 Jun 2024 20:04:06 -0700
Subject: [PATCH 072/150] remove nvme raid

---
 .buildkite/nightly-benchmarks/nightly-pipeline.yaml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 241b0dda145b8..37845b2802cf1 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -16,8 +16,8 @@ steps:
             volumeMounts:
             - name: devshm
               mountPath: /dev/shm
-            - name: nvme-raid
-              mountPath: /mnt/fast-disks/nvme-raid
+            # - name: nvme-raid
+            #   mountPath: /mnt/fast-disks/nvme-raid
             env:
             - name: VLLM_USAGE_SOURCE
               value: ci-test
@@ -34,10 +34,10 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-          - name: nvme-raid
-            hostpath:
-              path: /mnt/fast-disks/nvme-raid
-              type: Directory
+          # - name: nvme-raid
+          #   hostpath:
+          #     path: /mnt/fast-disks/nvme-raid
+          #     type: Directory
   - wait
   - label: "A100 trt benchmark"
     agents:

From 5dc8c8cccec38f194be19ca43e13ac6c919d4108 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 27 Jun 2024 21:40:58 -0700
Subject: [PATCH 073/150] raise the priority of benchmarking development jobs

---
 .buildkite/nightly-benchmarks/nightly-pipeline.yaml | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 37845b2802cf1..71edb6d6257ee 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -1,5 +1,6 @@
 steps:
   - label: "Annotate"
+    priority: 100
     agents:
       queue: A100
     plugins:
@@ -16,8 +17,6 @@ steps:
             volumeMounts:
             - name: devshm
               mountPath: /dev/shm
-            # - name: nvme-raid
-            #   mountPath: /mnt/fast-disks/nvme-raid
             env:
             - name: VLLM_USAGE_SOURCE
               value: ci-test
@@ -34,12 +33,9 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-          # - name: nvme-raid
-          #   hostpath:
-          #     path: /mnt/fast-disks/nvme-raid
-          #     type: Directory
   - wait
   - label: "A100 trt benchmark"
+    priority: 100
     agents:
       queue: A100
     plugins:
@@ -73,6 +69,7 @@ steps:
             emptyDir:
               medium: Memory
   - label: "A100 vllm benchmark"
+    priority: 100
     agents:
       queue: A100
     plugins:
@@ -106,6 +103,7 @@ steps:
             emptyDir:
               medium: Memory
   - label: "A100 tgi benchmark"
+    priority: 100
     agents:
       queue: A100
     plugins:
@@ -139,6 +137,7 @@ steps:
             emptyDir:
               medium: Memory
   - label: "A100 lmdeploy benchmark"
+    priority: 100
     agents:
       queue: A100
     plugins:

From 8b9192761005c649ae0ef152e439fbfad69bea61 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 27 Jun 2024 22:57:37 -0700
Subject: [PATCH 074/150] reduce the # of test from 1000 to 500, for faster
 testing

---
 .buildkite/nightly-benchmarks/tests/nightly-tests.json | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 573bd03114c64..04f387a6eb4aa 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -7,7 +7,7 @@
             "tp": 1,
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 1000,
+            "num_prompts": 500,
             "port": 8000
         },
         "lmdeploy_server_parameters": {
@@ -45,7 +45,7 @@
             "tp": 2,
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 1000,
+            "num_prompts": 500,
             "port": 8000
         },
         "lmdeploy_server_parameters": {
@@ -83,7 +83,7 @@
             "tp": 4,
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 1000,
+            "num_prompts": 500,
             "port": 8000
         },
         "lmdeploy_server_parameters": {

From 8539874030e657386a64b2bbf636c39cd9893d88 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 27 Jun 2024 23:15:52 -0700
Subject: [PATCH 075/150] trt won't run all the test. Just run llama-3 70B. Fix
 this bug tomorrow

---
 .../tests/nightly-tests.json                  | 76 -------------------
 1 file changed, 76 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 04f387a6eb4aa..31dea7a43c632 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -1,80 +1,4 @@
 [
-    {
-        "test_name": "llama8B_tp1",
-        "qps_list": [4,8],
-        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "tp": 1,
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 500,
-            "port": 8000
-        },
-        "lmdeploy_server_parameters": {
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
-        "trt_server_parameters": {
-            "model_type": "llama",
-            "model_dtype": "float16",
-            "max_batch_size": 256,
-            "max_input_len": 10000,
-            "max_output_len": 10000,
-            "trt_llm_version": "r24.04"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        },
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": ""
-        },
-        "vllm_client_parameters": {
-        }
-    },
-    {
-        "test_name": "mixtral8x7B_tp2",
-        "qps_list": [2,4],
-        "common_parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tp": 2,
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 500,
-            "port": 8000
-        },
-        "lmdeploy_server_parameters": {
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
-        "trt_server_parameters": {
-            "model_type": "mixtral",
-            "model_dtype": "float16",
-            "max_batch_size": 256,
-            "max_input_len": 10000,
-            "max_output_len": 10000,
-            "trt_llm_version": "r24.04"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        },
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": ""
-        },
-        "vllm_client_parameters": {
-        }
-    },
     {
         "test_name": "llama70B_tp4",
         "qps_list": [2,4],

From 144328b8c0613e71ddd52bb3347a92eaefe52e86 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 29 Jun 2024 21:23:25 -0700
Subject: [PATCH 076/150] debug tensorrt

---
 .../nightly-benchmarks/nightly-pipeline.yaml  | 204 +++++++++---------
 .../scripts/run-trt-nightly.sh                |   1 +
 .../tests/nightly-tests.json                  |  46 +++-
 3 files changed, 145 insertions(+), 106 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 71edb6d6257ee..19f1c6a355dbd 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -68,106 +68,106 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-  - label: "A100 vllm benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: vllm/vllm-openai:latest
-            command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-  - label: "A100 tgi benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: ghcr.io/huggingface/text-generation-inference:2.0
-            command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-  - label: "A100 lmdeploy benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: openmmlab/lmdeploy:latest
-            command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
+  # - label: "A100 vllm benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #   - kubernetes:
+  #       podSpec:
+  #         priorityClassName: perf-benchmark
+  #         containers:
+  #         - image: vllm/vllm-openai:latest
+  #           command:
+  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+  #           resources:
+  #             limits:
+  #               nvidia.com/gpu: 8
+  #           volumeMounts:
+  #           - name: devshm
+  #             mountPath: /dev/shm
+  #           env:
+  #           - name: VLLM_USAGE_SOURCE
+  #             value: ci-test
+  #           - name: VLLM_SOURCE_CODE_LOC
+  #             value: /workspace/build/buildkite/vllm/performance-benchmark
+  #           - name: HF_TOKEN
+  #             valueFrom:
+  #               secretKeyRef:
+  #                 name: hf-token-secret
+  #                 key: token
+  #         nodeSelector:
+  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  #         volumes:
+  #         - name: devshm
+  #           emptyDir:
+  #             medium: Memory
+  # - label: "A100 tgi benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #   - kubernetes:
+  #       podSpec:
+  #         priorityClassName: perf-benchmark
+  #         containers:
+  #         - image: ghcr.io/huggingface/text-generation-inference:2.0
+  #           command:
+  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+  #           resources:
+  #             limits:
+  #               nvidia.com/gpu: 8
+  #           volumeMounts:
+  #           - name: devshm
+  #             mountPath: /dev/shm
+  #           env:
+  #           - name: VLLM_USAGE_SOURCE
+  #             value: ci-test
+  #           - name: VLLM_SOURCE_CODE_LOC
+  #             value: /workspace/build/buildkite/vllm/performance-benchmark
+  #           - name: HF_TOKEN
+  #             valueFrom:
+  #               secretKeyRef:
+  #                 name: hf-token-secret
+  #                 key: token
+  #         nodeSelector:
+  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  #         volumes:
+  #         - name: devshm
+  #           emptyDir:
+  #             medium: Memory
+  # - label: "A100 lmdeploy benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #   - kubernetes:
+  #       podSpec:
+  #         priorityClassName: perf-benchmark
+  #         containers:
+  #         - image: openmmlab/lmdeploy:latest
+  #           command:
+  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+  #           resources:
+  #             limits:
+  #               nvidia.com/gpu: 8
+  #           volumeMounts:
+  #           - name: devshm
+  #             mountPath: /dev/shm
+  #           env:
+  #           - name: VLLM_USAGE_SOURCE
+  #             value: ci-test
+  #           - name: VLLM_SOURCE_CODE_LOC
+  #             value: /workspace/build/buildkite/vllm/performance-benchmark
+  #           - name: HF_TOKEN
+  #             valueFrom:
+  #               secretKeyRef:
+  #                 name: hf-token-secret
+  #                 key: token
+  #         nodeSelector:
+  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  #         volumes:
+  #         - name: devshm
+  #           emptyDir:
+  #             medium: Memory
   
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index d3abd53bf01ea..4324d418de9e0 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
 set -o pipefail
+set -ex
 
 check_gpus() {
   # check the number of GPUs and GPU type.
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 31dea7a43c632..c55da0ae70b2f 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -1,10 +1,48 @@
 [
     {
-        "test_name": "llama70B_tp4",
-        "qps_list": [2,4],
+        "test_name": "llama8B_tp1",
+        "qps_list": [4,8],
         "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "tp": 4,
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tp": 1,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000
+        },
+        "lmdeploy_server_parameters": {
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "float16",
+            "max_batch_size": 256,
+            "max_input_len": 10000,
+            "max_output_len": 10000,
+            "trt_llm_version": "r24.04"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": ""
+        },
+        "vllm_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama8B_tp1",
+        "qps_list": [4,8],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tp": 1,
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 500,

From 64e951891acb7e4277efdb995f3a073a7cc372b1 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sun, 30 Jun 2024 23:29:21 -0700
Subject: [PATCH 077/150] bug fix: avoid reassigning params during the for loop

---
 .../nightly-benchmarks/scripts/run-trt-nightly.sh  | 14 +++++++-------
 1 file changed, 7 insertions(+), 7 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index 4324d418de9e0..7c1149f86aa32 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -53,20 +53,20 @@ wait_for_server() {
 
 run_trt_server() {
 
-  params=$1
+  server_params=$1
   common_params=$2
 
 
 
   model_path=$(echo "$common_params" | jq -r '.model')
   model_name="${model_path#*/}"
-  model_type=$(echo "$params" | jq -r '.model_type')
-  model_dtype=$(echo "$params" | jq -r '.model_dtype')
+  model_type=$(echo "$server_params" | jq -r '.model_type')
+  model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
   model_tp_size=$(echo "$common_params" | jq -r '.tp')
-  max_batch_size=$(echo "$params" | jq -r '.max_batch_size')
-  max_input_len=$(echo "$params" | jq -r '.max_input_len')
-  max_output_len=$(echo "$params" | jq -r '.max_output_len')
-  trt_llm_version=$(echo "$params" | jq -r '.trt_llm_version')
+  max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
+  max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
+  max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
+  trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
 
   cd ~
   rm -rf models

From a94c1403d3998d036b5418c0b89e80dcced259e7 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sun, 30 Jun 2024 23:30:28 -0700
Subject: [PATCH 078/150] bring lmdeploy back for testing

---
 .../nightly-benchmarks/nightly-pipeline.yaml  | 68 +++++++++----------
 1 file changed, 34 insertions(+), 34 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 19f1c6a355dbd..7dae9c9c59a95 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -136,38 +136,38 @@ steps:
   #         - name: devshm
   #           emptyDir:
   #             medium: Memory
-  # - label: "A100 lmdeploy benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #   - kubernetes:
-  #       podSpec:
-  #         priorityClassName: perf-benchmark
-  #         containers:
-  #         - image: openmmlab/lmdeploy:latest
-  #           command:
-  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-  #           resources:
-  #             limits:
-  #               nvidia.com/gpu: 8
-  #           volumeMounts:
-  #           - name: devshm
-  #             mountPath: /dev/shm
-  #           env:
-  #           - name: VLLM_USAGE_SOURCE
-  #             value: ci-test
-  #           - name: VLLM_SOURCE_CODE_LOC
-  #             value: /workspace/build/buildkite/vllm/performance-benchmark
-  #           - name: HF_TOKEN
-  #             valueFrom:
-  #               secretKeyRef:
-  #                 name: hf-token-secret
-  #                 key: token
-  #         nodeSelector:
-  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  #         volumes:
-  #         - name: devshm
-  #           emptyDir:
-  #             medium: Memory
+  - label: "A100 lmdeploy benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: openmmlab/lmdeploy:latest
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
   
\ No newline at end of file

From 1f0ccb05ded608be29b170587c741d0339dcdab8 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 1 Jul 2024 00:13:46 -0700
Subject: [PATCH 079/150] change test name

---
 .buildkite/nightly-benchmarks/tests/nightly-tests.json | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index c55da0ae70b2f..c8e950570cf42 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -38,7 +38,7 @@
         }
     },
     {
-        "test_name": "llama8B_tp1",
+        "test_name": "llama8B_tp2",
         "qps_list": [4,8],
         "common_parameters": {
             "model": "meta-llama/Meta-Llama-3-8B",

From 2f53b96bd44f405ebec098bc3260fb1ddbfead48 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 1 Jul 2024 01:12:48 -0700
Subject: [PATCH 080/150] separating run server command from the bash file

---
 .../scripts/launch-trt-server.sh              | 82 +++++++++++++++++++
 .../scripts/run-trt-nightly.sh                |  3 +-
 .../tests/nightly-tests.json                  |  2 +-
 3 files changed, 85 insertions(+), 2 deletions(-)
 create mode 100644 .buildkite/nightly-benchmarks/scripts/launch-trt-server.sh

diff --git a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
new file mode 100644
index 0000000000000..251ab139c5729
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
@@ -0,0 +1,82 @@
+
+
+set -x
+
+server_params=$1
+common_params=$2
+
+
+
+model_path=$(echo "$common_params" | jq -r '.model')
+model_name="${model_path#*/}"
+model_type=$(echo "$server_params" | jq -r '.model_type')
+model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
+model_tp_size=$(echo "$common_params" | jq -r '.tp')
+max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
+max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
+max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
+trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
+
+cd ~
+rm -rf models
+mkdir -p models
+cd models
+models_dir=$(pwd)
+trt_model_path=${models_dir}/${model_name}-trt-ckpt
+trt_engine_path=${models_dir}/${model_name}-trt-engine
+
+cd ~
+rm -rf tensorrt-demo
+git clone https://github.com/neuralmagic/tensorrt-demo.git
+cd tensorrt-demo
+tensorrt_demo_dir=$(pwd)
+
+# make sure the parameter inside tensorrt_demo is consistent to envvar
+sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
+sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
+sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
+
+
+cd /
+rm -rf tensorrtllm_backend
+git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
+git lfs install
+cd tensorrtllm_backend
+git checkout $trt_llm_version
+tensorrtllm_backend_dir=$(pwd)
+git submodule update --init --recursive
+cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
+
+cd /tensorrtllm_backend
+cd ./tensorrt_llm/examples/${model_type}
+
+python3 convert_checkpoint.py \
+--model_dir ${model_path} \
+--dtype ${model_dtype} \
+--tp_size ${model_tp_size} \
+--output_dir ${trt_model_path}
+
+trtllm-build \
+--checkpoint_dir=${trt_model_path} \
+--gpt_attention_plugin=${model_dtype} \
+--gemm_plugin=${model_dtype} \
+--remove_input_padding=enable \
+--paged_kv_cache=enable \
+--tp_size=${model_tp_size} \
+--max_batch_size=${max_batch_size} \
+--max_input_len=${max_input_len} \
+--max_output_len=${max_output_len} \
+--max_num_tokens=${max_output_len} \
+--opt_num_tokens=${max_output_len} \
+--output_dir=${trt_engine_path} 
+
+cd /tensorrtllm_backend/triton_model_repo
+rm -rf ./tensorrt_llm/1/*
+cp -r ${trt_engine_path}/* ./tensorrt_llm/1
+cd /tensorrtllm_backend
+python3 scripts/launch_triton_server.py \
+--world_size=${model_tp_size} \
+--model_repo=/tensorrtllm_backend/triton_model_repo &
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index 7c1149f86aa32..58002651bd188 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -189,7 +189,8 @@ run_serving_tests() {
 
     # run the server
     echo "Running test case $test_name"
-    run_trt_server "$server_params" "$common_params"
+    # run_trt_server "$server_params" "$common_params"
+    bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
 
     # wait until the server is alive
     wait_for_server
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index c8e950570cf42..14e1730fd4a90 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -3,7 +3,7 @@
         "test_name": "llama8B_tp1",
         "qps_list": [4,8],
         "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/llama-2-7b-hf",
             "tp": 1,
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",

From 51d679e02876a08aaa713478eaa4b7d6fe0d4990 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 1 Jul 2024 01:21:21 -0700
Subject: [PATCH 081/150] clean up

---
 .../scripts/launch-trt-server.sh              |  2 +-
 .../scripts/run-trt-nightly.sh                | 84 -------------------
 2 files changed, 1 insertion(+), 85 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
index 251ab139c5729..d9108ef3a3168 100644
--- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
@@ -1,4 +1,4 @@
-
+#!/bin/bash
 
 set -x
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index 58002651bd188..22363dbc25dcb 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -50,89 +50,6 @@ wait_for_server() {
     done' && return 0 || return 1
 }
 
-
-run_trt_server() {
-
-  server_params=$1
-  common_params=$2
-
-
-
-  model_path=$(echo "$common_params" | jq -r '.model')
-  model_name="${model_path#*/}"
-  model_type=$(echo "$server_params" | jq -r '.model_type')
-  model_dtype=$(echo "$server_params" | jq -r '.model_dtype')
-  model_tp_size=$(echo "$common_params" | jq -r '.tp')
-  max_batch_size=$(echo "$server_params" | jq -r '.max_batch_size')
-  max_input_len=$(echo "$server_params" | jq -r '.max_input_len')
-  max_output_len=$(echo "$server_params" | jq -r '.max_output_len')
-  trt_llm_version=$(echo "$server_params" | jq -r '.trt_llm_version')
-
-  cd ~
-  rm -rf models
-  mkdir -p models
-  cd models
-  models_dir=$(pwd)
-  trt_model_path=${models_dir}/${model_name}-trt-ckpt
-  trt_engine_path=${models_dir}/${model_name}-trt-engine
-
-  cd ~
-  rm -rf tensorrt-demo
-  git clone https://github.com/neuralmagic/tensorrt-demo.git
-  cd tensorrt-demo
-  tensorrt_demo_dir=$(pwd)
-
-  # make sure the parameter inside tensorrt_demo is consistent to envvar
-  sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/postprocessing/config.pbtxt
-  sed -i.bak "/key: \"tokenizer_dir\"/,/string_value:/s|string_value: \".*\"|string_value: \"$model_path\"|" ./triton_model_repo/preprocessing/config.pbtxt
-  sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/ensemble/config.pbtxt
-  sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/preprocessing/config.pbtxt
-  sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/postprocessing/config.pbtxt
-  sed -i.bak "s|\(max_batch_size:\s*\)[0-9]*|\1$max_batch_size|g" ./triton_model_repo/tensorrt_llm_bls/config.pbtxt
-
-
-  cd /
-  rm -rf tensorrtllm_backend
-  git clone https://github.com/triton-inference-server/tensorrtllm_backend.git
-  git lfs install
-  cd tensorrtllm_backend
-  git checkout $trt_llm_version
-  tensorrtllm_backend_dir=$(pwd)
-  git submodule update --init --recursive
-  cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
-
-  cd /tensorrtllm_backend
-  cd ./tensorrt_llm/examples/${model_type}
-
-  python3 convert_checkpoint.py \
-    --model_dir ${model_path} \
-    --dtype ${model_dtype} \
-    --tp_size ${model_tp_size} \
-    --output_dir ${trt_model_path}
-
-  trtllm-build \
-    --checkpoint_dir=${trt_model_path} \
-    --gpt_attention_plugin=${model_dtype} \
-    --gemm_plugin=${model_dtype} \
-    --remove_input_padding=enable \
-    --paged_kv_cache=enable \
-    --tp_size=${model_tp_size} \
-    --max_batch_size=${max_batch_size} \
-    --max_input_len=${max_input_len} \
-    --max_output_len=${max_output_len} \
-    --max_num_tokens=${max_output_len} \
-    --opt_num_tokens=${max_output_len} \
-    --output_dir=${trt_engine_path} 
-
-  cd /tensorrtllm_backend/triton_model_repo
-  rm -rf ./tensorrt_llm/1/*
-  cp -r ${trt_engine_path}/* ./tensorrt_llm/1
-  cd /tensorrtllm_backend
-  python3 scripts/launch_triton_server.py \
-    --world_size=${model_tp_size} \
-    --model_repo=/tensorrtllm_backend/triton_model_repo &
-}
-
 run_serving_tests() {
   # run serving tests using `benchmark_serving.py`
   # $1: a json file specifying serving test cases
@@ -189,7 +106,6 @@ run_serving_tests() {
 
     # run the server
     echo "Running test case $test_name"
-    # run_trt_server "$server_params" "$common_params"
     bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
 
     # wait until the server is alive

From 21c986d5eadff3b540fb8f2a44296e20ba6d799c Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 1 Jul 2024 01:24:21 -0700
Subject: [PATCH 082/150] run lmdeploy server in a separate process

---
 .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index e9c29bbe7de47..275725e5741c5 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -113,7 +113,7 @@ run_serving_tests() {
     # run the server
     echo "Running test case $test_name"
     echo "Server command: $server_command"
-    eval "$server_command" &
+    bash -c "$server_command" &
 
     # wait until the server is alive
     wait_for_server

From 96bc2490c3b2d1461d7f5c6a1fd252e94c17fb24 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 1 Jul 2024 01:25:19 -0700
Subject: [PATCH 083/150] bring back the full test suite

---
 .../tests/nightly-tests.json                  | 48 +++++++++++++++++--
 1 file changed, 43 insertions(+), 5 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 14e1730fd4a90..04f387a6eb4aa 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -3,7 +3,7 @@
         "test_name": "llama8B_tp1",
         "qps_list": [4,8],
         "common_parameters": {
-            "model": "meta-llama/llama-2-7b-hf",
+            "model": "meta-llama/Meta-Llama-3-8B",
             "tp": 1,
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -38,11 +38,49 @@
         }
     },
     {
-        "test_name": "llama8B_tp2",
-        "qps_list": [4,8],
+        "test_name": "mixtral8x7B_tp2",
+        "qps_list": [2,4],
         "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "tp": 1,
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tp": 2,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000
+        },
+        "lmdeploy_server_parameters": {
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "mixtral",
+            "model_dtype": "float16",
+            "max_batch_size": 256,
+            "max_input_len": 10000,
+            "max_output_len": 10000,
+            "trt_llm_version": "r24.04"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": ""
+        },
+        "vllm_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama70B_tp4",
+        "qps_list": [2,4],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tp": 4,
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 500,

From 6c566cbe7a17cb39ad2a9e230f5d6570a6714c5c Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 1 Jul 2024 11:15:26 -0700
Subject: [PATCH 084/150] bug fix: need to use llama checkpoint converter for
 mixtral model

---
 .../tests/nightly-tests.json                  | 38 -------------------
 1 file changed, 38 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 04f387a6eb4aa..9627eae547a7d 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -57,44 +57,6 @@
         "tgi_client_parameters": {
             "endpoint": "/generate_stream"
         },
-        "trt_server_parameters": {
-            "model_type": "mixtral",
-            "model_dtype": "float16",
-            "max_batch_size": 256,
-            "max_input_len": 10000,
-            "max_output_len": 10000,
-            "trt_llm_version": "r24.04"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        },
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": ""
-        },
-        "vllm_client_parameters": {
-        }
-    },
-    {
-        "test_name": "llama70B_tp4",
-        "qps_list": [2,4],
-        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "tp": 4,
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 500,
-            "port": 8000
-        },
-        "lmdeploy_server_parameters": {
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
         "trt_server_parameters": {
             "model_type": "llama",
             "model_dtype": "float16",

From 162700f100b7340cbff6e6f471cccb2763be0acc Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 1 Jul 2024 22:05:41 -0700
Subject: [PATCH 085/150] reduce test case to only mixtral, debug lmdeploy +
 mixtral

---
 .../tests/nightly-tests.json                  | 38 -------------------
 1 file changed, 38 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 9627eae547a7d..bd881fdb831f9 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -1,42 +1,4 @@
 [
-    {
-        "test_name": "llama8B_tp1",
-        "qps_list": [4,8],
-        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
-            "tp": 1,
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 500,
-            "port": 8000
-        },
-        "lmdeploy_server_parameters": {
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
-        "trt_server_parameters": {
-            "model_type": "llama",
-            "model_dtype": "float16",
-            "max_batch_size": 256,
-            "max_input_len": 10000,
-            "max_output_len": 10000,
-            "trt_llm_version": "r24.04"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        },
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": ""
-        },
-        "vllm_client_parameters": {
-        }
-    },
     {
         "test_name": "mixtral8x7B_tp2",
         "qps_list": [2,4],

From b0d74cdfef1293612cd34a247ddc9c29b37d227e Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 1 Jul 2024 23:13:50 -0700
Subject: [PATCH 086/150] developing fp8 + tensorrt-llm

---
 .../scripts/launch-trt-server.sh              | 34 ++++++++++++++++---
 .../tests/nightly-tests.json                  | 13 ++++---
 2 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
index d9108ef3a3168..2b877601da123 100644
--- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
@@ -53,11 +53,35 @@ cp -r ${tensorrt_demo_dir}/triton_model_repo ${tensorrtllm_backend_dir}/
 cd /tensorrtllm_backend
 cd ./tensorrt_llm/examples/${model_type}
 
-python3 convert_checkpoint.py \
---model_dir ${model_path} \
---dtype ${model_dtype} \
---tp_size ${model_tp_size} \
---output_dir ${trt_model_path}
+
+if echo "$server_params" | jq -e 'has("qformat")' > /dev/null; then
+
+    echo "Key 'qformat' exists in tensorrt server params. Use quantize.py"
+    echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
+    qformat=$(echo "$server_params" | jq -r '.qformat')
+    kv_cache_dtype=$(echo "$server_params" | jq -r '.kv_cache_dtype')
+    calib_size=$(echo "$server_params" | jq -r '.calib_size')
+    python ../quantization/quantize.py \
+        --model_dir ${model_path} \
+        --dtype ${model_dtype} \
+        --tp_size ${model_tp_size} \
+        --output_dir ${trt_model_path} \
+        --qformat ${qformat} \
+        --kv_cache_dtype ${kv_cache_dtype} \
+        --calib_size ${calib_size} \
+
+else
+
+    echo "Key 'qformat' does not exist in tensorrt server params. Use convert_checkpoint.py"
+    python3 convert_checkpoint.py \
+        --model_dir ${model_path} \
+        --dtype ${model_dtype} \
+        --tp_size ${model_tp_size} \
+        --output_dir ${trt_model_path}
+        
+fi
+
+
 
 trtllm-build \
 --checkpoint_dir=${trt_model_path} \
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index bd881fdb831f9..85abacac29802 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -1,10 +1,10 @@
 [
     {
-        "test_name": "mixtral8x7B_tp2",
-        "qps_list": [2,4],
+        "test_name": "llama8B_fp8_tp1",
+        "qps_list": [4,8],
         "common_parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tp": 2,
+            "model": "meta-llama/Meta-Llama-3-8B",
+            "tp": 1,
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 500,
@@ -25,7 +25,10 @@
             "max_batch_size": 256,
             "max_input_len": 10000,
             "max_output_len": 10000,
-            "trt_llm_version": "r24.04"
+            "trt_llm_version": "r24.04",
+            "qformat": "fp8",
+            "kv_cache_dtype": "fp8",
+            "calib_size": 512
         },
         "trt_client_parameters": {
             "endpoint": "/v2/models/ensemble/generate_stream"

From f1a795557383da7e76680e4268cbe5e47f542ff1 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 1 Jul 2024 23:16:43 -0700
Subject: [PATCH 087/150] move fp8 quantization to common parameters

---
 .../scripts/launch-trt-server.sh                | 17 +++++++----------
 .../nightly-benchmarks/tests/nightly-tests.json |  3 ++-
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
index 2b877601da123..ed6ed1aff722b 100644
--- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
@@ -54,31 +54,28 @@ cd /tensorrtllm_backend
 cd ./tensorrt_llm/examples/${model_type}
 
 
-if echo "$server_params" | jq -e 'has("qformat")' > /dev/null; then
+if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
 
-    echo "Key 'qformat' exists in tensorrt server params. Use quantize.py"
+    echo "Key 'fp8' exists in common params. Use quantize.py instead of convert_checkpoint.py"
     echo "Reference: https://github.com/NVIDIA/TensorRT-LLM/blob/main/examples/llama/README.md"
-    qformat=$(echo "$server_params" | jq -r '.qformat')
-    kv_cache_dtype=$(echo "$server_params" | jq -r '.kv_cache_dtype')
-    calib_size=$(echo "$server_params" | jq -r '.calib_size')
     python ../quantization/quantize.py \
         --model_dir ${model_path} \
         --dtype ${model_dtype} \
         --tp_size ${model_tp_size} \
         --output_dir ${trt_model_path} \
-        --qformat ${qformat} \
-        --kv_cache_dtype ${kv_cache_dtype} \
-        --calib_size ${calib_size} \
+        --qformat fp8 \
+        --kv_cache_dtype fp8 \
+        --calib_size 512
 
 else
 
-    echo "Key 'qformat' does not exist in tensorrt server params. Use convert_checkpoint.py"
+    echo "Key 'fp8' exists in common params. Use convert_checkpoint.py"
     python3 convert_checkpoint.py \
         --model_dir ${model_path} \
         --dtype ${model_dtype} \
         --tp_size ${model_tp_size} \
         --output_dir ${trt_model_path}
-        
+
 fi
 
 
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 85abacac29802..433772f3b6d2b 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -8,7 +8,8 @@
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 500,
-            "port": 8000
+            "port": 8000,
+            "fp8": true
         },
         "lmdeploy_server_parameters": {
         },

From 459fb2f01dd0f410deabe3e916d70b23c6242222 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 1 Jul 2024 23:37:42 -0700
Subject: [PATCH 088/150] add fp8 for vllm

---
 .../scripts/run-vllm-nightly.sh               | 25 +++++++++++++------
 1 file changed, 18 insertions(+), 7 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
index 774e6f3d5cb2c..3560f93005eeb 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -101,13 +101,24 @@ run_serving_tests() {
       continue
     fi
 
-
-    server_command="python3 \
-      -m vllm.entrypoints.openai.api_server \
-      -tp $tp \
-      --model $model \
-      --port $port \
-      $server_args"
+    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
+      echo "Key 'fp8' exists in common params."
+      server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
+        -tp $tp \
+        --model $model \
+        --port $port \
+        --quantization fp8 \
+        $server_args"
+    else
+      echo "Key 'fp8' does not exist in common params."
+      server_command="python3 \
+        -m vllm.entrypoints.openai.api_server \
+        -tp $tp \
+        --model $model \
+        --port $port \
+        $server_args"
+    fi
 
     # run the server
     echo "Running test case $test_name"

From 79b295cd197086efbd955e13c56eaa523552faf8 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 1 Jul 2024 23:46:14 -0700
Subject: [PATCH 089/150] remove unused parameters

---
 .buildkite/nightly-benchmarks/tests/nightly-tests.json | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 433772f3b6d2b..cf33c4fe898ec 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -26,10 +26,7 @@
             "max_batch_size": 256,
             "max_input_len": 10000,
             "max_output_len": 10000,
-            "trt_llm_version": "r24.04",
-            "qformat": "fp8",
-            "kv_cache_dtype": "fp8",
-            "calib_size": 512
+            "trt_llm_version": "r24.04"
         },
         "trt_client_parameters": {
             "endpoint": "/v2/models/ensemble/generate_stream"

From 019802a93e54fc633ab99e75f009186c062dc944 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Mon, 1 Jul 2024 23:56:22 -0700
Subject: [PATCH 090/150] use llama2 for local debugging

---
 .../scripts/run-tgi-nightly.sh                | 23 +++++++++++++++----
 .../tests/nightly-tests.json                  |  2 +-
 2 files changed, 19 insertions(+), 6 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
index 67f88eee653d9..36a9f434d4740 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -95,12 +95,25 @@ run_serving_tests() {
       continue
     fi
 
+    if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
+      echo "Key 'fp8' exists in common params."
+      server_command="/tgi-entrypoint.sh \
+        --model-id $model \
+        --num-shard $tp \
+        --port $port \
+        --quantize fp8 \
+        $server_args"
+    else
+      echo "Key 'fp8' does not exist in common params."
+      server_command="/tgi-entrypoint.sh \
+        --model-id $model \
+        --num-shard $tp \
+        --port $port \
+        $server_args"
+    fi
+
 
-    server_command="/tgi-entrypoint.sh \
-      --model-id $model \
-      --num-shard $tp \
-      --port $port \
-      $server_args"
+    
 
     # run the server
     echo "Running test case $test_name"
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index cf33c4fe898ec..33b88b811ec0b 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -3,7 +3,7 @@
         "test_name": "llama8B_fp8_tp1",
         "qps_list": [4,8],
         "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B",
+            "model": "meta-llama/llama-2-7b-chat-hf",
             "tp": 1,
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",

From 3d20f9235e4d90a5b9f3794cfc0a8da2afafb641 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 2 Jul 2024 14:02:28 -0700
Subject: [PATCH 091/150] move kv cache dtype inside vllm

---
 .buildkite/nightly-benchmarks/tests/nightly-tests.json | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 33b88b811ec0b..d08dccf5455f2 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -3,7 +3,8 @@
         "test_name": "llama8B_fp8_tp1",
         "qps_list": [4,8],
         "common_parameters": {
-            "model": "meta-llama/llama-2-7b-chat-hf",
+            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
+            "neuralmagic_quantized_model": "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
             "tp": 1,
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
@@ -33,7 +34,8 @@
         },
         "vllm_server_parameters": {
             "disable_log_stats": "",
-            "disable_log_requests": ""
+            "disable_log_requests": "",
+            "kv_cache_dtype": "fp8"
         },
         "vllm_client_parameters": {
         }

From 44e2d971587bf455c30e1520575efd36366945cd Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 2 Jul 2024 14:11:02 -0700
Subject: [PATCH 092/150] change model

---
 .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
index 3560f93005eeb..f309e391d2cbe 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -102,13 +102,13 @@ run_serving_tests() {
     fi
 
     if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
-      echo "Key 'fp8' exists in common params."
+      echo "Key 'fp8' exists in common params. Use neuralmagic fp8 model for convenience."
+      model=$(echo "$common_params" | jq -r '.neuralmagic_quantized_model')
       server_command="python3 \
         -m vllm.entrypoints.openai.api_server \
         -tp $tp \
         --model $model \
         --port $port \
-        --quantization fp8 \
         $server_args"
     else
       echo "Key 'fp8' does not exist in common params."

From b8dbd8ac9e268a01022e38d291ada164fa9740f2 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 2 Jul 2024 23:35:36 -0700
Subject: [PATCH 093/150] test fp8 performance

---
 .buildkite/nightly-benchmarks/tests/nightly-tests.json | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index d08dccf5455f2..ce98a7604fae8 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -13,6 +13,7 @@
             "fp8": true
         },
         "lmdeploy_server_parameters": {
+            "quant_policy": 8
         },
         "lmdeploy_client_parameters": {
         },

From 0313c19e8fc6a1ce09ab0f3f45a416716653dbd0 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 3 Jul 2024 00:01:03 -0700
Subject: [PATCH 094/150] reduce calib size

---
 .../nightly-benchmarks/nightly-pipeline.yaml  | 140 +++++++++---------
 .../scripts/launch-trt-server.sh              |   2 +-
 2 files changed, 71 insertions(+), 71 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 7dae9c9c59a95..7a20d526e06c2 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -34,6 +34,40 @@ steps:
             emptyDir:
               medium: Memory
   - wait
+  - label: "A100 lmdeploy benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: openmmlab/lmdeploy:latest
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
   - label: "A100 trt benchmark"
     priority: 100
     agents:
@@ -68,75 +102,7 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-  # - label: "A100 vllm benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #   - kubernetes:
-  #       podSpec:
-  #         priorityClassName: perf-benchmark
-  #         containers:
-  #         - image: vllm/vllm-openai:latest
-  #           command:
-  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-  #           resources:
-  #             limits:
-  #               nvidia.com/gpu: 8
-  #           volumeMounts:
-  #           - name: devshm
-  #             mountPath: /dev/shm
-  #           env:
-  #           - name: VLLM_USAGE_SOURCE
-  #             value: ci-test
-  #           - name: VLLM_SOURCE_CODE_LOC
-  #             value: /workspace/build/buildkite/vllm/performance-benchmark
-  #           - name: HF_TOKEN
-  #             valueFrom:
-  #               secretKeyRef:
-  #                 name: hf-token-secret
-  #                 key: token
-  #         nodeSelector:
-  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  #         volumes:
-  #         - name: devshm
-  #           emptyDir:
-  #             medium: Memory
-  # - label: "A100 tgi benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #   - kubernetes:
-  #       podSpec:
-  #         priorityClassName: perf-benchmark
-  #         containers:
-  #         - image: ghcr.io/huggingface/text-generation-inference:2.0
-  #           command:
-  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-  #           resources:
-  #             limits:
-  #               nvidia.com/gpu: 8
-  #           volumeMounts:
-  #           - name: devshm
-  #             mountPath: /dev/shm
-  #           env:
-  #           - name: VLLM_USAGE_SOURCE
-  #             value: ci-test
-  #           - name: VLLM_SOURCE_CODE_LOC
-  #             value: /workspace/build/buildkite/vllm/performance-benchmark
-  #           - name: HF_TOKEN
-  #             valueFrom:
-  #               secretKeyRef:
-  #                 name: hf-token-secret
-  #                 key: token
-  #         nodeSelector:
-  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  #         volumes:
-  #         - name: devshm
-  #           emptyDir:
-  #             medium: Memory
-  - label: "A100 lmdeploy benchmark"
+  - label: "A100 vllm benchmark"
     priority: 100
     agents:
       queue: A100
@@ -145,7 +111,41 @@ steps:
         podSpec:
           priorityClassName: perf-benchmark
           containers:
-          - image: openmmlab/lmdeploy:latest
+          - image: vllm/vllm-openai:latest
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+  - label: "A100 tgi benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: ghcr.io/huggingface/text-generation-inference:2.1
             command:
             - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
             resources:
diff --git a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
index ed6ed1aff722b..3a4c8e704abfb 100644
--- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
@@ -65,7 +65,7 @@ if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
         --output_dir ${trt_model_path} \
         --qformat fp8 \
         --kv_cache_dtype fp8 \
-        --calib_size 512
+        --calib_size 2
 
 else
 

From 7b483a128ceef770f472f3ff136576b8c7fe895a Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 3 Jul 2024 11:45:03 -0700
Subject: [PATCH 095/150] freeze fp16 benchmark

---
 .../tests/nightly-tests.json                  | 92 +++++++++++++++++--
 1 file changed, 82 insertions(+), 10 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index ce98a7604fae8..47d94a39f503a 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -1,19 +1,16 @@
 [
     {
-        "test_name": "llama8B_fp8_tp1",
+        "test_name": "llama8B_tp1",
         "qps_list": [4,8],
         "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-8B-Instruct",
-            "neuralmagic_quantized_model": "neuralmagic/Meta-Llama-3-8B-Instruct-FP8-KV",
+            "model": "meta-llama/Meta-Llama-3-8B",
             "tp": 1,
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 500,
-            "port": 8000,
-            "fp8": true
+            "port": 8000
         },
         "lmdeploy_server_parameters": {
-            "quant_policy": 8
         },
         "lmdeploy_client_parameters": {
         },
@@ -26,8 +23,8 @@
             "model_type": "llama",
             "model_dtype": "float16",
             "max_batch_size": 256,
-            "max_input_len": 10000,
-            "max_output_len": 10000,
+            "max_input_len": 4096,
+            "max_output_len": 4096,
             "trt_llm_version": "r24.04"
         },
         "trt_client_parameters": {
@@ -35,8 +32,83 @@
         },
         "vllm_server_parameters": {
             "disable_log_stats": "",
-            "disable_log_requests": "",
-            "kv_cache_dtype": "fp8"
+            "disable_log_requests": ""
+        },
+        "vllm_client_parameters": {
+        }
+    },
+    {
+        "test_name": "mixtral8x7B_tp2",
+        "qps_list": [2,4],
+        "common_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tp": 2,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000
+        },
+        "lmdeploy_server_parameters": {
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "float16",
+            "max_batch_size": 256,
+            "max_input_len": 4096,
+            "max_output_len": 4096,
+            "trt_llm_version": "r24.04"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": ""
+        },
+        "vllm_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama70B_tp4",
+        "qps_list": [2,4],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tp": 4,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000
+        },
+        "lmdeploy_server_parameters": {
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "float16",
+            "max_batch_size": 256,
+            "max_input_len": 4096,
+            "max_output_len": 4096,
+            "trt_llm_version": "r24.04"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": ""
         },
         "vllm_client_parameters": {
         }

From c5e6662094ece8adcef11c2a74763551bf9654a0 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 3 Jul 2024 15:30:35 -0700
Subject: [PATCH 096/150] add standard deviation for each metric -- to plot
 confidence interval

---
 .../nightly-benchmarks/nightly-descriptions.md  | 17 +++++++++++++++++
 .../nightly-benchmarks/nightly-pipeline.yaml    |  4 ++--
 .../nightly-benchmarks/run-nightly-suite.sh     |  1 -
 .../scripts/launch-trt-server.sh                |  1 -
 benchmarks/benchmark_serving.py                 |  9 +++++++++
 5 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index edcbeb8db10c4..7d42d11fea1f3 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -6,6 +6,17 @@ The main goal of this benchmarking is two-fold:
 - Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
 
 
+## Versions
+
+We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
+- vllm/vllm-openai:v0.5.0.post1
+- nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+- openmmlab/lmdeploy:v0.5.0
+- ghcr.io/huggingface/text-generation-inference:2.1
+
+Check `nightly-pipeline.yaml` artifact for more details.
+
+
 ## Workload description
 
 We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
@@ -18,6 +29,12 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
 
 
+## Known crashes
+
+- TGI v2.1 crashes when running mixtral model [TGI PR #2122](https://github.com/huggingface/text-generation-inference/issues/2122)
+- 
+
+
 ## Results
 
 | Test name             | GPU            |   Successful req. |   Tput (req/s) |   Mean TTFT (ms) |   Median TTFT (ms) |   P99 TTFT (ms) |   Mean ITL (ms) |   Median ITL (ms) |   P99 ITL (ms) | Engine   |
diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 7a20d526e06c2..c3dfc800f1fe7 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -8,7 +8,7 @@ steps:
         podSpec:
           priorityClassName: perf-benchmark
           containers:
-          - image: vllm/vllm-openai:latest
+          - image: vllm/vllm-openai:v0.5.0.post1
             command:
             - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
             resources:
@@ -43,7 +43,7 @@ steps:
         podSpec:
           priorityClassName: perf-benchmark
           containers:
-          - image: openmmlab/lmdeploy:latest
+          - image: openmmlab/lmdeploy:v0.5.0
             command:
             - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
             resources:
diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
index a157074287083..e50d2ba4b2e7a 100644
--- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
 
-set -ex
 set -o pipefail
 
 check_gpus() {
diff --git a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
index 3a4c8e704abfb..26d3ca610af81 100644
--- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
@@ -1,6 +1,5 @@
 #!/bin/bash
 
-set -x
 
 server_params=$1
 common_params=$2
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
index 42867fc40edd2..99b2ac30c3516 100644
--- a/benchmarks/benchmark_serving.py
+++ b/benchmarks/benchmark_serving.py
@@ -60,12 +60,15 @@ class BenchmarkMetrics:
     output_throughput: float
     mean_ttft_ms: float
     median_ttft_ms: float
+    std_ttft_ms: float
     p99_ttft_ms: float
     mean_tpot_ms: float
     median_tpot_ms: float
+    std_tpot_ms: float
     p99_tpot_ms: float
     mean_itl_ms: float
     median_itl_ms: float
+    std_itl_ms: float
     p99_itl_ms: float
 
 
@@ -249,12 +252,15 @@ def calculate_metrics(
         mean_ttft_ms=np.mean(ttfts or 0) *
         1000,  # ttfts is empty if streaming is not supported by backend
         median_ttft_ms=np.median(ttfts or 0) * 1000,
+        std_ttft_ms=np.std(ttfts or 0) * 1000,
         p99_ttft_ms=np.percentile(ttfts or 0, 99) * 1000,
         mean_tpot_ms=np.mean(tpots or 0) * 1000,
         median_tpot_ms=np.median(tpots or 0) * 1000,
+        std_tpot_ms=np.std(tpots or 0) * 1000,
         p99_tpot_ms=np.percentile(tpots or 0, 99) * 1000,
         mean_itl_ms=np.mean(itls or 0) * 1000,
         median_itl_ms=np.median(itls or 0) * 1000,
+        std_itl_ms=np.std(itls or 0) * 1000,
         p99_itl_ms=np.percentile(itls or 0, 99) * 1000,
     )
 
@@ -371,12 +377,15 @@ async def benchmark(
         "output_throughput": metrics.output_throughput,
         "mean_ttft_ms": metrics.mean_ttft_ms,
         "median_ttft_ms": metrics.median_ttft_ms,
+        "std_ttft_ms": metrics.std_ttft_ms,
         "p99_ttft_ms": metrics.p99_ttft_ms,
         "mean_tpot_ms": metrics.mean_tpot_ms,
         "median_tpot_ms": metrics.median_tpot_ms,
+        "std_tpot_ms": metrics.std_tpot_ms,
         "p99_tpot_ms": metrics.p99_tpot_ms,
         "mean_itl_ms": metrics.mean_itl_ms,
         "median_itl_ms": metrics.median_itl_ms,
+        "std_itl_ms": metrics.std_itl_ms,
         "p99_itl_ms": metrics.p99_itl_ms,
         "input_lens": [output.prompt_len for output in outputs],
         "output_lens": actual_output_lens,

From 22e78b5a7c8d21171b9ea1ea6e574ce0bb43afce Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 3 Jul 2024 16:06:18 -0700
Subject: [PATCH 097/150] remove annotation inside the job --- run the
 annotation at the last.

---
 .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 2 +-
 .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh      | 2 +-
 .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh      | 2 +-
 .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index 275725e5741c5..c23438679578b 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -190,7 +190,7 @@ upload_to_buildkite() {
     echo "buildkite-agent binary not found. Skip uploading the results."
     return 0
   fi
-  /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
   /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
index 36a9f434d4740..b805c52d3fa8c 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -192,7 +192,7 @@ upload_to_buildkite() {
     echo "buildkite-agent binary not found. Skip uploading the results."
     return 0
   fi
-  /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
   /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index 22363dbc25dcb..ae6f4316eb4c2 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -183,7 +183,7 @@ upload_to_buildkite() {
     echo "buildkite-agent binary not found. Skip uploading the results."
     return 0
   fi
-  /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
   /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
index f309e391d2cbe..1e6d2893983bf 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -196,7 +196,7 @@ upload_to_buildkite() {
     echo "buildkite-agent binary not found. Skip uploading the results."
     return 0
   fi
-  /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
+  # /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < $RESULTS_FOLDER/${CURRENT_LLM_SERVING_ENGINE}_nightly_results.md
   /workspace/buildkite-agent artifact upload "$RESULTS_FOLDER/*"
 }
 

From 59072ed19d3a0b7456b54f9f6f4c5a335c0d98fa Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 3 Jul 2024 16:07:14 -0700
Subject: [PATCH 098/150] reduce nightly pipeline length

---
 .../nightly-benchmarks/nightly-pipeline.yaml  | 254 ++++++++++--------
 1 file changed, 144 insertions(+), 110 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index c3dfc800f1fe7..ad1bd25f3b230 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -1,108 +1,142 @@
 steps:
-  - label: "Annotate"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: vllm/vllm-openai:v0.5.0.post1
-            command:
-            - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-  - wait
-  - label: "A100 lmdeploy benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: openmmlab/lmdeploy:v0.5.0
-            command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-  - label: "A100 trt benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
-            command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-  - label: "A100 vllm benchmark"
+  # - label: "Annotate"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #   - kubernetes:
+  #       podSpec:
+  #         priorityClassName: perf-benchmark
+  #         containers:
+  #         - image: vllm/vllm-openai:v0.5.0.post1
+  #           command:
+  #           - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+  #           resources:
+  #             limits:
+  #               nvidia.com/gpu: 8
+  #           volumeMounts:
+  #           - name: devshm
+  #             mountPath: /dev/shm
+  #           env:
+  #           - name: VLLM_USAGE_SOURCE
+  #             value: ci-test
+  #           - name: VLLM_SOURCE_CODE_LOC
+  #             value: /workspace/build/buildkite/vllm/performance-benchmark
+  #           - name: HF_TOKEN
+  #             valueFrom:
+  #               secretKeyRef:
+  #                 name: hf-token-secret
+  #                 key: token
+  #         nodeSelector:
+  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  #         volumes:
+  #         - name: devshm
+  #           emptyDir:
+  #             medium: Memory
+  # - wait
+  # - label: "A100 lmdeploy benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #   - kubernetes:
+  #       podSpec:
+  #         priorityClassName: perf-benchmark
+  #         containers:
+  #         - image: openmmlab/lmdeploy:v0.5.0
+  #           command:
+  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+  #           resources:
+  #             limits:
+  #               nvidia.com/gpu: 8
+  #           volumeMounts:
+  #           - name: devshm
+  #             mountPath: /dev/shm
+  #           env:
+  #           - name: VLLM_USAGE_SOURCE
+  #             value: ci-test
+  #           - name: VLLM_SOURCE_CODE_LOC
+  #             value: /workspace/build/buildkite/vllm/performance-benchmark
+  #           - name: HF_TOKEN
+  #             valueFrom:
+  #               secretKeyRef:
+  #                 name: hf-token-secret
+  #                 key: token
+  #         nodeSelector:
+  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  #         volumes:
+  #         - name: devshm
+  #           emptyDir:
+  #             medium: Memory
+  # - label: "A100 trt benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #   - kubernetes:
+  #       podSpec:
+  #         priorityClassName: perf-benchmark
+  #         containers:
+  #         - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+  #           command:
+  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+  #           resources:
+  #             limits:
+  #               nvidia.com/gpu: 8
+  #           volumeMounts:
+  #           - name: devshm
+  #             mountPath: /dev/shm
+  #           env:
+  #           - name: VLLM_USAGE_SOURCE
+  #             value: ci-test
+  #           - name: VLLM_SOURCE_CODE_LOC
+  #             value: /workspace/build/buildkite/vllm/performance-benchmark
+  #           - name: HF_TOKEN
+  #             valueFrom:
+  #               secretKeyRef:
+  #                 name: hf-token-secret
+  #                 key: token
+  #         nodeSelector:
+  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  #         volumes:
+  #         - name: devshm
+  #           emptyDir:
+  #             medium: Memory
+  # - label: "A100 vllm benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #   - kubernetes:
+  #       podSpec:
+  #         priorityClassName: perf-benchmark
+  #         containers:
+  #         - image: vllm/vllm-openai:latest
+  #           command:
+  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+  #           resources:
+  #             limits:
+  #               nvidia.com/gpu: 8
+  #           volumeMounts:
+  #           - name: devshm
+  #             mountPath: /dev/shm
+  #           env:
+  #           - name: VLLM_USAGE_SOURCE
+  #             value: ci-test
+  #           - name: VLLM_SOURCE_CODE_LOC
+  #             value: /workspace/build/buildkite/vllm/performance-benchmark
+  #           - name: HF_TOKEN
+  #             valueFrom:
+  #               secretKeyRef:
+  #                 name: hf-token-secret
+  #                 key: token
+  #         nodeSelector:
+  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  #         volumes:
+  #         - name: devshm
+  #           emptyDir:
+  #             medium: Memory
+  - label: "A100 tgi benchmark"
     priority: 100
     agents:
       queue: A100
@@ -111,7 +145,7 @@ steps:
         podSpec:
           priorityClassName: perf-benchmark
           containers:
-          - image: vllm/vllm-openai:latest
+          - image: ghcr.io/huggingface/text-generation-inference:2.1
             command:
             - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
             resources:
@@ -136,7 +170,8 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-  - label: "A100 tgi benchmark"
+  - wait
+  - label: "Plot"
     priority: 100
     agents:
       queue: A100
@@ -145,9 +180,9 @@ steps:
         podSpec:
           priorityClassName: perf-benchmark
           containers:
-          - image: ghcr.io/huggingface/text-generation-inference:2.1
+          - image: vllm/vllm-openai:v0.5.0.post1
             command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
             resources:
               limits:
                 nvidia.com/gpu: 8
@@ -169,5 +204,4 @@ steps:
           volumes:
           - name: devshm
             emptyDir:
-              medium: Memory
-  
\ No newline at end of file
+              medium: Memory 
\ No newline at end of file

From a3e4355c24d5c75ff4a18056d0dc91451e8efcf9 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 3 Jul 2024 16:15:46 -0700
Subject: [PATCH 099/150] remove headers in result

---
 .buildkite/nightly-benchmarks/nightly-descriptions.md | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index 7d42d11fea1f3..b0ae36953d7d6 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -37,5 +37,3 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
 
 ## Results
 
-| Test name             | GPU            |   Successful req. |   Tput (req/s) |   Mean TTFT (ms) |   Median TTFT (ms) |   P99 TTFT (ms) |   Mean ITL (ms) |   Median ITL (ms) |   P99 ITL (ms) | Engine   |
-|:----------------------|:---------------|------------------:|---------------:|-----------------:|-------------------:|----------------:|----------------:|------------------:|---------------:|:---------|

From e27677ae4d88d4ebd421695820ebb58ffadf7eb9 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 3 Jul 2024 22:50:07 -0700
Subject: [PATCH 100/150] add visualization step

---
 .../nightly-descriptions.md                   |  9 +-
 .../scripts/nightly-annotate.sh               | 20 ++++-
 .../scripts/plot-nightly-results.py           | 83 +++++++++++++++++++
 .../scripts/summary-nightly-results.py        | 15 ++--
 .../tests/nightly-tests.json                  | 76 -----------------
 5 files changed, 113 insertions(+), 90 deletions(-)
 create mode 100644 .buildkite/nightly-benchmarks/scripts/plot-nightly-results.py

diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index b0ae36953d7d6..3f792d788c273 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -32,8 +32,15 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
 ## Known crashes
 
 - TGI v2.1 crashes when running mixtral model [TGI PR #2122](https://github.com/huggingface/text-generation-inference/issues/2122)
-- 
 
 
 ## Results
 
+
+
+
+ITL:
+
+
+Comparison table:
+{nightly_results_benchmarking_table}
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
index cee44c3d6eb92..83ad79674e7df 100644
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -13,10 +13,24 @@ main() {
 
     if [ ! -f /workspace/buildkite-agent ]; then
         echo "buildkite-agent binary not found. Skip uploading the results."
-        return 0
-    else
-        /workspace/buildkite-agent annotate --style "info" --context "benchmark-results" < $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md
+        exit 0
     fi
+
+    # initial annotation
+    description="$VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-descriptions.md"
+
+    # download results
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    mkdir -p results/
+    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results
+
+    # generate figures
+    python3 -m pip install tabulate pandas
+    python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py \
+        --results-folder results \
+        --description $description
+
+    
     
 }
 
diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
new file mode 100644
index 0000000000000..6fd2bcf631a7a
--- /dev/null
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -0,0 +1,83 @@
+
+import json
+import os
+from pathlib import Path
+import argparse
+import matplotlib.pyplot as plt
+
+import pandas as pd
+from tabulate import tabulate
+
+def parse_arguments():
+    parser = argparse.ArgumentParser(description='Parse command line arguments for summary-nightly-results script.')
+    parser.add_argument('--results-folder', type=str, required=True, help='The folder where the results are stored.')
+    parser.add_argument('--description', type=str, required=True, help='Description of the results.')
+    
+    args = parser.parse_args()
+    return args
+
+    
+def main(args):
+    results_folder = Path(args.results_folder)
+    
+    results = []
+
+    # collect results
+    for test_file in results_folder.glob("*.json"):
+        with open(test_file, "r") as f:
+            results = results + json.loads(f.read())
+            
+            
+    # generate markdown table            
+    df = pd.DataFrame.from_dict(results)
+
+    md_table = tabulate(df,
+                        headers='keys',
+                        tablefmt='pipe',
+                        showindex=False)
+                        
+    with open(args.description, "r") as f:
+        description = f.read()
+        
+    description = description.format(
+        nightly_results_benchmarking_table=md_table
+    )
+    
+    with open("nightly_results.md", "w") as f:
+        f.write(description)
+        
+        
+    # plot results
+    fig, axes = plt.subplots((3, 2), figsize=(16, 18))
+    for i, model in enumerate(["llama8b", "llama70b", "mixtral8x7b"]):
+        for j, metric in enumerate(["TTFT", "ITL"]):
+            means, stds = [], []
+            for method in ["vllm", "trt", "lmdeploy", "tgi"]:
+                target = df['Test name'].str.contains(model)
+                target = target & df['Test name'].str.contains(method)
+                filtered_df = df[target]
+                
+                if filtered_df.empty:
+                    means.append(0.)
+                    stds.append(0.)
+                else:
+                    means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
+                    stds.append(filtered_df[f"Std {metric} (ms)"].values[0])
+                    
+            ax = axes[i, j]
+            
+            ax.errorbar(
+                ["vllm", "trt", "lmdeploy", "tgi"], 
+                means, 
+                yerr=stds,
+                fmt='o', capsize=5)
+            
+            ax.set_xlabel("Method")
+            ax.set_ylabel(f"{metric} (ms)")
+            ax.set_title(f"{model} {metric} comparison")
+    
+    fig.savefig("nightly_results.jpg", bbox_inches='tight')
+
+if __name__ == '__main__':
+    args = parse_arguments()
+    main(args)
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
index 640e0bfdaa1f7..d9fc46cb45c92 100644
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -14,17 +14,10 @@
     "gpu_type": "GPU",
     "completed": "Successful req.",
     "request_throughput": "Tput (req/s)",
-    # "input_throughput": "Input Tput (tok/s)",
-    # "output_throughput": "Output Tput (tok/s)",
     "mean_ttft_ms": "Mean TTFT (ms)",
-    "median_ttft_ms": "Median TTFT (ms)",
-    "p99_ttft_ms": "P99 TTFT (ms)",
-    # "mean_tpot_ms": "Mean TPOT (ms)",
-    # "median_tpot_ms": "Median",
-    # "p99_tpot_ms": "P99",
+    "std_ttft_ms": "Std TTFT (ms)",
     "mean_itl_ms": "Mean ITL (ms)",
-    "median_itl_ms": "Median ITL (ms)",
-    "p99_itl_ms": "P99 ITL (ms)",
+    "std_itl_ms": "Std ITL (ms)",
     "engine": "Engine",
 }
 
@@ -67,7 +60,9 @@
 
     # document benchmarking results in markdown
     with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:
-        f.write(serving_md_table_without_header)
+        # document results with header.
+        # for those who wants to reproduce our benchmark.
+        f.write(serving_md_table_with_headers)
         f.write('\n')
 
     # document benchmarking results in json
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 47d94a39f503a..57d462393eeaf 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -36,81 +36,5 @@
         },
         "vllm_client_parameters": {
         }
-    },
-    {
-        "test_name": "mixtral8x7B_tp2",
-        "qps_list": [2,4],
-        "common_parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tp": 2,
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 500,
-            "port": 8000
-        },
-        "lmdeploy_server_parameters": {
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
-        "trt_server_parameters": {
-            "model_type": "llama",
-            "model_dtype": "float16",
-            "max_batch_size": 256,
-            "max_input_len": 4096,
-            "max_output_len": 4096,
-            "trt_llm_version": "r24.04"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        },
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": ""
-        },
-        "vllm_client_parameters": {
-        }
-    },
-    {
-        "test_name": "llama70B_tp4",
-        "qps_list": [2,4],
-        "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "tp": 4,
-            "dataset_name": "sharegpt",
-            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
-            "num_prompts": 500,
-            "port": 8000
-        },
-        "lmdeploy_server_parameters": {
-        },
-        "lmdeploy_client_parameters": {
-        },
-        "tgi_server_parameters": {
-        },
-        "tgi_client_parameters": {
-            "endpoint": "/generate_stream"
-        },
-        "trt_server_parameters": {
-            "model_type": "llama",
-            "model_dtype": "float16",
-            "max_batch_size": 256,
-            "max_input_len": 4096,
-            "max_output_len": 4096,
-            "trt_llm_version": "r24.04"
-        },
-        "trt_client_parameters": {
-            "endpoint": "/v2/models/ensemble/generate_stream"
-        },
-        "vllm_server_parameters": {
-            "disable_log_stats": "",
-            "disable_log_requests": ""
-        },
-        "vllm_client_parameters": {
-        }
     }
 ]
\ No newline at end of file

From 7c845ae8a5b07c08e0bbdbe14aac58561b6cf1a8 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 3 Jul 2024 23:31:45 -0700
Subject: [PATCH 101/150] support figure visualization

---
 .../nightly-descriptions.md                   |  7 ++----
 .../scripts/nightly-annotate.sh               | 21 +++++++++---------
 .../scripts/plot-nightly-results.py           | 22 ++++++++++++++-----
 .../results/trt_llama8B_tp1_qps_8.commands    |  6 +++++
 benchmarks/results/trt_nightly_results.md     |  1 +
 5 files changed, 37 insertions(+), 20 deletions(-)
 create mode 100644 benchmarks/results/trt_llama8B_tp1_qps_8.commands
 create mode 100644 benchmarks/results/trt_nightly_results.md

diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index 3f792d788c273..4445ecee72697 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -24,7 +24,7 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
 - Input length: randomly sample 1000 prompts from ShareGPT dataset (with fixed random seed).
 - Output length: the corresponding output length of these 1000 prompts.
 - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
-- Average QPS (query per second): 4, 8 for 8B model and 1, 4 for larger models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
+- Average QPS (query per second): 4 for 8B model and 2 for larger models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
 - Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
 
@@ -36,11 +36,8 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
 
 ## Results
 
+![[Overall benchmarking results]([artifacts](artifact://nightly_results.png))](artifact://indy.png)
 
 
 
-ITL:
-
-
-Comparison table:
 {nightly_results_benchmarking_table}
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
index 83ad79674e7df..19f789702703f 100644
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -8,11 +8,9 @@ main() {
 
     (which wget && which curl) || (apt-get update && apt-get install -y wget curl)
     (which jq) || (apt-get update && apt-get -y install jq)
-    
-    df -h
 
     if [ ! -f /workspace/buildkite-agent ]; then
-        echo "buildkite-agent binary not found. Skip uploading the results."
+        echo "buildkite-agent binary not found. Skip plotting the results."
         exit 0
     fi
 
@@ -22,16 +20,19 @@ main() {
     # download results
     cd $VLLM_SOURCE_CODE_LOC/benchmarks
     mkdir -p results/
-    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results
+    /workspace/buildkite-agent artifact download 'results/*nightly_results.json' results/
+    ls
+    ls results/
 
     # generate figures
-    python3 -m pip install tabulate pandas
-    python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py \
-        --results-folder results \
-        --description $description
-
-    
+    python3 -m pip install tabulate pandas matplotlib
+    python3 $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py \
+        --description $description \
+        --results-folder results/
     
+    # upload results and figures
+    /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < nightly_results.md
+    /workspace/buildkite-agent artifact upload "nightly_results.png"
 }
 
 main "$@"
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
index 6fd2bcf631a7a..f6ad25d8ca981 100644
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -23,7 +23,7 @@ def main(args):
     results = []
 
     # collect results
-    for test_file in results_folder.glob("*.json"):
+    for test_file in results_folder.glob("*_nightly_results.json"):
         with open(test_file, "r") as f:
             results = results + json.loads(f.read())
             
@@ -46,13 +46,16 @@ def main(args):
     with open("nightly_results.md", "w") as f:
         f.write(description)
         
+
+    plt.rcParams.update({'font.size': 20})
         
     # plot results
-    fig, axes = plt.subplots((3, 2), figsize=(16, 18))
-    for i, model in enumerate(["llama8b", "llama70b", "mixtral8x7b"]):
+    fig, axes = plt.subplots(3, 2, figsize=(16, 18))
+    methods = ["vllm", "trt", "lmdeploy", "tgi"]
+    for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
         for j, metric in enumerate(["TTFT", "ITL"]):
             means, stds = [], []
-            for method in ["vllm", "trt", "lmdeploy", "tgi"]:
+            for method in methods:
                 target = df['Test name'].str.contains(model)
                 target = target & df['Test name'].str.contains(method)
                 filtered_df = df[target]
@@ -71,12 +74,21 @@ def main(args):
                 means, 
                 yerr=stds,
                 fmt='o', capsize=5)
+            ax.set_ylim(bottom=0)
+
+            for i, (method, mean, std) in enumerate(zip(method, means, stds)):
+                ax.text(
+                    i - 0.2, mean,  # Adjust position above the error bar
+                    f'{mean:.0f}', 
+                    ha='center', 
+                    va='bottom'
+                )
             
             ax.set_xlabel("Method")
             ax.set_ylabel(f"{metric} (ms)")
             ax.set_title(f"{model} {metric} comparison")
     
-    fig.savefig("nightly_results.jpg", bbox_inches='tight')
+    fig.savefig("nightly_results.png", bbox_inches='tight')
 
 if __name__ == '__main__':
     args = parse_arguments()
diff --git a/benchmarks/results/trt_llama8B_tp1_qps_8.commands b/benchmarks/results/trt_llama8B_tp1_qps_8.commands
new file mode 100644
index 0000000000000..e0312b4e22dc2
--- /dev/null
+++ b/benchmarks/results/trt_llama8B_tp1_qps_8.commands
@@ -0,0 +1,6 @@
+{
+  "server_command": "",
+  "client_command": "python3 benchmark_serving.py         --backend tensorrt-llm         --tokenizer /tokenizer_cache         --model meta-llama/Meta-Llama-3-8B         --dataset-name sharegpt         --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json         --num-prompts 200         --port 8000         --save-result         --result-dir results/         --result-filename trt_llama8B_tp1_qps_8.json         --request-rate 8         --endpoint /v2/models/ensemble/generate_stream",
+  "gpu_type": "A100-SXM4-80GB",
+  "engine": "trt"
+}
diff --git a/benchmarks/results/trt_nightly_results.md b/benchmarks/results/trt_nightly_results.md
new file mode 100644
index 0000000000000..3befc74903d68
--- /dev/null
+++ b/benchmarks/results/trt_nightly_results.md
@@ -0,0 +1 @@
+| trt_llama8B_tp1_qps_8 | A100-SXM4-80GB |               200 |        6.49609 |          60.3214 |            55.7321 |         119.186 |         15.6021 |            14.187 |        56.2925 | trt      |

From 3a70a60aa9245652d0086d99d70118689260f8fb Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 3 Jul 2024 23:52:56 -0700
Subject: [PATCH 102/150] adjust visualization

---
 .buildkite/nightly-benchmarks/nightly-descriptions.md      | 6 ++++--
 .../nightly-benchmarks/scripts/plot-nightly-results.py     | 7 +++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index 4445ecee72697..9bb965b2450a8 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -36,8 +36,10 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
 
 ## Results
 
-![[Overall benchmarking results]([artifacts](artifact://nightly_results.png))](artifact://indy.png)
+{nightly_results_benchmarking_table}
 
+## Plots
 
+In the following plots, the error bar shows the standard error of the mean.
 
-{nightly_results_benchmarking_table}
\ No newline at end of file
+<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
index f6ad25d8ca981..41089aae90405 100644
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -4,6 +4,7 @@
 from pathlib import Path
 import argparse
 import matplotlib.pyplot as plt
+import math
 
 import pandas as pd
 from tabulate import tabulate
@@ -65,7 +66,9 @@ def main(args):
                     stds.append(0.)
                 else:
                     means.append(filtered_df[f"Mean {metric} (ms)"].values[0])
-                    stds.append(filtered_df[f"Std {metric} (ms)"].values[0])
+                    std = filtered_df[f"Std {metric} (ms)"].values[0]
+                    success = filtered_df["Successful req."].values[0]
+                    stds.append(std / math.sqrt(success))
                     
             ax = axes[i, j]
             
@@ -88,7 +91,7 @@ def main(args):
             ax.set_ylabel(f"{metric} (ms)")
             ax.set_title(f"{model} {metric} comparison")
     
-    fig.savefig("nightly_results.png", bbox_inches='tight')
+    fig.savefig("nightly_results.png")
 
 if __name__ == '__main__':
     args = parse_arguments()

From 8260d3889d0b58cf70f5e22444794df9ce325dc9 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 00:04:59 -0700
Subject: [PATCH 103/150] visual adjustment

---
 .buildkite/nightly-benchmarks/nightly-descriptions.md    | 8 +++++---
 .../nightly-benchmarks/scripts/nightly-annotate.sh       | 4 +++-
 .../nightly-benchmarks/scripts/plot-nightly-results.py   | 9 +++++----
 .buildkite/nightly-benchmarks/tests/nightly-tests.json   | 2 +-
 4 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index 9bb965b2450a8..98a593d455639 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -14,7 +14,7 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker ima
 - openmmlab/lmdeploy:v0.5.0
 - ghcr.io/huggingface/text-generation-inference:2.1
 
-Check `nightly-pipeline.yaml` artifact for more details.
+Check [nightly-pipeline.yaml](artifact://nightly-pipeline.yaml) artifact for more details.
 
 
 ## Workload description
@@ -26,12 +26,14 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
 - Batch size: dynamically determined by vllm and the arrival pattern of the requests.
 - Average QPS (query per second): 4 for 8B model and 2 for larger models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
 - Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
-- Evaluation metrics: throughput, TTFT (time to the first token, with mean, median and p99), ITL (inter-token latency, with mean, median and p99).
+- Evaluation metrics: Throughput, TTFT (time to the first token, with mean and std), ITL (inter-token latency, with mean and std).
+
+Check [nightly-tests.json](artifact://nightly-tests.json) artifact for more details.
 
 
 ## Known crashes
 
-- TGI v2.1 crashes when running mixtral model [TGI PR #2122](https://github.com/huggingface/text-generation-inference/issues/2122)
+- TGI v2.1 crashes when running mixtral model, see [TGI PR #2122](https://github.com/huggingface/text-generation-inference/issues/2122)
 
 
 ## Results
diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
index 19f789702703f..99f1548039ab6 100644
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -31,8 +31,10 @@ main() {
         --results-folder results/
     
     # upload results and figures
-    /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < nightly_results.md
     /workspace/buildkite-agent artifact upload "nightly_results.png"
+    /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+    /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+    /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < nightly_results.md
 }
 
 main "$@"
\ No newline at end of file
diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
index 41089aae90405..962099a6b4c5f 100644
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -51,14 +51,14 @@ def main(args):
     plt.rcParams.update({'font.size': 20})
         
     # plot results
-    fig, axes = plt.subplots(3, 2, figsize=(16, 18))
+    fig, axes = plt.subplots(3, 2, figsize=(14, 16))
     methods = ["vllm", "trt", "lmdeploy", "tgi"]
     for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
         for j, metric in enumerate(["TTFT", "ITL"]):
             means, stds = [], []
             for method in methods:
                 target = df['Test name'].str.contains(model)
-                target = target & df['Test name'].str.contains(method)
+                target = target & df['Engine'].str.contains(method)
                 filtered_df = df[target]
                 
                 if filtered_df.empty:
@@ -70,6 +70,8 @@ def main(args):
                     success = filtered_df["Successful req."].values[0]
                     stds.append(std / math.sqrt(success))
                     
+            print(means, stds)
+                    
             ax = axes[i, j]
             
             ax.errorbar(
@@ -87,11 +89,10 @@ def main(args):
                     va='bottom'
                 )
             
-            ax.set_xlabel("Method")
             ax.set_ylabel(f"{metric} (ms)")
             ax.set_title(f"{model} {metric} comparison")
     
-    fig.savefig("nightly_results.png")
+    fig.savefig("nightly_results.png", bbox_inches='tight')
 
 if __name__ == '__main__':
     args = parse_arguments()
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 57d462393eeaf..0f2ac3be3df81 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -1,7 +1,7 @@
 [
     {
         "test_name": "llama8B_tp1",
-        "qps_list": [4,8],
+        "qps_list": [4],
         "common_parameters": {
             "model": "meta-llama/Meta-Llama-3-8B",
             "tp": 1,

From 464374916df8a477189fb936ba04e5510f0ce50e Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 00:06:27 -0700
Subject: [PATCH 104/150] remove text annotation

---
 .../scripts/plot-nightly-results.py            | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
index 962099a6b4c5f..6ef5c3a2c0ae4 100644
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -48,10 +48,10 @@ def main(args):
         f.write(description)
         
 
-    plt.rcParams.update({'font.size': 20})
+    plt.rcParams.update({'font.size': 15})
         
     # plot results
-    fig, axes = plt.subplots(3, 2, figsize=(14, 16))
+    fig, axes = plt.subplots(3, 2, figsize=(10, 12))
     methods = ["vllm", "trt", "lmdeploy", "tgi"]
     for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
         for j, metric in enumerate(["TTFT", "ITL"]):
@@ -81,13 +81,13 @@ def main(args):
                 fmt='o', capsize=5)
             ax.set_ylim(bottom=0)
 
-            for i, (method, mean, std) in enumerate(zip(method, means, stds)):
-                ax.text(
-                    i - 0.2, mean,  # Adjust position above the error bar
-                    f'{mean:.0f}', 
-                    ha='center', 
-                    va='bottom'
-                )
+            # for i, (method, mean, std) in enumerate(zip(method, means, stds)):
+            #     ax.text(
+            #         i - 0.2, mean,  # Adjust position above the error bar
+            #         f'{mean:.0f}', 
+            #         ha='center', 
+            #         va='bottom'
+            #     )
             
             ax.set_ylabel(f"{metric} (ms)")
             ax.set_title(f"{model} {metric} comparison")

From 3146a96df465e878af724fb92d17d461b9f2953f Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 00:16:21 -0700
Subject: [PATCH 105/150] add padding

---
 .buildkite/nightly-benchmarks/scripts/plot-nightly-results.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
index 6ef5c3a2c0ae4..16f37be40f3c6 100644
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -91,7 +91,8 @@ def main(args):
             
             ax.set_ylabel(f"{metric} (ms)")
             ax.set_title(f"{model} {metric} comparison")
-    
+
+    fig.tight_layout(pad=0.1)
     fig.savefig("nightly_results.png", bbox_inches='tight')
 
 if __name__ == '__main__':

From 6da59d16294dc9a232bab5b03178c66d37e21eee Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 00:18:54 -0700
Subject: [PATCH 106/150] add hyperlink

---
 .buildkite/nightly-benchmarks/nightly-descriptions.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index 98a593d455639..8ad73491202a0 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -14,7 +14,7 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker ima
 - openmmlab/lmdeploy:v0.5.0
 - ghcr.io/huggingface/text-generation-inference:2.1
 
-Check [nightly-pipeline.yaml](artifact://nightly-pipeline.yaml) artifact for more details.
+Check <a href="artifact://nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details.
 
 
 ## Workload description
@@ -28,7 +28,7 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
 - Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: Throughput, TTFT (time to the first token, with mean and std), ITL (inter-token latency, with mean and std).
 
-Check [nightly-tests.json](artifact://nightly-tests.json) artifact for more details.
+Check <a href="artifact://nightly-tests.json">nightly-tests.json</a> artifact for more details.
 
 
 ## Known crashes

From 0802f9fdb6fc375e0305895d47c55cf3759fdce2 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 00:37:15 -0700
Subject: [PATCH 107/150] bring back the full suite of test

---
 .../nightly-descriptions.md                   |  4 +-
 .../tests/nightly-tests.json                  | 76 +++++++++++++++++++
 2 files changed, 78 insertions(+), 2 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index 8ad73491202a0..fc9431d33a5a5 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -14,7 +14,7 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker ima
 - openmmlab/lmdeploy:v0.5.0
 - ghcr.io/huggingface/text-generation-inference:2.1
 
-Check <a href="artifact://nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details.
+Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details.
 
 
 ## Workload description
@@ -28,7 +28,7 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
 - Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
 - Evaluation metrics: Throughput, TTFT (time to the first token, with mean and std), ITL (inter-token latency, with mean and std).
 
-Check <a href="artifact://nightly-tests.json">nightly-tests.json</a> artifact for more details.
+Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details.
 
 
 ## Known crashes
diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 0f2ac3be3df81..89ef0b14e11b2 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -36,5 +36,81 @@
         },
         "vllm_client_parameters": {
         }
+    },
+    {
+        "test_name": "mixtral8x7B_tp2",
+        "qps_list": [2],
+        "common_parameters": {
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tp": 2,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000
+        },
+        "lmdeploy_server_parameters": {
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "float16",
+            "max_batch_size": 256,
+            "max_input_len": 4096,
+            "max_output_len": 4096,
+            "trt_llm_version": "r24.04"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": ""
+        },
+        "vllm_client_parameters": {
+        }
+    },
+    {
+        "test_name": "llama70B_tp4",
+        "qps_list": [2],
+        "common_parameters": {
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tp": 4,
+            "dataset_name": "sharegpt",
+            "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
+            "num_prompts": 500,
+            "port": 8000
+        },
+        "lmdeploy_server_parameters": {
+        },
+        "lmdeploy_client_parameters": {
+        },
+        "tgi_server_parameters": {
+        },
+        "tgi_client_parameters": {
+            "endpoint": "/generate_stream"
+        },
+        "trt_server_parameters": {
+            "model_type": "llama",
+            "model_dtype": "float16",
+            "max_batch_size": 256,
+            "max_input_len": 4096,
+            "max_output_len": 4096,
+            "trt_llm_version": "r24.04"
+        },
+        "trt_client_parameters": {
+            "endpoint": "/v2/models/ensemble/generate_stream"
+        },
+        "vllm_server_parameters": {
+            "disable_log_stats": "",
+            "disable_log_requests": ""
+        },
+        "vllm_client_parameters": {
+        }
     }
 ]
\ No newline at end of file

From 8e6fca22b2c2c40bbe9d06ce41b218b580777348 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 00:37:55 -0700
Subject: [PATCH 108/150] adjust test order

---
 .../nightly-benchmarks/tests/nightly-tests.json      | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/tests/nightly-tests.json b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
index 89ef0b14e11b2..f250833c62710 100644
--- a/.buildkite/nightly-benchmarks/tests/nightly-tests.json
+++ b/.buildkite/nightly-benchmarks/tests/nightly-tests.json
@@ -38,11 +38,11 @@
         }
     },
     {
-        "test_name": "mixtral8x7B_tp2",
+        "test_name": "llama70B_tp4",
         "qps_list": [2],
         "common_parameters": {
-            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
-            "tp": 2,
+            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
+            "tp": 4,
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 500,
@@ -76,11 +76,11 @@
         }
     },
     {
-        "test_name": "llama70B_tp4",
+        "test_name": "mixtral8x7B_tp2",
         "qps_list": [2],
         "common_parameters": {
-            "model": "meta-llama/Meta-Llama-3-70B-Instruct",
-            "tp": 4,
+            "model": "mistralai/Mixtral-8x7B-Instruct-v0.1",
+            "tp": 2,
             "dataset_name": "sharegpt",
             "dataset_path": "./ShareGPT_V3_unfiltered_cleaned_split.json",
             "num_prompts": 500,

From a77fcbdf87f7fb3bfdb4b6c636ecc75bf21ae65d Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 00:38:56 -0700
Subject: [PATCH 109/150] bring back full benchmark suite

---
 .../nightly-benchmarks/nightly-pipeline.yaml  | 239 ++++++++----------
 1 file changed, 102 insertions(+), 137 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index ad1bd25f3b230..c9f7740fef968 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -1,141 +1,106 @@
 steps:
-  # - label: "Annotate"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #   - kubernetes:
-  #       podSpec:
-  #         priorityClassName: perf-benchmark
-  #         containers:
-  #         - image: vllm/vllm-openai:v0.5.0.post1
-  #           command:
-  #           - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
-  #           resources:
-  #             limits:
-  #               nvidia.com/gpu: 8
-  #           volumeMounts:
-  #           - name: devshm
-  #             mountPath: /dev/shm
-  #           env:
-  #           - name: VLLM_USAGE_SOURCE
-  #             value: ci-test
-  #           - name: VLLM_SOURCE_CODE_LOC
-  #             value: /workspace/build/buildkite/vllm/performance-benchmark
-  #           - name: HF_TOKEN
-  #             valueFrom:
-  #               secretKeyRef:
-  #                 name: hf-token-secret
-  #                 key: token
-  #         nodeSelector:
-  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  #         volumes:
-  #         - name: devshm
-  #           emptyDir:
-  #             medium: Memory
-  # - wait
-  # - label: "A100 lmdeploy benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #   - kubernetes:
-  #       podSpec:
-  #         priorityClassName: perf-benchmark
-  #         containers:
-  #         - image: openmmlab/lmdeploy:v0.5.0
-  #           command:
-  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-  #           resources:
-  #             limits:
-  #               nvidia.com/gpu: 8
-  #           volumeMounts:
-  #           - name: devshm
-  #             mountPath: /dev/shm
-  #           env:
-  #           - name: VLLM_USAGE_SOURCE
-  #             value: ci-test
-  #           - name: VLLM_SOURCE_CODE_LOC
-  #             value: /workspace/build/buildkite/vllm/performance-benchmark
-  #           - name: HF_TOKEN
-  #             valueFrom:
-  #               secretKeyRef:
-  #                 name: hf-token-secret
-  #                 key: token
-  #         nodeSelector:
-  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  #         volumes:
-  #         - name: devshm
-  #           emptyDir:
-  #             medium: Memory
-  # - label: "A100 trt benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #   - kubernetes:
-  #       podSpec:
-  #         priorityClassName: perf-benchmark
-  #         containers:
-  #         - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
-  #           command:
-  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-  #           resources:
-  #             limits:
-  #               nvidia.com/gpu: 8
-  #           volumeMounts:
-  #           - name: devshm
-  #             mountPath: /dev/shm
-  #           env:
-  #           - name: VLLM_USAGE_SOURCE
-  #             value: ci-test
-  #           - name: VLLM_SOURCE_CODE_LOC
-  #             value: /workspace/build/buildkite/vllm/performance-benchmark
-  #           - name: HF_TOKEN
-  #             valueFrom:
-  #               secretKeyRef:
-  #                 name: hf-token-secret
-  #                 key: token
-  #         nodeSelector:
-  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  #         volumes:
-  #         - name: devshm
-  #           emptyDir:
-  #             medium: Memory
-  # - label: "A100 vllm benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #   - kubernetes:
-  #       podSpec:
-  #         priorityClassName: perf-benchmark
-  #         containers:
-  #         - image: vllm/vllm-openai:latest
-  #           command:
-  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-  #           resources:
-  #             limits:
-  #               nvidia.com/gpu: 8
-  #           volumeMounts:
-  #           - name: devshm
-  #             mountPath: /dev/shm
-  #           env:
-  #           - name: VLLM_USAGE_SOURCE
-  #             value: ci-test
-  #           - name: VLLM_SOURCE_CODE_LOC
-  #             value: /workspace/build/buildkite/vllm/performance-benchmark
-  #           - name: HF_TOKEN
-  #             valueFrom:
-  #               secretKeyRef:
-  #                 name: hf-token-secret
-  #                 key: token
-  #         nodeSelector:
-  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  #         volumes:
-  #         - name: devshm
-  #           emptyDir:
-  #             medium: Memory
+  - label: "A100 lmdeploy benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: openmmlab/lmdeploy:v0.5.0
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+  - label: "A100 trt benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+  - label: "A100 vllm benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: vllm/vllm-openai:latest
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
   - label: "A100 tgi benchmark"
     priority: 100
     agents:

From 8b51f458e2cf25f57cf8d415378b155a61e8a3bd Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 00:39:38 -0700
Subject: [PATCH 110/150] add more pad

---
 .buildkite/nightly-benchmarks/scripts/plot-nightly-results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
index 16f37be40f3c6..b40de8fd07481 100644
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -92,7 +92,7 @@ def main(args):
             ax.set_ylabel(f"{metric} (ms)")
             ax.set_title(f"{model} {metric} comparison")
 
-    fig.tight_layout(pad=0.1)
+    fig.tight_layout(pad=0.3)
     fig.savefig("nightly_results.png", bbox_inches='tight')
 
 if __name__ == '__main__':

From 4427b06d1f6cb87010cbb2436834fee2b1ec6bb8 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 10:58:31 -0700
Subject: [PATCH 111/150] mount huggingface cache

---
 .../nightly-descriptions.md                   |   2 +-
 .../nightly-benchmarks/nightly-pipeline.yaml  | 212 +++++++++---------
 .../nightly-benchmarks/run-nightly-suite.sh   |   1 +
 3 files changed, 112 insertions(+), 103 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index fc9431d33a5a5..5699c938eae42 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -33,7 +33,7 @@ Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.
 
 ## Known crashes
 
-- TGI v2.1 crashes when running mixtral model, see [TGI PR #2122](https://github.com/huggingface/text-generation-inference/issues/2122)
+- TGI v2.1 crashes when running mixtral model, see [TGI Issue #2122](https://github.com/huggingface/text-generation-inference/issues/2122)
 
 
 ## Results
diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index c9f7740fef968..875b40ffb1aec 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -1,38 +1,38 @@
 steps:
-  - label: "A100 lmdeploy benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: openmmlab/lmdeploy:v0.5.0
-            command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
+  # - label: "A100 lmdeploy benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #   - kubernetes:
+  #       podSpec:
+  #         priorityClassName: perf-benchmark
+  #         containers:
+  #         - image: openmmlab/lmdeploy:v0.5.0
+  #           command:
+  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+  #           resources:
+  #             limits:
+  #               nvidia.com/gpu: 8
+  #           volumeMounts:
+  #           - name: devshm
+  #             mountPath: /dev/shm
+  #           env:
+  #           - name: VLLM_USAGE_SOURCE
+  #             value: ci-test
+  #           - name: VLLM_SOURCE_CODE_LOC
+  #             value: /workspace/build/buildkite/vllm/performance-benchmark
+  #           - name: HF_TOKEN
+  #             valueFrom:
+  #               secretKeyRef:
+  #                 name: hf-token-secret
+  #                 key: token
+  #         nodeSelector:
+  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  #         volumes:
+  #         - name: devshm
+  #           emptyDir:
+  #             medium: Memory
   - label: "A100 trt benchmark"
     priority: 100
     agents:
@@ -51,9 +51,13 @@ steps:
             volumeMounts:
             - name: devshm
               mountPath: /dev/shm
+            - name: hf-cache
+              mountPath: /root/.cache/huggingface
             env:
             - name: VLLM_USAGE_SOURCE
               value: ci-test
+            - name: HF_HOME
+              value: /root/.cache/huggingface
             - name: VLLM_SOURCE_CODE_LOC
               value: /workspace/build/buildkite/vllm/performance-benchmark
             - name: HF_TOKEN
@@ -67,74 +71,78 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-  - label: "A100 vllm benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: vllm/vllm-openai:latest
-            command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-  - label: "A100 tgi benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: ghcr.io/huggingface/text-generation-inference:2.1
-            command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /root/.cache/huggingface
+              type: Directory
+  # - label: "A100 vllm benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #   - kubernetes:
+  #       podSpec:
+  #         priorityClassName: perf-benchmark
+  #         containers:
+  #         - image: vllm/vllm-openai:latest
+  #           command:
+  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+  #           resources:
+  #             limits:
+  #               nvidia.com/gpu: 8
+  #           volumeMounts:
+  #           - name: devshm
+  #             mountPath: /dev/shm
+  #           env:
+  #           - name: VLLM_USAGE_SOURCE
+  #             value: ci-test
+  #           - name: VLLM_SOURCE_CODE_LOC
+  #             value: /workspace/build/buildkite/vllm/performance-benchmark
+  #           - name: HF_TOKEN
+  #             valueFrom:
+  #               secretKeyRef:
+  #                 name: hf-token-secret
+  #                 key: token
+  #         nodeSelector:
+  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  #         volumes:
+  #         - name: devshm
+  #           emptyDir:
+  #             medium: Memory
+  # - label: "A100 tgi benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #   - kubernetes:
+  #       podSpec:
+  #         priorityClassName: perf-benchmark
+  #         containers:
+  #         - image: ghcr.io/huggingface/text-generation-inference:2.1
+  #           command:
+  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+  #           resources:
+  #             limits:
+  #               nvidia.com/gpu: 8
+  #           volumeMounts:
+  #           - name: devshm
+  #             mountPath: /dev/shm
+  #           env:
+  #           - name: VLLM_USAGE_SOURCE
+  #             value: ci-test
+  #           - name: VLLM_SOURCE_CODE_LOC
+  #             value: /workspace/build/buildkite/vllm/performance-benchmark
+  #           - name: HF_TOKEN
+  #             valueFrom:
+  #               secretKeyRef:
+  #                 name: hf-token-secret
+  #                 key: token
+  #         nodeSelector:
+  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  #         volumes:
+  #         - name: devshm
+  #           emptyDir:
+  #             medium: Memory
   - wait
   - label: "Plot"
     priority: 100
diff --git a/.buildkite/nightly-benchmarks/run-nightly-suite.sh b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
index e50d2ba4b2e7a..627a3e6971578 100644
--- a/.buildkite/nightly-benchmarks/run-nightly-suite.sh
+++ b/.buildkite/nightly-benchmarks/run-nightly-suite.sh
@@ -1,6 +1,7 @@
 #!/bin/bash
 
 set -o pipefail
+set -x
 
 check_gpus() {
     # check the number of GPUs and GPU type.

From 329efe620ff1d93562746890fecb8fff4870147f Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 11:33:03 -0700
Subject: [PATCH 112/150] mount huggingface cache

---
 .../nightly-benchmarks/nightly-pipeline.yaml  | 230 ++++++++++--------
 1 file changed, 127 insertions(+), 103 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 875b40ffb1aec..5616ce6af028d 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -1,38 +1,4 @@
-steps:
-  # - label: "A100 lmdeploy benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #   - kubernetes:
-  #       podSpec:
-  #         priorityClassName: perf-benchmark
-  #         containers:
-  #         - image: openmmlab/lmdeploy:v0.5.0
-  #           command:
-  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-  #           resources:
-  #             limits:
-  #               nvidia.com/gpu: 8
-  #           volumeMounts:
-  #           - name: devshm
-  #             mountPath: /dev/shm
-  #           env:
-  #           - name: VLLM_USAGE_SOURCE
-  #             value: ci-test
-  #           - name: VLLM_SOURCE_CODE_LOC
-  #             value: /workspace/build/buildkite/vllm/performance-benchmark
-  #           - name: HF_TOKEN
-  #             valueFrom:
-  #               secretKeyRef:
-  #                 name: hf-token-secret
-  #                 key: token
-  #         nodeSelector:
-  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  #         volumes:
-  #         - name: devshm
-  #           emptyDir:
-  #             medium: Memory
+steps: 
   - label: "A100 trt benchmark"
     priority: 100
     agents:
@@ -75,74 +41,132 @@ steps:
             hostPath:
               path: /root/.cache/huggingface
               type: Directory
-  # - label: "A100 vllm benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #   - kubernetes:
-  #       podSpec:
-  #         priorityClassName: perf-benchmark
-  #         containers:
-  #         - image: vllm/vllm-openai:latest
-  #           command:
-  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-  #           resources:
-  #             limits:
-  #               nvidia.com/gpu: 8
-  #           volumeMounts:
-  #           - name: devshm
-  #             mountPath: /dev/shm
-  #           env:
-  #           - name: VLLM_USAGE_SOURCE
-  #             value: ci-test
-  #           - name: VLLM_SOURCE_CODE_LOC
-  #             value: /workspace/build/buildkite/vllm/performance-benchmark
-  #           - name: HF_TOKEN
-  #             valueFrom:
-  #               secretKeyRef:
-  #                 name: hf-token-secret
-  #                 key: token
-  #         nodeSelector:
-  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  #         volumes:
-  #         - name: devshm
-  #           emptyDir:
-  #             medium: Memory
-  # - label: "A100 tgi benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #   - kubernetes:
-  #       podSpec:
-  #         priorityClassName: perf-benchmark
-  #         containers:
-  #         - image: ghcr.io/huggingface/text-generation-inference:2.1
-  #           command:
-  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-  #           resources:
-  #             limits:
-  #               nvidia.com/gpu: 8
-  #           volumeMounts:
-  #           - name: devshm
-  #             mountPath: /dev/shm
-  #           env:
-  #           - name: VLLM_USAGE_SOURCE
-  #             value: ci-test
-  #           - name: VLLM_SOURCE_CODE_LOC
-  #             value: /workspace/build/buildkite/vllm/performance-benchmark
-  #           - name: HF_TOKEN
-  #             valueFrom:
-  #               secretKeyRef:
-  #                 name: hf-token-secret
-  #                 key: token
-  #         nodeSelector:
-  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  #         volumes:
-  #         - name: devshm
-  #           emptyDir:
-  #             medium: Memory
+  - label: "A100 lmdeploy benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: openmmlab/lmdeploy:v0.5.0
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            - name: hf-cache
+              mountPath: /root/.cache/huggingface
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_HOME
+              value: /root/.cache/huggingface
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /root/.cache/huggingface
+              type: Directory
+  - label: "A100 vllm benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: vllm/vllm-openai:latest
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            - name: hf-cache
+              mountPath: /root/.cache/huggingface
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_HOME
+              value: /root/.cache/huggingface
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /root/.cache/huggingface
+              type: Directory
+  - label: "A100 tgi benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: ghcr.io/huggingface/text-generation-inference:2.1
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            - name: hf-cache
+              mountPath: /root/.cache/huggingface
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_HOME
+              value: /root/.cache/huggingface
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /root/.cache/huggingface
+              type: Directory 
   - wait
   - label: "Plot"
     priority: 100

From a174d268ca405d15a090aa20a73662ebd518ca2c Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 11:34:16 -0700
Subject: [PATCH 113/150] add even more padding

---
 .buildkite/nightly-benchmarks/scripts/plot-nightly-results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
index b40de8fd07481..71ad313372467 100644
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -92,7 +92,7 @@ def main(args):
             ax.set_ylabel(f"{metric} (ms)")
             ax.set_title(f"{model} {metric} comparison")
 
-    fig.tight_layout(pad=0.3)
+    fig.tight_layout(pad=0.6)
     fig.savefig("nightly_results.png", bbox_inches='tight')
 
 if __name__ == '__main__':

From 3f49b0cf7fcf22dd7ce89ab1c88d22b007bdc7c8 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 11:35:09 -0700
Subject: [PATCH 114/150] add illustration

---
 .buildkite/nightly-benchmarks/nightly-descriptions.md | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index 5699c938eae42..84d304bebc0e1 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -36,12 +36,13 @@ Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.
 - TGI v2.1 crashes when running mixtral model, see [TGI Issue #2122](https://github.com/huggingface/text-generation-inference/issues/2122)
 
 
-## Results
-
-{nightly_results_benchmarking_table}
 
 ## Plots
 
-In the following plots, the error bar shows the standard error of the mean.
+In the following plots, the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
+
+<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
 
-<img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
\ No newline at end of file
+## Results
+
+{nightly_results_benchmarking_table}

From 5c3a7d083d8a229ebe34c8a63e5293766aecb476 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 11:36:00 -0700
Subject: [PATCH 115/150] make yapf and ruff happy

---
 .../scripts/plot-nightly-results.py           | 78 ++++++++++---------
 1 file changed, 40 insertions(+), 38 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
index 71ad313372467..1641be259e06c 100644
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -1,55 +1,56 @@
-
-import json
-import os
-from pathlib import Path
 import argparse
-import matplotlib.pyplot as plt
+import json
 import math
+from pathlib import Path
 
+import matplotlib.pyplot as plt
 import pandas as pd
 from tabulate import tabulate
 
+
 def parse_arguments():
-    parser = argparse.ArgumentParser(description='Parse command line arguments for summary-nightly-results script.')
-    parser.add_argument('--results-folder', type=str, required=True, help='The folder where the results are stored.')
-    parser.add_argument('--description', type=str, required=True, help='Description of the results.')
-    
+    parser = argparse.ArgumentParser(
+        description=
+        'Parse command line arguments for summary-nightly-results script.')
+    parser.add_argument('--results-folder',
+                        type=str,
+                        required=True,
+                        help='The folder where the results are stored.')
+    parser.add_argument('--description',
+                        type=str,
+                        required=True,
+                        help='Description of the results.')
+
     args = parser.parse_args()
     return args
 
-    
+
 def main(args):
     results_folder = Path(args.results_folder)
-    
+
     results = []
 
     # collect results
     for test_file in results_folder.glob("*_nightly_results.json"):
         with open(test_file, "r") as f:
             results = results + json.loads(f.read())
-            
-            
-    # generate markdown table            
+
+    # generate markdown table
     df = pd.DataFrame.from_dict(results)
 
-    md_table = tabulate(df,
-                        headers='keys',
-                        tablefmt='pipe',
-                        showindex=False)
-                        
+    md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False)
+
     with open(args.description, "r") as f:
         description = f.read()
-        
+
     description = description.format(
-        nightly_results_benchmarking_table=md_table
-    )
-    
+        nightly_results_benchmarking_table=md_table)
+
     with open("nightly_results.md", "w") as f:
         f.write(description)
-        
 
     plt.rcParams.update({'font.size': 15})
-        
+
     # plot results
     fig, axes = plt.subplots(3, 2, figsize=(10, 12))
     methods = ["vllm", "trt", "lmdeploy", "tgi"]
@@ -60,7 +61,7 @@ def main(args):
                 target = df['Test name'].str.contains(model)
                 target = target & df['Engine'].str.contains(method)
                 filtered_df = df[target]
-                
+
                 if filtered_df.empty:
                     means.append(0.)
                     stds.append(0.)
@@ -69,32 +70,33 @@ def main(args):
                     std = filtered_df[f"Std {metric} (ms)"].values[0]
                     success = filtered_df["Successful req."].values[0]
                     stds.append(std / math.sqrt(success))
-                    
+
             print(means, stds)
-                    
+
             ax = axes[i, j]
-            
-            ax.errorbar(
-                ["vllm", "trt", "lmdeploy", "tgi"], 
-                means, 
-                yerr=stds,
-                fmt='o', capsize=5)
+
+            ax.errorbar(["vllm", "trt", "lmdeploy", "tgi"],
+                        means,
+                        yerr=stds,
+                        fmt='o',
+                        capsize=5)
             ax.set_ylim(bottom=0)
 
             # for i, (method, mean, std) in enumerate(zip(method, means, stds)):
             #     ax.text(
             #         i - 0.2, mean,  # Adjust position above the error bar
-            #         f'{mean:.0f}', 
-            #         ha='center', 
+            #         f'{mean:.0f}',
+            #         ha='center',
             #         va='bottom'
             #     )
-            
+
             ax.set_ylabel(f"{metric} (ms)")
             ax.set_title(f"{model} {metric} comparison")
 
     fig.tight_layout(pad=0.6)
     fig.savefig("nightly_results.png", bbox_inches='tight')
 
+
 if __name__ == '__main__':
     args = parse_arguments()
-    main(args)
\ No newline at end of file
+    main(args)

From ec6f42de9ea32c1c3baa491e31cfc88a8d6bdab9 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 13:16:11 -0700
Subject: [PATCH 116/150] add datetime to filename and make yapf happy

---
 .../nightly-benchmarks/scripts/summary-nightly-results.py     | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
index d9fc46cb45c92..f7d765a9ac06e 100644
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -1,3 +1,4 @@
+import datetime
 import json
 import os
 from pathlib import Path
@@ -56,7 +57,8 @@
     serving_md_table_lines = serving_md_table_with_headers.split('\n')
     serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
 
-    prefix = os.environ.get("CURRENT_LLM_SERVING_ENGINE")
+    prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
+    prefix = prefix + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
 
     # document benchmarking results in markdown
     with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:

From 00905776e217ba2b4e22723ff8ed95db177667d6 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 13:31:07 -0700
Subject: [PATCH 117/150] debug mixtral and llama70B

---
 .../nightly-benchmarks/nightly-pipeline.yaml  | 265 ++++++++++++------
 .../scripts/run-trt-nightly.sh                |  18 +-
 2 files changed, 189 insertions(+), 94 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 5616ce6af028d..9bea50673e81f 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -1,5 +1,5 @@
 steps: 
-  - label: "A100 trt benchmark"
+  - label: "A100 trt benchmark mixtral8x7B"
     priority: 100
     agents:
       queue: A100
@@ -22,6 +22,8 @@ steps:
             env:
             - name: VLLM_USAGE_SOURCE
               value: ci-test
+            - name: TEST_SELECTOR
+              value: mixtral8x7B_tp2
             - name: HF_HOME
               value: /root/.cache/huggingface
             - name: VLLM_SOURCE_CODE_LOC
@@ -41,49 +43,8 @@ steps:
             hostPath:
               path: /root/.cache/huggingface
               type: Directory
-  - label: "A100 lmdeploy benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: openmmlab/lmdeploy:v0.5.0
-            command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /root/.cache/huggingface
-              type: Directory
-  - label: "A100 vllm benchmark"
+  - wait
+  - label: "A100 trt benchmark llama8B"
     priority: 100
     agents:
       queue: A100
@@ -92,7 +53,7 @@ steps:
         podSpec:
           priorityClassName: perf-benchmark
           containers:
-          - image: vllm/vllm-openai:latest
+          - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
             command:
             - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
             resources:
@@ -106,6 +67,8 @@ steps:
             env:
             - name: VLLM_USAGE_SOURCE
               value: ci-test
+            - name: TEST_SELECTOR
+              value: llama8B_tp1
             - name: HF_HOME
               value: /root/.cache/huggingface
             - name: VLLM_SOURCE_CODE_LOC
@@ -125,48 +88,176 @@ steps:
             hostPath:
               path: /root/.cache/huggingface
               type: Directory
-  - label: "A100 tgi benchmark"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: ghcr.io/huggingface/text-generation-inference:2.1
-            command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /root/.cache/huggingface
-              type: Directory 
+  # - label: "A100 trt benchmark llama70B"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #   - kubernetes:
+  #       podSpec:
+  #         priorityClassName: perf-benchmark
+  #         containers:
+  #         - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+  #           command:
+  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+  #           resources:
+  #             limits:
+  #               nvidia.com/gpu: 8
+  #           volumeMounts:
+  #           - name: devshm
+  #             mountPath: /dev/shm
+  #           - name: hf-cache
+  #             mountPath: /root/.cache/huggingface
+  #           env:
+  #           - name: VLLM_USAGE_SOURCE
+  #             value: ci-test
+  #           - name: TEST_SELECTOR
+  #             value: llama70B_tp4
+  #           - name: HF_HOME
+  #             value: /root/.cache/huggingface
+  #           - name: VLLM_SOURCE_CODE_LOC
+  #             value: /workspace/build/buildkite/vllm/performance-benchmark
+  #           - name: HF_TOKEN
+  #             valueFrom:
+  #               secretKeyRef:
+  #                 name: hf-token-secret
+  #                 key: token
+  #         nodeSelector:
+  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  #         volumes:
+  #         - name: devshm
+  #           emptyDir:
+  #             medium: Memory
+  #         - name: hf-cache
+  #           hostPath:
+  #             path: /root/.cache/huggingface
+  #             type: Directory
+  # - label: "A100 lmdeploy benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #   - kubernetes:
+  #       podSpec:
+  #         priorityClassName: perf-benchmark
+  #         containers:
+  #         - image: openmmlab/lmdeploy:v0.5.0
+  #           command:
+  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+  #           resources:
+  #             limits:
+  #               nvidia.com/gpu: 8
+  #           volumeMounts:
+  #           - name: devshm
+  #             mountPath: /dev/shm
+  #           - name: hf-cache
+  #             mountPath: /root/.cache/huggingface
+  #           env:
+  #           - name: VLLM_USAGE_SOURCE
+  #             value: ci-test
+  #           - name: HF_HOME
+  #             value: /root/.cache/huggingface
+  #           - name: VLLM_SOURCE_CODE_LOC
+  #             value: /workspace/build/buildkite/vllm/performance-benchmark
+  #           - name: HF_TOKEN
+  #             valueFrom:
+  #               secretKeyRef:
+  #                 name: hf-token-secret
+  #                 key: token
+  #         nodeSelector:
+  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  #         volumes:
+  #         - name: devshm
+  #           emptyDir:
+  #             medium: Memory
+  #         - name: hf-cache
+  #           hostPath:
+  #             path: /root/.cache/huggingface
+  #             type: Directory
+  # - label: "A100 vllm benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #   - kubernetes:
+  #       podSpec:
+  #         priorityClassName: perf-benchmark
+  #         containers:
+  #         - image: vllm/vllm-openai:latest
+  #           command:
+  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+  #           resources:
+  #             limits:
+  #               nvidia.com/gpu: 8
+  #           volumeMounts:
+  #           - name: devshm
+  #             mountPath: /dev/shm
+  #           - name: hf-cache
+  #             mountPath: /root/.cache/huggingface
+  #           env:
+  #           - name: VLLM_USAGE_SOURCE
+  #             value: ci-test
+  #           - name: HF_HOME
+  #             value: /root/.cache/huggingface
+  #           - name: VLLM_SOURCE_CODE_LOC
+  #             value: /workspace/build/buildkite/vllm/performance-benchmark
+  #           - name: HF_TOKEN
+  #             valueFrom:
+  #               secretKeyRef:
+  #                 name: hf-token-secret
+  #                 key: token
+  #         nodeSelector:
+  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  #         volumes:
+  #         - name: devshm
+  #           emptyDir:
+  #             medium: Memory
+  #         - name: hf-cache
+  #           hostPath:
+  #             path: /root/.cache/huggingface
+  #             type: Directory
+  # - label: "A100 tgi benchmark"
+  #   priority: 100
+  #   agents:
+  #     queue: A100
+  #   plugins:
+  #   - kubernetes:
+  #       podSpec:
+  #         priorityClassName: perf-benchmark
+  #         containers:
+  #         - image: ghcr.io/huggingface/text-generation-inference:2.1
+  #           command:
+  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+  #           resources:
+  #             limits:
+  #               nvidia.com/gpu: 8
+  #           volumeMounts:
+  #           - name: devshm
+  #             mountPath: /dev/shm
+  #           - name: hf-cache
+  #             mountPath: /root/.cache/huggingface
+  #           env:
+  #           - name: VLLM_USAGE_SOURCE
+  #             value: ci-test
+  #           - name: HF_HOME
+  #             value: /root/.cache/huggingface
+  #           - name: VLLM_SOURCE_CODE_LOC
+  #             value: /workspace/build/buildkite/vllm/performance-benchmark
+  #           - name: HF_TOKEN
+  #             valueFrom:
+  #               secretKeyRef:
+  #                 name: hf-token-secret
+  #                 key: token
+  #         nodeSelector:
+  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  #         volumes:
+  #         - name: devshm
+  #           emptyDir:
+  #             medium: Memory
+  #         - name: hf-cache
+  #           hostPath:
+  #             path: /root/.cache/huggingface
+  #             type: Directory 
   - wait
   - label: "Plot"
     priority: 100
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index ae6f4316eb4c2..a063a78e51d4c 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -95,13 +95,8 @@ run_serving_tests() {
     fi
 
 
-    # prepare tokenizer
+
     cd $VLLM_SOURCE_CODE_LOC/benchmarks
-    rm -rf /tokenizer_cache
-    mkdir /tokenizer_cache
-    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
-      --model "$model" \
-      --cachedir /tokenizer_cache
 
 
     # run the server
@@ -119,8 +114,17 @@ run_serving_tests() {
       break
     fi
 
-    # go back to vllm benchmarking directory
+    # prepare tokenizer
     cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    rm -rf /tokenizer_cache
+    mkdir /tokenizer_cache
+    # update transformers package, to make sure mixtral tokenizer is available
+    python -m pip install transformers -U
+    python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
+      --model "$model" \
+      --cachedir /tokenizer_cache
+    cd $VLLM_SOURCE_CODE_LOC/benchmarks
+    
 
     # iterate over different QPS
     for qps in $qps_list; do

From f76a04a84bc6b6d4c72cbd23d5093549305bd2dd Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 13:45:59 -0700
Subject: [PATCH 118/150] pin lmdeploy transformers to 4.41.2

---
 .buildkite/nightly-benchmarks/nightly-descriptions.md        | 5 +++--
 .../nightly-benchmarks/scripts/run-lmdeploy-nightly.sh       | 2 ++
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index 84d304bebc0e1..eaa59309c0957 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -31,9 +31,10 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
 Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details.
 
 
-## Known crashes
+## Known issues
 
-- TGI v2.1 crashes when running mixtral model, see [TGI Issue #2122](https://github.com/huggingface/text-generation-inference/issues/2122)
+- TGI v2.1 crashes when running mixtral model, see [tgi issue #2122](https://github.com/huggingface/text-generation-inference/issues/2122)
+- pin the transformers library to 4.41.2 to avoid lmdeploy missing cache_position error, see [lmdeploy issue 1885](https://github.com/InternLM/lmdeploy/issues/1885).
 
 
 
diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index c23438679578b..0f6393b2f7784 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -205,6 +205,8 @@ main() {
   mkdir -p $RESULTS_FOLDER
   BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
 
+  python -m pip install transformers==4.41.2
+
   export CURRENT_LLM_SERVING_ENGINE=lmdeploy
   run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
   python -m pip install tabulate pandas

From 859d6f3e55071f3ae37b0fc5c32947fb6f296123 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 13:48:34 -0700
Subject: [PATCH 119/150] skip the test case instead of exit the whoel test
 suite

---
 .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh  | 2 +-
 .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh  | 2 +-
 .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
index b805c52d3fa8c..492fb5260ed70 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -65,7 +65,7 @@ run_serving_tests() {
     # if TEST_SELECTOR is set, only run the test cases that match the selector
     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
       echo "Skip test case $test_name."
-      exit 0
+      continue
     fi
 
     # append tgi to the test name
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index a063a78e51d4c..59e129f7b9f52 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -65,7 +65,7 @@ run_serving_tests() {
     # if TEST_SELECTOR is set, only run the test cases that match the selector
     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
       echo "Skip test case $test_name."
-      exit 0
+      continue
     fi
 
     # append trt to the test name
diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
index 1e6d2893983bf..abed7cbb67348 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -70,7 +70,7 @@ run_serving_tests() {
     # if TEST_SELECTOR is set, only run the test cases that match the selector
     if [[ -n "$TEST_SELECTOR" ]] && [[ ! "$test_name" =~ $TEST_SELECTOR ]]; then
       echo "Skip test case $test_name."
-      exit 0
+      continue
     fi
 
     # append vllm to the test name

From 3ce4f5fdb2f27d1859ecf4c93d3051227b44629c Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 14:52:33 -0700
Subject: [PATCH 120/150] update transformers for mixtral model

---
 .../nightly-benchmarks/nightly-pipeline.yaml  | 22 ++++++++++---------
 .../scripts/run-trt-nightly.sh                |  1 +
 2 files changed, 13 insertions(+), 10 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 9bea50673e81f..bb6af945146c2 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -1,5 +1,5 @@
 steps: 
-  - label: "A100 trt benchmark mixtral8x7B"
+  - label: "A100 lmdeploy benchmark"
     priority: 100
     agents:
       queue: A100
@@ -8,7 +8,7 @@ steps:
         podSpec:
           priorityClassName: perf-benchmark
           containers:
-          - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+          - image: openmmlab/lmdeploy:v0.5.0
             command:
             - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
             resources:
@@ -20,10 +20,10 @@ steps:
             - name: hf-cache
               mountPath: /root/.cache/huggingface
             env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
             - name: TEST_SELECTOR
               value: mixtral8x7B_tp2
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
             - name: HF_HOME
               value: /root/.cache/huggingface
             - name: VLLM_SOURCE_CODE_LOC
@@ -44,7 +44,7 @@ steps:
               path: /root/.cache/huggingface
               type: Directory
   - wait
-  - label: "A100 trt benchmark llama8B"
+  - label: "A100 trt benchmark mixtral8x7B"
     priority: 100
     agents:
       queue: A100
@@ -68,7 +68,7 @@ steps:
             - name: VLLM_USAGE_SOURCE
               value: ci-test
             - name: TEST_SELECTOR
-              value: llama8B_tp1
+              value: mixtral8x7B_tp2
             - name: HF_HOME
               value: /root/.cache/huggingface
             - name: VLLM_SOURCE_CODE_LOC
@@ -88,7 +88,7 @@ steps:
             hostPath:
               path: /root/.cache/huggingface
               type: Directory
-  # - label: "A100 trt benchmark llama70B"
+  # - label: "A100 trt benchmark llama8B"
   #   priority: 100
   #   agents:
   #     queue: A100
@@ -112,7 +112,7 @@ steps:
   #           - name: VLLM_USAGE_SOURCE
   #             value: ci-test
   #           - name: TEST_SELECTOR
-  #             value: llama70B_tp4
+  #             value: llama8B_tp1
   #           - name: HF_HOME
   #             value: /root/.cache/huggingface
   #           - name: VLLM_SOURCE_CODE_LOC
@@ -132,7 +132,7 @@ steps:
   #           hostPath:
   #             path: /root/.cache/huggingface
   #             type: Directory
-  # - label: "A100 lmdeploy benchmark"
+  # - label: "A100 trt benchmark llama70B"
   #   priority: 100
   #   agents:
   #     queue: A100
@@ -141,7 +141,7 @@ steps:
   #       podSpec:
   #         priorityClassName: perf-benchmark
   #         containers:
-  #         - image: openmmlab/lmdeploy:v0.5.0
+  #         - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
   #           command:
   #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
   #           resources:
@@ -155,6 +155,8 @@ steps:
   #           env:
   #           - name: VLLM_USAGE_SOURCE
   #             value: ci-test
+  #           - name: TEST_SELECTOR
+  #             value: llama70B_tp4
   #           - name: HF_HOME
   #             value: /root/.cache/huggingface
   #           - name: VLLM_SOURCE_CODE_LOC
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index 59e129f7b9f52..25b1bae78e6b6 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -100,6 +100,7 @@ run_serving_tests() {
 
 
     # run the server
+    python -m pip install transformers -U
     echo "Running test case $test_name"
     bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
 

From 646114d299827043451fbb4a11710a1bd0405261 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 14:53:38 -0700
Subject: [PATCH 121/150] move transformers update

---
 .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index 25b1bae78e6b6..312126bb56b3f 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -99,8 +99,6 @@ run_serving_tests() {
     cd $VLLM_SOURCE_CODE_LOC/benchmarks
 
 
-    # run the server
-    python -m pip install transformers -U
     echo "Running test case $test_name"
     bash ../.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh "$server_params" "$common_params"
 
@@ -119,8 +117,6 @@ run_serving_tests() {
     cd $VLLM_SOURCE_CODE_LOC/benchmarks
     rm -rf /tokenizer_cache
     mkdir /tokenizer_cache
-    # update transformers package, to make sure mixtral tokenizer is available
-    python -m pip install transformers -U
     python ../.buildkite/nightly-benchmarks/scripts/download-tokenizer.py \
       --model "$model" \
       --cachedir /tokenizer_cache
@@ -205,6 +201,9 @@ main() {
   mkdir -p $RESULTS_FOLDER
   BENCHMARK_ROOT=../.buildkite/nightly-benchmarks/
 
+  # update transformers package, to make sure mixtral tokenizer is available
+  python -m pip install transformers -U
+
   export CURRENT_LLM_SERVING_ENGINE=trt
   run_serving_tests $BENCHMARK_ROOT/tests/nightly-tests.json
   python -m pip install tabulate pandas

From b6058aa91596c87fdb567d84f3d4f19d96251385 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 14:56:58 -0700
Subject: [PATCH 122/150] typo fix

---
 .buildkite/nightly-benchmarks/scripts/launch-trt-server.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
index 26d3ca610af81..f8262653a6628 100644
--- a/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
+++ b/.buildkite/nightly-benchmarks/scripts/launch-trt-server.sh
@@ -68,7 +68,7 @@ if echo "$common_params" | jq -e 'has("fp8")' > /dev/null; then
 
 else
 
-    echo "Key 'fp8' exists in common params. Use convert_checkpoint.py"
+    echo "Key 'fp8' does not exist in common params. Use convert_checkpoint.py"
     python3 convert_checkpoint.py \
         --model_dir ${model_path} \
         --dtype ${model_dtype} \

From ac4d13774b7f8623801c2e1d77b345dfec6569c6 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 16:14:46 -0700
Subject: [PATCH 123/150] bring back the full test suite

---
 .../nightly-descriptions.md                   |   2 +-
 .../nightly-benchmarks/nightly-pipeline.yaml  | 351 +++++++++---------
 .../scripts/run-trt-nightly.sh                |   1 -
 .../scripts/summary-nightly-results.py        |   2 +-
 4 files changed, 176 insertions(+), 180 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index eaa59309c0957..58a3fb2c07833 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -34,7 +34,7 @@ Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.
 ## Known issues
 
 - TGI v2.1 crashes when running mixtral model, see [tgi issue #2122](https://github.com/huggingface/text-generation-inference/issues/2122)
-- pin the transformers library to 4.41.2 to avoid lmdeploy missing cache_position error, see [lmdeploy issue 1885](https://github.com/InternLM/lmdeploy/issues/1885).
+- Pin the transformers library to 4.41.2 to avoid lmdeploy missing cache_position error, see [lmdeploy issue 1885](https://github.com/InternLM/lmdeploy/issues/1885).
 
 
 
diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index bb6af945146c2..c58f99cde40c6 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -1,5 +1,5 @@
 steps: 
-  - label: "A100 lmdeploy benchmark"
+  - label: "A100 trt benchmark llama8B"
     priority: 100
     agents:
       queue: A100
@@ -8,7 +8,7 @@ steps:
         podSpec:
           priorityClassName: perf-benchmark
           containers:
-          - image: openmmlab/lmdeploy:v0.5.0
+          - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
             command:
             - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
             resources:
@@ -20,10 +20,10 @@ steps:
             - name: hf-cache
               mountPath: /root/.cache/huggingface
             env:
-            - name: TEST_SELECTOR
-              value: mixtral8x7B_tp2
             - name: VLLM_USAGE_SOURCE
               value: ci-test
+            - name: TEST_SELECTOR
+              value: llama8B_tp1
             - name: HF_HOME
               value: /root/.cache/huggingface
             - name: VLLM_SOURCE_CODE_LOC
@@ -43,7 +43,6 @@ steps:
             hostPath:
               path: /root/.cache/huggingface
               type: Directory
-  - wait
   - label: "A100 trt benchmark mixtral8x7B"
     priority: 100
     agents:
@@ -88,178 +87,176 @@ steps:
             hostPath:
               path: /root/.cache/huggingface
               type: Directory
-  # - label: "A100 trt benchmark llama8B"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #   - kubernetes:
-  #       podSpec:
-  #         priorityClassName: perf-benchmark
-  #         containers:
-  #         - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
-  #           command:
-  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-  #           resources:
-  #             limits:
-  #               nvidia.com/gpu: 8
-  #           volumeMounts:
-  #           - name: devshm
-  #             mountPath: /dev/shm
-  #           - name: hf-cache
-  #             mountPath: /root/.cache/huggingface
-  #           env:
-  #           - name: VLLM_USAGE_SOURCE
-  #             value: ci-test
-  #           - name: TEST_SELECTOR
-  #             value: llama8B_tp1
-  #           - name: HF_HOME
-  #             value: /root/.cache/huggingface
-  #           - name: VLLM_SOURCE_CODE_LOC
-  #             value: /workspace/build/buildkite/vllm/performance-benchmark
-  #           - name: HF_TOKEN
-  #             valueFrom:
-  #               secretKeyRef:
-  #                 name: hf-token-secret
-  #                 key: token
-  #         nodeSelector:
-  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  #         volumes:
-  #         - name: devshm
-  #           emptyDir:
-  #             medium: Memory
-  #         - name: hf-cache
-  #           hostPath:
-  #             path: /root/.cache/huggingface
-  #             type: Directory
-  # - label: "A100 trt benchmark llama70B"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #   - kubernetes:
-  #       podSpec:
-  #         priorityClassName: perf-benchmark
-  #         containers:
-  #         - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
-  #           command:
-  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-  #           resources:
-  #             limits:
-  #               nvidia.com/gpu: 8
-  #           volumeMounts:
-  #           - name: devshm
-  #             mountPath: /dev/shm
-  #           - name: hf-cache
-  #             mountPath: /root/.cache/huggingface
-  #           env:
-  #           - name: VLLM_USAGE_SOURCE
-  #             value: ci-test
-  #           - name: TEST_SELECTOR
-  #             value: llama70B_tp4
-  #           - name: HF_HOME
-  #             value: /root/.cache/huggingface
-  #           - name: VLLM_SOURCE_CODE_LOC
-  #             value: /workspace/build/buildkite/vllm/performance-benchmark
-  #           - name: HF_TOKEN
-  #             valueFrom:
-  #               secretKeyRef:
-  #                 name: hf-token-secret
-  #                 key: token
-  #         nodeSelector:
-  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  #         volumes:
-  #         - name: devshm
-  #           emptyDir:
-  #             medium: Memory
-  #         - name: hf-cache
-  #           hostPath:
-  #             path: /root/.cache/huggingface
-  #             type: Directory
-  # - label: "A100 vllm benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #   - kubernetes:
-  #       podSpec:
-  #         priorityClassName: perf-benchmark
-  #         containers:
-  #         - image: vllm/vllm-openai:latest
-  #           command:
-  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-  #           resources:
-  #             limits:
-  #               nvidia.com/gpu: 8
-  #           volumeMounts:
-  #           - name: devshm
-  #             mountPath: /dev/shm
-  #           - name: hf-cache
-  #             mountPath: /root/.cache/huggingface
-  #           env:
-  #           - name: VLLM_USAGE_SOURCE
-  #             value: ci-test
-  #           - name: HF_HOME
-  #             value: /root/.cache/huggingface
-  #           - name: VLLM_SOURCE_CODE_LOC
-  #             value: /workspace/build/buildkite/vllm/performance-benchmark
-  #           - name: HF_TOKEN
-  #             valueFrom:
-  #               secretKeyRef:
-  #                 name: hf-token-secret
-  #                 key: token
-  #         nodeSelector:
-  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  #         volumes:
-  #         - name: devshm
-  #           emptyDir:
-  #             medium: Memory
-  #         - name: hf-cache
-  #           hostPath:
-  #             path: /root/.cache/huggingface
-  #             type: Directory
-  # - label: "A100 tgi benchmark"
-  #   priority: 100
-  #   agents:
-  #     queue: A100
-  #   plugins:
-  #   - kubernetes:
-  #       podSpec:
-  #         priorityClassName: perf-benchmark
-  #         containers:
-  #         - image: ghcr.io/huggingface/text-generation-inference:2.1
-  #           command:
-  #           - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-  #           resources:
-  #             limits:
-  #               nvidia.com/gpu: 8
-  #           volumeMounts:
-  #           - name: devshm
-  #             mountPath: /dev/shm
-  #           - name: hf-cache
-  #             mountPath: /root/.cache/huggingface
-  #           env:
-  #           - name: VLLM_USAGE_SOURCE
-  #             value: ci-test
-  #           - name: HF_HOME
-  #             value: /root/.cache/huggingface
-  #           - name: VLLM_SOURCE_CODE_LOC
-  #             value: /workspace/build/buildkite/vllm/performance-benchmark
-  #           - name: HF_TOKEN
-  #             valueFrom:
-  #               secretKeyRef:
-  #                 name: hf-token-secret
-  #                 key: token
-  #         nodeSelector:
-  #           nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  #         volumes:
-  #         - name: devshm
-  #           emptyDir:
-  #             medium: Memory
-  #         - name: hf-cache
-  #           hostPath:
-  #             path: /root/.cache/huggingface
-  #             type: Directory 
+  - label: "A100 trt benchmark llama70B"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            - name: hf-cache
+              mountPath: /root/.cache/huggingface
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: TEST_SELECTOR
+              value: llama70B_tp4
+            - name: HF_HOME
+              value: /root/.cache/huggingface
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /root/.cache/huggingface
+              type: Directory
+  - label: "A100 lmdeploy benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: openmmlab/lmdeploy:v0.5.0
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            - name: hf-cache
+              mountPath: /root/.cache/huggingface
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_HOME
+              value: /root/.cache/huggingface
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /root/.cache/huggingface
+              type: Directory
+  - label: "A100 vllm benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: vllm/vllm-openai:latest
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            - name: hf-cache
+              mountPath: /root/.cache/huggingface
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_HOME
+              value: /root/.cache/huggingface
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /root/.cache/huggingface
+              type: Directory
+  - label: "A100 tgi benchmark"
+    priority: 100
+    agents:
+      queue: A100
+    plugins:
+    - kubernetes:
+        podSpec:
+          priorityClassName: perf-benchmark
+          containers:
+          - image: ghcr.io/huggingface/text-generation-inference:2.1
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            - name: hf-cache
+              mountPath: /root/.cache/huggingface
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_HOME
+              value: /root/.cache/huggingface
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /root/.cache/huggingface
+              type: Directory 
   - wait
   - label: "Plot"
     priority: 100
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index 312126bb56b3f..bec59dad6e55c 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -1,7 +1,6 @@
 #!/bin/bash
 
 set -o pipefail
-set -ex
 
 check_gpus() {
   # check the number of GPUs and GPU type.
diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
index f7d765a9ac06e..c78c831eaab7b 100644
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -58,7 +58,7 @@
     serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:])
 
     prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
-    prefix = prefix + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
+    prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE")
 
     # document benchmarking results in markdown
     with open(results_folder / f"{prefix}_nightly_results.md", "w") as f:

From 11731f405a19cb17f1bdfd9d98026fd664dfa5b5 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 22:34:59 -0700
Subject: [PATCH 124/150] remove wrongfully-added results

---
 benchmarks/results/trt_llama8B_tp1_qps_8.commands | 6 ------
 benchmarks/results/trt_nightly_results.md         | 1 -
 2 files changed, 7 deletions(-)
 delete mode 100644 benchmarks/results/trt_llama8B_tp1_qps_8.commands
 delete mode 100644 benchmarks/results/trt_nightly_results.md

diff --git a/benchmarks/results/trt_llama8B_tp1_qps_8.commands b/benchmarks/results/trt_llama8B_tp1_qps_8.commands
deleted file mode 100644
index e0312b4e22dc2..0000000000000
--- a/benchmarks/results/trt_llama8B_tp1_qps_8.commands
+++ /dev/null
@@ -1,6 +0,0 @@
-{
-  "server_command": "",
-  "client_command": "python3 benchmark_serving.py         --backend tensorrt-llm         --tokenizer /tokenizer_cache         --model meta-llama/Meta-Llama-3-8B         --dataset-name sharegpt         --dataset-path ./ShareGPT_V3_unfiltered_cleaned_split.json         --num-prompts 200         --port 8000         --save-result         --result-dir results/         --result-filename trt_llama8B_tp1_qps_8.json         --request-rate 8         --endpoint /v2/models/ensemble/generate_stream",
-  "gpu_type": "A100-SXM4-80GB",
-  "engine": "trt"
-}
diff --git a/benchmarks/results/trt_nightly_results.md b/benchmarks/results/trt_nightly_results.md
deleted file mode 100644
index 3befc74903d68..0000000000000
--- a/benchmarks/results/trt_nightly_results.md
+++ /dev/null
@@ -1 +0,0 @@
-| trt_llama8B_tp1_qps_8 | A100-SXM4-80GB |               200 |        6.49609 |          60.3214 |            55.7321 |         119.186 |         15.6021 |            14.187 |        56.2925 | trt      |

From b012f719f26e7b6f4bad17c2270c4007ad0adf84 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Thu, 4 Jul 2024 23:01:24 -0700
Subject: [PATCH 125/150] adjust plotting & provide more details in nightly
 description

---
 .../nightly-descriptions.md                   | 30 ++++++++-----------
 .../scripts/plot-nightly-results.py           |  1 +
 README.md                                     |  2 ++
 3 files changed, 16 insertions(+), 17 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index 58a3fb2c07833..08e8b0c02543e 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -6,7 +6,7 @@ The main goal of this benchmarking is two-fold:
 - Reproducible: one can run the exact same set of benchmarking commands inside the exact same docker by following reproducing instructions in [reproduce.md]().
 
 
-## Versions
+## Docker images
 
 We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker images:
 - vllm/vllm-openai:v0.5.0.post1
@@ -14,33 +14,29 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following docker ima
 - openmmlab/lmdeploy:v0.5.0
 - ghcr.io/huggingface/text-generation-inference:2.1
 
-Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details.
+<!-- Please check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/nightly-pipeline.yaml">nightly-pipeline.yaml</a> artifact for more details on how we deploy the docker images. -->
 
 
-## Workload description
-
-We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
-
-- Input length: randomly sample 1000 prompts from ShareGPT dataset (with fixed random seed).
-- Output length: the corresponding output length of these 1000 prompts.
-- Batch size: dynamically determined by vllm and the arrival pattern of the requests.
-- Average QPS (query per second): 4 for 8B model and 2 for larger models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
-- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
-- Evaluation metrics: Throughput, TTFT (time to the first token, with mean and std), ITL (inter-token latency, with mean and std).
+## Hardware
 
-Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details.
+One AWS node with 8x NVIDIA A100 GPUs.
 
 
-## Known issues
+## Workload description
 
-- TGI v2.1 crashes when running mixtral model, see [tgi issue #2122](https://github.com/huggingface/text-generation-inference/issues/2122)
-- Pin the transformers library to 4.41.2 to avoid lmdeploy missing cache_position error, see [lmdeploy issue 1885](https://github.com/InternLM/lmdeploy/issues/1885).
+We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
 
+- Input length: randomly sample 500 prompts from ShareGPT dataset (with fixed random seed).
+- Output length: the corresponding output length of these 500 prompts.
+- Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
+- Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
+- Evaluation metrics: Throughput, TTFT (time to the first token), ITL (inter-token latency).
 
+<!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
 
 ## Plots
 
-In the following plots, the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
+In the following plots, the dot shows the mean and the error bar shows the standard error of the mean. Value 0 means that the corresponding benchmark crashed.
 
 <img src="artifact://nightly_results.png" alt="Benchmarking results" height=250 >
 
diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
index 1641be259e06c..9c93d654d1926 100644
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -92,6 +92,7 @@ def main(args):
 
             ax.set_ylabel(f"{metric} (ms)")
             ax.set_title(f"{model} {metric} comparison")
+            ax.grid()
 
     fig.tight_layout(pad=0.6)
     fig.savefig("nightly_results.png", bbox_inches='tight')
diff --git a/README.md b/README.md
index 3e0da945d9be8..879d47fefe0f4 100644
--- a/README.md
+++ b/README.md
@@ -52,6 +52,8 @@ vLLM is fast with:
 - Quantization: [GPTQ](https://arxiv.org/abs/2210.17323), [AWQ](https://arxiv.org/abs/2306.00978), [SqueezeLLM](https://arxiv.org/abs/2306.07629), FP8 KV Cache
 - Optimized CUDA kernels
 
+**Performance benchmark**: We include a [performance benchmark](https://buildkite.com/vllm/performance-benchmark/builds/3924) that compares the performance of vllm against other LLM serving engines ([TensorRT-LLM](https://github.com/NVIDIA/TensorRT-LLM), [text-generation-inference](https://github.com/huggingface/text-generation-inference) and [lmdeploy](https://github.com/InternLM/lmdeploy)).
+
 vLLM is flexible and easy to use with:
 
 - Seamless integration with popular Hugging Face models

From 4def3026f196f351ac45a71a009666a48576ad7d Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 23:03:42 -0700
Subject: [PATCH 126/150] adjust figure -- add grid, bar plot, color,
 +throughput

---
 .../scripts/plot-nightly-results.py           | 50 +++++++++++++------
 .../scripts/summary-nightly-results.py        |  2 +
 2 files changed, 36 insertions(+), 16 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
index 9c93d654d1926..c67b1b8414b70 100644
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -52,11 +52,11 @@ def main(args):
     plt.rcParams.update({'font.size': 15})
 
     # plot results
-    fig, axes = plt.subplots(3, 2, figsize=(10, 12))
+    fig, axes = plt.subplots(3, 3, figsize=(10, 12))
     methods = ["vllm", "trt", "lmdeploy", "tgi"]
     for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
         for j, metric in enumerate(["TTFT", "ITL"]):
-            means, stds = [], []
+            means, stds = [], [], []
             for method in methods:
                 target = df['Test name'].str.contains(model)
                 target = target & df['Engine'].str.contains(method)
@@ -75,24 +75,42 @@ def main(args):
 
             ax = axes[i, j]
 
-            ax.errorbar(["vllm", "trt", "lmdeploy", "tgi"],
-                        means,
-                        yerr=stds,
-                        fmt='o',
-                        capsize=5)
+            ax.bar(["vllm", "trt", "lmdeploy", "tgi"],
+                    means,
+                    yerr=stds,
+                    capsize=5,
+                    colors=['#E69F00', '#56B4E9','#D55E00', '#009E73'])
             ax.set_ylim(bottom=0)
 
-            # for i, (method, mean, std) in enumerate(zip(method, means, stds)):
-            #     ax.text(
-            #         i - 0.2, mean,  # Adjust position above the error bar
-            #         f'{mean:.0f}',
-            #         ha='center',
-            #         va='bottom'
-            #     )
-
             ax.set_ylabel(f"{metric} (ms)")
             ax.set_title(f"{model} {metric} comparison")
-            ax.grid()
+            ax.grid(axis='y')
+            
+        metric = "Tput"
+        j = 2
+        if True:
+            tputs = []
+            for method in methods:
+                target = df['Test name'].str.contains(model)
+                target = target & df['Engine'].str.contains(method)
+                filtered_df = df[target]
+
+                if filtered_df.empty:
+                    tputs.append(0.)
+                else:
+                    tputs.append(filtered_df["Input Tput (tok/s)"].values[0] + filtered_df["Output Tput (tok/s)"].values[0])
+            
+            ax = axes[i, j]
+
+            ax.bar(["vllm", "trt", "lmdeploy", "tgi"],
+                    tputs,
+                    colors=['#E69F00', '#56B4E9','#D55E00', '#009E73'])
+            ax.set_ylim(bottom=0)
+
+            ax.set_ylabel(f"Tput (token/s)")
+            ax.set_title(f"{model} {metric} comparison")
+            ax.grid(axis='y')
+                    
 
     fig.tight_layout(pad=0.6)
     fig.savefig("nightly_results.png", bbox_inches='tight')
diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
index c78c831eaab7b..782d1ef9aab98 100644
--- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py
@@ -19,6 +19,8 @@
     "std_ttft_ms": "Std TTFT (ms)",
     "mean_itl_ms": "Mean ITL (ms)",
     "std_itl_ms": "Std ITL (ms)",
+    "input_throughput": "Input Tput (tok/s)",
+    "output_throughput": "Output Tput (tok/s)",
     "engine": "Engine",
 }
 

From 6b77d2bf8a0eedcd0e5939eb1a4376bdc2111bef Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 23:12:37 -0700
Subject: [PATCH 127/150] typo fix

---
 .buildkite/nightly-benchmarks/scripts/plot-nightly-results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
index c67b1b8414b70..74f10c467c186 100644
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -56,7 +56,7 @@ def main(args):
     methods = ["vllm", "trt", "lmdeploy", "tgi"]
     for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
         for j, metric in enumerate(["TTFT", "ITL"]):
-            means, stds = [], [], []
+            means, stds = [], []
             for method in methods:
                 target = df['Test name'].str.contains(model)
                 target = target & df['Engine'].str.contains(method)

From 8c0259ccb5e2c3ada603ba3f53564b172657e61c Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 23:19:22 -0700
Subject: [PATCH 128/150] bug fix: set color using attribute

---
 .../scripts/plot-nightly-results.py           | 21 ++++++++++++-------
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
index 74f10c467c186..e2b12a948e074 100644
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -26,6 +26,7 @@ def parse_arguments():
 
 
 def main(args):
+    bar_colors = ['#E69F00', '#56B4E9','#D55E00', '#009E73']
     results_folder = Path(args.results_folder)
 
     results = []
@@ -75,11 +76,13 @@ def main(args):
 
             ax = axes[i, j]
 
-            ax.bar(["vllm", "trt", "lmdeploy", "tgi"],
-                    means,
-                    yerr=stds,
-                    capsize=5,
-                    colors=['#E69F00', '#56B4E9','#D55E00', '#009E73'])
+            bars = ax.bar(["vllm", "trt", "lmdeploy", "tgi"],
+                        means,
+                        yerr=stds,
+                        capsize=5,
+            )
+            for idx, bar in enumerate(bars):
+                bar.set_color(bar_colors[idx])
             ax.set_ylim(bottom=0)
 
             ax.set_ylabel(f"{metric} (ms)")
@@ -102,9 +105,11 @@ def main(args):
             
             ax = axes[i, j]
 
-            ax.bar(["vllm", "trt", "lmdeploy", "tgi"],
-                    tputs,
-                    colors=['#E69F00', '#56B4E9','#D55E00', '#009E73'])
+            bars = ax.bar(["vllm", "trt", "lmdeploy", "tgi"],
+                        tputs,)
+            for idx, bar in enumerate(bars):
+                bar.set_color(bar_colors[idx])
+                
             ax.set_ylim(bottom=0)
 
             ax.set_ylabel(f"Tput (token/s)")

From 2ee07df2f9d041a810630993ebaf6aa0e9f2114a Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 23:25:27 -0700
Subject: [PATCH 129/150] mute curl output --- it's getting toooo long

---
 .buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh | 2 +-
 .buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh      | 2 +-
 .buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh      | 2 +-
 .buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh     | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
index 0f6393b2f7784..d6f112aaa42fd 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-lmdeploy-nightly.sh
@@ -46,7 +46,7 @@ wait_for_server() {
   # wait for vllm server to start
   # return 1 if vllm server crashes
   timeout 1200 bash -c '
-    until curl localhost:8000/v1/completions; do
+    until curl -s localhost:8000/v1/completions > /dev/null; do
       sleep 1
     done' && return 0 || return 1
 }
diff --git a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
index 492fb5260ed70..fed03654f8b77 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-tgi-nightly.sh
@@ -44,7 +44,7 @@ json2args() {
 
 wait_for_server() {
   timeout 1200 bash -c '
-    until curl localhost:8000/generate_stream; do
+    until curl -s localhost:8000/generate_stream > /dev/null; do
       sleep 1
     done' && return 0 || return 1
 }
diff --git a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
index bec59dad6e55c..4a82b9ec64d71 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-trt-nightly.sh
@@ -44,7 +44,7 @@ json2args() {
 
 wait_for_server() {
   timeout 1200 bash -c '
-    until curl localhost:8000/generate_stream; do
+    until curl -s localhost:8000/generate_stream > /dev/null; do
       sleep 1
     done' && return 0 || return 1
 }
diff --git a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
index abed7cbb67348..663045b8a9122 100644
--- a/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
+++ b/.buildkite/nightly-benchmarks/scripts/run-vllm-nightly.sh
@@ -50,7 +50,7 @@ wait_for_server() {
   # wait for vllm server to start
   # return 1 if vllm server crashes
   timeout 1200 bash -c '
-    until curl localhost:8000/v1/completions; do
+    until curl -s localhost:8000/v1/completions > /dev/null; do
       sleep 1
     done' && return 0 || return 1
 }

From 9547066fd3de7c2dbf3155a71733bf15360534ee Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 23:38:20 -0700
Subject: [PATCH 130/150] adjust coloring

---
 .../nightly-benchmarks/scripts/plot-nightly-results.py    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
index e2b12a948e074..c7545cbc79ae8 100644
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -26,7 +26,7 @@ def parse_arguments():
 
 
 def main(args):
-    bar_colors = ['#E69F00', '#56B4E9','#D55E00', '#009E73']
+    bar_colors = ['#56B4E9', '#009E73', '#D55E00', '#E69F00']
     results_folder = Path(args.results_folder)
 
     results = []
@@ -53,7 +53,7 @@ def main(args):
     plt.rcParams.update({'font.size': 15})
 
     # plot results
-    fig, axes = plt.subplots(3, 3, figsize=(10, 12))
+    fig, axes = plt.subplots(3, 3, figsize=(14, 14))
     methods = ["vllm", "trt", "lmdeploy", "tgi"]
     for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
         for j, metric in enumerate(["TTFT", "ITL"]):
@@ -74,7 +74,7 @@ def main(args):
 
             print(means, stds)
 
-            ax = axes[i, j]
+            ax = axes[i, j+1]
 
             bars = ax.bar(["vllm", "trt", "lmdeploy", "tgi"],
                         means,
@@ -90,7 +90,7 @@ def main(args):
             ax.grid(axis='y')
             
         metric = "Tput"
-        j = 2
+        j = 0
         if True:
             tputs = []
             for method in methods:

From a3085a10f67817fbb2bbf6d197b5ae1e22664092 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 23:42:58 -0700
Subject: [PATCH 131/150] increase font size, adjust coloring

---
 .buildkite/nightly-benchmarks/nightly-descriptions.md        | 2 +-
 .../nightly-benchmarks/scripts/plot-nightly-results.py       | 5 +++--
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-descriptions.md b/.buildkite/nightly-benchmarks/nightly-descriptions.md
index 08e8b0c02543e..c3d3cbf473968 100644
--- a/.buildkite/nightly-benchmarks/nightly-descriptions.md
+++ b/.buildkite/nightly-benchmarks/nightly-descriptions.md
@@ -30,7 +30,7 @@ We benchmark vllm, tensorrt-llm, lmdeploy and tgi using the following workload:
 - Output length: the corresponding output length of these 500 prompts.
 - Models: llama-3 8B, llama-3 70B, mixtral 8x7B.
 - Average QPS (query per second): 4 for the small model (llama-3 8B) and 2 for other two models. For each QPS, the arrival time of each query is determined using a random Poisson process (with fixed random seed).
-- Evaluation metrics: Throughput, TTFT (time to the first token), ITL (inter-token latency).
+- Evaluation metrics: Throughput (higher the better), TTFT (time to the first token, lower the better), ITL (inter-token latency, lower the better).
 
 <!-- Check <a href="artifact://workspace/build/buildkite/vllm/performance-benchmark/.buildkite/nightly-benchmarks/tests/nightly-tests.json">nightly-tests.json</a> artifact for more details. -->
 
diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
index c7545cbc79ae8..3e5bd72f56de2 100644
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -50,10 +50,11 @@ def main(args):
     with open("nightly_results.md", "w") as f:
         f.write(description)
 
-    plt.rcParams.update({'font.size': 15})
+    plt.rcParams.update({'font.size': 20})
 
     # plot results
     fig, axes = plt.subplots(3, 3, figsize=(14, 14))
+    fig.subplots_adjust(hspace=0.5)
     methods = ["vllm", "trt", "lmdeploy", "tgi"]
     for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
         for j, metric in enumerate(["TTFT", "ITL"]):
@@ -117,7 +118,7 @@ def main(args):
             ax.grid(axis='y')
                     
 
-    fig.tight_layout(pad=0.6)
+    fig.tight_layout()
     fig.savefig("nightly_results.png", bbox_inches='tight')
 
 

From 0a554aef2e26914301a6fc3f309f10f1d9f7aae8 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 23:43:58 -0700
Subject: [PATCH 132/150] adjust font size

---
 .buildkite/nightly-benchmarks/scripts/plot-nightly-results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
index 3e5bd72f56de2..cf9042a8c14d2 100644
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -50,7 +50,7 @@ def main(args):
     with open("nightly_results.md", "w") as f:
         f.write(description)
 
-    plt.rcParams.update({'font.size': 20})
+    plt.rcParams.update({'font.size': 18})
 
     # plot results
     fig, axes = plt.subplots(3, 3, figsize=(14, 14))

From c6c9292003fa01a1a7ab13d45ce1318772993deb Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 23:45:35 -0700
Subject: [PATCH 133/150] adjust spacing

---
 .../nightly-benchmarks/scripts/plot-nightly-results.py    | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
index cf9042a8c14d2..7a860261593bf 100644
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -53,8 +53,8 @@ def main(args):
     plt.rcParams.update({'font.size': 18})
 
     # plot results
-    fig, axes = plt.subplots(3, 3, figsize=(14, 14))
-    fig.subplots_adjust(hspace=0.5)
+    fig, axes = plt.subplots(3, 3, figsize=(15, 14))
+    fig.subplots_adjust(hspace=1)
     methods = ["vllm", "trt", "lmdeploy", "tgi"]
     for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
         for j, metric in enumerate(["TTFT", "ITL"]):
@@ -73,6 +73,7 @@ def main(args):
                     success = filtered_df["Successful req."].values[0]
                     stds.append(std / math.sqrt(success))
 
+            print(model, metric)
             print(means, stds)
 
             ax = axes[i, j+1]
@@ -103,6 +104,9 @@ def main(args):
                     tputs.append(0.)
                 else:
                     tputs.append(filtered_df["Input Tput (tok/s)"].values[0] + filtered_df["Output Tput (tok/s)"].values[0])
+
+            print(model, metric)
+            print(tputs)
             
             ax = axes[i, j]
 

From 4788d27f4c90d37129614849a0ea67c56796cca3 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 23:46:55 -0700
Subject: [PATCH 134/150] increase font size

---
 .../nightly-benchmarks/scripts/plot-nightly-results.py    | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
index 7a860261593bf..13f87ae5a222a 100644
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -50,10 +50,10 @@ def main(args):
     with open("nightly_results.md", "w") as f:
         f.write(description)
 
-    plt.rcParams.update({'font.size': 18})
+    plt.rcParams.update({'font.size': 20})
 
     # plot results
-    fig, axes = plt.subplots(3, 3, figsize=(15, 14))
+    fig, axes = plt.subplots(3, 3, figsize=(16, 14))
     fig.subplots_adjust(hspace=1)
     methods = ["vllm", "trt", "lmdeploy", "tgi"]
     for i, model in enumerate(["llama8B", "llama70B", "mixtral8x7B"]):
@@ -88,7 +88,7 @@ def main(args):
             ax.set_ylim(bottom=0)
 
             ax.set_ylabel(f"{metric} (ms)")
-            ax.set_title(f"{model} {metric} comparison")
+            ax.set_title(f"{model} {metric}")
             ax.grid(axis='y')
             
         metric = "Tput"
@@ -118,7 +118,7 @@ def main(args):
             ax.set_ylim(bottom=0)
 
             ax.set_ylabel(f"Tput (token/s)")
-            ax.set_title(f"{model} {metric} comparison")
+            ax.set_title(f"{model} {metric}")
             ax.grid(axis='y')
                     
 

From ccc160ceaf9248d488fa860f8ce6d8b0d5763c43 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 23:47:37 -0700
Subject: [PATCH 135/150] increase cap size

---
 .buildkite/nightly-benchmarks/scripts/plot-nightly-results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
index 13f87ae5a222a..a53d9570dac23 100644
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -81,7 +81,7 @@ def main(args):
             bars = ax.bar(["vllm", "trt", "lmdeploy", "tgi"],
                         means,
                         yerr=stds,
-                        capsize=5,
+                        capsize=10,
             )
             for idx, bar in enumerate(bars):
                 bar.set_color(bar_colors[idx])

From b6c557211f7c4eb3919a54f6a55f087ad11307a4 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Sat, 6 Jul 2024 23:51:24 -0700
Subject: [PATCH 136/150] make yapf and ruff happy

---
 .../scripts/plot-nightly-results.py           | 30 +++++++++++--------
 1 file changed, 17 insertions(+), 13 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
index a53d9570dac23..b57e2d384e744 100644
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -76,12 +76,13 @@ def main(args):
             print(model, metric)
             print(means, stds)
 
-            ax = axes[i, j+1]
+            ax = axes[i, j + 1]
 
-            bars = ax.bar(["vllm", "trt", "lmdeploy", "tgi"],
-                        means,
-                        yerr=stds,
-                        capsize=10,
+            bars = ax.bar(
+                ["vllm", "trt", "lmdeploy", "tgi"],
+                means,
+                yerr=stds,
+                capsize=10,
             )
             for idx, bar in enumerate(bars):
                 bar.set_color(bar_colors[idx])
@@ -90,7 +91,7 @@ def main(args):
             ax.set_ylabel(f"{metric} (ms)")
             ax.set_title(f"{model} {metric}")
             ax.grid(axis='y')
-            
+
         metric = "Tput"
         j = 0
         if True:
@@ -103,24 +104,27 @@ def main(args):
                 if filtered_df.empty:
                     tputs.append(0.)
                 else:
-                    tputs.append(filtered_df["Input Tput (tok/s)"].values[0] + filtered_df["Output Tput (tok/s)"].values[0])
+                    input_tput = filtered_df["Input Tput (tok/s)"].values[0]
+                    output_tput = filtered_df["Output Tput (tok/s)"].values[0]
+                    tputs.append(input_tput + output_tput)
 
             print(model, metric)
             print(tputs)
-            
+
             ax = axes[i, j]
 
-            bars = ax.bar(["vllm", "trt", "lmdeploy", "tgi"],
-                        tputs,)
+            bars = ax.bar(
+                ["vllm", "trt", "lmdeploy", "tgi"],
+                tputs,
+            )
             for idx, bar in enumerate(bars):
                 bar.set_color(bar_colors[idx])
-                
+
             ax.set_ylim(bottom=0)
 
-            ax.set_ylabel(f"Tput (token/s)")
+            ax.set_ylabel("Tput (token/s)")
             ax.set_title(f"{model} {metric}")
             ax.grid(axis='y')
-                    
 
     fig.tight_layout()
     fig.savefig("nightly_results.png", bbox_inches='tight')

From 13d8c0479f218528dd149362d393deae869fb0bd Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 9 Jul 2024 16:31:27 -0700
Subject: [PATCH 137/150] allow running performance benchmark & nightly
 benchmark simultaneously

---
 .buildkite/nightly-benchmarks/kickoff-pipeline.sh | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
index 8a4f852477713..3c2c0770e52cf 100755
--- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
+++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
@@ -10,7 +10,6 @@ apt install -y curl jq
 # Install minijinja for templating
 curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
 source $HOME/.cargo/env
-target_yaml_file=""
 
 # If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
 if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
@@ -18,16 +17,11 @@ if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
 
   if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
     echo "This PR has the 'perf-benchmarks' label. Proceeding with the performance benchmarks."
-    target_yaml_file=".buildkite/nightly-benchmarks/benchmark-pipeline.yaml"
+    buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
   fi
 
   if [[ $PR_LABELS == *"nightly-benchmarks"* ]]; then
     echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks."
-    target_yaml_file=".buildkite/nightly-benchmarks/nightly-pipeline.yaml"
+    buildkite-agent pipeline upload .buildkite/nightly-benchmarks/nightly-pipeline.yaml
   fi
 fi
-
-if [ -n "$target_yaml_file" ]; then
-  # Upload sample.yaml
-  buildkite-agent pipeline upload $target_yaml_file
-fi

From 4d77e8f896f94a5c467a1596030807f06d419d7a Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 9 Jul 2024 16:34:13 -0700
Subject: [PATCH 138/150] adjust the annotation context for nightly benchmark
 so that it does not overlap with performance benchmark

---
 .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
index 99f1548039ab6..1168912c6e229 100644
--- a/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+++ b/.buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
@@ -34,7 +34,7 @@ main() {
     /workspace/buildkite-agent artifact upload "nightly_results.png"
     /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
     /workspace/buildkite-agent artifact upload $VLLM_SOURCE_CODE_LOC/.buildkite/nightly-benchmarks/tests/nightly-tests.json
-    /workspace/buildkite-agent annotate --style "success" --context "benchmark-results" --append < nightly_results.md
+    /workspace/buildkite-agent annotate --style "success" --context "nightly-benchmarks-results" --append < nightly_results.md
 }
 
 main "$@"
\ No newline at end of file

From da41c537a5d20256e8c733f22b27acaca654f190 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 9 Jul 2024 16:53:09 -0700
Subject: [PATCH 139/150] cut redundant lines in nightly-pipeline.yaml using
 yaml anchor

---
 .../nightly-benchmarks/nightly-pipeline.yaml  | 272 ++++--------------
 1 file changed, 48 insertions(+), 224 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index c58f99cde40c6..75ea50d10ad95 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -1,136 +1,52 @@
+common: &common
+  priorityClassName: perf-benchmark
+  containers:
+  - command:
+    - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+    resources:
+      limits:
+        nvidia.com/gpu: 8
+    volumeMounts:
+    - name: devshm
+      mountPath: /dev/shm
+    - name: hf-cache
+      mountPath: /root/.cache/huggingface
+    env:
+    - name: VLLM_USAGE_SOURCE
+      value: ci-test
+    - name: HF_HOME
+      value: /root/.cache/huggingface
+    - name: VLLM_SOURCE_CODE_LOC
+      value: /workspace/build/buildkite/vllm/performance-benchmark
+    - name: HF_TOKEN
+      valueFrom:
+        secretKeyRef:
+          name: hf-token-secret
+          key: token
+  nodeSelector:
+    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  volumes:
+  - name: devshm
+    emptyDir:
+      medium: Memory
+  - name: hf-cache
+    hostPath:
+      path: /root/.cache/huggingface
+      type: Directory
+
 steps: 
-  - label: "A100 trt benchmark llama8B"
+  - block: ":rocket: Ready for comparing vllm against alternatives?"
+  - label: "A100 trt benchmark"
     priority: 100
     agents:
       queue: A100
     plugins:
     - kubernetes:
         podSpec:
-          priorityClassName: perf-benchmark
+          <<: *common
           containers:
           - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
-            command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: TEST_SELECTOR
-              value: llama8B_tp1
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /root/.cache/huggingface
-              type: Directory
-  - label: "A100 trt benchmark mixtral8x7B"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
-            command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: TEST_SELECTOR
-              value: mixtral8x7B_tp2
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /root/.cache/huggingface
-              type: Directory
-  - label: "A100 trt benchmark llama70B"
-    priority: 100
-    agents:
-      queue: A100
-    plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
-            command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: TEST_SELECTOR
-              value: llama70B_tp4
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /root/.cache/huggingface
-              type: Directory
+
   - label: "A100 lmdeploy benchmark"
     priority: 100
     agents:
@@ -138,41 +54,10 @@ steps:
     plugins:
     - kubernetes:
         podSpec:
-          priorityClassName: perf-benchmark
+          <<: *common
           containers:
           - image: openmmlab/lmdeploy:v0.5.0
-            command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /root/.cache/huggingface
-              type: Directory
+
   - label: "A100 vllm benchmark"
     priority: 100
     agents:
@@ -180,41 +65,10 @@ steps:
     plugins:
     - kubernetes:
         podSpec:
-          priorityClassName: perf-benchmark
+          <<: *common
           containers:
           - image: vllm/vllm-openai:latest
-            command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /root/.cache/huggingface
-              type: Directory
+
   - label: "A100 tgi benchmark"
     priority: 100
     agents:
@@ -222,42 +76,12 @@ steps:
     plugins:
     - kubernetes:
         podSpec:
-          priorityClassName: perf-benchmark
+          <<: *common
           containers:
           - image: ghcr.io/huggingface/text-generation-inference:2.1
-            command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /root/.cache/huggingface
-              type: Directory 
+
   - wait
+
   - label: "Plot"
     priority: 100
     agents:
@@ -291,4 +115,4 @@ steps:
           volumes:
           - name: devshm
             emptyDir:
-              medium: Memory 
\ No newline at end of file
+              medium: Memory
\ No newline at end of file

From c1080840ebfe93c8abd707b8e944e388f387b9c5 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 9 Jul 2024 16:56:08 -0700
Subject: [PATCH 140/150] add dpi=400

---
 .buildkite/nightly-benchmarks/scripts/plot-nightly-results.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
index b57e2d384e744..e5cfcc64a9b2a 100644
--- a/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
+++ b/.buildkite/nightly-benchmarks/scripts/plot-nightly-results.py
@@ -127,7 +127,7 @@ def main(args):
             ax.grid(axis='y')
 
     fig.tight_layout()
-    fig.savefig("nightly_results.png", bbox_inches='tight')
+    fig.savefig("nightly_results.png", bbox_inches='tight', dpi=400)
 
 
 if __name__ == '__main__':

From 57e678327bbbb2bf0ae9b7d22aeaf8d01ed0837c Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 9 Jul 2024 17:03:27 -0700
Subject: [PATCH 141/150] adjust pipeline upload order

---
 .buildkite/nightly-benchmarks/kickoff-pipeline.sh | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
index 3c2c0770e52cf..29af45013aa87 100755
--- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
+++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
@@ -15,13 +15,16 @@ source $HOME/.cargo/env
 if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
   PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
 
+  if [[ $PR_LABELS == *"nightly-benchmarks"* ]]; then
+    echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks."
+    buildkite-agent pipeline upload .buildkite/nightly-benchmarks/nightly-pipeline.yaml
+  fi
+
+  # Run performance benchmark first by upload it at last
+  # See https://buildkite.com/docs/agent/v3/cli-pipeline
   if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
     echo "This PR has the 'perf-benchmarks' label. Proceeding with the performance benchmarks."
     buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
   fi
 
-  if [[ $PR_LABELS == *"nightly-benchmarks"* ]]; then
-    echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks."
-    buildkite-agent pipeline upload .buildkite/nightly-benchmarks/nightly-pipeline.yaml
-  fi
 fi

From b057b4bb13c40be2b2258848149c9c724b2f2a13 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 9 Jul 2024 17:38:12 -0700
Subject: [PATCH 142/150] merge two pipelines using yq

---
 .../nightly-benchmarks/kickoff-pipeline.sh    | 20 ++++++++++++-------
 1 file changed, 13 insertions(+), 7 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
index 29af45013aa87..692506532a011 100755
--- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
+++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
@@ -6,6 +6,8 @@ set -euo pipefail
 # Install system packages
 apt update
 apt install -y curl jq
+# install yq
+add-apt-repository ppa:rmescandon/yq -y && apt update && apt install yq -y
 
 # Install minijinja for templating
 curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
@@ -15,16 +17,20 @@ source $HOME/.cargo/env
 if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
   PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
 
-  if [[ $PR_LABELS == *"nightly-benchmarks"* ]]; then
-    echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks."
-    buildkite-agent pipeline upload .buildkite/nightly-benchmarks/nightly-pipeline.yaml
-  fi
+  touch final.yaml
 
-  # Run performance benchmark first by upload it at last
-  # See https://buildkite.com/docs/agent/v3/cli-pipeline
   if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
     echo "This PR has the 'perf-benchmarks' label. Proceeding with the performance benchmarks."
-    buildkite-agent pipeline upload .buildkite/nightly-benchmarks/benchmark-pipeline.yaml
+    # append benchmark-pipeline.yaml to the end of final.yaml
+    yq 'load(".buildkite/nightly-benchmarks/benchmark-pipeline.yaml") * load("final.yaml")' > final.yaml
+  fi
+
+  if [[ $PR_LABELS == *"nightly-benchmarks"* ]]; then
+    echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks."
+    # append nightly-pipeline.yaml to the end of final.yaml
+    yq 'load(".buildkite/nightly-benchmarks/nightly-pipeline.yaml") * load("final.yaml")' > final.yaml
   fi
 
+  buildkite-agent pipeline upload final.yaml
+
 fi

From 1053900ed75b05a87bd767fcc5dc0dc978817b44 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 9 Jul 2024 18:22:39 -0700
Subject: [PATCH 143/150] adjust merging logic

---
 .buildkite/nightly-benchmarks/kickoff-pipeline.sh | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
index 692506532a011..d8edf39a5882b 100755
--- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
+++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
@@ -21,16 +21,18 @@ if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
 
   if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
     echo "This PR has the 'perf-benchmarks' label. Proceeding with the performance benchmarks."
-    # append benchmark-pipeline.yaml to the end of final.yaml
-    yq 'load(".buildkite/nightly-benchmarks/benchmark-pipeline.yaml") * load("final.yaml")' > final.yaml
+    yq -n 'load("final.yaml") *+ load(".buildkite/nightly-benchmarks/benchmark-pipeline.yaml")' > final.yaml
   fi
 
+  cat final.yaml
+
   if [[ $PR_LABELS == *"nightly-benchmarks"* ]]; then
     echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks."
-    # append nightly-pipeline.yaml to the end of final.yaml
-    yq 'load(".buildkite/nightly-benchmarks/nightly-pipeline.yaml") * load("final.yaml")' > final.yaml
+    yq -n 'load("final.yaml") *+ load(".buildkite/nightly-benchmarks/nightly-pipeline.yaml")' > final.yaml
   fi
 
+  cat final.yaml
+
   buildkite-agent pipeline upload final.yaml
 
 fi

From 5ef7e8a974bdc93474436c1597c73c4531378950 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 9 Jul 2024 18:23:54 -0700
Subject: [PATCH 144/150] put blocking step as the first step

---
 .buildkite/nightly-benchmarks/kickoff-pipeline.sh | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
index d8edf39a5882b..441ee58f082c2 100755
--- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
+++ b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
@@ -19,16 +19,17 @@ if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
 
   touch final.yaml
 
-  if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
-    echo "This PR has the 'perf-benchmarks' label. Proceeding with the performance benchmarks."
-    yq -n 'load("final.yaml") *+ load(".buildkite/nightly-benchmarks/benchmark-pipeline.yaml")' > final.yaml
+  # put blocking step (the nightly benchmark) as the first step
+  if [[ $PR_LABELS == *"nightly-benchmarks"* ]]; then
+    echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks."
+    yq -n 'load("final.yaml") *+ load(".buildkite/nightly-benchmarks/nightly-pipeline.yaml")' > final.yaml
   fi
 
   cat final.yaml
 
-  if [[ $PR_LABELS == *"nightly-benchmarks"* ]]; then
-    echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks."
-    yq -n 'load("final.yaml") *+ load(".buildkite/nightly-benchmarks/nightly-pipeline.yaml")' > final.yaml
+  if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
+    echo "This PR has the 'perf-benchmarks' label. Proceeding with the performance benchmarks."
+    yq -n 'load("final.yaml") *+ load(".buildkite/nightly-benchmarks/benchmark-pipeline.yaml")' > final.yaml
   fi
 
   cat final.yaml

From bbe115db7e796b83237ab6f2abbe49eca4b40483 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 9 Jul 2024 18:51:58 -0700
Subject: [PATCH 145/150] this file has been moved to
 vllm-project/buildkite-ci. Remove it.

---
 .../nightly-benchmarks/kickoff-pipeline.sh    | 39 -------------------
 1 file changed, 39 deletions(-)
 delete mode 100755 .buildkite/nightly-benchmarks/kickoff-pipeline.sh

diff --git a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh b/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
deleted file mode 100755
index 441ee58f082c2..0000000000000
--- a/.buildkite/nightly-benchmarks/kickoff-pipeline.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env bash
-
-# NOTE(simon): this script runs inside a buildkite agent with CPU only access.
-set -euo pipefail
-
-# Install system packages
-apt update
-apt install -y curl jq
-# install yq
-add-apt-repository ppa:rmescandon/yq -y && apt update && apt install yq -y
-
-# Install minijinja for templating
-curl -sSfL https://github.com/mitsuhiko/minijinja/releases/latest/download/minijinja-cli-installer.sh | sh
-source $HOME/.cargo/env
-
-# If BUILDKITE_PULL_REQUEST != "false", then we check the PR labels using curl and jq
-if [ "$BUILDKITE_PULL_REQUEST" != "false" ]; then
-  PR_LABELS=$(curl -s "https://api.github.com/repos/vllm-project/vllm/pulls/$BUILDKITE_PULL_REQUEST" | jq -r '.labels[].name')
-
-  touch final.yaml
-
-  # put blocking step (the nightly benchmark) as the first step
-  if [[ $PR_LABELS == *"nightly-benchmarks"* ]]; then
-    echo "This PR has the 'nightly-benchmark' label. Proceeding with the nightly benchmarks."
-    yq -n 'load("final.yaml") *+ load(".buildkite/nightly-benchmarks/nightly-pipeline.yaml")' > final.yaml
-  fi
-
-  cat final.yaml
-
-  if [[ $PR_LABELS == *"perf-benchmarks"* ]]; then
-    echo "This PR has the 'perf-benchmarks' label. Proceeding with the performance benchmarks."
-    yq -n 'load("final.yaml") *+ load(".buildkite/nightly-benchmarks/benchmark-pipeline.yaml")' > final.yaml
-  fi
-
-  cat final.yaml
-
-  buildkite-agent pipeline upload final.yaml
-
-fi

From fb1e3926ed02306796e91cf26d0697d01402011b Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 9 Jul 2024 18:53:13 -0700
Subject: [PATCH 146/150] add warning message

---
 .buildkite/nightly-benchmarks/nightly-pipeline.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 75ea50d10ad95..2d33579c37364 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -35,7 +35,7 @@ common: &common
       type: Directory
 
 steps: 
-  - block: ":rocket: Ready for comparing vllm against alternatives?"
+  - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
   - label: "A100 trt benchmark"
     priority: 100
     agents:

From 50ed6b7a4a0a8451edcbd77cfc3a142cba4851d0 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Tue, 9 Jul 2024 18:57:11 -0700
Subject: [PATCH 147/150] add a wait at the end, essential when merging
 multiple yaml files

---
 .buildkite/nightly-benchmarks/nightly-pipeline.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index 2d33579c37364..de620b9f107a5 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -115,4 +115,6 @@ steps:
           volumes:
           - name: devshm
             emptyDir:
-              medium: Memory
\ No newline at end of file
+              medium: Memory
+              
+  - wait
\ No newline at end of file

From 9758f94b07c39e3b0a716758dbb41596ef260781 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 10 Jul 2024 17:12:44 -0700
Subject: [PATCH 148/150] adjust pipeline.yaml

---
 .../nightly-benchmarks/nightly-pipeline.yaml  | 178 +++++++++++++-----
 1 file changed, 132 insertions(+), 46 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index de620b9f107a5..c12841b0c03cd 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -1,39 +1,3 @@
-common: &common
-  priorityClassName: perf-benchmark
-  containers:
-  - command:
-    - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-    resources:
-      limits:
-        nvidia.com/gpu: 8
-    volumeMounts:
-    - name: devshm
-      mountPath: /dev/shm
-    - name: hf-cache
-      mountPath: /root/.cache/huggingface
-    env:
-    - name: VLLM_USAGE_SOURCE
-      value: ci-test
-    - name: HF_HOME
-      value: /root/.cache/huggingface
-    - name: VLLM_SOURCE_CODE_LOC
-      value: /workspace/build/buildkite/vllm/performance-benchmark
-    - name: HF_TOKEN
-      valueFrom:
-        secretKeyRef:
-          name: hf-token-secret
-          key: token
-  nodeSelector:
-    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-  volumes:
-  - name: devshm
-    emptyDir:
-      medium: Memory
-  - name: hf-cache
-    hostPath:
-      path: /root/.cache/huggingface
-      type: Directory
-
 steps: 
   - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
   - label: "A100 trt benchmark"
@@ -43,10 +7,41 @@ steps:
     plugins:
     - kubernetes:
         podSpec:
-          <<: *common
+          priorityClassName: perf-benchmark
           containers:
           - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
-
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            - name: hf-cache
+              mountPath: /root/.cache/huggingface
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_HOME
+              value: /root/.cache/huggingface
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /root/.cache/huggingface
+              type: Directory
   - label: "A100 lmdeploy benchmark"
     priority: 100
     agents:
@@ -54,10 +49,41 @@ steps:
     plugins:
     - kubernetes:
         podSpec:
-          <<: *common
+          priorityClassName: perf-benchmark
           containers:
           - image: openmmlab/lmdeploy:v0.5.0
-
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            - name: hf-cache
+              mountPath: /root/.cache/huggingface
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_HOME
+              value: /root/.cache/huggingface
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /root/.cache/huggingface
+              type: Directory
   - label: "A100 vllm benchmark"
     priority: 100
     agents:
@@ -65,10 +91,41 @@ steps:
     plugins:
     - kubernetes:
         podSpec:
-          <<: *common
+          priorityClassName: perf-benchmark
           containers:
           - image: vllm/vllm-openai:latest
-
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            - name: hf-cache
+              mountPath: /root/.cache/huggingface
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_HOME
+              value: /root/.cache/huggingface
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /root/.cache/huggingface
+              type: Directory
   - label: "A100 tgi benchmark"
     priority: 100
     agents:
@@ -76,12 +133,42 @@ steps:
     plugins:
     - kubernetes:
         podSpec:
-          <<: *common
+          priorityClassName: perf-benchmark
           containers:
           - image: ghcr.io/huggingface/text-generation-inference:2.1
-
+            command:
+            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+            resources:
+              limits:
+                nvidia.com/gpu: 8
+            volumeMounts:
+            - name: devshm
+              mountPath: /dev/shm
+            - name: hf-cache
+              mountPath: /root/.cache/huggingface
+            env:
+            - name: VLLM_USAGE_SOURCE
+              value: ci-test
+            - name: HF_HOME
+              value: /root/.cache/huggingface
+            - name: VLLM_SOURCE_CODE_LOC
+              value: /workspace/build/buildkite/vllm/performance-benchmark
+            - name: HF_TOKEN
+              valueFrom:
+                secretKeyRef:
+                  name: hf-token-secret
+                  key: token
+          nodeSelector:
+            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+          volumes:
+          - name: devshm
+            emptyDir:
+              medium: Memory
+          - name: hf-cache
+            hostPath:
+              path: /root/.cache/huggingface
+              type: Directory 
   - wait
-
   - label: "Plot"
     priority: 100
     agents:
@@ -116,5 +203,4 @@ steps:
           - name: devshm
             emptyDir:
               medium: Memory
-              
   - wait
\ No newline at end of file

From 8608d17644bfb113912b696d86369415e75908ee Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 10 Jul 2024 17:19:17 -0700
Subject: [PATCH 149/150] adjust pipeline.yaml

---
 .../nightly-benchmarks/nightly-pipeline.yaml  | 160 +++---------------
 1 file changed, 19 insertions(+), 141 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index c12841b0c03cd..d7e3254407a2f 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -1,6 +1,8 @@
-steps: 
+steps:
   - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
-  - label: "A100 trt benchmark"
+
+  - &benchmark_template  # Anchor for the repeated structure
+    label: "A100 trt benchmark"
     priority: 100
     agents:
       queue: A100
@@ -42,165 +44,41 @@ steps:
             hostPath:
               path: /root/.cache/huggingface
               type: Directory
-  - label: "A100 lmdeploy benchmark"
-    priority: 100
-    agents:
-      queue: A100
+
+  - <<: *benchmark_template  # Using alias to repeat the structure
+    label: "A100 lmdeploy benchmark"
     plugins:
     - kubernetes:
         podSpec:
-          priorityClassName: perf-benchmark
           containers:
           - image: openmmlab/lmdeploy:v0.5.0
-            command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /root/.cache/huggingface
-              type: Directory
-  - label: "A100 vllm benchmark"
-    priority: 100
-    agents:
-      queue: A100
+
+  - <<: *benchmark_template  # Reuse the template
+    label: "A100 vllm benchmark"
     plugins:
     - kubernetes:
         podSpec:
-          priorityClassName: perf-benchmark
           containers:
           - image: vllm/vllm-openai:latest
-            command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /root/.cache/huggingface
-              type: Directory
-  - label: "A100 tgi benchmark"
-    priority: 100
-    agents:
-      queue: A100
+
+  - <<: *benchmark_template  # Reuse the template
+    label: "A100 tgi benchmark"
     plugins:
     - kubernetes:
         podSpec:
-          priorityClassName: perf-benchmark
           containers:
           - image: ghcr.io/huggingface/text-generation-inference:2.1
-            command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /root/.cache/huggingface
-              type: Directory 
+
   - wait
-  - label: "Plot"
-    priority: 100
-    agents:
-      queue: A100
+
+  - <<: *benchmark_template  # Reuse the template for the plot
+    label: "Plot"
     plugins:
     - kubernetes:
         podSpec:
-          priorityClassName: perf-benchmark
           containers:
           - image: vllm/vllm-openai:v0.5.0.post1
             command:
             - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-  - wait
\ No newline at end of file
+
+  - wait

From 37c4c118ea79a911ad253f3cef386b6ae0859812 Mon Sep 17 00:00:00 2001
From: Kuntai Du <kuntai@uchicago.edu>
Date: Wed, 10 Jul 2024 19:24:07 -0700
Subject: [PATCH 150/150] fix pipeline yaml

---
 .../nightly-benchmarks/nightly-pipeline.yaml  | 172 +++++++++++-------
 1 file changed, 104 insertions(+), 68 deletions(-)

diff --git a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
index d7e3254407a2f..6e399bb936fbc 100644
--- a/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
+++ b/.buildkite/nightly-benchmarks/nightly-pipeline.yaml
@@ -1,84 +1,120 @@
+common_pod_spec: &common_pod_spec
+  priorityClassName: perf-benchmark
+  nodeSelector:
+    nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
+  volumes:
+    - name: devshm
+      emptyDir:
+        medium: Memory
+    - name: hf-cache
+      hostPath:
+        path: /root/.cache/huggingface
+        type: Directory
+
+common_container_settings: &common_container_settings
+  command:
+    - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
+  resources:
+    limits:
+      nvidia.com/gpu: 8
+  volumeMounts:
+    - name: devshm
+      mountPath: /dev/shm
+    - name: hf-cache
+      mountPath: /root/.cache/huggingface
+  env:
+    - name: VLLM_USAGE_SOURCE
+      value: ci-test
+    - name: HF_HOME
+      value: /root/.cache/huggingface
+    - name: VLLM_SOURCE_CODE_LOC
+      value: /workspace/build/buildkite/vllm/performance-benchmark
+    - name: HF_TOKEN
+      valueFrom:
+        secretKeyRef:
+          name: hf-token-secret
+          key: token
+
 steps:
   - block: ":rocket: Ready for comparing vllm against alternatives? This will take 4 hours."
-
-  - &benchmark_template  # Anchor for the repeated structure
-    label: "A100 trt benchmark"
+  - label: "A100 trt benchmark"
     priority: 100
     agents:
       queue: A100
     plugins:
-    - kubernetes:
-        podSpec:
-          priorityClassName: perf-benchmark
-          containers:
-          - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
-            command:
-            - bash .buildkite/nightly-benchmarks/run-nightly-suite.sh
-            resources:
-              limits:
-                nvidia.com/gpu: 8
-            volumeMounts:
-            - name: devshm
-              mountPath: /dev/shm
-            - name: hf-cache
-              mountPath: /root/.cache/huggingface
-            env:
-            - name: VLLM_USAGE_SOURCE
-              value: ci-test
-            - name: HF_HOME
-              value: /root/.cache/huggingface
-            - name: VLLM_SOURCE_CODE_LOC
-              value: /workspace/build/buildkite/vllm/performance-benchmark
-            - name: HF_TOKEN
-              valueFrom:
-                secretKeyRef:
-                  name: hf-token-secret
-                  key: token
-          nodeSelector:
-            nvidia.com/gpu.product: NVIDIA-A100-SXM4-80GB
-          volumes:
-          - name: devshm
-            emptyDir:
-              medium: Memory
-          - name: hf-cache
-            hostPath:
-              path: /root/.cache/huggingface
-              type: Directory
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: nvcr.io/nvidia/tritonserver:24.04-trtllm-python-py3
+                <<: *common_container_settings
 
-  - <<: *benchmark_template  # Using alias to repeat the structure
-    label: "A100 lmdeploy benchmark"
+  - label: "A100 lmdeploy benchmark"
+    priority: 100
+    agents:
+      queue: A100
     plugins:
-    - kubernetes:
-        podSpec:
-          containers:
-          - image: openmmlab/lmdeploy:v0.5.0
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: openmmlab/lmdeploy:v0.5.0
+                <<: *common_container_settings
+  
 
-  - <<: *benchmark_template  # Reuse the template
-    label: "A100 vllm benchmark"
+  - label: "A100 vllm benchmark"
+    priority: 100
+    agents:
+      queue: A100
     plugins:
-    - kubernetes:
-        podSpec:
-          containers:
-          - image: vllm/vllm-openai:latest
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: vllm/vllm-openai:latest 
+                <<: *common_container_settings
 
-  - <<: *benchmark_template  # Reuse the template
-    label: "A100 tgi benchmark"
+  - label: "A100 tgi benchmark"
+    priority: 100
+    agents:
+      queue: A100
     plugins:
-    - kubernetes:
-        podSpec:
-          containers:
-          - image: ghcr.io/huggingface/text-generation-inference:2.1
-
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+              - image: ghcr.io/huggingface/text-generation-inference:2.1 
+                <<: *common_container_settings
+        
   - wait
 
-  - <<: *benchmark_template  # Reuse the template for the plot
-    label: "Plot"
+  - label: "Plot"
+    priority: 100
+    agents:
+      queue: A100
     plugins:
-    - kubernetes:
-        podSpec:
-          containers:
-          - image: vllm/vllm-openai:v0.5.0.post1
-            command:
-            - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+      - kubernetes:
+          podSpec:
+            <<: *common_pod_spec
+            containers:
+            - image: vllm/vllm-openai:v0.5.0.post1
+              command:
+              - bash .buildkite/nightly-benchmarks/scripts/nightly-annotate.sh
+              resources:
+                limits:
+                  nvidia.com/gpu: 8
+              volumeMounts:
+              - name: devshm
+                mountPath: /dev/shm
+              env:
+              - name: VLLM_USAGE_SOURCE
+                value: ci-test
+              - name: VLLM_SOURCE_CODE_LOC
+                value: /workspace/build/buildkite/vllm/performance-benchmark
+              - name: HF_TOKEN
+                valueFrom:
+                  secretKeyRef:
+                    name: hf-token-secret
+                    key: token
 
-  - wait
+  - wait
\ No newline at end of file