Skip to content

Commit

Permalink
Merge branch 'main' into remove_batch_expansion
Browse files Browse the repository at this point in the history
  • Loading branch information
LiuXiaoxuanPKU committed Sep 29, 2024
2 parents 5063c95 + f13a07b commit 7ae6c0b
Show file tree
Hide file tree
Showing 130 changed files with 3,460 additions and 1,823 deletions.
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
# bash .buildkite/lm-eval-harness/run-lm-eval-gsm-vllm-baseline.sh -m nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test -b "auto" -l 250 -f 5 -t 1
model_name: "nm-testing/Meta-Llama-3-8B-Instruct-W8-Channel-A8-Dynamic-Asym-Per-Token-Test"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.764
- name: "exact_match,flexible-extract"
value: 0.764
limit: 250
num_fewshot: 5
1 change: 1 addition & 0 deletions .buildkite/lm-eval-harness/configs/models-small.txt
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
Meta-Llama-3-8B-Instruct.yaml
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
Minitron-4B-Base-FP8.yaml
Expand Down
7 changes: 6 additions & 1 deletion .buildkite/lm-eval-harness/test_lm_eval_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,10 +49,15 @@ def test_lm_eval_correctness():
results = launch_lm_eval(eval_config)

# Confirm scores match ground truth.
success = True
for task in eval_config["tasks"]:
for metric in task["metrics"]:
ground_truth = metric["value"]
measured_value = results["results"][task["name"]][metric["name"]]
print(f'{task["name"]} | {metric["name"]}: '
f'ground_truth={ground_truth} | measured={measured_value}')
assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
success = success and numpy.isclose(
ground_truth, measured_value, rtol=RTOL)

# Assert at the end, print all scores even on failure for debugging.
assert success
5 changes: 3 additions & 2 deletions .buildkite/release-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,9 @@ steps:
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
# rename the files to change linux -> manylinux1
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
- "mv artifacts/dist/$(ls artifacts/dist) artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
env:
DOCKER_BUILDKIT: "1"

Expand Down
2 changes: 1 addition & 1 deletion .buildkite/run-xpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ trap remove_docker_container EXIT
remove_docker_container

# Run the image and launch offline inference
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
51 changes: 32 additions & 19 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
# label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually)
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for test. incompatbile with command.
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
Expand Down Expand Up @@ -39,7 +40,7 @@ steps:
# Check API reference (if it fails, you may have missing mock imports)
- grep \"sig sig-object py\" build/html/dev/sampling_params.html

- label: Async Engine, Inputs, Utils, Worker Test # 15min
- label: Async Engine, Inputs, Utils, Worker Test # 24min
fast_check: true
source_file_dependencies:
- vllm/
Expand Down Expand Up @@ -81,7 +82,7 @@ steps:
commands:
- pytest -v -s core

- label: Entrypoints Test # 20min
- label: Entrypoints Test # 40min
working_dir: "/vllm-workspace/tests"
fast_check: true
mirror_hardwares: [amd]
Expand Down Expand Up @@ -151,7 +152,7 @@ steps:
# OOM in the CI unless we run this separately
- pytest -v -s tokenization

- label: Examples Test # 12min
- label: Examples Test # 15min
working_dir: "/vllm-workspace/examples"
#mirror_hardwares: [amd]
source_file_dependencies:
Expand All @@ -169,15 +170,15 @@ steps:
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference_encoder_decoder.py

- label: Prefix Caching Test # 7min
- label: Prefix Caching Test # 9min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/prefix_caching
commands:
- pytest -v -s prefix_caching

- label: Samplers Test # 18min
- label: Samplers Test # 36min
source_file_dependencies:
- vllm/model_executor/layers
- vllm/sampling_metadata.py
Expand All @@ -193,38 +194,38 @@ steps:
- tests/test_logits_processor
command: pytest -v -s test_logits_processor.py

- label: Speculative decoding tests # 22min
- label: Speculative decoding tests # 30min
source_file_dependencies:
- vllm/spec_decode
- tests/spec_decode
commands:
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
- pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py

- label: LoRA Test %N # 30min each
- label: LoRA Test %N # 15min each
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/lora
- tests/lora
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
parallelism: 4

- label: "PyTorch Fullgraph Smoke Test"
- label: "PyTorch Fullgraph Smoke Test" # 9min
fast_check: true
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_full_graph_smoke.py

- label: "PyTorch Fullgraph Test"
- label: "PyTorch Fullgraph Test" # 18min
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_full_graph.py

- label: Kernels Test %N # 30min each
- label: Kernels Test %N # 1h each
mirror_hardwares: [amd]
source_file_dependencies:
- csrc/
Expand Down Expand Up @@ -254,7 +255,7 @@ steps:
- pip install aiohttp
- bash run-benchmarks.sh

- label: Quantization Test # 15min
- label: Quantization Test # 33min
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
Expand Down Expand Up @@ -298,15 +299,15 @@ steps:
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py

- label: Decoder-only Language Models Test # 1h3min
- label: Decoder-only Language Models Test # 1h36min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language
commands:
- pytest -v -s models/decoder_only/language

- label: Decoder-only Multi-Modal Models Test # 56min
- label: Decoder-only Multi-Modal Models Test # 1h31min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
Expand All @@ -316,15 +317,25 @@ steps:
- pytest -v -s models/decoder_only/audio_language
- pytest -v -s models/decoder_only/vision_language

- label: Other Models Test # 5min
- label: Other Models Test # 6min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/embedding/language
- tests/models/encoder_decoder/language
- tests/models/encoder_decoder/vision_language
commands:
- pytest -v -s models/embedding/language
- pytest -v -s models/encoder_decoder/language
- pytest -v -s models/encoder_decoder/vision_language

- label: Custom Models Test
#mirror_hardwares: [amd]
optional: true
commands:
# PR authors can temporarily add commands below to test individual models
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*

##### 1 GPU test #####
##### multi gpus test #####
Expand Down Expand Up @@ -357,7 +368,7 @@ steps:
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'

- label: Distributed Tests (2 GPUs) # 28min
- label: Distributed Tests (2 GPUs) # 40min
#mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
Expand All @@ -374,14 +385,16 @@ steps:
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py

- label: Multi-step Tests (4 GPUs) # 21min
- label: Multi-step Tests (4 GPUs) # 36min
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
Expand All @@ -399,7 +412,7 @@ steps:
- pytest -v -s multi_step/test_correctness_async_llm.py
- pytest -v -s multi_step/test_correctness_llm.py

- label: Pipeline Parallelism Test # 23min
- label: Pipeline Parallelism Test # 45min
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
Expand All @@ -425,7 +438,7 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s -x lora/test_long_context.py

- label: Weight Loading Multiple GPU Test
- label: Weight Loading Multiple GPU Test # 33min
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
Expand Down
4 changes: 3 additions & 1 deletion .dockerignore
Original file line number Diff line number Diff line change
@@ -1,4 +1,6 @@
vllm/*.so
/.github/
/.venv
/build
dist
Dockerfile*
vllm/*.so
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ share/python-wheels/
.installed.cfg
*.egg
MANIFEST
/.deps/

# PyInstaller
# Usually these files are written by a python script from a template
Expand Down
9 changes: 9 additions & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -166,7 +166,16 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
endif()


#
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
# Configure it to place files in vllm/.deps, in order to play nicely with sccache.
#
include(FetchContent)
get_filename_component(PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
file(MAKE_DIRECTORY "${FETCHCONTENT_BASE_DIR}")
set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT_DIR}/.deps")
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")

#
# Define other extension targets
Expand Down
47 changes: 38 additions & 9 deletions Dockerfile.xpu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base

RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
Expand All @@ -7,20 +7,49 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg

RUN apt-get update -y && \
apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1

COPY ./ /workspace/vllm
RUN apt-get update -y && \
apt-get install -y --no-install-recommends --fix-missing \
curl \
ffmpeg \
git \
libsndfile1 \
libsm6 \
libxext6 \
libgl1 \
lsb-release \
numactl \
python3 \
python3-dev \
python3-pip \
# vim \
wget

WORKDIR /workspace/vllm
COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
COPY requirements-common.txt /workspace/vllm/requirements-common.txt

RUN --mount=type=cache,target=/root/.cache/pip \
pip install -v --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
-r requirements-xpu.txt
pip install --no-cache-dir \
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
-r requirements-xpu.txt

COPY ./ /workspace/vllm

ENV VLLM_TARGET_DEVICE=xpu

RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
VLLM_TARGET_DEVICE=xpu python3 setup.py install
python3 setup.py install

CMD ["/bin/bash"]

FROM vllm-base AS vllm-openai

# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer 'modelscope!=1.15.0'

ENV VLLM_USAGE_SOURCE production-docker-image \
TRITON_XPU_PROFILE 1

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
13 changes: 6 additions & 7 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Benchmark online serving throughput.
r"""Benchmark online serving throughput.
On the server side, run one of the following commands:
vLLM OpenAI API server
Expand Down Expand Up @@ -89,8 +89,6 @@ def sample_sharegpt_requests(
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int] = None,
) -> List[Tuple[str, int, int, None]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
# Load the dataset.
with open(dataset_path) as f:
dataset = json.load(f)
Expand All @@ -117,7 +115,7 @@ def sample_sharegpt_requests(
prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids
) if fixed_output_len is None else fixed_output_len
if prompt_len < 4 or output_len < 4:
if prompt_len < 4 or (fixed_output_len is None and output_len < 4):
# Prune too short sequences.
continue
if prompt_len > 1024 or prompt_len + output_len > 2048:
Expand Down Expand Up @@ -228,10 +226,11 @@ def sample_hf_requests(
prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids
) if fixed_output_len is None else fixed_output_len
if prompt_len < 4 or output_len < 4:
if fixed_output_len is None and (prompt_len < 4 or output_len < 4):
# Prune too short sequences.
continue
if prompt_len > 1024 or prompt_len + output_len > 2048:
if fixed_output_len is None and \
(prompt_len > 1024 or prompt_len + output_len > 2048):
# Prune too long sequences.
continue

Expand Down Expand Up @@ -963,4 +962,4 @@ def main(args: argparse.Namespace):
)

args = parser.parse_args()
main(args)
main(args)
Loading

0 comments on commit 7ae6c0b

Please sign in to comment.