Skip to content

Commit

Permalink
Revert "Merge branch 'main' into remove_batch_expansion"
Browse files Browse the repository at this point in the history
This reverts commit 7ae6c0b, reversing
changes made to 5063c95.
  • Loading branch information
LiuXiaoxuanPKU committed Sep 29, 2024
1 parent 7ae6c0b commit 41f52ae
Show file tree
Hide file tree
Showing 130 changed files with 1,823 additions and 3,460 deletions.

This file was deleted.

1 change: 0 additions & 1 deletion .buildkite/lm-eval-harness/configs/models-small.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
Meta-Llama-3-8B-Instruct.yaml
Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
Minitron-4B-Base-FP8.yaml
Expand Down
7 changes: 1 addition & 6 deletions .buildkite/lm-eval-harness/test_lm_eval_correctness.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,10 @@ def test_lm_eval_correctness():
results = launch_lm_eval(eval_config)

# Confirm scores match ground truth.
success = True
for task in eval_config["tasks"]:
for metric in task["metrics"]:
ground_truth = metric["value"]
measured_value = results["results"][task["name"]][metric["name"]]
print(f'{task["name"]} | {metric["name"]}: '
f'ground_truth={ground_truth} | measured={measured_value}')
success = success and numpy.isclose(
ground_truth, measured_value, rtol=RTOL)

# Assert at the end, print all scores even on failure for debugging.
assert success
assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
5 changes: 2 additions & 3 deletions .buildkite/release-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,8 @@ steps:
- "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
# rename the files to change linux -> manylinux1
- "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
- "mv artifacts/dist/$(ls artifacts/dist) artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
- "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
- "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
env:
DOCKER_BUILDKIT: "1"

Expand Down
2 changes: 1 addition & 1 deletion .buildkite/run-xpu-test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -11,4 +11,4 @@ trap remove_docker_container EXIT
remove_docker_container

# Run the image and launch offline inference
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
51 changes: 19 additions & 32 deletions .buildkite/test-pipeline.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,6 @@
# label(str): the name of the test. emoji allowed.
# fast_check(bool): whether to run this on each commit on fastcheck pipeline.
# fast_check_only(bool): run this test on fastcheck pipeline only
# optional(bool): never run this test by default (i.e. need to unblock manually)
# command(str): the single command to run for tests. incompatible with commands.
# commands(list): the list of commands to run for test. incompatbile with command.
# mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
Expand Down Expand Up @@ -40,7 +39,7 @@ steps:
# Check API reference (if it fails, you may have missing mock imports)
- grep \"sig sig-object py\" build/html/dev/sampling_params.html

- label: Async Engine, Inputs, Utils, Worker Test # 24min
- label: Async Engine, Inputs, Utils, Worker Test # 15min
fast_check: true
source_file_dependencies:
- vllm/
Expand Down Expand Up @@ -82,7 +81,7 @@ steps:
commands:
- pytest -v -s core

- label: Entrypoints Test # 40min
- label: Entrypoints Test # 20min
working_dir: "/vllm-workspace/tests"
fast_check: true
mirror_hardwares: [amd]
Expand Down Expand Up @@ -152,7 +151,7 @@ steps:
# OOM in the CI unless we run this separately
- pytest -v -s tokenization

- label: Examples Test # 15min
- label: Examples Test # 12min
working_dir: "/vllm-workspace/examples"
#mirror_hardwares: [amd]
source_file_dependencies:
Expand All @@ -170,15 +169,15 @@ steps:
- python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
- python3 offline_inference_encoder_decoder.py

- label: Prefix Caching Test # 9min
- label: Prefix Caching Test # 7min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/prefix_caching
commands:
- pytest -v -s prefix_caching

- label: Samplers Test # 36min
- label: Samplers Test # 18min
source_file_dependencies:
- vllm/model_executor/layers
- vllm/sampling_metadata.py
Expand All @@ -194,38 +193,38 @@ steps:
- tests/test_logits_processor
command: pytest -v -s test_logits_processor.py

- label: Speculative decoding tests # 30min
- label: Speculative decoding tests # 22min
source_file_dependencies:
- vllm/spec_decode
- tests/spec_decode
commands:
- pytest -v -s spec_decode/e2e/test_multistep_correctness.py
- pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py

- label: LoRA Test %N # 15min each
- label: LoRA Test %N # 30min each
mirror_hardwares: [amd]
source_file_dependencies:
- vllm/lora
- tests/lora
command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
parallelism: 4

- label: "PyTorch Fullgraph Smoke Test" # 9min
- label: "PyTorch Fullgraph Smoke Test"
fast_check: true
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_full_graph_smoke.py

- label: "PyTorch Fullgraph Test" # 18min
- label: "PyTorch Fullgraph Test"
source_file_dependencies:
- vllm/
- tests/compile
commands:
- pytest -v -s compile/test_full_graph.py

- label: Kernels Test %N # 1h each
- label: Kernels Test %N # 30min each
mirror_hardwares: [amd]
source_file_dependencies:
- csrc/
Expand Down Expand Up @@ -255,7 +254,7 @@ steps:
- pip install aiohttp
- bash run-benchmarks.sh

- label: Quantization Test # 33min
- label: Quantization Test # 15min
source_file_dependencies:
- csrc/
- vllm/model_executor/layers/quantization
Expand Down Expand Up @@ -299,15 +298,15 @@ steps:
- pytest -v -s models/test_oot_registration.py # it needs a clean process
- pytest -v -s models/*.py --ignore=models/test_oot_registration.py

- label: Decoder-only Language Models Test # 1h36min
- label: Decoder-only Language Models Test # 1h3min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/decoder_only/language
commands:
- pytest -v -s models/decoder_only/language

- label: Decoder-only Multi-Modal Models Test # 1h31min
- label: Decoder-only Multi-Modal Models Test # 56min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
Expand All @@ -317,25 +316,15 @@ steps:
- pytest -v -s models/decoder_only/audio_language
- pytest -v -s models/decoder_only/vision_language

- label: Other Models Test # 6min
- label: Other Models Test # 5min
#mirror_hardwares: [amd]
source_file_dependencies:
- vllm/
- tests/models/embedding/language
- tests/models/encoder_decoder/language
- tests/models/encoder_decoder/vision_language
commands:
- pytest -v -s models/embedding/language
- pytest -v -s models/encoder_decoder/language
- pytest -v -s models/encoder_decoder/vision_language

- label: Custom Models Test
#mirror_hardwares: [amd]
optional: true
commands:
# PR authors can temporarily add commands below to test individual models
# e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
# *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*

##### 1 GPU test #####
##### multi gpus test #####
Expand Down Expand Up @@ -368,7 +357,7 @@ steps:
- # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
- VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'

- label: Distributed Tests (2 GPUs) # 40min
- label: Distributed Tests (2 GPUs) # 28min
#mirror_hardwares: [amd]
working_dir: "/vllm-workspace/tests"
num_gpus: 2
Expand All @@ -385,16 +374,14 @@ steps:
- VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
- TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
# Avoid importing model tests that cause CUDA reinitialization error
- pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
- pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
- pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
- pip install -e ./plugins/vllm_add_dummy_model
- pytest -v -s distributed/test_distributed_oot.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
- CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py

- label: Multi-step Tests (4 GPUs) # 36min
- label: Multi-step Tests (4 GPUs) # 21min
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
Expand All @@ -412,7 +399,7 @@ steps:
- pytest -v -s multi_step/test_correctness_async_llm.py
- pytest -v -s multi_step/test_correctness_llm.py

- label: Pipeline Parallelism Test # 45min
- label: Pipeline Parallelism Test # 23min
working_dir: "/vllm-workspace/tests"
num_gpus: 4
source_file_dependencies:
Expand All @@ -438,7 +425,7 @@ steps:
- export VLLM_WORKER_MULTIPROC_METHOD=spawn
- pytest -v -s -x lora/test_long_context.py

- label: Weight Loading Multiple GPU Test # 33min
- label: Weight Loading Multiple GPU Test
working_dir: "/vllm-workspace/tests"
num_gpus: 2
source_file_dependencies:
Expand Down
4 changes: 1 addition & 3 deletions .dockerignore
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
/.github/
vllm/*.so
/.venv
/build
dist
Dockerfile*
vllm/*.so
1 change: 0 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,6 @@ share/python-wheels/
.installed.cfg
*.egg
MANIFEST
/.deps/

# PyInstaller
# Usually these files are written by a python script from a template
Expand Down
9 changes: 0 additions & 9 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -166,16 +166,7 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
endif()


#
# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
# Configure it to place files in vllm/.deps, in order to play nicely with sccache.
#
include(FetchContent)
get_filename_component(PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
file(MAKE_DIRECTORY "${FETCHCONTENT_BASE_DIR}")
set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT_DIR}/.deps")
message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")

#
# Define other extension targets
Expand Down
47 changes: 9 additions & 38 deletions Dockerfile.xpu
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base
FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04

RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
Expand All @@ -7,49 +7,20 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
chmod 644 /usr/share/keyrings/intel-graphics.gpg

RUN apt-get update -y && \
apt-get install -y --no-install-recommends --fix-missing \
curl \
ffmpeg \
git \
libsndfile1 \
libsm6 \
libxext6 \
libgl1 \
lsb-release \
numactl \
python3 \
python3-dev \
python3-pip \
# vim \
wget
RUN apt-get update -y && \
apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1

COPY ./ /workspace/vllm

WORKDIR /workspace/vllm
COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
COPY requirements-common.txt /workspace/vllm/requirements-common.txt

RUN --mount=type=cache,target=/root/.cache/pip \
pip install --no-cache-dir \
--extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
-r requirements-xpu.txt

COPY ./ /workspace/vllm

ENV VLLM_TARGET_DEVICE=xpu
pip install -v --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
-r requirements-xpu.txt

RUN --mount=type=cache,target=/root/.cache/pip \
--mount=type=bind,source=.git,target=.git \
python3 setup.py install
VLLM_TARGET_DEVICE=xpu python3 setup.py install

CMD ["/bin/bash"]

FROM vllm-base AS vllm-openai

# install additional dependencies for openai api server
RUN --mount=type=cache,target=/root/.cache/pip \
pip install accelerate hf_transfer 'modelscope!=1.15.0'

ENV VLLM_USAGE_SOURCE production-docker-image \
TRITON_XPU_PROFILE 1

ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
13 changes: 7 additions & 6 deletions benchmarks/benchmark_serving.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
r"""Benchmark online serving throughput.
"""Benchmark online serving throughput.
On the server side, run one of the following commands:
vLLM OpenAI API server
Expand Down Expand Up @@ -89,6 +89,8 @@ def sample_sharegpt_requests(
tokenizer: PreTrainedTokenizerBase,
fixed_output_len: Optional[int] = None,
) -> List[Tuple[str, int, int, None]]:
if fixed_output_len is not None and fixed_output_len < 4:
raise ValueError("output_len too small")
# Load the dataset.
with open(dataset_path) as f:
dataset = json.load(f)
Expand All @@ -115,7 +117,7 @@ def sample_sharegpt_requests(
prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids
) if fixed_output_len is None else fixed_output_len
if prompt_len < 4 or (fixed_output_len is None and output_len < 4):
if prompt_len < 4 or output_len < 4:
# Prune too short sequences.
continue
if prompt_len > 1024 or prompt_len + output_len > 2048:
Expand Down Expand Up @@ -226,11 +228,10 @@ def sample_hf_requests(
prompt_len = len(prompt_token_ids)
output_len = len(completion_token_ids
) if fixed_output_len is None else fixed_output_len
if fixed_output_len is None and (prompt_len < 4 or output_len < 4):
if prompt_len < 4 or output_len < 4:
# Prune too short sequences.
continue
if fixed_output_len is None and \
(prompt_len > 1024 or prompt_len + output_len > 2048):
if prompt_len > 1024 or prompt_len + output_len > 2048:
# Prune too long sequences.
continue

Expand Down Expand Up @@ -962,4 +963,4 @@ def main(args: argparse.Namespace):
)

args = parser.parse_args()
main(args)
main(args)
Loading

0 comments on commit 41f52ae

Please sign in to comment.