Revert "Merge branch 'main' into remove_batch_expansion"

This reverts commit 7ae6c0b, reversing changes made to 5063c95.
vllm-project · Sep 29, 2024 · 41f52ae · 41f52ae
1 parent 7ae6c0b
commit 41f52ae
Show file tree

Hide file tree

Showing 130 changed files with 1,823 additions and 3,460 deletions.
diff --git a/...ldkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml b/...ldkite/lm-eval-harness/configs/Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
diff --git a/.buildkite/lm-eval-harness/configs/models-small.txt b/.buildkite/lm-eval-harness/configs/models-small.txt
@@ -1,7 +1,6 @@
 Meta-Llama-3-8B-Instruct.yaml
 Meta-Llama-3-8B-Instruct-FP8-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-INT8-compressed-tensors.yaml
-Meta-Llama-3-8B-Instruct-INT8-compressed-tensors-asym.yaml
 Meta-Llama-3-8B-Instruct-nonuniform-compressed-tensors.yaml
 Meta-Llama-3-8B-Instruct-Channelwise-compressed-tensors.yaml
 Minitron-4B-Base-FP8.yaml

diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py
@@ -49,15 +49,10 @@ def test_lm_eval_correctness():
     results = launch_lm_eval(eval_config)
 
     # Confirm scores match ground truth.
-    success = True
     for task in eval_config["tasks"]:
         for metric in task["metrics"]:
             ground_truth = metric["value"]
             measured_value = results["results"][task["name"]][metric["name"]]
             print(f'{task["name"]} | {metric["name"]}: '
                   f'ground_truth={ground_truth} | measured={measured_value}')
-            success = success and numpy.isclose(
-                ground_truth, measured_value, rtol=RTOL)
-
-    # Assert at the end, print all scores even on failure for debugging.
-    assert success
+            assert numpy.isclose(ground_truth, measured_value, rtol=RTOL)
diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml
@@ -8,9 +8,8 @@ steps:
       - "docker run --rm -v $(pwd)/artifacts:/artifacts_host vllm-ci:build-image bash -c 'cp -r dist /artifacts_host && chmod -R a+rw /artifacts_host'"
       # rename the files to change linux -> manylinux1
       - "for f in artifacts/dist/*.whl; do mv -- \"$$f\" \"$${f/linux/manylinux1}\"; done"
-      - "mv artifacts/dist/$(ls artifacts/dist) artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
-      - "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/$BUILDKITE_COMMIT/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
-      - "aws s3 cp artifacts/dist/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl s3://vllm-wheels/nightly/vllm-1.0.0.dev-cp38-abi3-manylinux1_x86_64.whl"
+      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/$BUILDKITE_COMMIT/"
+      - "aws s3 cp --recursive artifacts/dist s3://vllm-wheels/nightly/"
     env:
       DOCKER_BUILDKIT: "1"
 

diff --git a/.buildkite/run-xpu-test.sh b/.buildkite/run-xpu-test.sh
@@ -11,4 +11,4 @@ trap remove_docker_container EXIT
 remove_docker_container
 
 # Run the image and launch offline inference
-docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path --entrypoint="" xpu-test python3 examples/offline_inference.py
+docker run --network host --name xpu-test --device /dev/dri -v /dev/dri/by-path:/dev/dri/by-path xpu-test python3 examples/offline_inference.py
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
@@ -9,7 +9,6 @@
 # label(str): the name of the test. emoji allowed.
 # fast_check(bool): whether to run this on each commit on fastcheck pipeline.
 # fast_check_only(bool): run this test on fastcheck pipeline only
-# optional(bool): never run this test by default (i.e. need to unblock manually)
 # command(str): the single command to run for tests. incompatible with commands.
 # commands(list): the list of commands to run for test. incompatbile with command.
 # mirror_hardwares(list): the list of hardwares to run the test on as well. currently only supports [amd]
@@ -40,7 +39,7 @@ steps:
   # Check API reference (if it fails, you may have missing mock imports)
   - grep \"sig sig-object py\" build/html/dev/sampling_params.html
 
-- label: Async Engine, Inputs, Utils, Worker Test # 24min
+- label: Async Engine, Inputs, Utils, Worker Test # 15min
   fast_check: true
   source_file_dependencies:
   - vllm/
@@ -82,7 +81,7 @@ steps:
   commands:
   - pytest -v -s core
 
-- label: Entrypoints Test # 40min
+- label: Entrypoints Test # 20min
   working_dir: "/vllm-workspace/tests"
   fast_check: true
   mirror_hardwares: [amd]
@@ -152,7 +151,7 @@ steps:
   # OOM in the CI unless we run this separately
   - pytest -v -s tokenization
 
-- label: Examples Test # 15min
+- label: Examples Test # 12min
   working_dir: "/vllm-workspace/examples"
   #mirror_hardwares: [amd]
   source_file_dependencies:
@@ -170,15 +169,15 @@ steps:
     - python3 tensorize_vllm_model.py --model facebook/opt-125m serialize --serialized-directory /tmp/ --suffix v1 && python3 tensorize_vllm_model.py --model facebook/opt-125m deserialize --path-to-tensors /tmp/vllm/facebook/opt-125m/v1/model.tensors
     - python3 offline_inference_encoder_decoder.py
 
-- label: Prefix Caching Test # 9min
+- label: Prefix Caching Test # 7min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/prefix_caching
   commands:
     - pytest -v -s prefix_caching
 
-- label: Samplers Test # 36min
+- label: Samplers Test # 18min
   source_file_dependencies:
   - vllm/model_executor/layers
   - vllm/sampling_metadata.py
@@ -194,38 +193,38 @@ steps:
   - tests/test_logits_processor
   command: pytest -v -s test_logits_processor.py
 
-- label: Speculative decoding tests # 30min
+- label: Speculative decoding tests # 22min
   source_file_dependencies:
   - vllm/spec_decode
   - tests/spec_decode
   commands:
     - pytest -v -s spec_decode/e2e/test_multistep_correctness.py
     - pytest -v -s spec_decode --ignore=spec_decode/e2e/test_multistep_correctness.py
 
-- label: LoRA Test %N # 15min each
+- label: LoRA Test %N # 30min each
   mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/lora
   - tests/lora
   command: pytest -v -s lora --shard-id=$$BUILDKITE_PARALLEL_JOB --num-shards=$$BUILDKITE_PARALLEL_JOB_COUNT --ignore=lora/test_long_context.py
   parallelism: 4
 
-- label: "PyTorch Fullgraph Smoke Test" # 9min
+- label: "PyTorch Fullgraph Smoke Test"
   fast_check: true
   source_file_dependencies:
   - vllm/
   - tests/compile
   commands:
   - pytest -v -s compile/test_full_graph_smoke.py
 
-- label: "PyTorch Fullgraph Test" # 18min
+- label: "PyTorch Fullgraph Test"
   source_file_dependencies:
   - vllm/
   - tests/compile
   commands:
   - pytest -v -s compile/test_full_graph.py
 
-- label: Kernels Test %N # 1h each
+- label: Kernels Test %N # 30min each
   mirror_hardwares: [amd]
   source_file_dependencies:
   - csrc/
@@ -255,7 +254,7 @@ steps:
   - pip install aiohttp
   - bash run-benchmarks.sh
 
-- label: Quantization Test # 33min
+- label: Quantization Test # 15min
   source_file_dependencies:
   - csrc/
   - vllm/model_executor/layers/quantization
@@ -299,15 +298,15 @@ steps:
     - pytest -v -s models/test_oot_registration.py # it needs a clean process
     - pytest -v -s models/*.py --ignore=models/test_oot_registration.py
 
-- label: Decoder-only Language Models Test # 1h36min
+- label: Decoder-only Language Models Test # 1h3min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/models/decoder_only/language
   commands:
     - pytest -v -s models/decoder_only/language
 
-- label: Decoder-only Multi-Modal Models Test # 1h31min
+- label: Decoder-only Multi-Modal Models Test # 56min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
@@ -317,25 +316,15 @@ steps:
     - pytest -v -s models/decoder_only/audio_language
     - pytest -v -s models/decoder_only/vision_language
 
-- label: Other Models Test # 6min
+- label: Other Models Test # 5min
   #mirror_hardwares: [amd]
   source_file_dependencies:
   - vllm/
   - tests/models/embedding/language
   - tests/models/encoder_decoder/language
-  - tests/models/encoder_decoder/vision_language
   commands:
     - pytest -v -s models/embedding/language
     - pytest -v -s models/encoder_decoder/language
-    - pytest -v -s models/encoder_decoder/vision_language
-
-- label: Custom Models Test
-  #mirror_hardwares: [amd]
-  optional: true
-  commands:
-    # PR authors can temporarily add commands below to test individual models
-    # e.g. pytest -v -s models/encoder_decoder/vision_language/test_mllama.py
-    # *To avoid merge conflicts, remember to REMOVE (not just comment out) them before merging the PR*
 
 #####  1 GPU test  #####
 #####  multi gpus test  #####
@@ -368,7 +357,7 @@ steps:
   - # the following commands are for the second node, with ip 192.168.10.11 (ray environment already set up)
     - VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep -q 'Same node test passed'
 
-- label: Distributed Tests (2 GPUs) # 40min
+- label: Distributed Tests (2 GPUs) # 28min
   #mirror_hardwares: [amd]
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
@@ -385,16 +374,14 @@ steps:
   - VLLM_TEST_SAME_HOST=1 torchrun --nproc-per-node=4 distributed/test_same_node.py | grep -q 'Same node test passed'
   - TARGET_TEST_SUITE=L4 pytest basic_correctness/ -v -s -m distributed_2_gpus
   # Avoid importing model tests that cause CUDA reinitialization error
-  - pytest models/encoder_decoder/language/test_bart.py -v -s -m distributed_2_gpus
-  - pytest models/encoder_decoder/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
-  - pytest models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
+  - pytest models/encoder_decoder/language/test_bart.py models/decoder_only/vision_language/test_broadcast.py -v -s -m distributed_2_gpus
   - pytest -v -s spec_decode/e2e/test_integration_dist_tp2.py
   - pip install -e ./plugins/vllm_add_dummy_model
   - pytest -v -s distributed/test_distributed_oot.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s test_sharded_state_loader.py
   - CUDA_VISIBLE_DEVICES=0,1 pytest -v -s distributed/test_utils.py
 
-- label: Multi-step Tests (4 GPUs) # 36min
+- label: Multi-step Tests (4 GPUs) # 21min
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
@@ -412,7 +399,7 @@ steps:
   - pytest -v -s multi_step/test_correctness_async_llm.py
   - pytest -v -s multi_step/test_correctness_llm.py
 
-- label: Pipeline Parallelism Test # 45min
+- label: Pipeline Parallelism Test # 23min
   working_dir: "/vllm-workspace/tests"
   num_gpus: 4
   source_file_dependencies:
@@ -438,7 +425,7 @@ steps:
     - export VLLM_WORKER_MULTIPROC_METHOD=spawn
     - pytest -v -s -x lora/test_long_context.py
 
-- label: Weight Loading Multiple GPU Test  # 33min
+- label: Weight Loading Multiple GPU Test
   working_dir: "/vllm-workspace/tests"
   num_gpus: 2
   source_file_dependencies:

diff --git a/.dockerignore b/.dockerignore
@@ -1,6 +1,4 @@
-/.github/
+vllm/*.so
 /.venv
 /build
 dist
-Dockerfile*
-vllm/*.so
diff --git a/.gitignore b/.gitignore
@@ -33,7 +33,6 @@ share/python-wheels/
 .installed.cfg
 *.egg
 MANIFEST
-/.deps/
 
 # PyInstaller
 #  Usually these files are written by a python script from a template

diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -166,16 +166,7 @@ if(NVCC_THREADS AND VLLM_GPU_LANG STREQUAL "CUDA")
   list(APPEND VLLM_GPU_FLAGS "--threads=${NVCC_THREADS}")
 endif()
 
-
-#
-# Use FetchContent for C++ dependencies that are compiled as part of vLLM's build process.
-# Configure it to place files in vllm/.deps, in order to play nicely with sccache.
-#
 include(FetchContent)
-get_filename_component(PROJECT_ROOT_DIR "${CMAKE_CURRENT_SOURCE_DIR}" ABSOLUTE)
-file(MAKE_DIRECTORY "${FETCHCONTENT_BASE_DIR}")
-set(FETCHCONTENT_BASE_DIR "${PROJECT_ROOT_DIR}/.deps")
-message(STATUS "FetchContent base directory: ${FETCHCONTENT_BASE_DIR}")
 
 #
 # Define other extension targets

diff --git a/Dockerfile.xpu b/Dockerfile.xpu
@@ -1,4 +1,4 @@
-FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04 AS vllm-base
+FROM intel/oneapi-basekit:2024.2.1-0-devel-ubuntu22.04
 
 RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRODUCTS.PUB | gpg --dearmor | tee /usr/share/keyrings/intel-oneapi-archive-keyring.gpg > /dev/null && \
     echo "deb [signed-by=/usr/share/keyrings/intel-oneapi-archive-keyring.gpg] https://apt.repos.intel.com/oneapi all main " | tee /etc/apt/sources.list.d/oneAPI.list && \
@@ -7,49 +7,20 @@ RUN wget -O- https://apt.repos.intel.com/intel-gpg-keys/GPG-PUB-KEY-INTEL-SW-PRO
     echo "deb [arch=amd64,i386 signed-by=/usr/share/keyrings/intel-graphics.gpg] https://repositories.intel.com/graphics/ubuntu jammy arc" | tee /etc/apt/sources.list.d/intel.gpu.jammy.list && \
     chmod 644 /usr/share/keyrings/intel-graphics.gpg
 
-RUN apt-get update -y && \
-    apt-get install -y --no-install-recommends --fix-missing \
-    curl \
-    ffmpeg \
-    git \
-    libsndfile1 \
-    libsm6 \
-    libxext6 \
-    libgl1 \
-    lsb-release \
-    numactl \
-    python3 \
-    python3-dev \
-    python3-pip \
-    # vim \
-    wget
+RUN apt-get update  -y && \
+    apt-get install -y curl libicu70 lsb-release git wget vim numactl python3 python3-pip ffmpeg libsm6 libxext6 libgl1
+
+COPY ./ /workspace/vllm
 
 WORKDIR /workspace/vllm
-COPY requirements-xpu.txt /workspace/vllm/requirements-xpu.txt
-COPY requirements-common.txt /workspace/vllm/requirements-common.txt
 
 RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install --no-cache-dir \
-    --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
-    -r requirements-xpu.txt
-
-COPY ./ /workspace/vllm
-
-ENV VLLM_TARGET_DEVICE=xpu
+    pip install -v --extra-index-url https://pytorch-extension.intel.com/release-whl/stable/xpu/us/ \
+        cmake>=3.26 ninja packaging setuptools-scm>=8 wheel jinja2 \
+        -r requirements-xpu.txt
 
 RUN --mount=type=cache,target=/root/.cache/pip \
     --mount=type=bind,source=.git,target=.git \
-    python3 setup.py install
+    VLLM_TARGET_DEVICE=xpu python3 setup.py install
 
 CMD ["/bin/bash"]
-
-FROM vllm-base AS vllm-openai
-
-# install additional dependencies for openai api server
-RUN --mount=type=cache,target=/root/.cache/pip \
-    pip install accelerate hf_transfer 'modelscope!=1.15.0'
-
-ENV VLLM_USAGE_SOURCE production-docker-image \
-    TRITON_XPU_PROFILE 1
-
-ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
diff --git a/benchmarks/benchmark_serving.py b/benchmarks/benchmark_serving.py
@@ -1,4 +1,4 @@
-r"""Benchmark online serving throughput.
+"""Benchmark online serving throughput.
 
 On the server side, run one of the following commands:
     vLLM OpenAI API server
@@ -89,6 +89,8 @@ def sample_sharegpt_requests(
     tokenizer: PreTrainedTokenizerBase,
     fixed_output_len: Optional[int] = None,
 ) -> List[Tuple[str, int, int, None]]:
+    if fixed_output_len is not None and fixed_output_len < 4:
+        raise ValueError("output_len too small")
     # Load the dataset.
     with open(dataset_path) as f:
         dataset = json.load(f)
@@ -115,7 +117,7 @@ def sample_sharegpt_requests(
         prompt_len = len(prompt_token_ids)
         output_len = len(completion_token_ids
                          ) if fixed_output_len is None else fixed_output_len
-        if prompt_len < 4 or (fixed_output_len is None and output_len < 4):
+        if prompt_len < 4 or output_len < 4:
             # Prune too short sequences.
             continue
         if prompt_len > 1024 or prompt_len + output_len > 2048:
@@ -226,11 +228,10 @@ def sample_hf_requests(
         prompt_len = len(prompt_token_ids)
         output_len = len(completion_token_ids
                          ) if fixed_output_len is None else fixed_output_len
-        if fixed_output_len is None and (prompt_len < 4 or output_len < 4):
+        if prompt_len < 4 or output_len < 4:
             # Prune too short sequences.
             continue
-        if fixed_output_len is None and \
-            (prompt_len > 1024 or prompt_len + output_len > 2048):
+        if prompt_len > 1024 or prompt_len + output_len > 2048:
             # Prune too long sequences.
             continue
 
@@ -962,4 +963,4 @@ def main(args: argparse.Namespace):
     )
 
     args = parser.parse_args()
-    main(args)
+    main(args)