forked from vllm-project/vllm
-
Notifications
You must be signed in to change notification settings - Fork 2
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
[Model]: Add
transformers
backend support (vllm-project#11330)
# Adds support for `transformers` as a backend Following huggingface/transformers#35235, a bunch of models should already be supported, we are ramping up support for more models. Thanks @Isotr0py for the TP support, and @hmellor for his help as well! This includes: - `trust_remote_code=True` support: any model on the hub, if it implements attention the correct way can be natively supported!! - tensor parallel support --------- Signed-off-by: Harry Mellor <[email protected]> Signed-off-by: Isotr0py <[email protected]> Co-authored-by: Isotr0py <[email protected]> Co-authored-by: Harry Mellor <[email protected]> Co-authored-by: Isotr0py <[email protected]> Co-authored-by: Cyrus Leung <[email protected]> Co-authored-by: Michael Goin <[email protected]> Co-authored-by: Isotr0py <[email protected]>
- Loading branch information
1 parent
1298a40
commit a1a2aaa
Showing
11 changed files
with
528 additions
and
9 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,75 @@ | ||
"""Test the functionality of the Transformers backend. | ||
Run `pytest tests/models/test_transformers.py`. | ||
""" | ||
from contextlib import nullcontext | ||
from typing import Type | ||
|
||
import pytest | ||
|
||
from ..conftest import HfRunner, VllmRunner | ||
from ..utils import multi_gpu_test | ||
from .utils import check_logprobs_close | ||
|
||
|
||
def check_implementation( | ||
hf_runner: Type[HfRunner], | ||
vllm_runner: Type[VllmRunner], | ||
example_prompts: list[str], | ||
model: str, | ||
**kwargs, | ||
): | ||
max_tokens = 32 | ||
num_logprobs = 5 | ||
|
||
with vllm_runner(model, **kwargs) as vllm_model: | ||
vllm_outputs = vllm_model.generate_greedy_logprobs( | ||
example_prompts, max_tokens, num_logprobs) | ||
|
||
with hf_runner(model) as hf_model: | ||
hf_outputs = hf_model.generate_greedy_logprobs_limit( | ||
example_prompts, max_tokens, num_logprobs) | ||
|
||
check_logprobs_close( | ||
outputs_0_lst=hf_outputs, | ||
outputs_1_lst=vllm_outputs, | ||
name_0="hf", | ||
name_1="vllm", | ||
) | ||
|
||
|
||
@pytest.mark.parametrize( | ||
"model,model_impl", | ||
[ | ||
("meta-llama/Llama-3.2-1B-Instruct", "transformers"), | ||
("openai-community/gpt2", "transformers"), | ||
("ArthurZ/Ilama-3.2-1B", "auto"), # CUSTOM CODE | ||
("meta-llama/Llama-3.2-1B-Instruct", "auto"), | ||
]) # trust_remote_code=True by default | ||
def test_models(hf_runner, vllm_runner, example_prompts, model, | ||
model_impl) -> None: | ||
|
||
maybe_raises = nullcontext() | ||
if model == "openai-community/gpt2" and model_impl == "transformers": | ||
# Model is not backend compatible | ||
maybe_raises = pytest.raises( | ||
ValueError, | ||
match="The Transformers implementation.*not compatible with vLLM") | ||
|
||
with maybe_raises: | ||
check_implementation(hf_runner, | ||
vllm_runner, | ||
example_prompts, | ||
model, | ||
model_impl=model_impl) | ||
|
||
|
||
@multi_gpu_test(num_gpus=2) | ||
def test_distributed( | ||
hf_runner, | ||
vllm_runner, | ||
example_prompts, | ||
): | ||
kwargs = {"model_impl": "transformers", "tensor_parallel_size": 2} | ||
check_implementation(hf_runner, vllm_runner, example_prompts, | ||
"meta-llama/Llama-3.2-1B-Instruct", **kwargs) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.