diff --git a/tests/models/encoder_decoder/vision_language/test_mllama.py b/tests/models/encoder_decoder/vision_language/test_mllama.py index de31a5798863c..557605a7ac747 100644 --- a/tests/models/encoder_decoder/vision_language/test_mllama.py +++ b/tests/models/encoder_decoder/vision_language/test_mllama.py @@ -26,7 +26,7 @@ ] models = [ - "meta-llama/Llama-3.2-11B-Vision-Instruct", + "/home/xiangxu_google_com/data/meta-llama/Llama-3.2-11B-Vision", ] @@ -317,15 +317,13 @@ def test_models_interleaved_images(hf_runner, vllm_runner, image_assets, model, inputs = [( [ - "<|begin_of_text|>The meaning of the image <|image|> is", # noqa: E501 - "<|begin_of_text|>Is this <|image|> a stop sign and is this <|image|> a cherry blossom?", # noqa: E501 + "<|begin_of_text|>The content of the image <|image|> is", # noqa: E501 + "<|begin_of_text|>Between the first image <|image|> and the second image<|image|>, " + "which is a stop sign and which is a cherry blossom?", # noqa: E501 ], [ - [stop_sign.resize((1536, 512))], - [ - stop_sign.resize((1024, 512)), - cherry_blossom.resize((512, 2028)), - ], + [stop_sign], + [stop_sign, cherry_blossom], ])] _run_test(