From b21e4985f2820b4b4801b6ebf69ff42033f19a50 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 12 Jul 2024 07:03:24 +0000
Subject: [PATCH 1/6] Fix paligemma

---
 tests/models/test_paligemma.py          | 2 +-
 vllm/model_executor/models/paligemma.py | 6 ++++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/tests/models/test_paligemma.py b/tests/models/test_paligemma.py
index 2b1d3c5b43b44..3ba50294d419a 100644
--- a/tests/models/test_paligemma.py
+++ b/tests/models/test_paligemma.py
@@ -129,7 +129,7 @@ def run_test(
         [0.25, 0.5, 1.0],
     ],
 )
-@pytest.mark.parametrize("dtype", ["float"])
+@pytest.mark.parametrize("dtype", ["half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 2af2bedd8e48e..89010cf6bcba3 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -111,7 +111,7 @@ def input_processor_for_paligemma(ctx: InputContext, llm_inputs: LLMInputs):
     orig_prompt = llm_inputs.get("prompt")
     orig_prompt_ids = llm_inputs.get("prompt_token_ids")
 
-    if image_token_str in orig_prompt:
+    if orig_prompt is not None and image_token_str in orig_prompt:
         logger.warning(
             "The image token '%s' was detected in the prompt and "
             "will be removed. Please follow the proper prompt format"
@@ -214,7 +214,9 @@ def _parse_and_validate_image_input(
     def _image_pixels_to_features(self, vision_tower: SiglipVisionModel,
                                   pixel_values: torch.Tensor) -> torch.Tensor:
 
-        image_outputs = vision_tower(pixel_values, output_hidden_states=True)
+        target_dtype = vision_tower.get_input_embeddings().weight.dtype
+        image_outputs = vision_tower(pixel_values.to(
+            dtype=target_dtype), output_hidden_states=True)
 
         selected_image_features = image_outputs.last_hidden_state
 

From 2b3e7cd34c18de264c06bb8e81236dc93ff4da60 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 12 Jul 2024 07:06:09 +0000
Subject: [PATCH 2/6] yapf

---
 vllm/model_executor/models/paligemma.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 89010cf6bcba3..affeba81d9c3e 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -215,8 +215,8 @@ def _image_pixels_to_features(self, vision_tower: SiglipVisionModel,
                                   pixel_values: torch.Tensor) -> torch.Tensor:
 
         target_dtype = vision_tower.get_input_embeddings().weight.dtype
-        image_outputs = vision_tower(pixel_values.to(
-            dtype=target_dtype), output_hidden_states=True)
+        image_outputs = vision_tower(pixel_values.to(dtype=target_dtype),
+                                     output_hidden_states=True)
 
         selected_image_features = image_outputs.last_hidden_state
 

From 7296d9c9faafa2fbcf2113df0df3f0c8681966b5 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 12 Jul 2024 07:15:06 +0000
Subject: [PATCH 3/6] Test both float and half dtypes

---
 tests/models/test_paligemma.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/models/test_paligemma.py b/tests/models/test_paligemma.py
index 3ba50294d419a..b0e7264e89118 100644
--- a/tests/models/test_paligemma.py
+++ b/tests/models/test_paligemma.py
@@ -129,7 +129,7 @@ def run_test(
         [0.25, 0.5, 1.0],
     ],
 )
-@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("dtype", ["float", "half"])
 @pytest.mark.parametrize("max_tokens", [128])
 @pytest.mark.parametrize("num_logprobs", [5])
 def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,

From f577b89be7fbafd8808f13974c9c6ff68f599551 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 12 Jul 2024 07:59:44 +0000
Subject: [PATCH 4/6] Fix missing arg to language model

---
 vllm/model_executor/models/paligemma.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index affeba81d9c3e..561769954dec0 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -19,7 +19,7 @@
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
 from vllm.multimodal.image import cached_get_tokenizer
-from vllm.sequence import SamplerOutput, SequenceData
+from vllm.sequence import IntermediateTensors, SamplerOutput, SequenceData
 
 from .interfaces import SupportsVision
 from .utils import merge_vision_embeddings
@@ -241,6 +241,7 @@ def _process_image_input(
     def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
                 kv_caches: List[torch.Tensor],
                 attn_metadata: AttentionMetadata,
+                intermediate_tensors: Optional[IntermediateTensors] = None,
                 **kwargs: object) -> SamplerOutput:
 
         parsed_image_input = self._parse_and_validate_image_input(**kwargs)
@@ -265,6 +266,7 @@ def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
                                             positions,
                                             kv_caches,
                                             attn_metadata,
+                                            None,
                                             inputs_embeds=inputs_embeds)
 
         return hidden_states

From 3626a39029dfb8db92cbfa4b0a572dfd5ac49457 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 12 Jul 2024 08:08:18 +0000
Subject: [PATCH 5/6] Fix missing arg to language model

---
 vllm/model_executor/models/gemma.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/vllm/model_executor/models/gemma.py b/vllm/model_executor/models/gemma.py
index 16548c6c1e8c7..7e0888b5f5abd 100644
--- a/vllm/model_executor/models/gemma.py
+++ b/vllm/model_executor/models/gemma.py
@@ -277,6 +277,7 @@ def forward(
         positions: torch.Tensor,
         kv_caches: List[torch.Tensor],
         attn_metadata: AttentionMetadata,
+        intermediate_tensors: Optional[IntermediateTensors] = None,
         inputs_embeds: Optional[torch.Tensor] = None,
     ) -> torch.Tensor:
         if inputs_embeds is not None:

From c73887ef26bff04e8cea9863025dc775c1372da0 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Fri, 12 Jul 2024 08:08:39 +0000
Subject: [PATCH 6/6] yapf

---
 vllm/model_executor/models/paligemma.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/paligemma.py b/vllm/model_executor/models/paligemma.py
index 561769954dec0..8a2bacbd96b67 100644
--- a/vllm/model_executor/models/paligemma.py
+++ b/vllm/model_executor/models/paligemma.py
@@ -238,7 +238,9 @@ def _process_image_input(
 
         return self.multi_modal_projector(image_features)
 
-    def forward(self, input_ids: torch.Tensor, positions: torch.Tensor,
+    def forward(self,
+                input_ids: torch.Tensor,
+                positions: torch.Tensor,
                 kv_caches: List[torch.Tensor],
                 attn_metadata: AttentionMetadata,
                 intermediate_tensors: Optional[IntermediateTensors] = None,