From f565d34f2693d0f7d35be9b2e1aa7daadb751dc3 Mon Sep 17 00:00:00 2001
From: hzjane <a1015616934@qq.com>
Date: Wed, 9 Oct 2024 14:04:28 +0800
Subject: [PATCH 1/7] enable mrope model

---
 vllm/multimodal/utils.py        | 10 ++++---
 vllm/worker/xpu_model_runner.py | 47 ++++++++++++++++++++++++++++++++-
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 3c801464383ad..bb919c6c6b708 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -59,9 +59,13 @@ async def async_fetch_image(image_url: str,
     By default, the image is converted into RGB format.
     """
     if image_url.startswith('http'):
-        image_raw = await global_http_connection.async_get_bytes(
-            image_url, timeout=VLLM_IMAGE_FETCH_TIMEOUT)
-        image = _load_image_from_bytes(image_raw)
+        try:
+            import requests
+            image = Image.open(requests.get(image_url, stream=True).raw)
+        except:
+            image_raw = await global_http_connection.async_get_bytes(
+                image_url, timeout=VLLM_IMAGE_FETCH_TIMEOUT)
+            image = _load_image_from_bytes(image_raw)
 
     elif image_url.startswith('data:image'):
         image = _load_image_from_data_url(image_url)
diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index f9037625d4af9..89b8fe952918a 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -23,6 +23,7 @@
 from vllm.sequence import IntermediateTensors, SequenceGroupMetadata
 from vllm.utils import CudaMemoryProfiler, make_tensor_with_pad
 from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata
+from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding
 from vllm.worker.model_runner_base import (
     ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase,
     _add_attn_metadata_broadcastable_dict,
@@ -121,6 +122,8 @@ def __init__(self,
         self.sliding_window = self.runner.sliding_window
         self.block_size = self.runner.block_size
         self.device = self.runner.device
+        # Multi-modal data support
+        self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper
 
     def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata):
         self.seq_group_metadata_list.append(seq_group_metadata)
@@ -178,6 +181,40 @@ def _prepare_prompt(
             # NOTE(woosuk): Here we assume that the first token in the prompt
             # is always the first token in the sequence.
             input_positions.extend(list(range(computed_len, seq_len)))
+            mm_data = seq_group_metadata.multi_modal_data
+            if mm_data:
+                mm_kwargs = self.multi_modal_input_mapper(mm_data)
+                multi_modal_inputs_list.append(mm_kwargs)
+            if self.runner.model_is_mrope and mm_data:
+                image_grid_thw = mm_kwargs.get("image_grid_thw", None)
+                video_grid_thw = mm_kwargs.get("video_grid_thw", None)
+                assert image_grid_thw is not None or video_grid_thw is not None, (
+                    "mrope embedding type requires multi-modal input mapper "
+                    "returns 'image_grid_thw' or 'video_grid_thw'.")
+
+                hf_config = self.runner.model_config.hf_config
+                token_ids = seq_data.get_token_ids()
+                temp_mrope_input_positions, mrope_position_delta = \
+                    MRotaryEmbedding.get_input_positions(
+                        token_ids,
+                        image_grid_thw=image_grid_thw,
+                        video_grid_thw=video_grid_thw,
+                        image_token_id=hf_config.image_token_id,
+                        video_token_id=hf_config.video_token_id,
+                        vision_start_token_id=hf_config.vision_start_token_id,
+                        vision_end_token_id=hf_config.vision_end_token_id,
+                        spatial_merge_size=hf_config.vision_config.spatial_merge_size,
+                        context_len=0,
+                    )
+                seq_data.mrope_position_delta = mrope_position_delta
+                mrope_input_positions = [[] for _ in range(3)]
+                for idx in range(3):
+                    # msections = temp_mrope_input_positions
+                    # for _seq_mrope_input_positions in msections:
+                    mrope_input_positions[idx].extend(
+                        temp_mrope_input_positions[idx])
+                input_positions = mrope_input_positions
+
 
             if seq_group_metadata.block_tables is None:
                 # During memory profiling, the block tables are not initialized
@@ -240,7 +277,6 @@ def _prepare_prompt(
         )
 
         multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list)
-
         return (input_tokens, input_positions, attn_metadata, seq_lens,
                 multi_modal_kwargs)
 
@@ -410,6 +446,15 @@ def load_model(self) -> None:
     def vocab_size(self) -> int:
         return self.model_config.get_vocab_size()
 
+    @property
+    def model_is_mrope(self) -> bool:
+        """Detect if the model has "mrope" rope_scaling type.
+        mrope requires keep "rope_deltas" between prompt and decoding phases."""
+        rope_scaling = getattr(self.model_config.hf_config, "rope_scaling", {})
+        if rope_scaling is None:
+            return False
+        return rope_scaling.get("type", None) == "mrope"
+
     @torch.inference_mode()
     def profile_run(self) -> None:
         # Enable top-k sampling to reflect the accurate memory usage.

From 3c22db17021b75741ec2c0d1b22126815a02404d Mon Sep 17 00:00:00 2001
From: hzjane <a1015616934@qq.com>
Date: Wed, 9 Oct 2024 14:05:08 +0800
Subject: [PATCH 2/7] update minicpm

---
 vllm/model_executor/models/minicpmv.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index f8be9490ee55d..ac3f9f143c6b8 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -361,7 +361,7 @@ def __init__(
                            self.vpm.embeddings.embed_dim)
         self.embed_dim = self.config.hidden_size
         self.resampler = self.init_resampler(self.embed_dim, self.vision_dim)
-        self.resampler.to(device="cuda", dtype=param_dtype)
+        #self.resampler.to(device="cuda", dtype=param_dtype)
         self.lm_head = ParallelLMHead(config.vocab_size,
                                       config.hidden_size,
                                       quant_config=quant_config)
@@ -746,7 +746,7 @@ def get_vision_hidden_states(self,
             patch_attn_mask[i, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
 
         return self.get_vision_embedding(all_pixel_values.type(dtype),
-                                         patch_attn_mask, tgt_sizes)
+                                         patch_attn_mask.to(device), tgt_sizes.to(device))
 
     def is_default_weight_loading(self, name: str) -> bool:
         return "resampler" in name
@@ -839,9 +839,9 @@ def get_vision_hidden_states(self,
         for i in range(B):
             patch_attn_mask[i, 0, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
         vision_embedding = self.vpm(
-            all_pixel_values.type(dtype),
+            all_pixel_values.type(dtype).to(device),
             patch_attention_mask=patch_attn_mask,
-            tgt_sizes=tgt_sizes,
+            tgt_sizes=tgt_sizes.to(device),
         ).last_hidden_state
 
         return self.resampler(vision_embedding, tgt_sizes)

From 97d89f9d240c6809ba0aa239390e3bab1dffc6b3 Mon Sep 17 00:00:00 2001
From: hzjane <a1015616934@qq.com>
Date: Wed, 9 Oct 2024 14:05:17 +0800
Subject: [PATCH 3/7] update utils

---
 vllm/entrypoints/chat_utils.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index f1ce2c36fcceb..a545877c4c47c 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -350,7 +350,8 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int],
 
     # NOTE: For now we always add missing placeholders at the front of
     # the prompt. This may change to be customizable in the future.
-    return "\n".join(missing_placeholders + [text_prompt])
+    #return "\n".join(missing_placeholders + [text_prompt])
+    return "".join(missing_placeholders + [text_prompt])
 
 
 # No need to validate using Pydantic again
@@ -398,7 +399,6 @@ def _parse_chat_message_content_parts(
     if mm_placeholder_counts:
         text_prompt = _get_full_multimodal_text_prompt(mm_placeholder_counts,
                                                        text_prompt)
-
     return [ConversationMessage(role=role, content=text_prompt)]
 
 

From f96541b1ee612ba1c4c14276b55ce4210faecb3c Mon Sep 17 00:00:00 2001
From: hzjane <a1015616934@qq.com>
Date: Wed, 9 Oct 2024 14:09:44 +0800
Subject: [PATCH 4/7] update qwen2_vl

---
 vllm/model_executor/models/qwen2_vl.py | 145 +++++++++++++------------
 1 file changed, 75 insertions(+), 70 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 179399a12a3d5..50db215847cf5 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -200,37 +200,37 @@ def __init__(
                                       quant_config=quant_config)
 
         # Detect attention implementation.
-        selected_backend: Optional[_Backend] = get_global_forced_attn_backend()
-        if selected_backend is None:
-            backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
-            if backend_by_env_var is not None:
-                selected_backend = backend_name_to_enum(backend_by_env_var)
-        if selected_backend is None:
-            # For Volta and Turing GPUs, use xformers instead.
-            device_available = current_platform.get_device_capability()[0] >= 8
-            if device_available:
-                from transformers.utils import is_flash_attn_2_available
-
-                if is_flash_attn_2_available():
-                    self._use_flash_attn = True
-                else:
-                    logger.warning(
-                        "Current Qwen2-VL implementation has a bug with "
-                        "`vllm-flash-attn` inside vision module, so we use "
-                        "xformers backend instead. You can run `pip install "
-                        "flash-attn to use flash-attention backend.")
-                    self._use_flash_attn = False
-            else:
-                self._use_flash_attn = False
-        else:
-            if selected_backend == _Backend.FLASH_ATTN:
-                self._use_flash_attn = True
-            elif selected_backend == _Backend.XFORMERS:
-                self._use_flash_attn = False
-            else:
-                raise RuntimeError(
-                    f"Qwen2-VL does not support {selected_backend} backend now."
-                )
+        # selected_backend: Optional[_Backend] = get_global_forced_attn_backend()
+        # if selected_backend is None:
+        #     backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND
+        #     if backend_by_env_var is not None:
+        #         selected_backend = backend_name_to_enum(backend_by_env_var)
+        # if selected_backend is None:
+        #     # For Volta and Turing GPUs, use xformers instead.
+        #     device_available = current_platform.get_device_capability()[0] >= 8
+        #     if device_available:
+        #         from transformers.utils import is_flash_attn_2_available
+
+        #         if is_flash_attn_2_available():
+        #             self._use_flash_attn = True
+        #         else:
+        #             logger.warning(
+        #                 "Current Qwen2-VL implementation has a bug with "
+        #                 "`vllm-flash-attn` inside vision module, so we use "
+        #                 "xformers backend instead. You can run `pip install "
+        #                 "flash-attn to use flash-attention backend.")
+        #             self._use_flash_attn = False
+        #     else:
+        #         self._use_flash_attn = False
+        # else:
+        #     if selected_backend == _Backend.FLASH_ATTN:
+        #         self._use_flash_attn = True
+        #     elif selected_backend == _Backend.XFORMERS:
+        #         self._use_flash_attn = False
+        #     else:
+        #         raise RuntimeError(
+        #             f"Qwen2-VL does not support {selected_backend} backend now."
+        #         )
 
     def forward(
         self,
@@ -258,42 +258,49 @@ def forward(
         if rotary_pos_emb is not None:
             q = apply_rotary_pos_emb_vision(q, rotary_pos_emb)
             k = apply_rotary_pos_emb_vision(k, rotary_pos_emb)
-
-        if self._use_flash_attn:
-            # from vllm_flash_attn.flash_attn_interface import (
-            #   flash_attn_varlen_func)
-            from flash_attn import flash_attn_varlen_func
-
-            q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]]
-
-            max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item()
-            output = flash_attn_varlen_func(q,
-                                            k,
-                                            v,
-                                            cu_seqlens_q=cu_seqlens,
-                                            cu_seqlens_k=cu_seqlens,
-                                            max_seqlen_q=max_seqlen,
-                                            max_seqlen_k=max_seqlen,
-                                            dropout_p=0,
-                                            causal=False)
-
-            context_layer = rearrange(output,
-                                      "(b s) ... -> b s ...",
-                                      b=batch_size)
-        else:
-            from xformers import ops as xops
-            from xformers.ops.fmha.attn_bias import BlockDiagonalMask
-
-            seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist()
-            attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens,
-                                                       kv_seqlen=None)
-
-            context_layer = xops.memory_efficient_attention_forward(
-                q, k, v, attn_bias=attn_bias, p=0, scale=None)
-        context_layer = rearrange(context_layer,
-                                  "b s h d -> s b (h d)").contiguous()
-
-        output, _ = self.proj(context_layer)
+        query = q.movedim(1, 2)
+        key = k.movedim(1, 2)
+        value = v.movedim(1, 2)
+
+        seq_lens = []
+        for i in range(1, len(cu_seqlens)):
+            seq_lens.append(cu_seqlens[i]-cu_seqlens[i-1]) 
+        att_masks = [None] * len(seq_lens)
+
+        num_tokens = q.shape[0] * q.shape[1]
+        output = torch.empty(
+                (num_tokens, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head),
+                dtype=query.dtype, device=query.device)
+        start = 0
+        from vllm.attention.backends.ipex_attn import use_sdp_causal
+        for seq_len, mask in zip(seq_lens,
+                                att_masks):
+            end = start + seq_len
+            if use_sdp_causal(self.hidden_size_per_attention_head, query) and False:
+                import xe_addons
+                sub_out = xe_addons.sdp_causal(
+                    query[:, :, start:end, :].contiguous(),
+                    key[:, :, start:end, :].contiguous(),
+                    value[:, :, start:end, :].contiguous(),
+                    mask)
+                print(f"sub_out: {sub_out.shape}")
+                sub_out = sub_out.squeeze(0).movedim(
+                        0, 1)
+            else:
+                sub_out = torch.nn.functional.scaled_dot_product_attention(
+                    query[:, :, start:end, :],
+                    key[:, :, start:end, :],
+                    value[:, :, start:end, :],
+                    attn_mask=mask,
+                    dropout_p=0.0,
+                    is_causal=False,
+                    scale= self.hidden_size_per_attention_head**-0.5).squeeze(0).movedim(
+                        0, 1)
+            output[start:end, :, :] = sub_out
+            start = end
+        output = output.view(-1, batch_size, self.hidden_size_per_attention_head * self.num_attention_heads_per_partition)
+
+        output, _ = self.proj(output)
         return output
 
 
@@ -518,9 +525,7 @@ def forward(
         grid_thw: torch.Tensor,
     ) -> torch.Tensor:
         # patchify
-        x = x.to(device=self.device, dtype=self.dtype)
         x = self.patch_embed(x)
-
         # compute position embedding
         rotary_pos_emb = self.rot_pos_emb(grid_thw)
 
@@ -926,7 +931,7 @@ def _parse_and_validate_video_input(
 
     def _process_image_input(self,
                              image_input: Qwen2VLImageInputs) -> torch.Tensor:
-        pixel_values = image_input["pixel_values"].type(self.visual.dtype)
+        pixel_values = image_input["pixel_values"].to(torch.float16)
         image_embeds = self.visual(pixel_values,
                                    grid_thw=image_input["image_grid_thw"])
         return image_embeds

From d0a03a9e6601ef2457690f36b11295b8928cf523 Mon Sep 17 00:00:00 2001
From: hzjane <a1015616934@qq.com>
Date: Wed, 9 Oct 2024 14:28:13 +0800
Subject: [PATCH 5/7] update

---
 vllm/model_executor/models/minicpmv.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py
index 17dc3bfc6d38d..b0751f2d800b8 100644
--- a/vllm/model_executor/models/minicpmv.py
+++ b/vllm/model_executor/models/minicpmv.py
@@ -745,13 +745,8 @@ def get_vision_hidden_states(self,
         for i in range(B):
             patch_attn_mask[i, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True
 
-<<<<<<< HEAD
-        return self.get_vision_embedding(all_pixel_values.type(dtype),
-                                         patch_attn_mask.to(device), tgt_sizes.to(device))
-=======
         return self.get_vision_embedding(all_pixel_values.type(dtype).to(device),
                                          patch_attn_mask, tgt_sizes.to(device))
->>>>>>> upstream/061_test_0924
 
     def is_default_weight_loading(self, name: str) -> bool:
         return "resampler" in name

From e34a17b6a8416fbfb0b39d0930c2943303423425 Mon Sep 17 00:00:00 2001
From: hzjane <a1015616934@qq.com>
Date: Wed, 9 Oct 2024 15:19:00 +0800
Subject: [PATCH 6/7] update

---
 vllm/model_executor/models/qwen2_vl.py | 30 ++++++++------------------
 1 file changed, 9 insertions(+), 21 deletions(-)

diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py
index 50db215847cf5..8f7ebede86e33 100644
--- a/vllm/model_executor/models/qwen2_vl.py
+++ b/vllm/model_executor/models/qwen2_vl.py
@@ -272,30 +272,18 @@ def forward(
                 (num_tokens, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head),
                 dtype=query.dtype, device=query.device)
         start = 0
-        from vllm.attention.backends.ipex_attn import use_sdp_causal
         for seq_len, mask in zip(seq_lens,
                                 att_masks):
             end = start + seq_len
-            if use_sdp_causal(self.hidden_size_per_attention_head, query) and False:
-                import xe_addons
-                sub_out = xe_addons.sdp_causal(
-                    query[:, :, start:end, :].contiguous(),
-                    key[:, :, start:end, :].contiguous(),
-                    value[:, :, start:end, :].contiguous(),
-                    mask)
-                print(f"sub_out: {sub_out.shape}")
-                sub_out = sub_out.squeeze(0).movedim(
-                        0, 1)
-            else:
-                sub_out = torch.nn.functional.scaled_dot_product_attention(
-                    query[:, :, start:end, :],
-                    key[:, :, start:end, :],
-                    value[:, :, start:end, :],
-                    attn_mask=mask,
-                    dropout_p=0.0,
-                    is_causal=False,
-                    scale= self.hidden_size_per_attention_head**-0.5).squeeze(0).movedim(
-                        0, 1)
+            sub_out = torch.nn.functional.scaled_dot_product_attention(
+                query[:, :, start:end, :],
+                key[:, :, start:end, :],
+                value[:, :, start:end, :],
+                attn_mask=mask,
+                dropout_p=0.0,
+                is_causal=False,
+                scale= self.hidden_size_per_attention_head**-0.5).squeeze(0).movedim(
+                    0, 1)
             output[start:end, :, :] = sub_out
             start = end
         output = output.view(-1, batch_size, self.hidden_size_per_attention_head * self.num_attention_heads_per_partition)

From 559d9f25a58487e90878e5b1e3ba20350595d35a Mon Sep 17 00:00:00 2001
From: hzjane <a1015616934@qq.com>
Date: Thu, 10 Oct 2024 17:07:56 +0800
Subject: [PATCH 7/7] enable parallel multimodal input

---
 vllm/worker/xpu_model_runner.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py
index 8d94f4432152d..0c96297407148 100644
--- a/vllm/worker/xpu_model_runner.py
+++ b/vllm/worker/xpu_model_runner.py
@@ -63,6 +63,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
         tensor_dict = {
             "input_tokens": self.input_tokens,
             "input_positions": self.input_positions,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
 
@@ -91,6 +92,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]:
         tensor_dict = {
             "input_tokens": self.input_tokens,
             "input_positions": self.input_positions,
+            "multi_modal_kwargs": self.multi_modal_kwargs,
         }
         _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata)
         _add_sampling_metadata_broadcastable_dict(tensor_dict,