From f565d34f2693d0f7d35be9b2e1aa7daadb751dc3 Mon Sep 17 00:00:00 2001 From: hzjane Date: Wed, 9 Oct 2024 14:04:28 +0800 Subject: [PATCH 1/7] enable mrope model --- vllm/multimodal/utils.py | 10 ++++--- vllm/worker/xpu_model_runner.py | 47 ++++++++++++++++++++++++++++++++- 2 files changed, 53 insertions(+), 4 deletions(-) diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py index 3c801464383ad..bb919c6c6b708 100644 --- a/vllm/multimodal/utils.py +++ b/vllm/multimodal/utils.py @@ -59,9 +59,13 @@ async def async_fetch_image(image_url: str, By default, the image is converted into RGB format. """ if image_url.startswith('http'): - image_raw = await global_http_connection.async_get_bytes( - image_url, timeout=VLLM_IMAGE_FETCH_TIMEOUT) - image = _load_image_from_bytes(image_raw) + try: + import requests + image = Image.open(requests.get(image_url, stream=True).raw) + except: + image_raw = await global_http_connection.async_get_bytes( + image_url, timeout=VLLM_IMAGE_FETCH_TIMEOUT) + image = _load_image_from_bytes(image_raw) elif image_url.startswith('data:image'): image = _load_image_from_data_url(image_url) diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index f9037625d4af9..89b8fe952918a 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -23,6 +23,7 @@ from vllm.sequence import IntermediateTensors, SequenceGroupMetadata from vllm.utils import CudaMemoryProfiler, make_tensor_with_pad from vllm.worker.model_runner import AttentionMetadata, SamplingMetadata +from vllm.model_executor.layers.rotary_embedding import MRotaryEmbedding from vllm.worker.model_runner_base import ( ModelRunnerBase, ModelRunnerInputBase, ModelRunnerInputBuilderBase, _add_attn_metadata_broadcastable_dict, @@ -121,6 +122,8 @@ def __init__(self, self.sliding_window = self.runner.sliding_window self.block_size = self.runner.block_size self.device = self.runner.device + # Multi-modal data support + self.multi_modal_input_mapper = self.runner.multi_modal_input_mapper def add_seq_group(self, seq_group_metadata: SequenceGroupMetadata): self.seq_group_metadata_list.append(seq_group_metadata) @@ -178,6 +181,40 @@ def _prepare_prompt( # NOTE(woosuk): Here we assume that the first token in the prompt # is always the first token in the sequence. input_positions.extend(list(range(computed_len, seq_len))) + mm_data = seq_group_metadata.multi_modal_data + if mm_data: + mm_kwargs = self.multi_modal_input_mapper(mm_data) + multi_modal_inputs_list.append(mm_kwargs) + if self.runner.model_is_mrope and mm_data: + image_grid_thw = mm_kwargs.get("image_grid_thw", None) + video_grid_thw = mm_kwargs.get("video_grid_thw", None) + assert image_grid_thw is not None or video_grid_thw is not None, ( + "mrope embedding type requires multi-modal input mapper " + "returns 'image_grid_thw' or 'video_grid_thw'.") + + hf_config = self.runner.model_config.hf_config + token_ids = seq_data.get_token_ids() + temp_mrope_input_positions, mrope_position_delta = \ + MRotaryEmbedding.get_input_positions( + token_ids, + image_grid_thw=image_grid_thw, + video_grid_thw=video_grid_thw, + image_token_id=hf_config.image_token_id, + video_token_id=hf_config.video_token_id, + vision_start_token_id=hf_config.vision_start_token_id, + vision_end_token_id=hf_config.vision_end_token_id, + spatial_merge_size=hf_config.vision_config.spatial_merge_size, + context_len=0, + ) + seq_data.mrope_position_delta = mrope_position_delta + mrope_input_positions = [[] for _ in range(3)] + for idx in range(3): + # msections = temp_mrope_input_positions + # for _seq_mrope_input_positions in msections: + mrope_input_positions[idx].extend( + temp_mrope_input_positions[idx]) + input_positions = mrope_input_positions + if seq_group_metadata.block_tables is None: # During memory profiling, the block tables are not initialized @@ -240,7 +277,6 @@ def _prepare_prompt( ) multi_modal_kwargs = MultiModalInputs.batch(multi_modal_inputs_list) - return (input_tokens, input_positions, attn_metadata, seq_lens, multi_modal_kwargs) @@ -410,6 +446,15 @@ def load_model(self) -> None: def vocab_size(self) -> int: return self.model_config.get_vocab_size() + @property + def model_is_mrope(self) -> bool: + """Detect if the model has "mrope" rope_scaling type. + mrope requires keep "rope_deltas" between prompt and decoding phases.""" + rope_scaling = getattr(self.model_config.hf_config, "rope_scaling", {}) + if rope_scaling is None: + return False + return rope_scaling.get("type", None) == "mrope" + @torch.inference_mode() def profile_run(self) -> None: # Enable top-k sampling to reflect the accurate memory usage. From 3c22db17021b75741ec2c0d1b22126815a02404d Mon Sep 17 00:00:00 2001 From: hzjane Date: Wed, 9 Oct 2024 14:05:08 +0800 Subject: [PATCH 2/7] update minicpm --- vllm/model_executor/models/minicpmv.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index f8be9490ee55d..ac3f9f143c6b8 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -361,7 +361,7 @@ def __init__( self.vpm.embeddings.embed_dim) self.embed_dim = self.config.hidden_size self.resampler = self.init_resampler(self.embed_dim, self.vision_dim) - self.resampler.to(device="cuda", dtype=param_dtype) + #self.resampler.to(device="cuda", dtype=param_dtype) self.lm_head = ParallelLMHead(config.vocab_size, config.hidden_size, quant_config=quant_config) @@ -746,7 +746,7 @@ def get_vision_hidden_states(self, patch_attn_mask[i, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True return self.get_vision_embedding(all_pixel_values.type(dtype), - patch_attn_mask, tgt_sizes) + patch_attn_mask.to(device), tgt_sizes.to(device)) def is_default_weight_loading(self, name: str) -> bool: return "resampler" in name @@ -839,9 +839,9 @@ def get_vision_hidden_states(self, for i in range(B): patch_attn_mask[i, 0, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True vision_embedding = self.vpm( - all_pixel_values.type(dtype), + all_pixel_values.type(dtype).to(device), patch_attention_mask=patch_attn_mask, - tgt_sizes=tgt_sizes, + tgt_sizes=tgt_sizes.to(device), ).last_hidden_state return self.resampler(vision_embedding, tgt_sizes) From 97d89f9d240c6809ba0aa239390e3bab1dffc6b3 Mon Sep 17 00:00:00 2001 From: hzjane Date: Wed, 9 Oct 2024 14:05:17 +0800 Subject: [PATCH 3/7] update utils --- vllm/entrypoints/chat_utils.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py index f1ce2c36fcceb..a545877c4c47c 100644 --- a/vllm/entrypoints/chat_utils.py +++ b/vllm/entrypoints/chat_utils.py @@ -350,7 +350,8 @@ def _get_full_multimodal_text_prompt(placeholder_counts: Dict[str, int], # NOTE: For now we always add missing placeholders at the front of # the prompt. This may change to be customizable in the future. - return "\n".join(missing_placeholders + [text_prompt]) + #return "\n".join(missing_placeholders + [text_prompt]) + return "".join(missing_placeholders + [text_prompt]) # No need to validate using Pydantic again @@ -398,7 +399,6 @@ def _parse_chat_message_content_parts( if mm_placeholder_counts: text_prompt = _get_full_multimodal_text_prompt(mm_placeholder_counts, text_prompt) - return [ConversationMessage(role=role, content=text_prompt)] From f96541b1ee612ba1c4c14276b55ce4210faecb3c Mon Sep 17 00:00:00 2001 From: hzjane Date: Wed, 9 Oct 2024 14:09:44 +0800 Subject: [PATCH 4/7] update qwen2_vl --- vllm/model_executor/models/qwen2_vl.py | 145 +++++++++++++------------ 1 file changed, 75 insertions(+), 70 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 179399a12a3d5..50db215847cf5 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -200,37 +200,37 @@ def __init__( quant_config=quant_config) # Detect attention implementation. - selected_backend: Optional[_Backend] = get_global_forced_attn_backend() - if selected_backend is None: - backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND - if backend_by_env_var is not None: - selected_backend = backend_name_to_enum(backend_by_env_var) - if selected_backend is None: - # For Volta and Turing GPUs, use xformers instead. - device_available = current_platform.get_device_capability()[0] >= 8 - if device_available: - from transformers.utils import is_flash_attn_2_available - - if is_flash_attn_2_available(): - self._use_flash_attn = True - else: - logger.warning( - "Current Qwen2-VL implementation has a bug with " - "`vllm-flash-attn` inside vision module, so we use " - "xformers backend instead. You can run `pip install " - "flash-attn to use flash-attention backend.") - self._use_flash_attn = False - else: - self._use_flash_attn = False - else: - if selected_backend == _Backend.FLASH_ATTN: - self._use_flash_attn = True - elif selected_backend == _Backend.XFORMERS: - self._use_flash_attn = False - else: - raise RuntimeError( - f"Qwen2-VL does not support {selected_backend} backend now." - ) + # selected_backend: Optional[_Backend] = get_global_forced_attn_backend() + # if selected_backend is None: + # backend_by_env_var: Optional[str] = envs.VLLM_ATTENTION_BACKEND + # if backend_by_env_var is not None: + # selected_backend = backend_name_to_enum(backend_by_env_var) + # if selected_backend is None: + # # For Volta and Turing GPUs, use xformers instead. + # device_available = current_platform.get_device_capability()[0] >= 8 + # if device_available: + # from transformers.utils import is_flash_attn_2_available + + # if is_flash_attn_2_available(): + # self._use_flash_attn = True + # else: + # logger.warning( + # "Current Qwen2-VL implementation has a bug with " + # "`vllm-flash-attn` inside vision module, so we use " + # "xformers backend instead. You can run `pip install " + # "flash-attn to use flash-attention backend.") + # self._use_flash_attn = False + # else: + # self._use_flash_attn = False + # else: + # if selected_backend == _Backend.FLASH_ATTN: + # self._use_flash_attn = True + # elif selected_backend == _Backend.XFORMERS: + # self._use_flash_attn = False + # else: + # raise RuntimeError( + # f"Qwen2-VL does not support {selected_backend} backend now." + # ) def forward( self, @@ -258,42 +258,49 @@ def forward( if rotary_pos_emb is not None: q = apply_rotary_pos_emb_vision(q, rotary_pos_emb) k = apply_rotary_pos_emb_vision(k, rotary_pos_emb) - - if self._use_flash_attn: - # from vllm_flash_attn.flash_attn_interface import ( - # flash_attn_varlen_func) - from flash_attn import flash_attn_varlen_func - - q, k, v = [rearrange(x, "b s ... -> (b s) ...") for x in [q, k, v]] - - max_seqlen = (cu_seqlens[1:] - cu_seqlens[:-1]).max().item() - output = flash_attn_varlen_func(q, - k, - v, - cu_seqlens_q=cu_seqlens, - cu_seqlens_k=cu_seqlens, - max_seqlen_q=max_seqlen, - max_seqlen_k=max_seqlen, - dropout_p=0, - causal=False) - - context_layer = rearrange(output, - "(b s) ... -> b s ...", - b=batch_size) - else: - from xformers import ops as xops - from xformers.ops.fmha.attn_bias import BlockDiagonalMask - - seqlens = (cu_seqlens[1:] - cu_seqlens[:-1]).tolist() - attn_bias = BlockDiagonalMask.from_seqlens(q_seqlen=seqlens, - kv_seqlen=None) - - context_layer = xops.memory_efficient_attention_forward( - q, k, v, attn_bias=attn_bias, p=0, scale=None) - context_layer = rearrange(context_layer, - "b s h d -> s b (h d)").contiguous() - - output, _ = self.proj(context_layer) + query = q.movedim(1, 2) + key = k.movedim(1, 2) + value = v.movedim(1, 2) + + seq_lens = [] + for i in range(1, len(cu_seqlens)): + seq_lens.append(cu_seqlens[i]-cu_seqlens[i-1]) + att_masks = [None] * len(seq_lens) + + num_tokens = q.shape[0] * q.shape[1] + output = torch.empty( + (num_tokens, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head), + dtype=query.dtype, device=query.device) + start = 0 + from vllm.attention.backends.ipex_attn import use_sdp_causal + for seq_len, mask in zip(seq_lens, + att_masks): + end = start + seq_len + if use_sdp_causal(self.hidden_size_per_attention_head, query) and False: + import xe_addons + sub_out = xe_addons.sdp_causal( + query[:, :, start:end, :].contiguous(), + key[:, :, start:end, :].contiguous(), + value[:, :, start:end, :].contiguous(), + mask) + print(f"sub_out: {sub_out.shape}") + sub_out = sub_out.squeeze(0).movedim( + 0, 1) + else: + sub_out = torch.nn.functional.scaled_dot_product_attention( + query[:, :, start:end, :], + key[:, :, start:end, :], + value[:, :, start:end, :], + attn_mask=mask, + dropout_p=0.0, + is_causal=False, + scale= self.hidden_size_per_attention_head**-0.5).squeeze(0).movedim( + 0, 1) + output[start:end, :, :] = sub_out + start = end + output = output.view(-1, batch_size, self.hidden_size_per_attention_head * self.num_attention_heads_per_partition) + + output, _ = self.proj(output) return output @@ -518,9 +525,7 @@ def forward( grid_thw: torch.Tensor, ) -> torch.Tensor: # patchify - x = x.to(device=self.device, dtype=self.dtype) x = self.patch_embed(x) - # compute position embedding rotary_pos_emb = self.rot_pos_emb(grid_thw) @@ -926,7 +931,7 @@ def _parse_and_validate_video_input( def _process_image_input(self, image_input: Qwen2VLImageInputs) -> torch.Tensor: - pixel_values = image_input["pixel_values"].type(self.visual.dtype) + pixel_values = image_input["pixel_values"].to(torch.float16) image_embeds = self.visual(pixel_values, grid_thw=image_input["image_grid_thw"]) return image_embeds From d0a03a9e6601ef2457690f36b11295b8928cf523 Mon Sep 17 00:00:00 2001 From: hzjane Date: Wed, 9 Oct 2024 14:28:13 +0800 Subject: [PATCH 5/7] update --- vllm/model_executor/models/minicpmv.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/vllm/model_executor/models/minicpmv.py b/vllm/model_executor/models/minicpmv.py index 17dc3bfc6d38d..b0751f2d800b8 100644 --- a/vllm/model_executor/models/minicpmv.py +++ b/vllm/model_executor/models/minicpmv.py @@ -745,13 +745,8 @@ def get_vision_hidden_states(self, for i in range(B): patch_attn_mask[i, :tgt_sizes[i][0] * tgt_sizes[i][1]] = True -<<<<<<< HEAD - return self.get_vision_embedding(all_pixel_values.type(dtype), - patch_attn_mask.to(device), tgt_sizes.to(device)) -======= return self.get_vision_embedding(all_pixel_values.type(dtype).to(device), patch_attn_mask, tgt_sizes.to(device)) ->>>>>>> upstream/061_test_0924 def is_default_weight_loading(self, name: str) -> bool: return "resampler" in name From e34a17b6a8416fbfb0b39d0930c2943303423425 Mon Sep 17 00:00:00 2001 From: hzjane Date: Wed, 9 Oct 2024 15:19:00 +0800 Subject: [PATCH 6/7] update --- vllm/model_executor/models/qwen2_vl.py | 30 ++++++++------------------ 1 file changed, 9 insertions(+), 21 deletions(-) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index 50db215847cf5..8f7ebede86e33 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -272,30 +272,18 @@ def forward( (num_tokens, self.num_attention_heads_per_partition, self.hidden_size_per_attention_head), dtype=query.dtype, device=query.device) start = 0 - from vllm.attention.backends.ipex_attn import use_sdp_causal for seq_len, mask in zip(seq_lens, att_masks): end = start + seq_len - if use_sdp_causal(self.hidden_size_per_attention_head, query) and False: - import xe_addons - sub_out = xe_addons.sdp_causal( - query[:, :, start:end, :].contiguous(), - key[:, :, start:end, :].contiguous(), - value[:, :, start:end, :].contiguous(), - mask) - print(f"sub_out: {sub_out.shape}") - sub_out = sub_out.squeeze(0).movedim( - 0, 1) - else: - sub_out = torch.nn.functional.scaled_dot_product_attention( - query[:, :, start:end, :], - key[:, :, start:end, :], - value[:, :, start:end, :], - attn_mask=mask, - dropout_p=0.0, - is_causal=False, - scale= self.hidden_size_per_attention_head**-0.5).squeeze(0).movedim( - 0, 1) + sub_out = torch.nn.functional.scaled_dot_product_attention( + query[:, :, start:end, :], + key[:, :, start:end, :], + value[:, :, start:end, :], + attn_mask=mask, + dropout_p=0.0, + is_causal=False, + scale= self.hidden_size_per_attention_head**-0.5).squeeze(0).movedim( + 0, 1) output[start:end, :, :] = sub_out start = end output = output.view(-1, batch_size, self.hidden_size_per_attention_head * self.num_attention_heads_per_partition) From 559d9f25a58487e90878e5b1e3ba20350595d35a Mon Sep 17 00:00:00 2001 From: hzjane Date: Thu, 10 Oct 2024 17:07:56 +0800 Subject: [PATCH 7/7] enable parallel multimodal input --- vllm/worker/xpu_model_runner.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/vllm/worker/xpu_model_runner.py b/vllm/worker/xpu_model_runner.py index 8d94f4432152d..0c96297407148 100644 --- a/vllm/worker/xpu_model_runner.py +++ b/vllm/worker/xpu_model_runner.py @@ -63,6 +63,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, + "multi_modal_kwargs": self.multi_modal_kwargs, } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) @@ -91,6 +92,7 @@ def as_broadcastable_tensor_dict(self) -> Dict[str, Any]: tensor_dict = { "input_tokens": self.input_tokens, "input_positions": self.input_positions, + "multi_modal_kwargs": self.multi_modal_kwargs, } _add_attn_metadata_broadcastable_dict(tensor_dict, self.attn_metadata) _add_sampling_metadata_broadcastable_dict(tensor_dict,