diff --git a/python/mlc_llm/conversation_template/phi.py b/python/mlc_llm/conversation_template/phi.py index 81a694b1f4..41f4620bee 100644 --- a/python/mlc_llm/conversation_template/phi.py +++ b/python/mlc_llm/conversation_template/phi.py @@ -41,7 +41,7 @@ Conversation( name="phi-3-vision", system_template=f"{MessagePlaceholders.SYSTEM.value}", - system_message="\n", + system_message="", roles={"user": "<|user|>", "assistant": "<|assistant|>"}, seps=["<|end|>\n"], role_content_sep="\n", diff --git a/python/mlc_llm/model/llava/llava_model.py b/python/mlc_llm/model/llava/llava_model.py index d564c998d7..e9466927e9 100644 --- a/python/mlc_llm/model/llava/llava_model.py +++ b/python/mlc_llm/model/llava/llava_model.py @@ -155,7 +155,7 @@ def embed(self, input_ids: Tensor) -> Tensor: return self.language_model.embed(input_ids) def image_preprocess(self, pixel_values: Tensor) -> Tensor: - pixel_values = permute_dims(pixel_values, axes=(0, 2, 3, 1)) # NCHW -> NHWC + # pixel_values shape is NHWC pixel_values = self.image_processor.resize( pixel_values, {"shortest_edge": self.config.vision_config.image_size} ) @@ -256,7 +256,7 @@ def get_default_spec(self): }, "image_embed": { "pixel_values": nn.spec.Tensor( - [1, 3, "image_height", "image_width"], + [1, "image_height", "image_width", 3], "uint8", ), "$": { diff --git a/python/mlc_llm/model/model_preset.py b/python/mlc_llm/model/model_preset.py index ea4c0abb38..8c97bf2a00 100644 --- a/python/mlc_llm/model/model_preset.py +++ b/python/mlc_llm/model/model_preset.py @@ -608,6 +608,155 @@ "attention_bias": False, "vocab_size": 32064, }, + "phi-3_5-vision": { + "_name_or_path": "Phi-3.5-vision-instruct", + "architectures": ["Phi3VForCausalLM"], + "attention_dropout": 0.0, + "auto_map": { + "AutoConfig": "configuration_phi3_v.Phi3VConfig", + "AutoModelForCausalLM": "modeling_phi3_v.Phi3VForCausalLM", + }, + "bos_token_id": 1, + "embd_layer": { + "embedding_cls": "image", + "hd_transform_order": "sub_glb", + "projection_cls": "mlp", + "use_hd_transform": True, + "with_learnable_separator": True, + }, + "embd_pdrop": 0.0, + "eos_token_id": 2, + "hidden_act": "silu", + "hidden_size": 3072, + "img_processor": { + "image_dim_out": 1024, + "model_name": "openai/clip-vit-large-patch14-336", + "name": "clip_vision_model", + "num_img_tokens": 144, + }, + "initializer_range": 0.02, + "intermediate_size": 8192, + "max_position_embeddings": 131072, + "model_type": "phi3_v", + "num_attention_heads": 32, + "num_hidden_layers": 32, + "num_key_value_heads": 32, + "original_max_position_embeddings": 4096, + "pad_token_id": 32000, + "resid_pdrop": 0.0, + "rms_norm_eps": 1e-05, + "rope_scaling": { + "long_factor": [ + 1.0800000429153442, + 1.1100000143051147, + 1.1399999856948853, + 1.340000033378601, + 1.5899999141693115, + 1.600000023841858, + 1.6200000047683716, + 2.620000123977661, + 3.2300000190734863, + 3.2300000190734863, + 4.789999961853027, + 7.400000095367432, + 7.700000286102295, + 9.09000015258789, + 12.199999809265137, + 17.670000076293945, + 24.46000099182129, + 28.57000160217285, + 30.420001983642578, + 30.840002059936523, + 32.590003967285156, + 32.93000411987305, + 42.320003509521484, + 44.96000289916992, + 50.340003967285156, + 50.45000457763672, + 57.55000305175781, + 57.93000411987305, + 58.21000289916992, + 60.1400032043457, + 62.61000442504883, + 62.62000274658203, + 62.71000289916992, + 63.1400032043457, + 63.1400032043457, + 63.77000427246094, + 63.93000411987305, + 63.96000289916992, + 63.970001220703125, + 64.02999877929688, + 64.06999969482422, + 64.08000183105469, + 64.12000274658203, + 64.41000366210938, + 64.4800033569336, + 64.51000213623047, + 64.52999877929688, + 64.83999633789062, + ], + "short_factor": [ + 1.08, + 1.1, + 1.1300000000000001, + 1.2800000000000002, + 1.3100000000000003, + 1.4500000000000004, + 1.4500000000000004, + 1.9500000000000008, + 2.030000000000001, + 2.4299999999999926, + 2.5699999999999896, + 2.9499999999999815, + 3.729999999999965, + 3.869999999999962, + 4.189999999999955, + 4.43999999999995, + 4.6399999999999455, + 4.979999999999938, + 5.159999999999934, + 5.279999999999932, + 5.759999999999922, + 5.889999999999919, + 5.889999999999919, + 5.969999999999917, + 6.089999999999915, + 6.2799999999999105, + 6.7699999999999, + 6.8899999999998975, + 7.109999999999893, + 7.129999999999892, + 7.179999999999891, + 7.289999999999889, + 7.339999999999888, + 7.559999999999883, + 7.619999999999882, + 7.69999999999988, + 7.879999999999876, + 7.879999999999876, + 7.879999999999876, + 7.939999999999875, + 7.949999999999875, + 7.979999999999874, + 8.19999999999987, + 8.439999999999864, + 8.469999999999864, + 8.589999999999861, + 8.809999999999857, + 8.999999999999853, + ], + "type": "su", + }, + "rope_theta": 10000.0, + "sliding_window": 262144, + "tie_word_embeddings": False, + "torch_dtype": "bfloat16", + "transformers_version": "4.38.1", + "use_cache": True, + "vocab_size": 32064, + "_attn_implementation": "flash_attention_2", + }, "qwen": { "architectures": ["QWenLMHeadModel"], "auto_map": { diff --git a/python/mlc_llm/model/phi3v/phi3v_model.py b/python/mlc_llm/model/phi3v/phi3v_model.py index 880fdc803c..814d92bec1 100644 --- a/python/mlc_llm/model/phi3v/phi3v_model.py +++ b/python/mlc_llm/model/phi3v/phi3v_model.py @@ -219,7 +219,6 @@ def embed(self, input_ids: Tensor): # pylint: disable=protected-access def image_preprocess(self, pixel_values: Tensor, num_crops=16) -> Tensor: - pixel_values = op.permute_dims(pixel_values, axes=(0, 2, 3, 1)) # NCHW -> NHWC pixel_values = self.image_processor.resize(pixel_values, params={"hd_transform": 336}) new_h = tir.Var("new_h", "int64") new_w = tir.Var("new_w", "int64") @@ -280,7 +279,7 @@ def image_preprocess(self, pixel_values: Tensor, num_crops=16) -> Tensor: return combined_image def image_embed(self, pixel_values: Tensor) -> Tensor: - n, c, h, w = pixel_values.shape # pylint: disable=unused-variable + n, h, w, c = pixel_values.shape # pylint: disable=unused-variable pixel_values = self.image_preprocess(pixel_values) pixel_values = pixel_values.astype(self.dtype) return self.vision_embed_tokens(pixel_values, h, w) @@ -321,7 +320,7 @@ def get_default_spec(self): }, }, "image_embed": { - "pixel_values": nn.spec.Tensor([1, 3, "image_height", "image_width"], "uint8"), + "pixel_values": nn.spec.Tensor([1, "image_height", "image_width", 3], "uint8"), "$": { "param_mode": "packed", "effect_mode": "none", diff --git a/python/mlc_llm/serve/data.py b/python/mlc_llm/serve/data.py index 3cecd2c7bc..a12d3ff7a0 100644 --- a/python/mlc_llm/serve/data.py +++ b/python/mlc_llm/serve/data.py @@ -108,9 +108,11 @@ def from_url(url: str, config: Dict) -> "ImageData": raise ValueError(f"Unsupported image URL format: {url}") # image_embed_size = ImageData.get_embed_size(config) + # TODO: fix these hard-coded values for phi3.5-vision and llava # pylint: disable=fixme image_embed_size = 576 - image_tensor = np.transpose(image_tensor, (2, 0, 1)) - image_tensor = np.expand_dims(image_tensor, axis=0) + if config["model_type"] == "phi3_v": + image_embed_size = 1921 + image_tensor = np.expand_dims(image_tensor, axis=0) # HWC -> NHWC image_features = tvm.nd.array(image_tensor) image_data = ImageData(image_features, image_embed_size) return image_data