[Vision] Make image_embed kernel input layout NHWC to eliminate permu…

…te_dim (mlc-ai#2933) * [Vision][Fix] Make image_embed kernel input NHWC to eliminate redundant permute_dim * Fix embed_size
Deelvin · Sep 23, 2024 · cb333ea · cb333ea
1 parent 980589f
commit cb333ea
Show file tree

Hide file tree

Showing 5 changed files with 158 additions and 8 deletions.
diff --git a/python/mlc_llm/conversation_template/phi.py b/python/mlc_llm/conversation_template/phi.py
@@ -41,7 +41,7 @@
     Conversation(
         name="phi-3-vision",
         system_template=f"{MessagePlaceholders.SYSTEM.value}",
-        system_message="\n",
+        system_message="",
         roles={"user": "<|user|>", "assistant": "<|assistant|>"},
         seps=["<|end|>\n"],
         role_content_sep="\n",

diff --git a/python/mlc_llm/model/llava/llava_model.py b/python/mlc_llm/model/llava/llava_model.py
@@ -155,7 +155,7 @@ def embed(self, input_ids: Tensor) -> Tensor:
         return self.language_model.embed(input_ids)
 
     def image_preprocess(self, pixel_values: Tensor) -> Tensor:
-        pixel_values = permute_dims(pixel_values, axes=(0, 2, 3, 1))  # NCHW -> NHWC
+        # pixel_values shape is NHWC
         pixel_values = self.image_processor.resize(
             pixel_values, {"shortest_edge": self.config.vision_config.image_size}
         )
@@ -256,7 +256,7 @@ def get_default_spec(self):
             },
             "image_embed": {
                 "pixel_values": nn.spec.Tensor(
-                    [1, 3, "image_height", "image_width"],
+                    [1, "image_height", "image_width", 3],
                     "uint8",
                 ),
                 "$": {

diff --git a/python/mlc_llm/model/model_preset.py b/python/mlc_llm/model/model_preset.py
@@ -608,6 +608,155 @@
         "attention_bias": False,
         "vocab_size": 32064,
     },
+    "phi-3_5-vision": {
+        "_name_or_path": "Phi-3.5-vision-instruct",
+        "architectures": ["Phi3VForCausalLM"],
+        "attention_dropout": 0.0,
+        "auto_map": {
+            "AutoConfig": "configuration_phi3_v.Phi3VConfig",
+            "AutoModelForCausalLM": "modeling_phi3_v.Phi3VForCausalLM",
+        },
+        "bos_token_id": 1,
+        "embd_layer": {
+            "embedding_cls": "image",
+            "hd_transform_order": "sub_glb",
+            "projection_cls": "mlp",
+            "use_hd_transform": True,
+            "with_learnable_separator": True,
+        },
+        "embd_pdrop": 0.0,
+        "eos_token_id": 2,
+        "hidden_act": "silu",
+        "hidden_size": 3072,
+        "img_processor": {
+            "image_dim_out": 1024,
+            "model_name": "openai/clip-vit-large-patch14-336",
+            "name": "clip_vision_model",
+            "num_img_tokens": 144,
+        },
+        "initializer_range": 0.02,
+        "intermediate_size": 8192,
+        "max_position_embeddings": 131072,
+        "model_type": "phi3_v",
+        "num_attention_heads": 32,
+        "num_hidden_layers": 32,
+        "num_key_value_heads": 32,
+        "original_max_position_embeddings": 4096,
+        "pad_token_id": 32000,
+        "resid_pdrop": 0.0,
+        "rms_norm_eps": 1e-05,
+        "rope_scaling": {
+            "long_factor": [
+                1.0800000429153442,
+                1.1100000143051147,
+                1.1399999856948853,
+                1.340000033378601,
+                1.5899999141693115,
+                1.600000023841858,
+                1.6200000047683716,
+                2.620000123977661,
+                3.2300000190734863,
+                3.2300000190734863,
+                4.789999961853027,
+                7.400000095367432,
+                7.700000286102295,
+                9.09000015258789,
+                12.199999809265137,
+                17.670000076293945,
+                24.46000099182129,
+                28.57000160217285,
+                30.420001983642578,
+                30.840002059936523,
+                32.590003967285156,
+                32.93000411987305,
+                42.320003509521484,
+                44.96000289916992,
+                50.340003967285156,
+                50.45000457763672,
+                57.55000305175781,
+                57.93000411987305,
+                58.21000289916992,
+                60.1400032043457,
+                62.61000442504883,
+                62.62000274658203,
+                62.71000289916992,
+                63.1400032043457,
+                63.1400032043457,
+                63.77000427246094,
+                63.93000411987305,
+                63.96000289916992,
+                63.970001220703125,
+                64.02999877929688,
+                64.06999969482422,
+                64.08000183105469,
+                64.12000274658203,
+                64.41000366210938,
+                64.4800033569336,
+                64.51000213623047,
+                64.52999877929688,
+                64.83999633789062,
+            ],
+            "short_factor": [
+                1.08,
+                1.1,
+                1.1300000000000001,
+                1.2800000000000002,
+                1.3100000000000003,
+                1.4500000000000004,
+                1.4500000000000004,
+                1.9500000000000008,
+                2.030000000000001,
+                2.4299999999999926,
+                2.5699999999999896,
+                2.9499999999999815,
+                3.729999999999965,
+                3.869999999999962,
+                4.189999999999955,
+                4.43999999999995,
+                4.6399999999999455,
+                4.979999999999938,
+                5.159999999999934,
+                5.279999999999932,
+                5.759999999999922,
+                5.889999999999919,
+                5.889999999999919,
+                5.969999999999917,
+                6.089999999999915,
+                6.2799999999999105,
+                6.7699999999999,
+                6.8899999999998975,
+                7.109999999999893,
+                7.129999999999892,
+                7.179999999999891,
+                7.289999999999889,
+                7.339999999999888,
+                7.559999999999883,
+                7.619999999999882,
+                7.69999999999988,
+                7.879999999999876,
+                7.879999999999876,
+                7.879999999999876,
+                7.939999999999875,
+                7.949999999999875,
+                7.979999999999874,
+                8.19999999999987,
+                8.439999999999864,
+                8.469999999999864,
+                8.589999999999861,
+                8.809999999999857,
+                8.999999999999853,
+            ],
+            "type": "su",
+        },
+        "rope_theta": 10000.0,
+        "sliding_window": 262144,
+        "tie_word_embeddings": False,
+        "torch_dtype": "bfloat16",
+        "transformers_version": "4.38.1",
+        "use_cache": True,
+        "vocab_size": 32064,
+        "_attn_implementation": "flash_attention_2",
+    },
     "qwen": {
         "architectures": ["QWenLMHeadModel"],
         "auto_map": {

diff --git a/python/mlc_llm/model/phi3v/phi3v_model.py b/python/mlc_llm/model/phi3v/phi3v_model.py
@@ -219,7 +219,6 @@ def embed(self, input_ids: Tensor):
 
     # pylint: disable=protected-access
     def image_preprocess(self, pixel_values: Tensor, num_crops=16) -> Tensor:
-        pixel_values = op.permute_dims(pixel_values, axes=(0, 2, 3, 1))  # NCHW -> NHWC
         pixel_values = self.image_processor.resize(pixel_values, params={"hd_transform": 336})
         new_h = tir.Var("new_h", "int64")
         new_w = tir.Var("new_w", "int64")
@@ -280,7 +279,7 @@ def image_preprocess(self, pixel_values: Tensor, num_crops=16) -> Tensor:
         return combined_image
 
     def image_embed(self, pixel_values: Tensor) -> Tensor:
-        n, c, h, w = pixel_values.shape  # pylint: disable=unused-variable
+        n, h, w, c = pixel_values.shape  # pylint: disable=unused-variable
         pixel_values = self.image_preprocess(pixel_values)
         pixel_values = pixel_values.astype(self.dtype)
         return self.vision_embed_tokens(pixel_values, h, w)
@@ -321,7 +320,7 @@ def get_default_spec(self):
                 },
             },
             "image_embed": {
-                "pixel_values": nn.spec.Tensor([1, 3, "image_height", "image_width"], "uint8"),
+                "pixel_values": nn.spec.Tensor([1, "image_height", "image_width", 3], "uint8"),
                 "$": {
                     "param_mode": "packed",
                     "effect_mode": "none",

diff --git a/python/mlc_llm/serve/data.py b/python/mlc_llm/serve/data.py
@@ -108,9 +108,11 @@ def from_url(url: str, config: Dict) -> "ImageData":
             raise ValueError(f"Unsupported image URL format: {url}")
 
         # image_embed_size = ImageData.get_embed_size(config)
+        # TODO: fix these hard-coded values for phi3.5-vision and llava # pylint: disable=fixme
         image_embed_size = 576
-        image_tensor = np.transpose(image_tensor, (2, 0, 1))
-        image_tensor = np.expand_dims(image_tensor, axis=0)
+        if config["model_type"] == "phi3_v":
+            image_embed_size = 1921
+        image_tensor = np.expand_dims(image_tensor, axis=0)  # HWC -> NHWC
         image_features = tvm.nd.array(image_tensor)
         image_data = ImageData(image_features, image_embed_size)
         return image_data