diff --git a/deploy/qwen2_vl/README.md b/deploy/qwen2_vl/README.md
new file mode 100644
index 000000000..2eb092f4d
--- /dev/null
+++ b/deploy/qwen2_vl/README.md
@@ -0,0 +1,54 @@
+# Qwen2-VL
+
+## 1. 模型介绍
+
+[Qwen2-VL
+](https: //qwenlm.github.io/blog/qwen2-vl/) 是大规模视觉语言模型。可以以图像、文本、检测框、视频作为输入，并以文本和检测框作为输出。本仓库提供paddle版本的`Qwen2-VL-2B-Instruct`和`Qwen2-VL-7B-Instruct`模型。
+
+## 2 环境准备
+- **python >= 3.10**
+- **paddlepaddle-gpu 要求版本develop**
+```
+# 安装示例
+python -m pip install paddlepaddle-gpu==0.0.0.post118 -f https: //www.paddlepaddle.org.cn/whl/linux/gpu/develop.html
+```
+- **paddlenlp
+```
+# 安装示例
+git submodule update --init --recursive
+cd PaddleNLP
+git reset --hard e91c2d3d634b12769c30aa419ddf931c20b7ca9f
+pip install -e .
+cd csrc
+python setup_cuda.py install
+```
+
+> 注：
+* 请确保安装了以上依赖，否则无法运行。同时，需要安装 paddlemix/external_ops 下的自定义OP, `python setup.py install`。如果安装后仍然找不到算子，需要额外设置PYTHONPATH
+* (默认开启flash_attn)使用flash_attn 要求A100/A800显卡或者H20显卡
+
+## 3 高性能推理
+# 在Qwen2-vl的推理优化中，我们在视觉模型部分继续使用paddlemix中的模型组网；
+  但是在语言模型部分，我们调用Paddlenlp中高性能的qwen2语言模型，以得到高性能的Qwen2-vl推理版本。
+
+### a. 文本&单张图像输入高性能推理
+```bash
+python deploy/qwen2_vl/single_image_infer.py \
+    --model_name_or_path Qwen/Qwen2-VL-2B-Instruct \
+    --dtype bfloat16 \
+    --benchmark 1
+```
+
+- 在 NVIDIA A100-SXM4-80GB 上测试的性能如下：
+
+
+- Qwen2-VL-2B-Instruct
+| Paddle Inference|    PyTorch   | Paddle 动态图 |
+| --------------- | ------------ | ------------ |
+|      1.44 s     |     2.35 s   |    5.215 s   |
+
+
+- Qwen2-VL-7B-Instruct
+| Paddle Inference|    PyTorch   | Paddle 动态图 |
+| --------------- | ------------ | ------------ |
+|      1.73 s     |      4.4s    |    6.339 s   |
diff --git a/deploy/qwen2_vl/single_image_infer.py b/deploy/qwen2_vl/single_image_infer.py
new file mode 100644
index 000000000..76b679a99
--- /dev/null
+++ b/deploy/qwen2_vl/single_image_infer.py
@@ -0,0 +1,280 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import datetime
+from dataclasses import dataclass, field
+
+import numpy as np
+import paddle
+from paddlenlp.generation import GenerationConfig
+from paddlenlp.trainer import PdArgumentParser
+from paddlenlp.transformers import (
+    AutoConfig,
+    AutoInferenceModelForCausalLM,
+    Qwen2Tokenizer,
+)
+from paddlenlp.trl import llm_utils
+
+from paddlemix.models.qwen2_vl.modeling_qwen2_vl import (
+    Qwen2RotaryEmbedding,
+    Qwen2VLForConditionalGeneration,
+)
+from paddlemix.processors.qwen2_vl_processing import (
+    Qwen2VLImageProcessor,
+    Qwen2VLProcessor,
+    process_vision_info,
+)
+
+MODEL_NAME = "Qwen/Qwen2-VL-2B-Instruct"
+# MODEL_NAME = "Qwen/Qwen2-VL-7B-Instruct"
+vl_model = Qwen2VLForConditionalGeneration.from_pretrained(MODEL_NAME, dtype="bfloat16")
+
+# NOTE: (zhoukangkang、changwenbin) Because we only use the visual model here,
+# in order to reduce video memory,we delete the language model.
+del vl_model.model
+paddle.device.cuda.empty_cache()
+
+image_processor = Qwen2VLImageProcessor()
+tokenizer = Qwen2Tokenizer.from_pretrained(MODEL_NAME)
+processor = Qwen2VLProcessor(image_processor, tokenizer)
+
+# min_pixels = 256*28*28 # 200704
+# max_pixels = 1280*28*28 # 1003520
+# processor = Qwen2VLProcessor(image_processor, tokenizer, min_pixels=min_pixels, max_pixels=max_pixels)
+
+messages = [
+    {
+        "role": "user",
+        "content": [
+            {
+                "type": "image",
+                "image": "paddlemix/demo_images/examples_image1.jpg",
+            },
+            {"type": "text", "text": "Describe this image."},
+        ],
+    }
+]
+
+# Preparation for inference
+image_inputs, video_inputs = process_vision_info(messages)
+
+question = "Describe this image."
+image_pad_token = "<|vision_start|><|image_pad|><|vision_end|>"
+text = f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{image_pad_token}{question}<|im_end|>\n<|im_start|>assistant\n"
+
+
+@dataclass
+class PredictorArgument:
+    # NOTE: (zhoukangkang、changwenbin)
+    # These parameters are all copied from https://github.com/PaddlePaddle/PaddleNLP/blob/develop/llm/predict/predictor.py
+    # For simplicity and ease of use, only the necessary parameters are retained here.
+    # If you want to know the exact meaning of these parameters, please refer to the link above.
+
+    model_name_or_path: str = field(default=None, metadata={"help": "The directory of model."})
+    src_length = 1024
+    min_length = 2
+    max_length = 200
+    top_k = 0
+    top_p = 0.0
+    temperature = 0.95
+    repetition_penalty = 1.0
+    dtype: str = field(default=None, metadata={"help": "Model dtype"})
+    decode_strategy = "sampling"
+    mode = "dynamic"
+    inference_model = True
+    quant_type = ""
+    benchmark: bool = field(
+        default=False,
+        metadata={
+            "help": "If benchmark set as `True`, we will force model decode to max_length, which is helpful to compute throughput. "
+        },
+    )
+    use_fake_parameter = False
+    block_attn = True
+    block_size = 64
+    cachekv_int8_type = None
+    append_attn = True
+    total_max_length = 4096
+    speculate_method = None
+
+
+@dataclass
+class ModelArgument:
+    model_type: str = field(
+        default=None,
+        metadata={"help": "the type of the model, which can be one of ['gpt-3', 'ernie-3.5-se', 'llama-img2txt']"},
+    )
+
+
+def init_llm_model_inputs(vision_model_inputs, inputs_embeds, arg_config: PredictorArgument):
+    assert len(inputs_embeds.shape) == 3
+    batch_size = inputs_embeds.shape[0]
+
+    model_inputs = {}
+    model_inputs["input_ids"] = paddle.zeros(shape=[batch_size, arg_config.total_max_length], dtype="int64")
+    model_inputs["inputs_embeds"] = inputs_embeds
+
+    # I dislike write (arg_config.total_max_length + arg_config.block_size -1 ) // arg_config.block_size
+    assert arg_config.total_max_length % arg_config.block_size == 0
+
+    model_inputs["top_p"] = paddle.full(shape=[batch_size, 1], fill_value=arg_config.top_p, dtype="float32")
+    model_inputs["temperature"] = paddle.full(
+        shape=[batch_size, 1], fill_value=arg_config.temperature, dtype="float32"
+    )
+    model_inputs["eos_token_id"] = paddle.to_tensor(
+        np.array(llm_utils.get_eos_token_id(tokenizer, generation_config)).reshape(-1, 1).astype("int64")
+    )
+    model_inputs["penalty_score"] = paddle.full(
+        shape=[batch_size, 1], fill_value=arg_config.repetition_penalty, dtype="float32"
+    )
+    model_inputs["frequency_score"] = paddle.full(shape=[batch_size, 1], fill_value=0.0, dtype="float32")
+    model_inputs["presence_score"] = paddle.full(shape=[batch_size, 1], fill_value=0.0, dtype="float32")
+    model_inputs["min_length"] = paddle.full(shape=[batch_size, 1], fill_value=arg_config.min_length, dtype="int64")
+    model_inputs["max_length"] = paddle.full(shape=[batch_size, 1], fill_value=arg_config.max_length, dtype="int64")
+
+    position_ids, _ = vl_model.get_rope_index(
+        config.vision_config["spatial_merge_size"],
+        config.image_token_id,
+        config.video_token_id,
+        config.vision_start_token_id,
+        vision_model_inputs.get("input_ids"),
+        vision_model_inputs.get("image_grid_thw"),
+        vision_model_inputs.get("video_grid_thw", None),
+        vision_model_inputs.get("attention_mask"),
+    )
+    position_start = position_ids[0][0][-1].item()
+    position_end = 4096 - position_ids.shape[-1] + position_start
+    position_value = (
+        paddle.arange(position_start, position_end).reshape([1, 1, -1]).expand([position_ids.shape[0], 1, -1])
+    )
+    position_ids = paddle.concat([position_ids, position_value], axis=-1)
+
+    head_dim = config.hidden_size // config.num_attention_heads
+    qwen2_Embedding = Qwen2RotaryEmbedding(head_dim, 4096, config.rope_theta)
+    cos = qwen2_Embedding.cos_cached
+    sin = qwen2_Embedding.sin_cached
+
+    # NOTE: (zhoukangkang、changwenbin) Copied from PaddleMIX/paddlemix/models/qwen2_vl/modeling_qwen2_vl.py,
+    # for calculating M-ROPE.
+    cos = cos[position_ids]
+    sin = sin[position_ids]
+    mrope_section = config.rope_scaling["mrope_section"] * 2
+    cos = paddle.concat(x=[m[i % 3] for i, m in enumerate(cos.split(mrope_section, axis=-1))], axis=-1)
+    sin = paddle.concat(x=[m[i % 3] for i, m in enumerate(sin.split(mrope_section, axis=-1))], axis=-1)
+
+    rope_emb = paddle.stack([cos, sin], axis=0)
+    rope_emb = rope_emb.reshape([rope_emb.shape[0], 1, rope_emb.shape[2], 1, rope_emb.shape[-1]])
+    model_inputs["rope_emb"] = rope_emb
+
+    model_inputs["bad_tokens"] = paddle.to_tensor([-1], dtype="int64")
+    model_inputs["is_block_step"] = paddle.full(shape=[batch_size], fill_value=False, dtype="bool")
+
+    cache_kvs_shape = fast_llm_model.get_cache_kvs_shape(fast_llm_model.config, batch_size)
+    cachekv_dtype = config.dtype if arg_config.cachekv_int8_type is None else "uint8"
+    model_inputs["cache_kvs"] = [paddle.zeros(shape, dtype=cachekv_dtype) for shape in cache_kvs_shape]
+
+    block_nums = arg_config.total_max_length // arg_config.block_size
+    model_inputs["block_tables"] = paddle.arange(block_nums, dtype="int32").tile([batch_size, 1])
+
+    seq_lens = inputs_embeds.shape[1]
+    model_inputs["seq_lens_this_time"] = paddle.to_tensor(np.array(seq_lens).astype("int32").reshape(-1, 1))
+    model_inputs["seq_lens_encoder"] = paddle.to_tensor(np.array(seq_lens).astype("int32").reshape(-1, 1))
+    model_inputs["seq_lens_decoder"] = paddle.full(shape=[batch_size, 1], fill_value=0, dtype="int32")
+    model_inputs["step_idx"] = paddle.full(shape=[batch_size, 1], fill_value=0, dtype="int64")
+    model_inputs["not_need_stop"] = paddle.full(shape=[1], fill_value=True, dtype="bool")
+    model_inputs["stop_flags"] = paddle.full(shape=[batch_size, 1], fill_value=False, dtype="bool")
+    model_inputs["stop_nums"] = paddle.full(shape=[1], fill_value=batch_size, dtype="int64")
+    model_inputs["pre_ids"] = paddle.full(shape=[batch_size, arg_config.max_length], fill_value=-1, dtype="int64")
+    model_inputs["next_tokens"] = paddle.full(shape=[batch_size, 1], fill_value=-1, dtype="int64")
+
+    return model_inputs
+
+
+parser = PdArgumentParser((PredictorArgument, ModelArgument))
+predictor_args, model_args = parser.parse_args_into_dataclasses()
+
+paddle.set_default_dtype(predictor_args.dtype)
+config = AutoConfig.from_pretrained(MODEL_NAME)
+
+# NOTE: (changwenbin) This is for using the inference optimization of paddlenlp qwen2.
+config.model_type = "qwen2"
+generation_config = GenerationConfig.from_pretrained(MODEL_NAME)
+fast_llm_model = AutoInferenceModelForCausalLM.from_pretrained(
+    MODEL_NAME,
+    config=config,
+    predictor_args=predictor_args,
+    model_args=model_args,
+    dtype=predictor_args.dtype,
+    tensor_parallel_degree=1,
+    tensor_parallel_rank=0,
+)
+fast_llm_model.eval()
+
+vl_model.model = fast_llm_model
+
+
+def run_model():
+
+    vision_model_inputs = processor(
+        text=[text],
+        images=image_inputs,
+        videos=video_inputs,
+        padding=True,
+        return_tensors="pd",
+    )
+    inputs_embeds = vl_model.vision_forward(**vision_model_inputs)
+    llm_model_inputs = init_llm_model_inputs(vision_model_inputs, inputs_embeds, arg_config=predictor_args)
+    generated_text = ""
+    while llm_model_inputs["not_need_stop"]:
+        generated_ids = fast_llm_model.generate(**llm_model_inputs)  # already trimmed in paddle
+        llm_model_inputs["input_ids"] = generated_ids
+        llm_model_inputs["inputs_embeds"] = None
+        new_text_piece = processor.batch_decode(
+            generated_ids[0], skip_special_tokens=True, clean_up_tokenization_spaces=False
+        )[0]
+        if new_text_piece == "<|im_end|>":
+            break
+        generated_text += new_text_piece
+    return generated_text
+
+
+if predictor_args.benchmark:
+    print(f"Benchmarking {MODEL_NAME} ...")
+    warm_up = 3
+    repeat_times = 10
+    sumtime = 0.0
+    times = repeat_times + warm_up
+    for i in range(times):
+        if i > 2:
+            paddle.device.synchronize()
+            starttime = datetime.datetime.now()
+        generated_text = run_model()
+        if i > 2:
+            paddle.device.synchronize()
+            endtime = datetime.datetime.now()
+            print("Final output_text:\n", generated_text)
+
+        if i > 2:
+            duringtime = endtime - starttime
+            duringtime = duringtime.seconds * 1000 + duringtime.microseconds / 1000.0
+            sumtime += duringtime
+            print(f"Single {MODEL_NAME} end to end time : ", duringtime, "ms")
+            inference_global_mem = paddle.device.cuda.memory_reserved() / (1024**3)
+            print(f"Inference used CUDA memory : {inference_global_mem:.3f} GiB")
+
+    print(f"Single {MODEL_NAME} ave end to end time : ", sumtime / repeat_times, "ms")
+
+else:
+    generated_text = run_model()
+    print("Final output_text:\n", generated_text)
diff --git a/paddlemix/MULLM_WebUI/README.md b/paddlemix/MULLM_WebUI/README.md
new file mode 100644
index 000000000..4520e6d2c
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/README.md
@@ -0,0 +1,92 @@
+# PaddleMIX MULLM WebUI
+
+## 1. 简介
+PaddleMIX MULLM_WebUI 是一个基于PaddleMIX套件的交互式平台，主要支持多模态理解任务的模型微调与推理功能。MULLM_WebUI 提供了丰富的可视化操作界面，支持用户进行模型微调、推理等操作。
+![overview](./fig/overview.jpg)
+
+#### 支持模型
+| Model |Model Size |Inference | SFT | LoRA |
+|-------|------------|-------|---|-----|
+| qwen2_vl|2B/7B| ✅     | ✅   | ✅   ||
+
+>* ✅: Supported
+>* 🚧: In Progress
+>* ❌: Not Supported
+
+## 2. 安装
+* 安装Paddle和PaddleMIX依赖
+
+* 安装PaddleMIX MULLM WebUI依赖
+```
+pip install -r paddlemix/MULLM_WebUI/requirements.txt
+```
+
+## 3. 快速使用
+
+### 3.1 启动
+```
+CUDA_VISIBLE_DEVICES=0 \
+GRADIO_SHARE=1 \
+GRADIO_SERVER_NAME=0.0.0.0 \
+GRADIO_ANALYTICS_ENABLED=0 \
+GRADIO_SERVER_PORT=8260 python paddlemix/MULLM_WebUI/run_web.py
+```
+### 3.2 使用教程
+#### 3.2.1 新增数据集
+
+* 下载 [Pokemon](https://huggingface.co/datasets/llamafactory/pokemon-gpt4o-captions/tree/main) 数据集。Pokemon-gpt4o-captions 是一个基于精灵宝可梦的中英双语视觉问答数据集，其问答结果由gpt4o生成。其中中文问答数据共计833条，数据集大小80.8M。
+* 放置中文数据集文件到 `./data/pokemon_gpt4o_zh/pokemon_gpt4o_zh.parquet`
+
+* 运行转换数据集脚本
+```
+python paddlemix/MULLM_WebUI/scripts/convert_dataset.py \
+    --data_dir ./data \
+    --dataset_dir pokemon_gpt4o_zh \
+    --file_name ./data/pokemon_gpt4o_zh/pokemon_gpt4o_zh.parquet
+```
+> 注：目前MULLM WebUI只支持单卡微调，为了达到更佳的训练效果，建议自己构建数据集或者按照[qwen2_vl ](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/paddlemix/examples/qwen2_vl)样例中提供的脚本进行微调。
+#### 3.2.2 模型微调
+1) 模型选择
+
+![模型选择](./fig/train_1.jpg)
+
+
+2) 超参数设置
+![超参数设置](./fig/train_2.jpg)
+
+
+3) LoRA参数设置与模型训练
+![模型训练](./fig/train_3.jpg)
+
+#### 3.2.3 模型推理
+
+1) 模型加载
+![模型加载](./fig/chat_1.jpg)
+
+
+2) 多模态理解
+![多模态理解](./fig/chat_2.jpg)
+
+## 4. 使用展示
+
+
+1） 模型微调
+![模型微调样例](./fig/example_train.jpg)
+
+
+2） 模型推理
+![模型推理样例](./fig/example_chat.jpg)
+
+## 参考文献
+
+```BibTeX
+@inproceedings{zheng2024llamafactory,
+  title={LlamaFactory: Unified Efficient Fine-Tuning of 100+ Language Models},
+  author={Yaowei Zheng and Richong Zhang and Junhao Zhang and Yanhan Ye and Zheyan Luo and Zhangchi Feng and Yongqiang Ma},
+  booktitle={Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 3: System Demonstrations)},
+  address={Bangkok, Thailand},
+  publisher={Association for Computational Linguistics},
+  year={2024},
+  url={http://arxiv.org/abs/2403.13372}
+}
+```
diff --git a/paddlemix/MULLM_WebUI/__init__.py b/paddlemix/MULLM_WebUI/__init__.py
new file mode 100644
index 000000000..fd05a9208
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlemix/MULLM_WebUI/chatter.py b/paddlemix/MULLM_WebUI/chatter.py
new file mode 100644
index 000000000..200e55f72
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/chatter.py
@@ -0,0 +1,250 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from threading import Thread
+from typing import TYPE_CHECKING, Dict, Generator, List, Optional, Tuple
+
+import gradio as gr
+import paddle
+from paddlenlp.generation import TextIteratorStreamer
+from paddlenlp.peft import LoRAModel
+from paddlenlp.transformers import Qwen2Tokenizer
+from paddlenlp.utils.import_utils import import_module
+
+from ..processors.qwen2_vl_processing import (
+    Qwen2VLImageProcessor,
+    Qwen2VLProcessor,
+    process_vision_info,
+)
+from .common import ChatState, change_checkbox, chat_ready, get_save_dir
+from .extras.constants import FINAL_CHECKPOINT_NAME, MODEL_MAPPING
+from .locales import ALERTS, LOCALES
+
+if TYPE_CHECKING:
+    from .manager import Manager
+
+
+class WebChatModel:
+    def __init__(self, manager: "Manager", demo_mode: bool = False, lazy_init: bool = True) -> None:
+        self.manager = manager
+        self.demo_mode = demo_mode
+        self.engine = None
+        self.processor = None
+        self.tokenizer = None
+        self.terminators = ["<|im_end|>"]
+        # self.min_pixels = 256 * 28 * 28  # 200704
+        # self.max_pixels = 1280 * 28 * 28  # 1003520
+
+    @property
+    def loaded(self) -> bool:
+        return self.engine is not None
+
+    def load_model(self, data) -> Generator[str, None, None]:
+        engine_cls = self.get_model(data)
+        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
+        lang, model_name, model_path = get("top.lang"), get("top.model_name"), get("top.model_path")
+        finetuning_type = get("top.finetuning_type")
+        infer_dtype = get("infer.infer_dtype")
+        checkpoint_path = get("top.checkpoint_path")
+        state_checkbox_group = get("infer.state_checkbox_group")
+        selected_ckpt = get("infer.ckpt_box")
+        if selected_ckpt == FINAL_CHECKPOINT_NAME:
+            ckpt_path = os.path.join(get_save_dir(model_name, finetuning_type), checkpoint_path)
+        elif selected_ckpt != "":
+            ckpt_path = os.path.join(get_save_dir(model_name, finetuning_type), checkpoint_path, selected_ckpt)
+        else:
+            ckpt_path = ""
+
+        error = ""
+        yield ALERTS["info_loading"][lang], state_checkbox_group
+
+        if self.loaded:
+            error = ALERTS["err_exists"][lang]
+            yield error, change_checkbox(state_checkbox_group, True, choice_type=LOCALES["model_tag"][lang])
+            return
+        elif not model_name:
+            error = ALERTS["err_no_model"][lang]
+        elif not model_path:
+            error = ALERTS["err_no_path"][lang]
+
+        self.engine = engine_cls.from_pretrained(model_path, dtype=infer_dtype)
+        self.processor = self.get_processor(model_path)
+
+        # load lora
+        if ckpt_path != "":
+            self.engine = LoRAModel.from_pretrained(model=self.engine, lora_path=ckpt_path)
+
+        if error:
+            gr.Warning(error)
+            yield error
+            return
+
+        yield ALERTS["info_loaded"][lang], change_checkbox(
+            state_checkbox_group, True, choice_type=LOCALES["model_tag"][lang]
+        )
+
+    def unload_model(self, data) -> Generator[str, None, None]:
+        lang = data[self.manager.get_elem_by_id("top.lang")]
+        state_checkbox_group = data[self.manager.get_elem_by_id("infer.state_checkbox_group")]
+
+        if not self.loaded:
+            yield ALERTS["info_unload_error"][lang], state_checkbox_group
+            return
+
+        yield ALERTS["info_unloading"][lang], state_checkbox_group
+        self.engine = None
+        state_checkbox_group.remove(LOCALES["model_tag"][lang])
+        paddle.device.cuda.empty_cache()
+        yield ALERTS["info_unloaded"][lang], state_checkbox_group
+
+    def multi_round_chat(
+        self,
+        lang,
+        chatbot,
+        messages,
+        question_box,
+        question_type,
+        image,
+        video,
+        chat_checkbox,
+        max_new_tokens,
+        top_p,
+        temperature,
+        seed,
+        info_box,
+    ) -> Tuple[List[List[Optional[str]]], List[Dict[str, str]], str]:
+        chat_state = {
+            "model": LOCALES["model_tag"][lang],
+            "image": LOCALES["image_tag"][lang],
+            "video": LOCALES["video_tag"][lang],
+            "question": LOCALES["question_tag"][lang],
+        }
+
+        check_result = chat_ready(chat_checkbox, chat_state)
+        if check_result == ChatState.MISSING_QUESTION:
+            yield chatbot, messages, gr.update(value=question_box), gr.update(
+                value=ALERTS["info_query"][lang]
+            ), gr.update(interactive=True)
+            return
+        if check_result == ChatState.MISSING_MODEL:
+            yield chatbot, messages, gr.update(value=question_box), gr.update(
+                value=ALERTS["info_upload_model"][lang]
+            ), gr.update(interactive=True)
+            return
+        if check_result == ChatState.MISSING_FILE:
+            yield chatbot, messages, gr.update(value=question_box), gr.update(
+                value=ALERTS["info_upload_file"][lang]
+            ), gr.update(interactive=True)
+            return
+        msg = {
+            "role": "user",
+            "content": [],
+        }
+        last_img_inp = None
+        last_video_inp = None
+
+        # find last image and video input
+        for m in messages[::-1]:
+            for content in m["content"]:
+                if "video" in content.keys():
+                    last_video_inp = content["video"]
+
+                if "image" in content.keys():
+                    last_img_inp = content["image"]
+            if last_img_inp is not None and last_video_inp is not None:
+                break
+
+        if image is not None and image == last_img_inp:
+            image = None
+
+        if video is not None and video == last_video_inp:
+            video = None
+
+        if question_type == "image" and image is not None:
+            msg["content"].append({"type": "image", "image": image})
+
+        if question_type == "video" and video is not None:
+            msg["content"].append({"type": "video", "video": video, "fps": 1, "max_pixels": 360 * 420})
+
+        chatbot += [[question_box, None]]
+        msg["content"].append({"type": "text", "text": f"{question_box}"})
+
+        messages.append(msg)
+        paddle.seed(seed=seed)
+        generate_cfg = dict(
+            max_new_tokens=max_new_tokens,
+            top_p=top_p,
+            temperature=temperature,
+        )
+        response = ""
+        res = self.generate(messages, generate_cfg)
+        for text in res:
+            response += text
+            yield chatbot + [[None, response]], messages + [
+                {"role": "assistant", "content": [{"type": "text", "text": response}]}
+            ], gr.update(value=question_box), gr.update(value=ALERTS["info_generating"][lang]), gr.update(
+                interactive=False
+            )
+
+        yield chatbot + [[None, response]], messages + [
+            {"role": "assistant", "content": [{"type": "text", "text": response}]}
+        ], gr.update(value=question_box), gr.update(value=ALERTS["info_generated"][lang]), gr.update(interactive=True)
+
+    def get_model(self, data):
+        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
+        model_name = get("top.model_name")
+        model_module = import_module(f"paddlemix.models.{MODEL_MAPPING[model_name]}")
+
+        return model_module
+
+    def get_processor(self, model_path):
+        image_processor = Qwen2VLImageProcessor()
+        tokenizer = Qwen2Tokenizer.from_pretrained(model_path)
+        # processor = Qwen2VLProcessor(image_processor, tokenizer,min_pixels=self.min_pixels, max_pixels=self.max_pixels)
+        processor = Qwen2VLProcessor(image_processor, tokenizer)
+        self.tokenizer = tokenizer
+
+        return processor
+
+    def generate(self, messages, generate_cfg):
+        image_inputs, video_inputs = process_vision_info(messages)
+        text = self.processor.tokenizer.apply_chat_template(messages, tokenize=False)
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pd",
+        )
+        inputs = self.processor(
+            text=[text],
+            images=image_inputs,
+            videos=video_inputs,
+            padding=True,
+            return_tensors="pd",
+        )
+
+        streamer = TextIteratorStreamer(tokenizer=self.tokenizer, skip_special_tokens=True)
+        generation_kwargs = {
+            "streamer": streamer,
+        }
+
+        generation_kwargs.update(generate_cfg)
+        generation_kwargs.update(inputs)
+
+        thread = Thread(target=self.engine.generate, kwargs=generation_kwargs)
+        """Class Method: *.start, can not convert, please check whether it is torch.Tensor.*/Optimizer.*/nn.Module.*/torch.distributions.Distribution.*/torch.autograd.function.FunctionCtx.*/torch.profiler.profile.*/torch.autograd.profiler.profile.*, and convert manually"""
+        thread.start()
+        return streamer
diff --git a/paddlemix/MULLM_WebUI/common.py b/paddlemix/MULLM_WebUI/common.py
new file mode 100644
index 000000000..84cf9037d
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/common.py
@@ -0,0 +1,648 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import sys
+from dataclasses import dataclass
+from datetime import datetime
+from enum import Enum
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Sequence,
+    Tuple,
+    Union,
+)
+
+import gradio as gr
+import numpy as np
+import paddle
+from paddlenlp.transformers.utils import cached_file
+from paddlenlp.utils.import_utils import is_datasets_available
+
+from ..utils.log import logger
+from .extras.constants import (
+    DATA_CONFIG,
+    DEFAULT_TEMPLATE,
+    FILEEXT2TYPE,
+    FINAL_CHECKPOINT_NAME,
+    PEFT_METHODS,
+    STAGES_USE_PAIR_DATA,
+    SUPPORTED_MODELS,
+    TRAINING_STAGES,
+)
+from .extras.data import align_dataset, has_tokenized_data, merge_dataset, split_dataset
+from .extras.packages import use_modelscope, use_openmind
+from .extras.preprocess import get_preprocess_and_print_func
+from .locales import LOCALES
+
+if is_datasets_available():
+    from datasets import DatasetDict, load_dataset, load_from_disk
+
+if TYPE_CHECKING:
+    from datasets import Dataset, DatasetModule, IterableDataset
+    from paddlenlp.trainer import Seq2SeqTrainingArguments
+    from paddlenlp.transformers import PretrainedTokenizer, ProcessorMixin
+
+    from .extras.args import DataArguments, ModelArguments
+    from .extras.template import Template
+
+DEFAULT_CACHE_DIR = "cache"
+DEFAULT_CONFIG_DIR = "config"
+DEFAULT_DATA_DIR = "data"
+DEFAULT_SAVE_DIR = "saves"
+USER_CONFIG = "user_config.yaml"
+
+
+class ChatState(Enum):
+    READY = 1
+    MISSING_QUESTION = -1
+    MISSING_MODEL = -2
+    MISSING_IMAGE = -3
+    MISSING_VIDEO = -4
+    MISSING_FILE = -5
+
+
+@dataclass
+class DatasetAttr:
+    r"""
+    Dataset attributes.
+    """
+
+    # basic configs
+    load_from: Literal["hf_hub", "ms_hub", "om_hub", "script", "file"]
+    dataset_name: str
+    formatting: Literal["alpaca", "sharegpt"] = "alpaca"
+    ranking: bool = False
+    # extra configs
+    subset: Optional[str] = None
+    split: str = "train"
+    folder: Optional[str] = None
+    num_samples: Optional[int] = None
+    # common columns
+    system: Optional[str] = None
+    tools: Optional[str] = None
+    images: Optional[str] = None
+    videos: Optional[str] = None
+    # rlhf columns
+    chosen: Optional[str] = None
+    rejected: Optional[str] = None
+    kto_tag: Optional[str] = None
+    # alpaca columns
+    prompt: Optional[str] = "instruction"
+    query: Optional[str] = "input"
+    response: Optional[str] = "output"
+    history: Optional[str] = None
+    # sharegpt columns
+    messages: Optional[str] = "conversations"
+    # sharegpt tags
+    role_tag: Optional[str] = "from"
+    content_tag: Optional[str] = "value"
+    user_tag: Optional[str] = "human"
+    assistant_tag: Optional[str] = "gpt"
+    observation_tag: Optional[str] = "observation"
+    function_tag: Optional[str] = "function_call"
+    system_tag: Optional[str] = "system"
+
+    def __repr__(self) -> str:
+        return self.dataset_name
+
+    def set_attr(self, key: str, obj: Dict[str, Any], default: Optional[Any] = None) -> None:
+        setattr(self, key, obj.get(key, default))
+
+
+def get_save_dir(*paths: str) -> os.PathLike:
+    r"""
+    Gets the path to saved model checkpoints.
+    """
+    if os.path.sep in paths[-1]:
+        logger(30, "Found complex path, some features may be not available.")
+        return paths[-1]
+
+    paths = (path.replace(" ", "").strip() for path in paths)
+    return os.path.join(DEFAULT_SAVE_DIR, *paths)
+
+
+def get_model_path(model_name: str) -> str:
+    r"""
+    Gets the model path according to the model name.
+    """
+    model_path: str = SUPPORTED_MODELS.get(model_name)
+    return model_path
+
+
+def get_model_info(model_name: str) -> Tuple[str, str]:
+    r"""
+    Gets the necessary information of this model.
+
+    Returns:
+        model_path (str)
+        template (str)
+    """
+    return get_model_path(model_name), get_template(model_name)
+
+
+def get_template(model_name: str) -> str:
+    r"""
+    Gets the template name if the model is a chat model.
+    """
+    return DEFAULT_TEMPLATE.get(model_name, "default")
+
+
+def list_checkpoints(model_name: str, finetuning_type: str) -> "gr.Dropdown":
+    r"""
+    Lists all available checkpoints.
+    """
+    checkpoints = []
+    if model_name:
+        save_dir = get_save_dir(model_name, finetuning_type)
+        if save_dir and os.path.isdir(save_dir):
+            for checkpoint in os.listdir(save_dir):
+                if os.path.isdir(os.path.join(save_dir, checkpoint)):
+                    checkpoints.append(checkpoint)
+    if finetuning_type in PEFT_METHODS:
+        yield gr.Dropdown(value=None, choices=checkpoints, multiselect=False)
+        return
+    else:
+        yield gr.Dropdown(value=None, choices=checkpoints, multiselect=False)
+        return
+
+
+def load_dataset_info(dataset_dir: str) -> Dict[str, Dict[str, Any]]:
+    r"""
+    Loads dataset_info.json.
+    """
+    if dataset_dir == "ONLINE" or dataset_dir.startswith("REMOTE:"):
+        logger(20, f"dataset_dir is {dataset_dir}, using online dataset.")
+        return {}
+
+    try:
+        with open(os.path.join(dataset_dir, DATA_CONFIG), encoding="utf-8") as f:
+            return json.load(f)
+    except Exception as err:
+        logger(30, f"Cannot open {os.path.join(dataset_dir, DATA_CONFIG)} due to {str(err)}.")
+        return {}
+
+
+def list_datasets(dataset_dir: str = None, training_stage: str = list(TRAINING_STAGES.keys())[0]) -> "gr.Dropdown":
+    r"""
+    Lists all available datasets in the dataset dir for the training stage.
+    """
+    dataset_info = load_dataset_info(dataset_dir if dataset_dir is not None else DEFAULT_DATA_DIR)
+    ranking = TRAINING_STAGES[training_stage] in STAGES_USE_PAIR_DATA
+    datasets = [k for k, v in dataset_info.items() if v.get("ranking", False) == ranking]
+    return gr.Dropdown(choices=datasets)
+
+
+def get_dataset_list(dataset_names: Optional[Sequence[str]], dataset_dir: str) -> List["DatasetAttr"]:
+    r"""
+    Gets the attributes of the datasets.
+    """
+    if dataset_names is None:
+        dataset_names = []
+
+    if dataset_dir == "ONLINE":
+        dataset_info = None
+    else:
+        if dataset_dir.startswith("REMOTE:"):
+            config_path = cached_file(path_or_repo_id=dataset_dir[7:], filename=DATA_CONFIG, repo_type="dataset")
+        else:
+            config_path = os.path.join(dataset_dir, DATA_CONFIG)
+
+        try:
+            with open(config_path) as f:
+                dataset_info = json.load(f)
+        except Exception as err:
+            if len(dataset_names) != 0:
+                raise ValueError(f"Cannot open {config_path} due to {str(err)}.")
+
+            dataset_info = None
+
+    dataset_list: List["DatasetAttr"] = []
+    for name in dataset_names:
+        if dataset_info is None:  # dataset_dir is ONLINE
+            if use_modelscope():
+                load_from = "ms_hub"
+            elif use_openmind():
+                load_from = "om_hub"
+            else:
+                load_from = "hf_hub"
+            dataset_attr = DatasetAttr(load_from, dataset_name=name)
+            dataset_list.append(dataset_attr)
+            continue
+
+        if name not in dataset_info:
+            raise ValueError(f"Undefined dataset {name} in {DATA_CONFIG}.")
+
+        has_hf_url = "hf_hub_url" in dataset_info[name]
+        has_ms_url = "ms_hub_url" in dataset_info[name]
+        has_om_url = "om_hub_url" in dataset_info[name]
+
+        if has_hf_url or has_ms_url or has_om_url:
+            if has_ms_url and (use_modelscope() or not has_hf_url):
+                dataset_attr = DatasetAttr("ms_hub", dataset_name=dataset_info[name]["ms_hub_url"])
+            elif has_om_url and (use_openmind() or not has_hf_url):
+                dataset_attr = DatasetAttr("om_hub", dataset_name=dataset_info[name]["om_hub_url"])
+            else:
+                dataset_attr = DatasetAttr("hf_hub", dataset_name=dataset_info[name]["hf_hub_url"])
+        elif "script_url" in dataset_info[name]:
+            dataset_attr = DatasetAttr("script", dataset_name=dataset_info[name]["script_url"])
+        else:
+            dataset_attr = DatasetAttr("file", dataset_name=dataset_info[name]["file_name"])
+
+        dataset_attr.set_attr("formatting", dataset_info[name], default="alpaca")
+        dataset_attr.set_attr("ranking", dataset_info[name], default=False)
+        dataset_attr.set_attr("subset", dataset_info[name])
+        dataset_attr.set_attr("split", dataset_info[name], default="train")
+        dataset_attr.set_attr("folder", dataset_info[name])
+        dataset_attr.set_attr("num_samples", dataset_info[name])
+
+        if "columns" in dataset_info[name]:
+            column_names = ["system", "tools", "images", "videos", "chosen", "rejected", "kto_tag"]
+            if dataset_attr.formatting == "alpaca":
+                column_names.extend(["prompt", "query", "response", "history"])
+            else:
+                column_names.extend(["messages"])
+
+            for column_name in column_names:
+                dataset_attr.set_attr(column_name, dataset_info[name]["columns"])
+
+        if dataset_attr.formatting == "sharegpt" and "tags" in dataset_info[name]:
+            tag_names = (
+                "role_tag",
+                "content_tag",
+                "user_tag",
+                "assistant_tag",
+                "observation_tag",
+                "function_tag",
+                "system_tag",
+            )
+            for tag in tag_names:
+                dataset_attr.set_attr(tag, dataset_info[name]["tags"])
+
+        dataset_list.append(dataset_attr)
+
+    return dataset_list
+
+
+def change_checkbox(checkbox, x, lang=None, tag=None, choice_type=None):
+    if choice_type is None:
+        choice_type = LOCALES[tag][lang]
+    if (x == "" or x is None) and choice_type in checkbox:
+        checkbox.remove(choice_type)
+    elif (x != "" or (tag != "question_tag" and x is not None)) and choice_type not in checkbox:
+        checkbox.append(choice_type)
+    return checkbox
+
+
+def chat_ready(checkbox, state):
+
+    if state["model"] not in checkbox:
+        return ChatState.MISSING_MODEL
+
+    if state["question"] not in checkbox:
+        return ChatState.MISSING_QUESTION
+
+    if state["image"] not in checkbox and state["video"] not in checkbox:
+        return ChatState.MISSING_FILE
+
+    return ChatState.READY
+
+
+# train
+def get_device_count():
+    return paddle.device.cuda.device_count()
+
+
+# dataset
+def _load_single_dataset(
+    dataset_attr: "DatasetAttr",
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+) -> Union["Dataset", "IterableDataset"]:
+    r"""
+    Loads a single dataset and aligns it to the standard format.
+    """
+    logger(20, f"Loading dataset {dataset_attr}...")
+    data_path, data_name, data_dir, data_files = None, None, None, None
+    if dataset_attr.load_from in ["hf_hub", "ms_hub", "om_hub"]:
+        data_path = dataset_attr.dataset_name
+        data_name = dataset_attr.subset
+        data_dir = dataset_attr.folder
+
+    elif dataset_attr.load_from == "script":
+        data_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)
+        data_name = dataset_attr.subset
+        data_dir = dataset_attr.folder
+
+    elif dataset_attr.load_from == "file":
+        data_files = []
+        local_path = os.path.join(data_args.dataset_dir, dataset_attr.dataset_name)
+        if os.path.isdir(local_path):  # is directory
+            for file_name in os.listdir(local_path):
+                data_files.append(os.path.join(local_path, file_name))
+        elif os.path.isfile(local_path):  # is file
+            data_files.append(local_path)
+        else:
+            raise ValueError(f"File {local_path} not found.")
+
+        data_path = FILEEXT2TYPE.get(os.path.splitext(data_files[0])[-1][1:], None)
+        if data_path is None:
+            raise ValueError("Allowed file types: {}.".format(",".join(FILEEXT2TYPE.keys())))
+
+        if any(data_path != FILEEXT2TYPE.get(os.path.splitext(data_file)[-1][1:], None) for data_file in data_files):
+            raise ValueError("File types should be identical.")
+    else:
+        raise NotImplementedError(f"Unknown load type: {dataset_attr.load_from}.")
+
+    if dataset_attr.load_from == "ms_hub":
+        # require_version("modelscope>=1.11.0", "To fix: pip install modelscope>=1.11.0")
+        from modelscope import MsDataset  # type: ignore
+        from modelscope.utils.config_ds import MS_DATASETS_CACHE  # type: ignore
+
+        cache_dir = model_args.cache_dir or MS_DATASETS_CACHE
+        dataset = MsDataset.load(
+            dataset_name=data_path,
+            subset_name=data_name,
+            data_dir=data_dir,
+            data_files=data_files,
+            split=dataset_attr.split,
+            cache_dir=cache_dir,
+            token=model_args.ms_hub_token,
+            use_streaming=data_args.streaming,
+        )
+        if isinstance(dataset, MsDataset):
+            dataset = dataset.to_hf_dataset()
+
+    elif dataset_attr.load_from == "om_hub":
+        # require_version("openmind>=0.8.0", "To fix: pip install openmind>=0.8.0")
+        from openmind import OmDataset  # type: ignore
+        from openmind.utils.hub import OM_DATASETS_CACHE  # type: ignore
+
+        cache_dir = model_args.cache_dir or OM_DATASETS_CACHE
+        dataset = OmDataset.load_dataset(
+            path=data_path,
+            name=data_name,
+            data_dir=data_dir,
+            data_files=data_files,
+            split=dataset_attr.split,
+            cache_dir=cache_dir,
+            token=model_args.om_hub_token,
+            streaming=data_args.streaming,
+        )
+    else:
+        dataset = load_dataset(
+            path=data_path,
+            name=data_name,
+            data_dir=data_dir,
+            data_files=data_files,
+            split=dataset_attr.split,
+            cache_dir=model_args.cache_dir,
+            token=model_args.hf_hub_token,
+            streaming=data_args.streaming,
+            num_proc=data_args.preprocessing_num_workers,
+        )
+
+    if dataset_attr.num_samples is not None and not data_args.streaming:
+        target_num = dataset_attr.num_samples
+        indexes = np.random.permutation(len(dataset))[:target_num]  # all samples should be included
+        target_num -= len(indexes)
+        if target_num > 0:
+            expand_indexes = np.random.choice(len(dataset), target_num)
+            indexes = np.concatenate((indexes, expand_indexes), axis=0)
+
+        assert len(indexes) == dataset_attr.num_samples, "Sample num mismatched."
+        dataset = dataset.select(indexes)
+        logger(20, f"Sampled {dataset_attr.num_samples} examples from dataset {dataset_attr}.")
+
+    if data_args.max_samples is not None:  # truncate dataset
+        max_samples = min(data_args.max_samples, len(dataset))
+        dataset = dataset.select(range(max_samples))
+
+    return align_dataset(dataset, dataset_attr, data_args, training_args)
+
+
+def _get_merged_dataset(
+    dataset_names: Optional[Sequence[str]],
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    stage: Literal["pt", "sft", "rm", "ppo", "kto"],
+) -> Optional[Union["Dataset", "IterableDataset"]]:
+    r"""
+    Gets the merged datasets in the standard format.
+    """
+    if dataset_names is None:
+        return None
+
+    datasets = []
+    for dataset_attr in get_dataset_list(dataset_names, data_args.dataset_dir):
+        if (stage == "rm" and dataset_attr.ranking is False) or (stage != "rm" and dataset_attr.ranking is True):
+            raise ValueError("The dataset is not applicable in the current training stage.")
+
+        datasets.append(_load_single_dataset(dataset_attr, model_args, data_args, training_args))
+
+    return merge_dataset(datasets, data_args, seed=training_args.seed)
+
+
+def _get_preprocessed_dataset(
+    dataset: Optional[Union["Dataset", "IterableDataset"]],
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    stage: Literal["sft"],
+    template: "Template",
+    tokenizer: "PretrainedTokenizer",
+    processor: Optional["ProcessorMixin"] = None,
+    is_eval: bool = False,
+) -> Optional[Union["Dataset", "IterableDataset"]]:
+    r"""
+    Preprocesses the dataset, including format checking and tokenization.
+    """
+    if dataset is None:
+        return None
+
+    preprocess_func, print_function = get_preprocess_and_print_func(
+        data_args, stage, template, tokenizer, processor, do_generate=(training_args.predict_with_generate and is_eval)
+    )
+    column_names = list(next(iter(dataset)).keys())
+    kwargs = {}
+    if not data_args.streaming:
+        # kwargs = dict(
+        #     num_proc=data_args.preprocessing_num_workers,
+        #     load_from_cache_file=(not data_args.overwrite_cache) or (training_args.local_process_index != 0),
+        #     desc="Running tokenizer on dataset",
+        # )
+        kwargs = dict(
+            num_proc=1,
+            load_from_cache_file=False,
+            desc="Running tokenizer on dataset",
+        )
+
+    dataset = dataset.map(
+        preprocess_func,
+        batched=True,
+        batch_size=data_args.preprocessing_batch_size,
+        remove_columns=column_names,
+        **kwargs,
+    )
+
+    if training_args.should_log:
+        try:
+            print("eval example:" if is_eval else "training example:")
+            print_function(next(iter(dataset)))
+        except StopIteration:
+            if stage == "pt":
+                raise RuntimeError("Cannot find sufficient samples, consider increasing dataset size.")
+            else:
+                raise RuntimeError("Cannot find valid samples, check `data/README.md` for the data format.")
+
+    return dataset
+
+
+def get_dataset(
+    template: "Template",
+    model_args: "ModelArguments",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+    stage: Literal["sft"],
+    tokenizer: "PretrainedTokenizer",
+    processor: Optional["ProcessorMixin"] = None,
+) -> "DatasetModule":
+    r"""
+    Gets the train dataset and optionally gets the evaluation dataset.
+    """
+    # Load tokenized dataset
+    if data_args.tokenized_path is not None:
+        if has_tokenized_data(data_args.tokenized_path):
+            logger(30, "Loading dataset from disk will ignore other data arguments.")
+            dataset_dict: "DatasetDict" = load_from_disk(data_args.tokenized_path)
+            logger(20, f"Loaded tokenized dataset from {data_args.tokenized_path}.")
+
+            dataset_module: Dict[str, "Dataset"] = {}
+            if "train" in dataset_dict:
+                dataset_module["train_dataset"] = dataset_dict["train"]
+
+            if "validation" in dataset_dict:
+                dataset_module["eval_dataset"] = dataset_dict["validation"]
+
+            if data_args.streaming:
+                dataset_module = {k: v.to_iterable_dataset() for k, v in dataset_module.items()}
+
+            return dataset_module
+
+        if data_args.streaming:
+            raise ValueError("Turn off `streaming` when saving dataset to disk.")
+
+    # Load and preprocess dataset
+    with training_args.main_process_first(desc="load dataset"):
+        dataset = _get_merged_dataset(data_args.dataset, model_args, data_args, training_args, stage)
+        eval_dataset = _get_merged_dataset(data_args.eval_dataset, model_args, data_args, training_args, stage)
+
+    with training_args.main_process_first(desc="pre-process dataset"):
+        dataset = _get_preprocessed_dataset(
+            dataset, data_args, training_args, stage, template, tokenizer, processor, is_eval=False
+        )
+        eval_dataset = _get_preprocessed_dataset(
+            eval_dataset, data_args, training_args, stage, template, tokenizer, processor, is_eval=True
+        )
+
+        if data_args.val_size > 1e-6:
+            dataset_dict = split_dataset(dataset, data_args, seed=training_args.seed)
+        else:
+            dataset_dict = {}
+            if dataset is not None:
+                if data_args.streaming:
+                    dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=training_args.seed)
+
+                dataset_dict["train"] = dataset
+
+            if eval_dataset is not None:
+                if data_args.streaming:
+                    eval_dataset = eval_dataset.shuffle(buffer_size=data_args.buffer_size, seed=training_args.seed)
+
+                dataset_dict["validation"] = eval_dataset
+
+            dataset_dict = DatasetDict(dataset_dict)
+
+        if data_args.tokenized_path is not None:
+            if training_args.should_save:
+                dataset_dict.save_to_disk(data_args.tokenized_path)
+                logger(20, f"Tokenized dataset saved at {data_args.tokenized_path}.")
+                logger(20, f"Please restart the training with `tokenized_path: {data_args.tokenized_path}`.")
+
+            sys.exit(0)
+
+        dataset_module = {}
+        if "train" in dataset_dict:
+            dataset_module["train_dataset"] = dataset_dict["train"]
+
+        if "validation" in dataset_dict:
+            dataset_module["eval_dataset"] = dataset_dict["validation"]
+
+        return dataset_module
+
+
+def list_config_paths(current_time: str) -> "gr.Dropdown":
+    r"""
+    Lists all the saved configuration files.
+    """
+    config_files = [f"{current_time}.yaml"]
+    if os.path.isdir(DEFAULT_CONFIG_DIR):
+        for file_name in os.listdir(DEFAULT_CONFIG_DIR):
+            if file_name.endswith(".yaml") and file_name not in config_files:
+                config_files.append(file_name)
+
+    return gr.Dropdown(choices=config_files)
+
+
+def get_time() -> str:
+    r"""
+    Gets current date and time.
+    """
+    return datetime.now().strftime(r"%Y-%m-%d-%H-%M-%S")
+
+
+def list_output_dirs(model_name: Optional[str], finetuning_type: str, current_time: str) -> "gr.Dropdown":
+    r"""
+    Lists all the directories that can resume from.
+    """
+    output_dirs = [f"train_{current_time}"]
+    if model_name:
+        save_dir = get_save_dir(model_name, finetuning_type)
+        if save_dir and os.path.isdir(save_dir):
+            for output in os.listdir(save_dir):
+                output_dirs.append(output)
+    return gr.Dropdown(choices=output_dirs)
+
+
+def list_checkpoint_item(model_name, finetune_type, checkpoint_path):
+    items = []
+    if checkpoint_path == "" or not isinstance(checkpoint_path, str):
+        return gr.update(choices=items)
+    cur_path = os.path.join(get_save_dir(model_name, finetune_type), checkpoint_path)
+    if not os.path.exists(cur_path):
+        return gr.update(choices=items)
+    for ckpt in os.listdir(cur_path):
+        if "checkpoint" in ckpt:
+            items.append(ckpt)
+        elif "lora_model_state.pdparams" in ckpt:
+            items.append(FINAL_CHECKPOINT_NAME)
+    items.sort()
+    return gr.update(choices=items)
diff --git a/paddlemix/MULLM_WebUI/components/__init__.py b/paddlemix/MULLM_WebUI/components/__init__.py
new file mode 100644
index 000000000..986fec615
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/components/__init__.py
@@ -0,0 +1,25 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from .chatbot import create_chat_box
+from .infer import create_infer_tab
+from .top import create_top
+from .train import create_train_tab
+
+__all__ = [
+    "create_chat_box",
+    "create_infer_tab",
+    "create_top",
+    "create_train_tab",
+]
diff --git a/paddlemix/MULLM_WebUI/components/chatbot.py b/paddlemix/MULLM_WebUI/components/chatbot.py
new file mode 100644
index 000000000..13239b4e0
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/components/chatbot.py
@@ -0,0 +1,101 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Dict, Tuple
+
+from ..extras.packages import is_gradio_available
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+    from ..engine import Engine
+
+
+def create_chat_box(
+    engine: "Engine", visible: bool = False
+) -> Tuple["Component", "Component", Dict[str, "Component"]]:
+    with gr.Column(visible=visible) as chat_box:
+        chatbot = gr.Chatbot(show_copy_button=True)
+        messages = gr.State([])
+        with gr.Row():
+            with gr.Column(scale=4):
+                with gr.Row():
+                    with gr.Column():
+                        role = gr.Dropdown(choices=["user", "observation"], value="user")
+                        # role = gr.Dropdown(choices=[Role.USER.value, Role.OBSERVATION.value], value="")
+
+                        system = gr.Textbox(show_label=False)
+                        tools = gr.Textbox(show_label=False, lines=3)
+
+                    with gr.Column() as mm_box:
+                        with gr.Tab("Image"):
+                            image = gr.Image(sources=["upload"], type="pil")
+
+                        with gr.Tab("Video"):
+                            video = gr.Video(sources=["upload"])
+
+                query = gr.Textbox(show_label=False, lines=8)
+                submit_btn = gr.Button(variant="primary")
+
+            with gr.Column(scale=1):
+                max_new_tokens = gr.Slider(minimum=8, maximum=4096, value=512, step=1)
+                top_p = gr.Slider(minimum=0.01, maximum=1.0, value=0.7, step=0.01)
+                temperature = gr.Slider(minimum=0.01, maximum=1.5, value=0.95, step=0.01)
+                clear_btn = gr.Button()
+
+    tools.input(inputs=[tools, engine.manager.get_elem_by_id("top.lang")])
+
+    submit_btn.click(engine.chatter.append, [chatbot, messages, role, query], [chatbot, messages, query],).then(
+        engine.chatter.stream,
+        [chatbot, messages, system, tools, image, video, max_new_tokens, top_p, temperature],
+        [chatbot, messages],
+    )
+    clear_btn.click(lambda: ([], []), outputs=[chatbot, messages])
+
+    return (
+        chatbot,
+        messages,
+        dict(
+            chat_box=chat_box,
+            role=role,
+            system=system,
+            tools=tools,
+            mm_box=mm_box,
+            image=image,
+            video=video,
+            query=query,
+            submit_btn=submit_btn,
+            max_new_tokens=max_new_tokens,
+            top_p=top_p,
+            temperature=temperature,
+            clear_btn=clear_btn,
+        ),
+    )
+
+
+def enable_chat_btn(checkbox):
+    if "Question" in checkbox and "Model" in checkbox and ("Video" in checkbox or "Image" in checkbox):
+        return gr.update(interactive=True)
+    else:
+        return gr.update(interactive=False)
+
+
+def enable_checkpoint_box(checkpoint_path):
+    if isinstance(checkpoint_path, str) and checkpoint_path != "":
+        return gr.update(visible=True)
+    return gr.update(visible=False)
diff --git a/paddlemix/MULLM_WebUI/components/data.py b/paddlemix/MULLM_WebUI/components/data.py
new file mode 100644
index 000000000..aa54d4902
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/components/data.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple
+
+from ..extras.constants import DATA_CONFIG
+from ..extras.packages import is_gradio_available
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+
+PAGE_SIZE = 2
+
+
+def prev_page(page_index: int) -> int:
+    return page_index - 1 if page_index > 0 else page_index
+
+
+def next_page(page_index: int, total_num: int) -> int:
+    return page_index + 1 if (page_index + 1) * PAGE_SIZE < total_num else page_index
+
+
+def can_preview(dataset_dir: str, dataset: list) -> "gr.Button":
+    try:
+        with open(os.path.join(dataset_dir, DATA_CONFIG), encoding="utf-8") as f:
+            dataset_info = json.load(f)
+    except Exception:
+        return gr.Button(interactive=False)
+
+    if len(dataset) == 0 or "file_name" not in dataset_info[dataset[0]]:
+        return gr.Button(interactive=False)
+
+    data_path = os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"])
+    if os.path.isfile(data_path) or (os.path.isdir(data_path) and os.listdir(data_path)):
+        return gr.Button(interactive=True)
+    else:
+        return gr.Button(interactive=False)
+
+
+def _load_data_file(file_path: str) -> List[Any]:
+    with open(file_path, encoding="utf-8") as f:
+        if file_path.endswith(".json"):
+            return json.load(f)
+        elif file_path.endswith(".jsonl"):
+            return [json.loads(line) for line in f]
+        else:
+            return list(f)
+
+
+def get_preview(dataset_dir: str, dataset: list, page_index: int) -> Tuple[int, list, "gr.Column"]:
+    with open(os.path.join(dataset_dir, DATA_CONFIG), encoding="utf-8") as f:
+        dataset_info = json.load(f)
+
+    data_path = os.path.join(dataset_dir, dataset_info[dataset[0]]["file_name"])
+    if os.path.isfile(data_path):
+        data = _load_data_file(data_path)
+    else:
+        data = []
+        for file_name in os.listdir(data_path):
+            data.extend(_load_data_file(os.path.join(data_path, file_name)))
+
+    return len(data), data[PAGE_SIZE * page_index : PAGE_SIZE * (page_index + 1)], gr.Column(visible=True)
+
+
+def create_preview_box(dataset_dir: "gr.Textbox", dataset: "gr.Dropdown") -> Dict[str, "Component"]:
+    data_preview_btn = gr.Button(interactive=False, scale=1)
+    with gr.Column(visible=False, elem_classes="modal-box") as preview_box:
+        with gr.Row():
+            preview_count = gr.Number(value=0, interactive=False, precision=0)
+            page_index = gr.Number(value=0, interactive=False, precision=0)
+
+        with gr.Row():
+            prev_btn = gr.Button()
+            next_btn = gr.Button()
+            close_btn = gr.Button()
+
+        with gr.Row():
+            preview_samples = gr.JSON()
+
+    dataset.change(can_preview, [dataset_dir, dataset], [data_preview_btn], queue=False).then(
+        lambda: 0, outputs=[page_index], queue=False
+    )
+    data_preview_btn.click(
+        get_preview, [dataset_dir, dataset, page_index], [preview_count, preview_samples, preview_box], queue=False
+    )
+    prev_btn.click(prev_page, [page_index], [page_index], queue=False).then(
+        get_preview, [dataset_dir, dataset, page_index], [preview_count, preview_samples, preview_box], queue=False
+    )
+    next_btn.click(next_page, [page_index, preview_count], [page_index], queue=False).then(
+        get_preview, [dataset_dir, dataset, page_index], [preview_count, preview_samples, preview_box], queue=False
+    )
+    close_btn.click(lambda: gr.Column(visible=False), outputs=[preview_box], queue=False)
+    return dict(
+        data_preview_btn=data_preview_btn,
+        preview_count=preview_count,
+        page_index=page_index,
+        prev_btn=prev_btn,
+        next_btn=next_btn,
+        close_btn=close_btn,
+        preview_samples=preview_samples,
+    )
diff --git a/paddlemix/MULLM_WebUI/components/infer.py b/paddlemix/MULLM_WebUI/components/infer.py
new file mode 100644
index 000000000..f4aa8afd0
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/components/infer.py
@@ -0,0 +1,153 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Dict
+
+from ..common import change_checkbox, list_checkpoint_item
+from ..extras.packages import is_gradio_available
+from .chatbot import enable_checkpoint_box
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+    from ..engine import Engine
+
+
+def create_infer_tab(engine: "Engine") -> Dict[str, "Component"]:
+    input_elems = engine.manager.get_base_elems()
+    checkpoint_path: "gr.Dropdown" = engine.manager.get_elem_by_id("top.checkpoint_path")
+
+    elem_dict = dict()
+
+    with gr.Row():
+        infer_dtype = gr.Dropdown(choices=["float16", "bfloat16", "float32"], value="float16")
+        ckpt_box = gr.Dropdown(value="", visible=False)
+
+    with gr.Row():
+        load_btn = gr.Button()
+        unload_btn = gr.Button()
+
+    with gr.Row():
+        with gr.Column():
+            with gr.Tab("Image"):
+                image = gr.Image(type="pil", sources=["upload", "webcam", "clipboard"])
+
+            with gr.Tab("Video"):
+                video = gr.Video(sources=["upload"])
+
+            state_checkbox_group = gr.CheckboxGroup(value=[], interactive=False)
+            info_box = gr.Textbox(show_label=True, interactive=False)
+
+        with gr.Column(scale=1):
+            question_box = gr.Textbox(value="", interactive=True)
+            question_type = gr.Dropdown(choices=["image", "video"], value="image")
+            seed_box = gr.Textbox(value=42, interactive=True)
+            max_new_tokens = gr.Slider(minimum=8, maximum=4096, value=512, step=1)
+            top_p = gr.Slider(minimum=0.01, maximum=1.0, value=0.7, step=0.01)
+            temperature = gr.Slider(minimum=0.01, maximum=1.5, value=0.95, step=0.01)
+            chat_btn = gr.Button()
+            clear_btn = gr.Button()
+
+    chatbot = gr.Chatbot(show_copy_button=True)
+    messages = gr.State([])
+
+    input_elems.update({image})
+    input_elems.update({video})
+    input_elems.update({chatbot})
+    input_elems.update({question_box, info_box})
+    input_elems.update({messages})
+    input_elems.update({infer_dtype, ckpt_box})
+    input_elems.update({state_checkbox_group})
+    input_elems.update({question_type, max_new_tokens, top_p, temperature, seed_box})
+    elem_dict.update(
+        dict(
+            infer_dtype=infer_dtype,
+            ckpt_box=ckpt_box,
+            load_btn=load_btn,
+            unload_btn=unload_btn,
+            info_box=info_box,
+            question_box=question_box,
+            seed_box=seed_box,
+            question_type=question_type,
+            max_new_tokens=max_new_tokens,
+            top_p=top_p,
+            temperature=temperature,
+            chat_btn=chat_btn,
+            clear_btn=clear_btn,
+            state_checkbox_group=state_checkbox_group,
+            image=image,
+            video=video,
+            chatbot=chatbot,
+            messages=messages,
+        )
+    )
+
+    # Button
+    load_btn.click(engine.chatter.load_model, input_elems, [info_box, state_checkbox_group])
+    unload_btn.click(engine.chatter.unload_model, input_elems, [info_box, state_checkbox_group])
+
+    clear_btn.click(lambda: ([], [], "", None, None), outputs=[chatbot, messages, question_box, image, video])
+
+    chat_btn.click(
+        engine.chatter.multi_round_chat,
+        inputs=[
+            engine.manager._id_to_elem["top.lang"],
+            chatbot,
+            messages,
+            question_box,
+            question_type,
+            image,
+            video,
+            state_checkbox_group,
+            max_new_tokens,
+            top_p,
+            temperature,
+            seed_box,
+            info_box,
+        ],
+        outputs=[chatbot, messages, question_box, info_box, chat_btn],
+    )
+    question_box.change(
+        change_checkbox,
+        inputs=[state_checkbox_group, question_box, engine.manager._id_to_elem["top.lang"], gr.State("question_tag")],
+        outputs=state_checkbox_group,
+        every=3,
+    )
+
+    image.change(
+        change_checkbox,
+        inputs=[state_checkbox_group, image, engine.manager._id_to_elem["top.lang"], gr.State("image_tag")],
+        outputs=state_checkbox_group,
+    )
+    video.change(
+        change_checkbox,
+        inputs=[state_checkbox_group, video, engine.manager._id_to_elem["top.lang"], gr.State("video_tag")],
+        outputs=state_checkbox_group,
+    )
+    checkpoint_path.change(
+        list_checkpoint_item,
+        [
+            engine.manager._id_to_elem["top.model_name"],
+            engine.manager._id_to_elem["top.finetuning_type"],
+            engine.manager._id_to_elem["top.checkpoint_path"],
+        ],
+        [ckpt_box],
+        queue=False,
+    ).then(enable_checkpoint_box, inputs=[checkpoint_path], outputs=[ckpt_box], show_progress=False)
+
+    return elem_dict
diff --git a/paddlemix/MULLM_WebUI/components/top.py b/paddlemix/MULLM_WebUI/components/top.py
new file mode 100644
index 000000000..bd73d2801
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/components/top.py
@@ -0,0 +1,57 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Dict
+
+import gradio as gr
+
+from ..common import get_model_info, list_checkpoints
+from ..extras.constants import METHODS
+from ..extras.template import TEMPLATES
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+
+def create_top() -> Dict[str, "Component"]:
+    # available_models = list(SUPPORTED_MODELS.keys()) + ["Custom"]
+    available_models = ["Qwen2-VL-2B-Instruct", "Qwen2-VL-7B-Instruct"]
+    with gr.Row():
+        lang = gr.Dropdown(choices=["en", "zh"], scale=1, value="en")
+        model_name = gr.Dropdown(choices=available_models, scale=3, value="Qwen2-VL-2B-Instruct")
+        model_path = gr.Textbox(scale=3)
+
+    with gr.Row():
+        finetuning_type = gr.Dropdown(choices=METHODS, value="lora", scale=1)
+        checkpoint_path = gr.Dropdown(scale=6, value="")
+
+    with gr.Row():
+        template = gr.Dropdown(choices=list(TEMPLATES.keys()), value="qwen2_vl", scale=2)
+    model_name.change(get_model_info, [model_name], [model_path, template], queue=False).then(
+        list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False
+    )
+    checkpoint_path.focus(list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False)
+
+    finetuning_type.change(inputs=[finetuning_type], outputs=[finetuning_type]).then(
+        list_checkpoints, [model_name, finetuning_type], [checkpoint_path], queue=False
+    )
+
+    return dict(
+        lang=lang,
+        model_name=model_name,
+        model_path=model_path,
+        template=template,
+        finetuning_type=finetuning_type,
+        checkpoint_path=checkpoint_path,
+    )
diff --git a/paddlemix/MULLM_WebUI/components/train.py b/paddlemix/MULLM_WebUI/components/train.py
new file mode 100644
index 000000000..10ec44818
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/components/train.py
@@ -0,0 +1,232 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Dict
+
+import gradio as gr
+from paddlenlp.trainer.trainer_utils import SchedulerType
+
+from ..common import (
+    get_device_count,
+    list_checkpoints,
+    list_config_paths,
+    list_datasets,
+    list_output_dirs,
+)
+from ..components.data import create_preview_box
+from ..extras.constants import DEFAULT_DATA_DIR, TRAINING_STAGES
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+    from ..engine import Engine
+
+
+def create_train_tab(engine: "Engine") -> Dict[str, "Component"]:
+    input_elems = engine.manager.get_base_elems()
+    elem_dict = dict()
+
+    with gr.Row():
+        training_stage = gr.Dropdown(
+            choices=list(TRAINING_STAGES.keys()), value=list(TRAINING_STAGES.keys())[0], scale=1
+        )
+        dataset_dir = gr.Textbox(value=DEFAULT_DATA_DIR, scale=1)
+        dataset = gr.Dropdown(multiselect=True, allow_custom_value=True, scale=4)
+        preview_elems = create_preview_box(dataset_dir, dataset)
+
+    input_elems.update({training_stage, dataset_dir, dataset})
+    elem_dict.update(dict(training_stage=training_stage, dataset_dir=dataset_dir, dataset=dataset, **preview_elems))
+
+    with gr.Row():
+        learning_rate = gr.Textbox(value="5e-5")
+        num_train_epochs = gr.Textbox(value="3.0")
+        max_grad_norm = gr.Textbox(value="1.0")
+        max_samples = gr.Textbox(value="100000")
+        compute_type = gr.Dropdown(choices=["bf16", "fp16", "fp32", "pure_bf16"], value="bf16")
+
+    input_elems.update({learning_rate, num_train_epochs, max_grad_norm, max_samples, compute_type})
+    elem_dict.update(
+        dict(
+            learning_rate=learning_rate,
+            num_train_epochs=num_train_epochs,
+            max_grad_norm=max_grad_norm,
+            max_samples=max_samples,
+            compute_type=compute_type,
+        )
+    )
+
+    with gr.Row():
+        cutoff_len = gr.Slider(minimum=4, maximum=131072, value=2048, step=1)
+        batch_size = gr.Slider(minimum=1, maximum=1024, value=1, step=1)
+        gradient_accumulation_steps = gr.Slider(minimum=1, maximum=1024, value=8, step=1)
+        val_size = gr.Slider(minimum=0, maximum=1, value=0, step=0.001)
+        lr_scheduler_type = gr.Dropdown(choices=[scheduler.value for scheduler in SchedulerType], value="constant")
+
+    input_elems.update({cutoff_len, batch_size, gradient_accumulation_steps, val_size, lr_scheduler_type})
+    elem_dict.update(
+        dict(
+            cutoff_len=cutoff_len,
+            batch_size=batch_size,
+            gradient_accumulation_steps=gradient_accumulation_steps,
+            val_size=val_size,
+            lr_scheduler_type=lr_scheduler_type,
+        )
+    )
+
+    with gr.Accordion(open=False) as extra_tab:
+        with gr.Row():
+            logging_steps = gr.Slider(minimum=1, maximum=1000, value=5, step=5)
+            save_steps = gr.Slider(minimum=10, maximum=5000, value=100, step=10)
+            eval_steps = gr.Slider(minimum=10, maximum=5000, value=100, step=10)
+            warmup_steps = gr.Slider(minimum=0, maximum=5000, value=0, step=1)
+            extra_args = gr.Textbox(value='{"optim": "adamw"}')
+
+    input_elems.update(
+        {
+            logging_steps,
+            save_steps,
+            eval_steps,
+            warmup_steps,
+            extra_args,
+        }
+    )
+    elem_dict.update(
+        dict(
+            extra_tab=extra_tab,
+            logging_steps=logging_steps,
+            eval_steps=eval_steps,
+            save_steps=save_steps,
+            warmup_steps=warmup_steps,
+            extra_args=extra_args,
+        )
+    )
+
+    with gr.Accordion(open=False) as lora_tab:
+        with gr.Row():
+            lora_rank = gr.Slider(minimum=1, maximum=1024, value=32, step=1)
+            lora_alpha = gr.Slider(minimum=1, maximum=2048, value=32, step=1)
+            lora_dropout = gr.Slider(minimum=0, maximum=1, value=0, step=0.01)
+            loraplus_lr_ratio = gr.Slider(minimum=0, maximum=64, value=1, step=0.01)
+
+        with gr.Row():
+            use_rslora = gr.Checkbox()
+            use_pissa = gr.Checkbox()
+
+    input_elems.update(
+        {
+            lora_rank,
+            lora_alpha,
+            lora_dropout,
+            loraplus_lr_ratio,
+            use_rslora,
+            use_pissa,
+        }
+    )
+    elem_dict.update(
+        dict(
+            lora_tab=lora_tab,
+            lora_rank=lora_rank,
+            lora_alpha=lora_alpha,
+            lora_dropout=lora_dropout,
+            loraplus_lr_ratio=loraplus_lr_ratio,
+            use_rslora=use_rslora,
+            use_pissa=use_pissa,
+        )
+    )
+
+    with gr.Row():
+        arg_save_btn = gr.Button()
+        arg_load_btn = gr.Button()
+        start_btn = gr.Button(variant="primary")
+        stop_btn = gr.Button(variant="stop")
+
+    with gr.Row():
+        with gr.Column(scale=3):
+            with gr.Row():
+                current_time = gr.Textbox(visible=False, interactive=False)
+                output_dir = gr.Dropdown(allow_custom_value=True)
+                config_path = gr.Dropdown(allow_custom_value=True)
+
+            with gr.Row():
+                device_count = gr.Textbox(value=str(get_device_count() or 1), interactive=False)
+
+            with gr.Row():
+                resume_btn = gr.Checkbox(visible=False, interactive=False)
+                progress_bar = gr.Slider(visible=False, interactive=False)
+
+            with gr.Row():
+                output_box = gr.Textbox(interactive=False)
+
+        with gr.Column(scale=1):
+            loss_viewer = gr.Plot()
+
+    input_elems.update({output_dir, config_path, output_box})
+    elem_dict.update(
+        dict(
+            arg_save_btn=arg_save_btn,
+            arg_load_btn=arg_load_btn,
+            start_btn=start_btn,
+            stop_btn=stop_btn,
+            current_time=current_time,
+            output_dir=output_dir,
+            config_path=config_path,
+            device_count=device_count,
+            resume_btn=resume_btn,
+            progress_bar=progress_bar,
+            output_box=output_box,
+            loss_viewer=loss_viewer,
+        )
+    )
+    output_elems = [output_box, progress_bar, loss_viewer]
+
+    start_btn.click(engine.runner.run_train_v2, input_elems, output_elems)
+    stop_btn.click(engine.runner.set_abort)
+    resume_btn.change(engine.runner.monitor, outputs=output_elems, concurrency_limit=None)
+
+    lang = engine.manager.get_elem_by_id("top.lang")
+    model_name: "gr.Dropdown" = engine.manager.get_elem_by_id("top.model_name")
+    finetuning_type: "gr.Dropdown" = engine.manager.get_elem_by_id("top.finetuning_type")
+
+    arg_save_btn.click(engine.runner.save_args, input_elems, output_elems, concurrency_limit=None)
+    arg_load_btn.click(
+        engine.runner.load_args, [lang, config_path], list(input_elems) + [output_box], concurrency_limit=None
+    )
+
+    dataset.focus(list_datasets, [dataset_dir, training_stage], [dataset], queue=False)
+    model_name.change(
+        list_checkpoints,
+        [
+            model_name,
+            finetuning_type,
+        ],
+        [output_dir],
+        queue=False,
+    )
+    finetuning_type.change(list_checkpoints, [model_name, finetuning_type], [output_dir], queue=False)
+    output_dir.change(
+        list_output_dirs,
+        [model_name, finetuning_type, current_time],
+        [output_dir],
+        concurrency_limit=None,
+        queue=False,
+    )
+    output_dir.input(
+        engine.runner.check_output_dir,
+        [lang, model_name, finetuning_type, output_dir],
+        [output_box],
+        concurrency_limit=None,
+    )
+    config_path.change(list_config_paths, [current_time], [config_path], queue=False)
+
+    return elem_dict
diff --git a/paddlemix/MULLM_WebUI/css.py b/paddlemix/MULLM_WebUI/css.py
new file mode 100644
index 000000000..13794cf47
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/css.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+CSS = r"""
+.duplicate-button {
+  margin: auto !important;
+  color: white !important;
+  background: black !important;
+  border-radius: 100vh !important;
+}
+
+.modal-box {
+  position: fixed !important;
+  top: 50%;
+  left: 50%;
+  transform: translate(-50%, -50%); /* center horizontally */
+  max-width: 1000px;
+  max-height: 750px;
+  overflow-y: auto;
+  background-color: var(--input-background-fill);
+  flex-wrap: nowrap !important;
+  border: 2px solid black !important;
+  z-index: 1000;
+  padding: 10px;
+}
+
+.dark .modal-box {
+  border: 2px solid white !important;
+}
+"""
diff --git a/paddlemix/MULLM_WebUI/engine.py b/paddlemix/MULLM_WebUI/engine.py
new file mode 100644
index 000000000..bc1dd3128
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/engine.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Any, Dict
+
+from .chatter import WebChatModel
+from .common import get_time
+from .locales import LOCALES
+from .manager import Manager
+from .runner import Runner
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+
+class Engine:
+    def __init__(self, demo_mode: bool = False, pure_chat: bool = False) -> None:
+        self.demo_mode = demo_mode
+        self.pure_chat = pure_chat
+        self.manager = Manager()
+        self.runner = Runner(self.manager, demo_mode)
+        self.chatter = WebChatModel(self.manager, demo_mode, lazy_init=(not pure_chat))
+
+    def _update_component(self, input_dict: Dict[str, Dict[str, Any]]) -> Dict["Component", "Component"]:
+        r"""
+        Gets the dict to update the components.
+        """
+        output_dict: Dict["Component", "Component"] = {}
+        for elem_id, elem_attr in input_dict.items():
+            elem = self.manager.get_elem_by_id(elem_id)
+            output_dict[elem] = elem.__class__(**elem_attr)
+
+        return output_dict
+
+    def resume(self):
+        init_dict = {
+            "top.lang": {"value": "zh"},
+            "top.model_name": {"value": "Qwen2-VL-2B-Instruct"},
+            "top.model_path": {"value": "Qwen/Qwen2-VL-2B-Instruct"},
+        }
+
+        if not self.pure_chat:
+            current_time = get_time()
+            init_dict["train.current_time"] = {"value": current_time}
+            init_dict["train.output_dir"] = {"value": f"train_{current_time}"}
+            init_dict["train.config_path"] = {"value": f"{current_time}.yaml"}
+
+        yield self._update_component(init_dict)
+
+        if self.runner.running and not self.pure_chat:
+            yield {elem: elem.__class__(value=value) for elem, value in self.runner.running_data.items()}
+            if self.runner.do_train:
+                yield self._update_component({"train.resume_btn": {"value": True}})
+            else:
+                yield self._update_component({"eval.resume_btn": {"value": True}})
+
+    def change_lang(self, lang: str):
+        return {
+            elem: elem.__class__(**LOCALES[elem_name][lang])
+            for elem_name, elem in self.manager.get_elem_iter()
+            if elem_name in LOCALES
+        }
diff --git a/paddlemix/MULLM_WebUI/extras/__init__.py b/paddlemix/MULLM_WebUI/extras/__init__.py
new file mode 100644
index 000000000..fd05a9208
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/extras/__init__.py
@@ -0,0 +1,13 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
diff --git a/paddlemix/MULLM_WebUI/extras/args.py b/paddlemix/MULLM_WebUI/extras/args.py
new file mode 100644
index 000000000..6a546d6e9
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/extras/args.py
@@ -0,0 +1,872 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+from dataclasses import dataclass, field, fields
+from typing import Any, Dict, List, Literal, Optional, Tuple
+
+import paddle
+from paddlenlp.trainer import (
+    PdArgumentParser,
+    Seq2SeqTrainingArguments,
+    get_last_checkpoint,
+)
+from typing_extensions import Self
+from yaml import safe_dump, safe_load
+
+from ...utils.log import logger
+from .constants import CHECKPOINT_NAMES
+from .training import get_current_device
+
+
+@dataclass
+class DataArguments:
+    r"""
+    Arguments pertaining to what data we are going to input our model for training and evaluation.
+    """
+
+    template: Optional[str] = field(
+        default=None,
+        metadata={"help": "Which template to use for constructing prompts in training and inference."},
+    )
+    dataset: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of dataset(s) to use for training. Use commas to separate multiple datasets."},
+    )
+    eval_dataset: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of dataset(s) to use for evaluation. Use commas to separate multiple datasets."},
+    )
+    dataset_dir: str = field(
+        default="data",
+        metadata={"help": "Path to the folder containing the datasets."},
+    )
+    image_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to the folder containing the images or videos. Defaults to `dataset_dir`."},
+    )
+    cutoff_len: int = field(
+        default=2048,
+        metadata={"help": "The cutoff length of the tokenized inputs in the dataset."},
+    )
+    train_on_prompt: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to disable the mask on the prompt."},
+    )
+    mask_history: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to mask the history and train on the last turn only."},
+    )
+    streaming: bool = field(
+        default=False,
+        metadata={"help": "Enable dataset streaming."},
+    )
+    buffer_size: int = field(
+        default=16384,
+        metadata={"help": "Size of the buffer to randomly sample examples from in dataset streaming."},
+    )
+    mix_strategy: Literal["concat", "interleave_under", "interleave_over"] = field(
+        default="concat",
+        metadata={"help": "Strategy to use in dataset mixing (concat/interleave) (undersampling/oversampling)."},
+    )
+    interleave_probs: Optional[str] = field(
+        default=None,
+        metadata={"help": "Probabilities to sample data from datasets. Use commas to separate multiple datasets."},
+    )
+    overwrite_cache: bool = field(
+        default=False,
+        metadata={"help": "Overwrite the cached training and evaluation sets."},
+    )
+    preprocessing_batch_size: int = field(
+        default=1000,
+        metadata={"help": "The number of examples in one group in pre-processing."},
+    )
+    preprocessing_num_workers: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of processes to use for the pre-processing."},
+    )
+    max_samples: Optional[int] = field(
+        default=None,
+        metadata={"help": "For debugging purposes, truncate the number of examples for each dataset."},
+    )
+    eval_num_beams: Optional[int] = field(
+        default=None,
+        metadata={"help": "Number of beams to use for evaluation. This argument will be passed to `model.generate`"},
+    )
+    ignore_pad_token_for_loss: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to ignore the tokens corresponding to the pad label in loss computation."},
+    )
+    val_size: float = field(
+        default=0.0,
+        metadata={"help": "Size of the development set, should be an integer or a float in range `[0,1)`."},
+    )
+    packing: Optional[bool] = field(
+        default=None,
+        metadata={"help": "Enable sequences packing in training. Will automatically enable in pre-training."},
+    )
+    neat_packing: bool = field(
+        default=False,
+        metadata={"help": "Enable sequence packing without cross-attention."},
+    )
+    tool_format: Optional[str] = field(
+        default=None,
+        metadata={"help": "Tool format to use for constructing function calling examples."},
+    )
+    tokenized_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Path to save or load the tokenized datasets. "
+                "If tokenized_path not exists, it will save the tokenized datasets. "
+                "If tokenized_path exists, it will load the tokenized datasets."
+            )
+        },
+    )
+
+    def __post_init__(self):
+        def split_arg(arg):
+            if isinstance(arg, str):
+                return [item.strip() for item in arg.split(",")]
+            return arg
+
+        self.dataset = split_arg(self.dataset)
+        self.eval_dataset = split_arg(self.eval_dataset)
+
+        if self.image_dir is None:
+            self.image_dir = self.dataset_dir
+
+        if self.dataset is None and self.val_size > 1e-6:
+            raise ValueError("Cannot specify `val_size` if `dataset` is None.")
+
+        if self.eval_dataset is not None and self.val_size > 1e-6:
+            raise ValueError("Cannot specify `val_size` if `eval_dataset` is not None.")
+
+        if self.interleave_probs is not None:
+            if self.mix_strategy == "concat":
+                raise ValueError("`interleave_probs` is only valid for interleaved mixing.")
+
+            self.interleave_probs = list(map(float, split_arg(self.interleave_probs)))
+            if self.dataset is not None and len(self.dataset) != len(self.interleave_probs):
+                raise ValueError("The length of dataset and interleave probs should be identical.")
+
+            if self.eval_dataset is not None and len(self.eval_dataset) != len(self.interleave_probs):
+                raise ValueError("The length of eval dataset and interleave probs should be identical.")
+
+        if self.streaming and self.val_size > 1e-6 and self.val_size < 1:
+            raise ValueError("Streaming mode should have an integer val size.")
+
+        if self.streaming and self.max_samples is not None:
+            raise ValueError("`max_samples` is incompatible with `streaming`.")
+
+        if self.mask_history and self.train_on_prompt:
+            raise ValueError("`mask_history` is incompatible with `train_on_prompt`.")
+
+
+@dataclass
+class ProcessorArguments:
+    r"""
+    Arguments pertaining to the image processor.
+    """
+
+    image_resolution: int = field(
+        default=512 * 512,
+        metadata={"help": "Keeps the number of pixels of image below this resolution."},
+    )
+    video_resolution: int = field(
+        default=128 * 128,
+        metadata={"help": "Keeps the number of pixels of video below this resolution."},
+    )
+    video_fps: float = field(
+        default=2.0,
+        metadata={"help": "The frames to sample per second for video inputs."},
+    )
+    video_maxlen: int = field(
+        default=64,
+        metadata={"help": "The maximum number of sampled frames for video inputs."},
+    )
+
+
+@dataclass
+class ExportArguments:
+    r"""
+    Arguments pertaining to the model export.
+    """
+
+    export_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to the directory to save the exported model."},
+    )
+    export_size: int = field(
+        default=1,
+        metadata={"help": "The file shard size (in GB) of the exported model."},
+    )
+    export_device: Literal["cpu", "auto"] = field(
+        default="cpu",
+        metadata={"help": "The device used in model export, use `auto` to accelerate exporting."},
+    )
+    export_quantization_bit: Optional[int] = field(
+        default=None,
+        metadata={"help": "The number of bits to quantize the exported model."},
+    )
+    export_quantization_dataset: Optional[str] = field(
+        default=None,
+        metadata={"help": "Path to the dataset or dataset name to use in quantizing the exported model."},
+    )
+    export_quantization_nsamples: int = field(
+        default=128,
+        metadata={"help": "The number of samples used for quantization."},
+    )
+    export_quantization_maxlen: int = field(
+        default=1024,
+        metadata={"help": "The maximum length of the model inputs used for quantization."},
+    )
+    export_legacy_format: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to save the `.bin` files instead of `.safetensors`."},
+    )
+    export_hub_model_id: Optional[str] = field(
+        default=None,
+        metadata={"help": "The name of the repository if push the model to the Hugging Face hub."},
+    )
+
+
+@dataclass
+class VllmArguments:
+    r"""
+    Arguments pertaining to the vLLM worker.
+    """
+
+    vllm_maxlen: int = field(
+        default=4096,
+        metadata={"help": "Maximum sequence (prompt + response) length of the vLLM engine."},
+    )
+    vllm_gpu_util: float = field(
+        default=0.9,
+        metadata={"help": "The fraction of GPU memory in (0,1) to be used for the vLLM engine."},
+    )
+    vllm_enforce_eager: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to disable CUDA graph in the vLLM engine."},
+    )
+    vllm_max_lora_rank: int = field(
+        default=32,
+        metadata={"help": "Maximum rank of all LoRAs in the vLLM engine."},
+    )
+    vllm_config: Optional[str] = field(
+        default=None,
+        metadata={"help": "Config to initialize the vllm engine. Please use JSON strings."},
+    )
+
+
+@dataclass
+class ModelArguments(ProcessorArguments, ExportArguments, VllmArguments):
+    r"""
+    Arguments pertaining to which model/config/tokenizer we are going to fine-tune or infer.
+    """
+
+    model_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": "Path to the model weight or identifier from huggingface.co/models or modelscope.cn/models."
+        },
+    )
+    adapter_name_or_path: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Path to the adapter weight or identifier from huggingface.co/models. "
+                "Use commas to separate multiple adapters."
+            )
+        },
+    )
+    adapter_folder: Optional[str] = field(
+        default=None,
+        metadata={"help": "The folder containing the adapter weights to load."},
+    )
+    cache_dir: Optional[str] = field(
+        default=None,
+        metadata={"help": "Where to store the pre-trained models downloaded from huggingface.co or modelscope.cn."},
+    )
+    use_fast_tokenizer: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to use one of the fast tokenizer (backed by the tokenizers library)."},
+    )
+    resize_vocab: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to resize the tokenizer vocab and the embedding layers."},
+    )
+    split_special_tokens: bool = field(
+        default=False,
+        metadata={"help": "Whether or not the special tokens should be split during the tokenization process."},
+    )
+    new_special_tokens: Optional[str] = field(
+        default=None,
+        metadata={"help": "Special tokens to be added into the tokenizer. Use commas to separate multiple tokens."},
+    )
+    model_revision: str = field(
+        default="main",
+        metadata={"help": "The specific model version to use (can be a branch name, tag name or commit id)."},
+    )
+    low_cpu_mem_usage: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to use memory-efficient model loading."},
+    )
+    rope_scaling: Optional[Literal["linear", "dynamic"]] = field(
+        default=None,
+        metadata={"help": "Which scaling strategy should be adopted for the RoPE embeddings."},
+    )
+    flash_attn: Literal["auto", "disabled", "sdpa", "fa2"] = field(
+        default="auto",
+        metadata={"help": "Enable FlashAttention for faster training and inference."},
+    )
+    shift_attn: bool = field(
+        default=False,
+        metadata={"help": "Enable shift short attention (S^2-Attn) proposed by LongLoRA."},
+    )
+    mixture_of_depths: Optional[Literal["convert", "load"]] = field(
+        default=None,
+        metadata={"help": "Convert the model to mixture-of-depths (MoD) or load the MoD model."},
+    )
+    use_unsloth: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use unsloth's optimization for the LoRA training."},
+    )
+    use_unsloth_gc: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use unsloth's gradient checkpointing."},
+    )
+    enable_liger_kernel: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to enable liger kernel for faster training."},
+    )
+    moe_aux_loss_coef: Optional[float] = field(
+        default=None,
+        metadata={"help": "Coefficient of the auxiliary router loss in mixture-of-experts model."},
+    )
+    disable_gradient_checkpointing: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to disable gradient checkpointing."},
+    )
+    upcast_layernorm: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to upcast the layernorm weights in fp32."},
+    )
+    upcast_lmhead_output: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to upcast the output of lm_head in fp32."},
+    )
+    train_from_scratch: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to randomly initialize the model weights."},
+    )
+    infer_backend: Literal["huggingface", "vllm"] = field(
+        default="huggingface",
+        metadata={"help": "Backend engine used at inference."},
+    )
+    offload_folder: str = field(
+        default="offload",
+        metadata={"help": "Path to offload model weights."},
+    )
+    use_cache: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to use KV cache in generation."},
+    )
+    infer_dtype: Literal["auto", "float16", "bfloat16", "float32"] = field(
+        default="auto",
+        metadata={"help": "Data type for model weights and activations at inference."},
+    )
+    hf_hub_token: Optional[str] = field(
+        default=None,
+        metadata={"help": "Auth token to log in with Hugging Face Hub."},
+    )
+    ms_hub_token: Optional[str] = field(
+        default=None,
+        metadata={"help": "Auth token to log in with ModelScope Hub."},
+    )
+    om_hub_token: Optional[str] = field(
+        default=None,
+        metadata={"help": "Auth token to log in with Modelers Hub."},
+    )
+    print_param_status: bool = field(
+        default=False,
+        metadata={"help": "For debugging purposes, print the status of the parameters in the model."},
+    )
+    compute_dtype: Optional[paddle.dtype] = field(
+        default=None,
+        init=False,
+        metadata={"help": "Paddle data type for computing model outputs, derived from `fp/bf16`. Do not specify it."},
+    )
+    device_map: Optional[str] = field(
+        default=None,
+        init=False,
+        metadata={"help": "Device map for model placement, derived from training stage. Do not specify it."},
+    )
+    model_max_length: Optional[int] = field(
+        default=None,
+        init=False,
+        metadata={"help": "The maximum input length for model, derived from `cutoff_len`. Do not specify it."},
+    )
+    block_diag_attn: bool = field(
+        default=False,
+        init=False,
+        metadata={"help": "Whether use block diag attention or not, derived from `neat_packing`. Do not specify it."},
+    )
+
+    def __post_init__(self):
+        if self.model_name_or_path is None:
+            raise ValueError("Please provide `model_name_or_path`.")
+
+        if self.split_special_tokens and self.use_fast_tokenizer:
+            raise ValueError("`split_special_tokens` is only supported for slow tokenizers.")
+
+        if self.adapter_name_or_path is not None:  # support merging multiple lora weights
+            self.adapter_name_or_path = [path.strip() for path in self.adapter_name_or_path.split(",")]
+
+        if self.new_special_tokens is not None:  # support multiple special tokens
+            self.new_special_tokens = [token.strip() for token in self.new_special_tokens.split(",")]
+
+        if self.export_quantization_bit is not None and self.export_quantization_dataset is None:
+            raise ValueError("Quantization dataset is necessary for exporting.")
+
+        # if isinstance(self.vllm_config, str) and self.vllm_config.startswith("{"):
+        #     self.vllm_config = _convert_str_dict(json.loads(self.vllm_config))
+
+    @classmethod
+    def copyfrom(cls, source: "Self", **kwargs) -> "Self":
+        init_args, lazy_args = {}, {}
+        for attr in fields(source):
+            if attr.init:
+                init_args[attr.name] = getattr(source, attr.name)
+            else:
+                lazy_args[attr.name] = getattr(source, attr.name)
+
+        init_args.update(kwargs)
+        result = cls(**init_args)
+        for name, value in lazy_args.items():
+            setattr(result, name, value)
+
+        return result
+
+
+@dataclass
+class GeneratingArguments:
+    r"""
+    Arguments pertaining to specify the decoding parameters.
+    """
+
+    do_sample: bool = field(
+        default=True,
+        metadata={"help": "Whether or not to use sampling, use greedy decoding otherwise."},
+    )
+    temperature: float = field(
+        default=0.95,
+        metadata={"help": "The value used to modulate the next token probabilities."},
+    )
+    top_p: float = field(
+        default=0.7,
+        metadata={
+            "help": "The smallest set of most probable tokens with probabilities that add up to top_p or higher are kept."
+        },
+    )
+    top_k: int = field(
+        default=50,
+        metadata={"help": "The number of highest probability vocabulary tokens to keep for top-k filtering."},
+    )
+    num_beams: int = field(
+        default=1,
+        metadata={"help": "Number of beams for beam search. 1 means no beam search."},
+    )
+    max_length: int = field(
+        default=1024,
+        metadata={"help": "The maximum length the generated tokens can have. It can be overridden by max_new_tokens."},
+    )
+    max_new_tokens: int = field(
+        default=1024,
+        metadata={"help": "The maximum numbers of tokens to generate, ignoring the number of tokens in the prompt."},
+    )
+    repetition_penalty: float = field(
+        default=1.0,
+        metadata={"help": "The parameter for repetition penalty. 1.0 means no penalty."},
+    )
+    length_penalty: float = field(
+        default=1.0,
+        metadata={"help": "Exponential penalty to the length that is used with beam-based generation."},
+    )
+    default_system: Optional[str] = field(
+        default=None,
+        metadata={"help": "Default system message to use in chat completion."},
+    )
+
+
+# finetune
+@dataclass
+class FreezeArguments:
+    r"""
+    Arguments pertaining to the freeze (partial-parameter) training.
+    """
+
+    freeze_trainable_layers: int = field(
+        default=2,
+        metadata={
+            "help": (
+                "The number of trainable layers for freeze (partial-parameter) fine-tuning. "
+                "Positive numbers mean the last n layers are set as trainable, "
+                "negative numbers mean the first n layers are set as trainable."
+            )
+        },
+    )
+    freeze_trainable_modules: str = field(
+        default="all",
+        metadata={
+            "help": (
+                "Name(s) of trainable modules for freeze (partial-parameter) fine-tuning. "
+                "Use commas to separate multiple modules. "
+                "Use `all` to specify all the available modules."
+            )
+        },
+    )
+    freeze_extra_modules: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Name(s) of modules apart from hidden layers to be set as trainable "
+                "for freeze (partial-parameter) fine-tuning. "
+                "Use commas to separate multiple modules."
+            )
+        },
+    )
+
+
+@dataclass
+class LoraArguments:
+    r"""
+    Arguments pertaining to the LoRA training.
+    """
+
+    additional_target: Optional[str] = field(
+        default=None,
+        metadata={
+            "help": (
+                "Name(s) of modules apart from LoRA layers to be set as trainable "
+                "and saved in the final checkpoint. "
+                "Use commas to separate multiple modules."
+            )
+        },
+    )
+    lora_alpha: Optional[int] = field(
+        default=None,
+        metadata={"help": "The scale factor for LoRA fine-tuning (default: lora_rank * 2)."},
+    )
+    lora_dropout: float = field(
+        default=0.0,
+        metadata={"help": "Dropout rate for the LoRA fine-tuning."},
+    )
+    lora_rank: int = field(
+        default=8,
+        metadata={"help": "The intrinsic dimension for LoRA fine-tuning."},
+    )
+    lora_target: str = field(
+        default="all",
+        metadata={
+            "help": (
+                "Name(s) of target modules to apply LoRA. "
+                "Use commas to separate multiple modules. "
+                "Use `all` to specify all the linear modules."
+            )
+        },
+    )
+    loraplus_lr_ratio: Optional[float] = field(
+        default=1.0,
+        metadata={"help": "LoRA plus learning rate ratio (lr_B / lr_A)."},
+    )
+    loraplus_lr_embedding: float = field(
+        default=1e-6,
+        metadata={"help": "LoRA plus learning rate for lora embedding layers."},
+    )
+    use_rslora: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the rank stabilization scaling factor for LoRA layer."},
+    )
+    use_dora: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the weight-decomposed lora method (DoRA)."},
+    )
+    pissa_init: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to initialize a PiSSA adapter."},
+    )
+    pissa_iter: int = field(
+        default=16,
+        metadata={"help": "The number of iteration steps performed by FSVD in PiSSA. Use -1 to disable it."},
+    )
+    pissa_convert: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to convert the PiSSA adapter to a normal LoRA adapter."},
+    )
+    create_new_adapter: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to create a new adapter with randomly initialized weight."},
+    )
+
+
+@dataclass
+class GaloreArguments:
+    r"""
+    Arguments pertaining to the GaLore algorithm.
+    """
+
+    use_galore: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the gradient low-Rank projection (GaLore)."},
+    )
+    galore_target: str = field(
+        default="all",
+        metadata={
+            "help": (
+                "Name(s) of modules to apply GaLore. Use commas to separate multiple modules. "
+                "Use `all` to specify all the linear modules."
+            )
+        },
+    )
+    galore_rank: int = field(
+        default=16,
+        metadata={"help": "The rank of GaLore gradients."},
+    )
+    galore_update_interval: int = field(
+        default=200,
+        metadata={"help": "Number of steps to update the GaLore projection."},
+    )
+    galore_scale: float = field(
+        default=0.25,
+        metadata={"help": "GaLore scaling coefficient."},
+    )
+    galore_proj_type: Literal["std", "reverse_std", "right", "left", "full"] = field(
+        default="std",
+        metadata={"help": "Type of GaLore projection."},
+    )
+    galore_layerwise: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to enable layer-wise update to further save memory."},
+    )
+
+
+@dataclass
+class FinetuningArguments(FreezeArguments, LoraArguments):
+    r"""
+    Arguments pertaining to which techniques we are going to fine-tuning with.
+    """
+
+    pure_bf16: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to train model in purely bf16 precision (without AMP)."},
+    )
+    stage: Literal["pt", "sft", "rm", "ppo", "dpo", "kto"] = field(
+        default="sft",
+        metadata={"help": "Which stage will be performed in training."},
+    )
+    finetuning_type: Literal["lora", "freeze", "full"] = field(
+        default="lora",
+        metadata={"help": "Which fine-tuning method to use."},
+    )
+    use_llama_pro: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to make only the parameters in the expanded blocks trainable."},
+    )
+    use_adam_mini: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to use the Adam-mini optimizer."},
+    )
+    freeze_vision_tower: bool = field(
+        default=True,
+        metadata={"help": "Whether ot not to freeze vision tower in MLLM training."},
+    )
+    train_mm_proj_only: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to train the multimodal projector for MLLM only."},
+    )
+    compute_accuracy: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to compute the token-level accuracy at evaluation."},
+    )
+    plot_loss: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to save the training loss curves."},
+    )
+    include_effective_tokens_per_second: bool = field(
+        default=False,
+        metadata={"help": "Whether or not to compute effective tokens per second."},
+    )
+
+    def __post_init__(self):
+        def split_arg(arg):
+            if isinstance(arg, str):
+                return [item.strip() for item in arg.split(",")]
+            return arg
+
+        self.freeze_trainable_modules: List[str] = split_arg(self.freeze_trainable_modules)
+        self.freeze_extra_modules: Optional[List[str]] = split_arg(self.freeze_extra_modules)
+        self.lora_alpha: int = self.lora_alpha or self.lora_rank * 2
+        # self.lora_target: List[str] = split_arg(self.lora_target)
+        self.freeze_vision_tower = self.freeze_vision_tower or self.train_mm_proj_only
+
+        assert self.finetuning_type in ["lora", "full"], "Invalid fine-tuning method."
+        # assert self.ref_model_quantization_bit in [None, 8, 4], "We only accept 4-bit or 8-bit quantization."
+
+        if self.use_llama_pro and self.finetuning_type == "full":
+            raise ValueError("`use_llama_pro` is only valid for Freeze or LoRA training.")
+
+        if self.train_mm_proj_only and self.finetuning_type != "full":
+            raise ValueError("`train_mm_proj_only` is only valid for full training.")
+
+        if self.finetuning_type != "lora":
+            if self.loraplus_lr_ratio is not None:
+                raise ValueError("`loraplus_lr_ratio` is only valid for LoRA training.")
+
+            if self.use_rslora:
+                raise ValueError("`use_rslora` is only valid for LoRA training.")
+
+            if self.pissa_init:
+                raise ValueError("`pissa_init` is only valid for LoRA training.")
+
+
+_TRAIN_ARGS = [ModelArguments, DataArguments, Seq2SeqTrainingArguments, FinetuningArguments, GeneratingArguments]
+_TRAIN_CLS = Tuple[ModelArguments, DataArguments, Seq2SeqTrainingArguments, FinetuningArguments, GeneratingArguments]
+
+
+def _parse_args(parser: "PdArgumentParser", args: Optional[Dict[str, Any]] = None) -> Tuple[Any]:
+    if args is not None:
+        return parser.parse_dict(args)
+
+    if len(sys.argv) == 2 and (sys.argv[1].endswith(".yaml") or sys.argv[1].endswith(".yml")):
+        return parser.parse_yaml_file(os.path.abspath(sys.argv[1]))
+
+    if len(sys.argv) == 2 and sys.argv[1].endswith(".json"):
+        return parser.parse_json_file(os.path.abspath(sys.argv[1]))
+
+    (*parsed_args, unknown_args) = parser.parse_args_into_dataclasses(return_remaining_strings=True)
+
+    if unknown_args:
+        print(parser.format_help())
+        print(f"Got unknown args, potentially deprecated arguments: {unknown_args}")
+        raise ValueError(f"Some specified arguments are not used by the PdArgumentParser: {unknown_args}")
+
+    return (*parsed_args,)
+
+
+def _parse_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
+    parser = PdArgumentParser(_TRAIN_ARGS)
+    return _parse_args(parser, args)
+
+
+def get_train_args(args: Optional[Dict[str, Any]] = None) -> _TRAIN_CLS:
+    model_args, data_args, training_args, finetuning_args, generating_args = _parse_train_args(args)
+
+    # Check arguments
+    if finetuning_args.stage != "pt" and data_args.template is None:
+        raise ValueError("Please specify which `template` to use.")
+
+    if finetuning_args.stage != "sft":
+        if training_args.predict_with_generate:
+            raise ValueError("`predict_with_generate` cannot be set as True except SFT.")
+
+        if data_args.neat_packing:
+            raise ValueError("`neat_packing` cannot be set as True except SFT.")
+
+        if data_args.train_on_prompt or data_args.mask_history:
+            raise ValueError("`train_on_prompt` or `mask_history` cannot be set as True except SFT.")
+
+    if finetuning_args.stage == "sft" and training_args.do_predict and not training_args.predict_with_generate:
+        raise ValueError("Please enable `predict_with_generate` to save model predictions.")
+
+    if training_args.max_steps == -1 and data_args.streaming:
+        raise ValueError("Please specify `max_steps` in streaming mode.")
+
+    if training_args.do_train and data_args.dataset is None:
+        raise ValueError("Please specify dataset for training.")
+
+    if (training_args.do_eval or training_args.do_predict) and (
+        data_args.eval_dataset is None and data_args.val_size < 1e-6
+    ):
+        raise ValueError("Please specify dataset for evaluation.")
+
+    if training_args.predict_with_generate:
+        if data_args.eval_dataset is None:
+            raise ValueError("Cannot use `predict_with_generate` if `eval_dataset` is None.")
+
+        if finetuning_args.compute_accuracy:
+            raise ValueError("Cannot use `predict_with_generate` and `compute_accuracy` together.")
+
+    if (
+        training_args.resume_from_checkpoint is None
+        and training_args.do_train
+        and os.path.isdir(training_args.output_dir)
+        and not training_args.overwrite_output_dir
+    ):
+        last_checkpoint = get_last_checkpoint(training_args.output_dir)
+        if last_checkpoint is None and any(
+            os.path.isfile(os.path.join(training_args.output_dir, name)) for name in CHECKPOINT_NAMES
+        ):
+            raise ValueError("Output directory already exists and is not empty. Please set `overwrite_output_dir`.")
+
+        if last_checkpoint is not None:
+            training_args.resume_from_checkpoint = last_checkpoint
+            logger(20, f"Resuming training from {training_args.resume_from_checkpoint}.")
+            logger(20, "Change `output_dir` or use `overwrite_output_dir` to avoid.")
+
+    # Post-process model arguments
+    if training_args.bf16 or finetuning_args.pure_bf16:
+        model_args.compute_dtype = "bfloat16"
+    elif training_args.fp16:
+        model_args.compute_dtype = "float16"
+    else:
+        model_args.compute_dtype = "float32"
+    model_args.device_map = {"": get_current_device()}
+    model_args.model_max_length = data_args.cutoff_len
+    model_args.block_diag_attn = data_args.neat_packing
+    data_args.packing = data_args.packing if data_args.packing is not None else finetuning_args.stage == "pt"
+
+    # Log on each process the small summary
+    logger(
+        20,
+        "Process rank: {}, device: {}, compute dtype: {}".format(
+            training_args.local_rank,
+            training_args.device,
+            str(model_args.compute_dtype),
+        ),
+    )
+
+    # transformers.set_seed(training_args.seed)
+
+    return model_args, data_args, training_args, finetuning_args, generating_args
+
+
+def load_args(config_path: str) -> Optional[Dict[str, Any]]:
+    r"""
+    Loads saved arguments.
+    """
+    try:
+        with open(config_path, encoding="utf-8") as f:
+            return safe_load(f)
+    except Exception:
+        return None
+
+
+def save_args(config_path: str, config_dict: Dict[str, Any]):
+    r"""
+    Saves arguments.
+    """
+    with open(config_path, "w", encoding="utf-8") as f:
+        safe_dump(config_dict, f)
diff --git a/paddlemix/MULLM_WebUI/extras/callbacks.py b/paddlemix/MULLM_WebUI/extras/callbacks.py
new file mode 100644
index 000000000..18484afbf
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/extras/callbacks.py
@@ -0,0 +1,205 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import sys
+import time
+from concurrent.futures import ThreadPoolExecutor
+from datetime import timedelta
+from typing import TYPE_CHECKING, Any, Dict, Optional
+
+from paddlenlp.trainer.trainer_callback import TrainerCallback
+from paddlenlp.trainer.trainer_utils import has_length
+from typing_extensions import override
+
+from ...utils.log import logger
+from .constants import TRAINER_LOG
+from .training import get_peak_memory
+
+if TYPE_CHECKING:
+    from paddlenlp.trainer.trainer_callback import (
+        TrainerControl,
+        TrainerState,
+        TrainingArguments,
+    )
+
+
+class LogCallback(TrainerCallback):
+    r"""
+    A callback for logging training and evaluation status.
+    """
+
+    def __init__(self) -> None:
+        # Progress
+        self.start_time = 0
+        self.cur_steps = 0
+        self.max_steps = 0
+        self.elapsed_time = ""
+        self.remaining_time = ""
+        self.thread_pool: Optional["ThreadPoolExecutor"] = None
+        # Status
+        self.aborted = False
+        self.do_train = False
+        # Web UI
+        self.webui_mode = True
+        # if self.webui_mode:
+        # signal.signal(signal.SIGABRT, self._set_abort)
+        # self.logger_handler = logging.LoggerHandler(os.environ.get("LLAMABOARD_WORKDIR"))
+        # logging.add_handler(self.logger_handler)
+        # transformers.logging.add_handler(self.logger_handler)
+
+    def _set_abort(self, signum, frame) -> None:
+        self.aborted = True
+
+    def _reset(self, max_steps: int = 0) -> None:
+        self.start_time = time.time()
+        self.cur_steps = 0
+        self.max_steps = max_steps
+        self.elapsed_time = ""
+        self.remaining_time = ""
+
+    def _timing(self, cur_steps: int) -> None:
+        cur_time = time.time()
+        elapsed_time = cur_time - self.start_time
+        avg_time_per_step = elapsed_time / cur_steps if cur_steps != 0 else 0
+        remaining_time = (self.max_steps - cur_steps) * avg_time_per_step
+        self.cur_steps = cur_steps
+        self.elapsed_time = str(timedelta(seconds=int(elapsed_time)))
+        self.remaining_time = str(timedelta(seconds=int(remaining_time)))
+
+    def _write_log(self, output_dir: str, logs: Dict[str, Any]) -> None:
+        with open(os.path.join(output_dir, TRAINER_LOG), "a", encoding="utf-8") as f:
+            f.write(json.dumps(logs) + "\n")
+
+    def _create_thread_pool(self, output_dir: str) -> None:
+        os.makedirs(output_dir, exist_ok=True)
+        self.thread_pool = ThreadPoolExecutor(max_workers=1)
+
+    def _close_thread_pool(self) -> None:
+        if self.thread_pool is not None:
+            self.thread_pool.shutdown(wait=True)
+            self.thread_pool = None
+
+    @override
+    def on_init_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if (
+            args.should_save
+            and os.path.exists(os.path.join(args.output_dir, TRAINER_LOG))
+            and args.overwrite_output_dir
+        ):
+            logger(30, "Previous trainer log in this folder will be deleted.")
+            os.remove(os.path.join(args.output_dir, TRAINER_LOG))
+
+    @override
+    def on_train_begin(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if args.should_save:
+            self.do_train = True
+            self._reset(max_steps=state.max_steps)
+            self._create_thread_pool(output_dir=args.output_dir)
+
+    @override
+    def on_train_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        self._close_thread_pool()
+
+    @override
+    def on_substep_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if self.aborted:
+            control.should_epoch_stop = True
+            control.should_training_stop = True
+
+    @override
+    def on_step_end(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if self.aborted:
+            control.should_epoch_stop = True
+            control.should_training_stop = True
+
+    @override
+    def on_evaluate(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if not self.do_train:
+            self._close_thread_pool()
+
+    @override
+    def on_predict(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if not self.do_train:
+            self._close_thread_pool()
+
+    @override
+    def on_log(self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs):
+        if not args.should_save:
+            return
+
+        self._timing(cur_steps=state.global_step)
+        logs = dict(
+            current_steps=self.cur_steps,
+            total_steps=self.max_steps,
+            loss=kwargs["logs"].get("loss"),
+            eval_loss=kwargs["logs"].get("eval_loss"),
+            predict_loss=kwargs["logs"].get("predict_loss"),
+            lr=kwargs["logs"].get("learning_rate"),
+            epoch=kwargs["logs"].get("epoch"),
+            percentage=round(self.cur_steps / self.max_steps * 100, 2) if self.max_steps != 0 else 100,
+            elapsed_time=self.elapsed_time,
+            remaining_time=self.remaining_time,
+        )
+        # if state.num_input_tokens_seen:
+        #     logs["throughput"] = round(state.num_input_tokens_seen / (time.time() - self.start_time), 2)
+        #     logs["total_tokens"] = state.num_input_tokens_seen
+
+        if os.environ.get("RECORD_VRAM", "0").lower() in ["true", "1"]:
+            vram_allocated, vram_reserved = get_peak_memory()
+            logs["vram_allocated"] = round(vram_allocated / (1024**3), 2)
+            logs["vram_reserved"] = round(vram_reserved / (1024**3), 2)
+
+        logs = {k: v for k, v in logs.items() if v is not None}
+        if self.webui_mode and all(key in logs for key in ("loss", "lr", "epoch")):
+            log_str = f"'loss': {logs['loss']:.4f}, 'learning_rate': {logs['lr']:2.4e}, 'epoch': {logs['epoch']:.2f}"
+            for extra_key in ("reward", "accuracy", "throughput"):
+                if logs.get(extra_key):
+                    log_str += f", '{extra_key}': {logs[extra_key]:.2f}"
+
+            logger(30, "{" + log_str + "}")
+
+        if self.thread_pool is not None:
+            self.thread_pool.submit(self._write_log, args.output_dir, logs)
+
+    @override
+    def on_prediction_step(
+        self, args: "TrainingArguments", state: "TrainerState", control: "TrainerControl", **kwargs
+    ):
+        if self.do_train:
+            return
+
+        if self.aborted:
+            sys.exit(0)
+
+        if not args.should_save:
+            return
+
+        eval_dataloader = kwargs.pop("eval_dataloader", None)
+        if has_length(eval_dataloader):
+            if self.max_steps == 0:
+                self._reset(max_steps=len(eval_dataloader))
+                self._create_thread_pool(output_dir=args.output_dir)
+
+            self._timing(cur_steps=self.cur_steps + 1)
+            if self.cur_steps % 5 == 0 and self.thread_pool is not None:
+                logs = dict(
+                    current_steps=self.cur_steps,
+                    total_steps=self.max_steps,
+                    percentage=round(self.cur_steps / self.max_steps * 100, 2) if self.max_steps != 0 else 100,
+                    elapsed_time=self.elapsed_time,
+                    remaining_time=self.remaining_time,
+                )
+                self.thread_pool.submit(self._write_log, args.output_dir, logs)
diff --git a/paddlemix/MULLM_WebUI/extras/constants.py b/paddlemix/MULLM_WebUI/extras/constants.py
new file mode 100644
index 000000000..05d636edb
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/extras/constants.py
@@ -0,0 +1,70 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+SUPPORTED_MODELS = {
+    "Qwen2-VL-2B-Instruct": "Qwen/Qwen2-VL-2B-Instruct",
+    "Qwen2-VL-7B-Instruct": "Qwen/Qwen2-VL-7B-Instruct",
+}
+
+
+MODEL_MAPPING = {
+    "Qwen2-VL-2B-Instruct": "Qwen2VLForConditionalGeneration",
+    "Qwen2-VL-7B-Instruct": "Qwen2VLForConditionalGeneration",
+}
+
+DEFAULT_TEMPLATE = {
+    "Qwen2-VL-2B-Instruct": "qwen2_vl",
+    "Qwen2-VL-7B-Instruct": "qwen2_vl",
+}
+
+METHODS = ["full", "lora"]
+
+# train
+TRAINING_STAGES = {
+    "Supervised Fine-Tuning": "sft",
+}
+
+STAGES_USE_PAIR_DATA = {}
+PADDLEMIX_CONFIG = "config.yaml"
+
+DATA_CONFIG = "dataset_info.json"
+
+PEFT_METHODS = {"lora"}
+
+DEFAULT_DATA_DIR = "data"
+
+TRAINER_MAPPING = {}
+
+FILEEXT2TYPE = {
+    "arrow": "arrow",
+    "csv": "csv",
+    "json": "json",
+    "jsonl": "json",
+    "parquet": "parquet",
+    "txt": "text",
+}
+
+IGNORE_INDEX = -100
+
+IMAGE_PLACEHOLDER = os.environ.get("IMAGE_PLACEHOLDER", "<image>")
+VIDEO_PLACEHOLDER = os.environ.get("VIDEO_PLACEHOLDER", "<video>")
+
+CHECKPOINT_NAMES = {}
+
+FINAL_CHECKPOINT_NAME = "checkpoint-latest"
+
+TRAINER_LOG = "trainer_log.jsonl"
+RUNNING_LOG = "running_log.txt"
diff --git a/paddlemix/MULLM_WebUI/extras/data.py b/paddlemix/MULLM_WebUI/extras/data.py
new file mode 100644
index 000000000..89db6f07a
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/extras/data.py
@@ -0,0 +1,345 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+from enum import Enum
+from functools import partial
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Dict,
+    List,
+    Optional,
+    Sequence,
+    Set,
+    TypedDict,
+    Union,
+)
+
+from datasets import DatasetDict, concatenate_datasets, interleave_datasets
+
+from ...utils.log import logger
+
+if TYPE_CHECKING:
+    from datasets import Dataset, IterableDataset
+    from paddlenlp.trainer import Seq2SeqTrainingArguments
+
+    from ..common import DatasetAttr
+    from .args import DataArguments
+    from .plugin import ImageInput, VideoInput
+
+
+SLOTS = Sequence[Union[str, Set[str], Dict[str, str]]]
+
+
+class Role(str, Enum):
+    USER = "user"
+    ASSISTANT = "assistant"
+    SYSTEM = "system"
+    FUNCTION = "function"
+    OBSERVATION = "observation"
+
+
+class DatasetModule(TypedDict):
+    train_dataset: Optional[Union["Dataset", "IterableDataset"]]
+    eval_dataset: Optional[Union["Dataset", "IterableDataset"]]
+
+
+def merge_dataset(
+    all_datasets: List[Union["Dataset", "IterableDataset"]], data_args: "DataArguments", seed: int
+) -> Union["Dataset", "IterableDataset"]:
+    r"""
+    Merges multiple datasets to a unified dataset.
+    """
+    if len(all_datasets) == 1:
+        return all_datasets[0]
+    elif data_args.mix_strategy == "concat":
+        if data_args.streaming:
+            logger.warning_once("The samples between different datasets will not be mixed in streaming mode.")
+
+        return concatenate_datasets(all_datasets)
+    elif data_args.mix_strategy.startswith("interleave"):
+        if not data_args.streaming:
+            logger.warning_once("We recommend using `mix_strategy=concat` in non-streaming mode.")
+
+        return interleave_datasets(
+            datasets=all_datasets,
+            probabilities=data_args.interleave_probs,
+            seed=seed,
+            stopping_strategy="first_exhausted" if data_args.mix_strategy.endswith("under") else "all_exhausted",
+        )
+    else:
+        raise ValueError(f"Unknown mixing strategy: {data_args.mix_strategy}.")
+
+
+def split_dataset(
+    dataset: Union["Dataset", "IterableDataset"], data_args: "DataArguments", seed: int
+) -> "DatasetDict":
+    r"""
+    Splits the dataset and returns a dataset dict containing train set and validation set.
+
+    Supports both map dataset and iterable dataset.
+    """
+    if data_args.streaming:
+        dataset = dataset.shuffle(buffer_size=data_args.buffer_size, seed=seed)
+        val_set = dataset.take(int(data_args.val_size))
+        train_set = dataset.skip(int(data_args.val_size))
+        return DatasetDict({"train": train_set, "validation": val_set})
+    else:
+        val_size = int(data_args.val_size) if data_args.val_size > 1 else data_args.val_size
+        dataset = dataset.train_test_split(test_size=val_size, seed=seed)
+        return DatasetDict({"train": dataset["train"], "validation": dataset["test"]})
+
+
+def _convert_images(
+    images: Union["ImageInput", Sequence["ImageInput"]],
+    dataset_attr: "DatasetAttr",
+    data_args: "DataArguments",
+) -> Optional[List["ImageInput"]]:
+    r"""
+    Optionally concatenates image path to dataset dir when loading from local disk.
+    """
+    if not isinstance(images, list):
+        images = [images]
+    elif len(images) == 0:
+        return None
+    else:
+        images = images[:]
+
+    if dataset_attr.load_from in ["script", "file"]:
+        for i in range(len(images)):
+            if isinstance(images[i], str) and os.path.isfile(os.path.join(data_args.image_dir, images[i])):
+                images[i] = os.path.join(data_args.image_dir, images[i])
+
+    return images
+
+
+def _convert_videos(
+    videos: Union["VideoInput", Sequence["VideoInput"]],
+    dataset_attr: "DatasetAttr",
+    data_args: "DataArguments",
+) -> Optional[List["VideoInput"]]:
+    r"""
+    Optionally concatenates video path to dataset dir when loading from local disk.
+    """
+    if not isinstance(videos, list):
+        videos = [videos]
+    elif len(videos) == 0:
+        return None
+    else:
+        videos = videos[:]
+
+    if dataset_attr.load_from in ["script", "file"]:
+        for i in range(len(videos)):
+            if isinstance(videos[i], str) and os.path.isfile(os.path.join(data_args.image_dir, videos[i])):
+                videos[i] = os.path.join(data_args.image_dir, videos[i])
+
+    return videos
+
+
+def convert_alpaca(
+    example: Dict[str, Any],
+    dataset_attr: "DatasetAttr",
+    data_args: "DataArguments",
+) -> Dict[str, Any]:
+    r"""
+    Converts alpaca format dataset to the standard format.
+    """
+    prompt = []
+    if dataset_attr.history and isinstance(example[dataset_attr.history], list):
+        for old_prompt, old_response in example[dataset_attr.history]:
+            prompt.append({"role": Role.USER.value, "content": old_prompt})
+            prompt.append({"role": Role.ASSISTANT.value, "content": old_response})
+
+    query = []
+    if dataset_attr.prompt and example[dataset_attr.prompt]:
+        query.append(example[dataset_attr.prompt])
+
+    if dataset_attr.query and example[dataset_attr.query]:
+        query.append(example[dataset_attr.query])
+
+    prompt.append({"role": Role.USER.value, "content": "\n".join(query)})  # "prompt\nquery"
+
+    if dataset_attr.kto_tag and isinstance(example[dataset_attr.kto_tag], bool):  # kto example
+        response = [{"role": Role.ASSISTANT.value, "content": example[dataset_attr.response]}]
+        if example[dataset_attr.kto_tag]:
+            response = response + [{"role": Role.ASSISTANT.value, "content": ""}]
+        else:
+            response = [{"role": Role.ASSISTANT.value, "content": ""}] + response
+    elif (
+        dataset_attr.ranking
+        and isinstance(example[dataset_attr.chosen], str)
+        and isinstance(example[dataset_attr.rejected], str)
+    ):  # pairwise example
+        response = [
+            {"role": Role.ASSISTANT.value, "content": example[dataset_attr.chosen]},
+            {"role": Role.ASSISTANT.value, "content": example[dataset_attr.rejected]},
+        ]
+    elif dataset_attr.response and isinstance(example[dataset_attr.response], str):  # normal example
+        response = [{"role": Role.ASSISTANT.value, "content": example[dataset_attr.response]}]
+    else:  # unsupervised
+        response = []
+
+    convert_images = partial(_convert_images, dataset_attr=dataset_attr, data_args=data_args)
+    convert_videos = partial(_convert_videos, dataset_attr=dataset_attr, data_args=data_args)
+    output = {
+        "_prompt": prompt,
+        "_response": response,
+        "_system": example[dataset_attr.system] if dataset_attr.system else "",
+        "_tools": example[dataset_attr.tools] if dataset_attr.tools else "",
+        "_images": convert_images(example[dataset_attr.images]) if dataset_attr.images else None,
+        "_videos": convert_videos(example[dataset_attr.videos]) if dataset_attr.videos else None,
+    }
+    return output
+
+
+def convert_sharegpt(
+    example: Dict[str, Any],
+    dataset_attr: "DatasetAttr",
+    data_args: "DataArguments",
+) -> Dict[str, Any]:
+    r"""
+    Converts sharegpt format dataset to the standard format.
+    """
+    tag_mapping = {
+        dataset_attr.user_tag: Role.USER.value,
+        dataset_attr.assistant_tag: Role.ASSISTANT.value,
+        dataset_attr.observation_tag: Role.OBSERVATION.value,
+        dataset_attr.function_tag: Role.FUNCTION.value,
+        dataset_attr.system_tag: Role.SYSTEM.value,
+    }
+    odd_tags = (dataset_attr.user_tag, dataset_attr.observation_tag)
+    even_tags = (dataset_attr.assistant_tag, dataset_attr.function_tag)
+    accept_tags = (odd_tags, even_tags)
+    messages = example[dataset_attr.messages]
+    if (
+        dataset_attr.system_tag
+        and len(messages) != 0
+        and messages[0][dataset_attr.role_tag] == dataset_attr.system_tag
+    ):
+        system = messages[0][dataset_attr.content_tag]
+        messages = messages[1:]
+    else:
+        system = example[dataset_attr.system] if dataset_attr.system else ""
+
+    aligned_messages = []
+    broken_data = False
+    for turn_idx, message in enumerate(messages):
+        if message[dataset_attr.role_tag] not in accept_tags[turn_idx % 2]:
+            logger.warning_rank0(f"Invalid role tag in {messages}.")
+            broken_data = True
+
+        aligned_messages.append(
+            {"role": tag_mapping[message[dataset_attr.role_tag]], "content": message[dataset_attr.content_tag]}
+        )
+
+    if (not dataset_attr.ranking and len(aligned_messages) % 2 != 0) or (
+        dataset_attr.ranking and len(aligned_messages) % 2 == 0
+    ):
+        logger.warning_rank0(f"Invalid message count in {messages}.")
+        broken_data = True
+
+    if dataset_attr.kto_tag and isinstance(example[dataset_attr.kto_tag], bool):  # kto example
+        prompt = aligned_messages[:-1]
+        response = aligned_messages[-1:]
+        if example[dataset_attr.kto_tag]:
+            response = response + [{"role": Role.ASSISTANT.value, "content": ""}]
+        else:
+            response = [{"role": Role.ASSISTANT.value, "content": ""}] + response
+    elif (
+        dataset_attr.ranking
+        and isinstance(example[dataset_attr.chosen], dict)
+        and isinstance(example[dataset_attr.rejected], dict)
+    ):  # pairwise example
+        chosen = example[dataset_attr.chosen]
+        rejected = example[dataset_attr.rejected]
+        if (
+            chosen[dataset_attr.role_tag] not in accept_tags[-1]
+            or rejected[dataset_attr.role_tag] not in accept_tags[-1]
+        ):
+            logger.warning_rank0(f"Invalid role tag in {[chosen, rejected]}.")
+            broken_data = True
+
+        prompt = aligned_messages
+        response = [
+            {"role": tag_mapping[chosen[dataset_attr.role_tag]], "content": chosen[dataset_attr.content_tag]},
+            {"role": tag_mapping[rejected[dataset_attr.role_tag]], "content": rejected[dataset_attr.content_tag]},
+        ]
+    else:  # normal example
+        prompt = aligned_messages[:-1]
+        response = aligned_messages[-1:]
+
+    if broken_data:
+        logger.warning_rank0("Skipping this abnormal example.")
+        prompt, response = [], []
+
+    convert_images = partial(_convert_images, dataset_attr=dataset_attr, data_args=data_args)
+    convert_videos = partial(_convert_videos, dataset_attr=dataset_attr, data_args=data_args)
+    output = {
+        "_prompt": prompt,
+        "_response": response,
+        "_system": system,
+        "_tools": example[dataset_attr.tools] if dataset_attr.tools else "",
+        "_images": convert_images(example[dataset_attr.images]) if dataset_attr.images else None,
+        "_videos": convert_videos(example[dataset_attr.videos]) if dataset_attr.videos else None,
+    }
+    return output
+
+
+def align_dataset(
+    dataset: Union["Dataset", "IterableDataset"],
+    dataset_attr: "DatasetAttr",
+    data_args: "DataArguments",
+    training_args: "Seq2SeqTrainingArguments",
+) -> Union["Dataset", "IterableDataset"]:
+    r"""
+    Aligned dataset:
+        _prompt: [{"role": "user", "content": "..."}] * (2T - 1)
+        _response: [{"role": "assistant", "content": "..."}] * N (N > 1 for ranking dataset)
+        _system: "..."
+        _tools: "...",
+        _images: [],
+        _videos: [],
+    """
+    if dataset_attr.formatting == "alpaca":
+        convert_func = partial(convert_alpaca, dataset_attr=dataset_attr, data_args=data_args)
+    else:
+        convert_func = partial(convert_sharegpt, dataset_attr=dataset_attr, data_args=data_args)
+
+    column_names = list(next(iter(dataset)).keys())
+    kwargs = {}
+    if not data_args.streaming:
+        # kwargs = dict(
+        #     num_proc=data_args.preprocessing_num_workers,
+        #     load_from_cache_file=(not data_args.overwrite_cache) or (training_args.local_process_index != 0),
+        #     desc="Converting format of dataset",
+        # )
+        kwargs = dict(
+            num_proc=1,
+            load_from_cache_file=False,
+            desc="Converting format of dataset",
+        )
+    return dataset.map(
+        convert_func,
+        batched=False,
+        remove_columns=column_names,
+        **kwargs,
+    )
+
+
+def has_tokenized_data(path: "os.PathLike") -> bool:
+    r"""
+    Checks if the path has a tokenized dataset.
+    """
+    return os.path.isdir(path) and len(os.listdir(path)) > 0
diff --git a/paddlemix/MULLM_WebUI/extras/formatter.py b/paddlemix/MULLM_WebUI/extras/formatter.py
new file mode 100644
index 000000000..fe6429a5b
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/extras/formatter.py
@@ -0,0 +1,147 @@
+# Copyright 2024 the LlamaFactory team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import re
+from abc import ABC, abstractmethod
+from dataclasses import dataclass, field
+from typing import TYPE_CHECKING, List, Optional, Tuple, Union
+
+from typing_extensions import override
+
+from .data import SLOTS
+from .tool import get_tool_utils
+
+if TYPE_CHECKING:
+    from .tool import FunctionCall
+
+
+@dataclass
+class Formatter(ABC):
+    slots: SLOTS = field(default_factory=list)
+    tool_format: Optional[str] = None
+
+    @abstractmethod
+    def apply(self, **kwargs) -> SLOTS:
+        r"""
+        Forms a list of slots according to the inputs to encode.
+        """
+        ...
+
+    def extract(self, content: str) -> Union[str, List["FunctionCall"]]:
+        r"""
+        Extract a list of tuples from the response message if using tools.
+
+        Each tuple consists of function name and function arguments.
+        """
+        raise NotImplementedError
+
+
+@dataclass
+class EmptyFormatter(Formatter):
+    def __post_init__(self):
+        has_placeholder = False
+        for slot in filter(lambda s: isinstance(s, str), self.slots):
+            if re.search(r"\{\{[a-zA-Z_][a-zA-Z0-9_]*\}\}", slot):
+                has_placeholder = True
+
+        if has_placeholder:
+            raise ValueError("Empty formatter should not contain any placeholder.")
+
+    @override
+    def apply(self, **kwargs) -> SLOTS:
+        return self.slots
+
+
+@dataclass
+class StringFormatter(Formatter):
+    def __post_init__(self):
+        has_placeholder = False
+        for slot in filter(lambda s: isinstance(s, str), self.slots):
+            if re.search(r"\{\{[a-zA-Z_][a-zA-Z0-9_]*\}\}", slot):
+                has_placeholder = True
+
+        if not has_placeholder:
+            raise ValueError("A placeholder is required in the string formatter.")
+
+    @override
+    def apply(self, **kwargs) -> SLOTS:
+        elements = []
+        for slot in self.slots:
+            if isinstance(slot, str):
+                for name, value in kwargs.items():
+                    if not isinstance(value, str):
+                        raise RuntimeError(f"Expected a string, got {value}")
+
+                    slot = slot.replace("{{" + name + "}}", value, 1)
+                elements.append(slot)
+            elif isinstance(slot, (dict, set)):
+                elements.append(slot)
+            else:
+                raise RuntimeError(f"Input must be string, set[str] or dict[str, str], got {type(slot)}")
+
+        return elements
+
+
+@dataclass
+class FunctionFormatter(Formatter):
+    def __post_init__(self):
+        self.slots = get_tool_utils(self.tool_format).get_function_slots() + self.slots
+
+    @override
+    def apply(self, **kwargs) -> SLOTS:
+        content = kwargs.pop("content")
+        functions: List[Tuple[str, str]] = []
+        try:
+            tool_calls = json.loads(content)
+            if not isinstance(tool_calls, list):  # parallel function call
+                tool_calls = [tool_calls]
+
+            for tool_call in tool_calls:
+                functions.append((tool_call["name"], json.dumps(tool_call["arguments"], ensure_ascii=False)))
+
+        except json.JSONDecodeError:
+            raise RuntimeError(f"Invalid JSON format in function message: {str([content])}")  # flat string
+
+        elements = []
+        for name, arguments in functions:
+            for slot in self.slots:
+                if isinstance(slot, str):
+                    slot = slot.replace("{{name}}", name).replace("{{arguments}}", arguments)
+                    elements.append(slot)
+                elif isinstance(slot, (dict, set)):
+                    elements.append(slot)
+                else:
+                    raise RuntimeError(f"Input must be string, set[str] or dict[str, str], got {type(slot)}")
+
+        return elements
+
+
+@dataclass
+class ToolFormatter(Formatter):
+    def __post_init__(self):
+        self.tool_utils = get_tool_utils(self.tool_format)
+
+    @override
+    def apply(self, **kwargs) -> SLOTS:
+        content = kwargs.pop("content")
+        try:
+            tools = json.loads(content)
+            return [self.tool_utils.tool_formatter(tools) if len(tools) != 0 else ""]
+        except json.JSONDecodeError:
+            raise RuntimeError(f"Invalid JSON format in tool description: {str([content])}")  # flat string
+
+    @override
+    def extract(self, content: str) -> Union[str, List["FunctionCall"]]:
+        return self.tool_utils.tool_extractor(content)
diff --git a/paddlemix/MULLM_WebUI/extras/packages.py b/paddlemix/MULLM_WebUI/extras/packages.py
new file mode 100644
index 000000000..b078342a3
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/extras/packages.py
@@ -0,0 +1,47 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import importlib.util
+import os
+
+
+def _is_package_available(name: str) -> bool:
+    return importlib.util.find_spec(name) is not None
+
+
+def is_gradio_available():
+    return _is_package_available("gradio")
+
+
+def use_modelscope() -> bool:
+    return os.environ.get("USE_MODELSCOPE_HUB", "0").lower() in ["true", "1"]
+
+
+def use_openmind() -> bool:
+    return os.environ.get("USE_OPENMIND_HUB", "0").lower() in ["true", "1"]
+
+
+def is_pyav_available():
+    return _is_package_available("av")
+
+
+def is_pillow_available():
+    return _is_package_available("PIL")
+
+
+def is_matplotlib_available():
+    return _is_package_available("matplotlib")
+
+
+# version
diff --git a/paddlemix/MULLM_WebUI/extras/ploting.py b/paddlemix/MULLM_WebUI/extras/ploting.py
new file mode 100644
index 000000000..612beb6c5
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/extras/ploting.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import math
+import os
+from typing import Any, Dict, List
+
+from paddlenlp.trainer.trainer import TRAINER_STATE_NAME
+
+from ...utils.log import logger
+from .packages import is_matplotlib_available
+
+if is_matplotlib_available():
+    import matplotlib.figure
+    import matplotlib.pyplot as plt
+
+
+def smooth(scalars: List[float]) -> List[float]:
+    r"""
+    EMA implementation according to TensorBoard.
+    """
+    if len(scalars) == 0:
+        return []
+
+    last = scalars[0]
+    smoothed = []
+    weight = 1.8 * (1 / (1 + math.exp(-0.05 * len(scalars))) - 0.5)  # a sigmoid function
+    for next_val in scalars:
+        smoothed_val = last * weight + (1 - weight) * next_val
+        smoothed.append(smoothed_val)
+        last = smoothed_val
+    return smoothed
+
+
+def gen_loss_plot(trainer_log: List[Dict[str, Any]]) -> "matplotlib.figure.Figure":
+    r"""
+    Plots loss curves
+    """
+    plt.close("all")
+    plt.switch_backend("agg")
+    fig = plt.figure(dpi=300)
+    ax = fig.add_subplot(111)
+    steps, losses, eval_steps, eval_losses = [], [], [], []
+    for log in trainer_log:
+        if log.get("loss", None):
+            steps.append(log["current_steps"])
+            losses.append(log["loss"])
+        if log.get("eval_loss") is not None:
+            eval_steps.append(log["current_steps"])
+            eval_losses.append(log["eval_loss"])
+
+    ax.plot(steps, losses, color="#1f77b4", alpha=0.4, label="original training loss")
+    ax.plot(steps, smooth(losses), color="#1f77b4", label="smoothed training loss")
+
+    ax.plot(eval_steps, eval_losses, color="#ff0000", label="evaluation loss")
+
+    ax.legend()
+    ax.set_xlabel("step")
+    ax.set_ylabel("loss")
+    return fig
+
+
+def plot_loss(save_dictionary: str, keys: List[str] = ["loss"]) -> None:
+    r"""
+    Plots loss curves and saves the image.
+    """
+    plt.switch_backend("agg")
+    with open(os.path.join(save_dictionary, TRAINER_STATE_NAME), encoding="utf-8") as f:
+        data = json.load(f)
+
+    for key in keys:
+        steps, metrics = [], []
+        for i in range(len(data["log_history"])):
+            if key in data["log_history"][i]:
+                steps.append(data["log_history"][i]["step"])
+                metrics.append(data["log_history"][i][key])
+
+        if len(metrics) == 0:
+            logger(30, f"No metric {key} to plot.")
+            continue
+
+        plt.figure()
+        plt.plot(steps, metrics, color="#1f77b4", alpha=0.4, label="original")
+        plt.plot(steps, smooth(metrics), color="#1f77b4", label="smoothed")
+        plt.title(f"training {key} of {save_dictionary}")
+        plt.xlabel("step")
+        plt.ylabel(key)
+        plt.legend()
+        figure_path = os.path.join(save_dictionary, "training_{}.png".format(key.replace("/", "_")))
+        plt.savefig(figure_path, format="png", dpi=100)
+        print("Figure saved at:", figure_path)
diff --git a/paddlemix/MULLM_WebUI/extras/plugin.py b/paddlemix/MULLM_WebUI/extras/plugin.py
new file mode 100644
index 000000000..b12195dd5
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/extras/plugin.py
@@ -0,0 +1,373 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from copy import deepcopy
+from io import BytesIO
+from typing import (
+    TYPE_CHECKING,
+    Dict,
+    List,
+    Optional,
+    Sequence,
+    Tuple,
+    TypedDict,
+    Union,
+)
+
+import numpy as np
+from typing_extensions import override
+
+from .constants import IMAGE_PLACEHOLDER, VIDEO_PLACEHOLDER
+from .packages import is_pillow_available, is_pyav_available
+
+if is_pillow_available():
+    from PIL import Image
+    from PIL.Image import Image as ImageObject
+
+
+if is_pyav_available():
+    import av
+
+
+if TYPE_CHECKING:
+    import paddle
+    from av.stream import Stream
+    from paddlenlp.transformers import PretrainedTokenizer, ProcessorMixin
+
+    from ..processors import BaseImageProcessor
+
+    class EncodedImage(TypedDict):
+        path: Optional[str]
+        bytes: Optional[bytes]
+
+    ImageInput = Union[str, bytes, EncodedImage, ImageObject]
+    VideoInput = str
+
+
+def _get_paligemma_token_type_ids(
+    imglens: Sequence[int], seqlens: Sequence[int], processor: "ProcessorMixin"
+) -> List[List[int]]:
+    r"""
+    Gets paligemma token type ids for computing loss.
+
+    Returns:
+        batch_token_type_ids: shape (batch_size, sequence_length)
+    """
+    batch_token_type_ids = []
+    for imglen, seqlen in zip(imglens, seqlens):
+        image_seqlen = imglen * getattr(processor, "image_seqlen")
+        batch_token_type_ids.append([0] * image_seqlen + [1] * (seqlen - image_seqlen))
+
+    return batch_token_type_ids
+
+
+class BasePlugin:
+    def __init__(self, image_token: Optional[str], video_token: Optional[str]) -> None:
+        self.image_token = image_token
+        self.video_token = video_token
+
+    def _validate_input(
+        self,
+        images: Sequence["ImageInput"],
+        videos: Sequence["VideoInput"],
+    ) -> None:
+        r"""
+        Validates if this model accepts the input modalities.
+        """
+        if len(images) != 0 and self.image_token is None:
+            raise ValueError("This model does not support image input.")
+
+        if len(videos) != 0 and self.video_token is None:
+            raise ValueError("This model does not support video input.")
+
+    def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject":
+        r"""
+        Pre-processes a single image.
+        """
+        image_resolution: int = kwargs.get("image_resolution")
+        if (image.width * image.height) > image_resolution:
+            resize_factor = math.sqrt(image_resolution / (image.width * image.height))
+            width, height = int(image.width * resize_factor), int(image.height * resize_factor)
+            image = image.resize((width, height), resample=Image.NEAREST)
+
+        if image.mode != "RGB":
+            image = image.convert("RGB")
+
+        return image
+
+    def _get_video_sample_frames(self, video_stream: "Stream", **kwargs) -> int:
+        r"""
+        Computes video sample frames according to fps.
+        """
+        video_fps: float = kwargs.get("video_fps")
+        video_maxlen: int = kwargs.get("video_maxlen")
+        total_frames = video_stream.frames
+        sample_frames = float(video_stream.duration * video_stream.time_base) * video_fps
+        sample_frames = min(total_frames, video_maxlen, sample_frames)
+        return math.floor(sample_frames)
+
+    def _regularize_images(self, images: Sequence["ImageInput"], **kwargs) -> List["ImageObject"]:
+        r"""
+        Regularizes images to avoid error. Including reading and pre-processing.
+        """
+        results = []
+        for image in images:
+            if isinstance(image, str):
+                image = Image.open(image)
+            elif isinstance(image, bytes):
+                image = Image.open(BytesIO(image))
+            elif isinstance(image, dict):
+                if image["bytes"] is not None:
+                    image = Image.open(BytesIO(image["bytes"]))
+                else:
+                    image = Image.open(image["path"])
+
+            if not isinstance(image, ImageObject):
+                raise ValueError(f"Expect input is a list of Images, but got {type(image)}.")
+
+            results.append(self._preprocess_image(image, **kwargs))
+
+        return results
+
+    def _regularize_videos(self, videos: Sequence["VideoInput"], **kwargs) -> List[List["ImageObject"]]:
+        r"""
+        Regularizes videos to avoid error. Including reading, resizing and converting.
+        """
+        results = []
+        for video in videos:
+            container = av.open(video, "r")
+            video_stream = next(stream for stream in container.streams if stream.type == "video")
+            total_frames = video_stream.frames
+            sample_frames = self._get_video_sample_frames(video_stream, **kwargs)
+            sample_indices = np.linspace(0, total_frames - 1, sample_frames).astype(np.int32)
+            frames: List["ImageObject"] = []
+            container.seek(0)
+            for frame_idx, frame in enumerate(container.decode(video_stream)):
+                if frame_idx in sample_indices:
+                    frames.append(frame.to_image())
+
+            frames = self._regularize_images(frames, **kwargs)
+            results.append(frames)
+
+        return results
+
+    def _get_mm_inputs(
+        self,
+        images: Sequence["ImageInput"],
+        videos: Sequence["VideoInput"],
+        processor: "ProcessorMixin",
+    ) -> Dict[str, "paddle.Tensor"]:
+        r"""
+        Processes visual inputs.
+
+        Returns:
+            pixel_values: tensor with shape (B, C, H, W)
+
+        Returns: (qwen2-vl)
+            pixel_values: tensor with shape (num_patches, patch_dim)
+            image_grid_thw: tensor with shape (num_images, 3), where the three numbers are time, width, height
+
+        """
+        image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
+        video_processor: "BaseImageProcessor" = getattr(processor, "video_processor", image_processor)
+        input_dict = {"images": None}  # default key
+        if len(images) != 0:
+            images = self._regularize_images(
+                images,
+                image_resolution=getattr(processor, "image_resolution", 512 * 512),
+            )
+            input_dict["images"] = images
+
+        if len(videos) != 0:
+            videos = self._regularize_videos(
+                videos,
+                image_resolution=getattr(processor, "video_resolution", 128 * 128),
+                video_fps=getattr(processor, "video_fps", 2.0),
+                video_maxlen=getattr(processor, "video_maxlen", 64),
+            )
+            input_dict["videos"] = videos
+
+        mm_inputs = {}
+        if image_processor != video_processor:
+            if input_dict.get("images") is not None:
+                mm_inputs.update(image_processor(input_dict["images"]))
+            if input_dict.get("videos") is not None:
+                mm_inputs.update(video_processor(input_dict["videos"]))
+        elif input_dict.get("images") is not None or input_dict.get("videos") is not None:  # same processor (qwen2-vl)
+            mm_inputs.update(image_processor(**input_dict))
+
+        return mm_inputs
+
+    def process_messages(
+        self,
+        messages: Sequence[Dict[str, str]],
+        images: Sequence["ImageInput"],
+        videos: Sequence["VideoInput"],
+        processor: Optional["ProcessorMixin"],
+    ) -> List[Dict[str, str]]:
+        r"""
+        Pre-processes input messages before tokenization for VLMs.
+        """
+        self._validate_input(images, videos)
+        return messages
+
+    def process_token_ids(
+        self,
+        input_ids: List[int],
+        labels: Optional[List[int]],
+        images: Sequence["ImageInput"],
+        videos: Sequence["VideoInput"],
+        tokenizer: "PretrainedTokenizer",
+        processor: Optional["ProcessorMixin"],
+    ) -> Tuple[List[int], Optional[List[int]]]:
+        r"""
+        Pre-processes token ids after tokenization for VLMs.
+        """
+        self._validate_input(images, videos)
+        return input_ids, labels
+
+    def get_mm_inputs(
+        self,
+        images: Sequence["ImageInput"],
+        videos: Sequence["VideoInput"],
+        imglens: Sequence[int],
+        vidlens: Sequence[int],
+        batch_ids: Sequence[List[int]],
+        processor: Optional["ProcessorMixin"],
+    ) -> Dict[str, Union[List[int], "paddle.Tensor"]]:
+        r"""
+        Builds batched multimodal inputs for VLMs.
+
+        Arguments:
+            images: a list of image inputs, shape (num_images,)
+            videos: a list of video inputs, shape (num_videos,)
+            imglens: number of images in each sample, shape (batch_size,)
+            vidlens: number of videos in each sample, shape (batch_size,)
+            batch_ids: token ids of input samples, shape (batch_size, seq_len)
+            processor: a processor for pre-processing images and videos
+        """
+        self._validate_input(images, videos)
+        return {}
+
+
+class Qwen2vlPlugin(BasePlugin):
+    @override
+    def _preprocess_image(self, image: "ImageObject", **kwargs) -> "ImageObject":
+        image = super()._preprocess_image(image, **kwargs)
+        if min(image.width, image.height) < 28:
+            width, height = max(image.width, 28), max(image.height, 28)
+            image = image.resize((width, height), resample=Image.NEAREST)
+
+        if image.width / image.height > 200:
+            width, height = image.height * 180, image.height
+            image = image.resize((width, height), resample=Image.NEAREST)
+
+        if image.height / image.width > 200:
+            width, height = image.width, image.width * 180
+            image = image.resize((width, height), resample=Image.NEAREST)
+
+        return image
+
+    @override
+    def _get_video_sample_frames(self, video_stream: "Stream", **kwargs) -> int:
+        sample_frames = super()._get_video_sample_frames(video_stream, **kwargs)
+        sample_frames = sample_frames // 2 * 2
+        return sample_frames
+
+    @override
+    def process_messages(
+        self,
+        messages: Sequence[Dict[str, str]],
+        images: Sequence["ImageInput"],
+        videos: Sequence["VideoInput"],
+        processor: Optional["ProcessorMixin"],
+    ) -> List[Dict[str, str]]:
+        self._validate_input(images, videos)
+        image_processor: "BaseImageProcessor" = getattr(processor, "image_processor")
+        merge_length: int = getattr(image_processor, "merge_size") ** 2
+        mm_inputs = self._get_mm_inputs(images, videos, processor)
+        image_grid_thw = mm_inputs.get("image_grid_thw", [])
+        video_grid_thw = mm_inputs.get("video_grid_thw", [])
+
+        num_image_tokens, num_video_tokens = 0, 0
+        messages = deepcopy(messages)
+        for message in messages:
+            content = message["content"]
+            while IMAGE_PLACEHOLDER in content:
+                if num_image_tokens >= len(image_grid_thw):
+                    raise ValueError(f"`len(images)` is less than the number of {IMAGE_PLACEHOLDER} tokens.")
+
+                content = content.replace(
+                    IMAGE_PLACEHOLDER,
+                    "<|vision_start|>{}<|vision_end|>".format(
+                        self.image_token * (image_grid_thw[num_image_tokens].prod() // merge_length)
+                    ),
+                    1,
+                )
+                num_image_tokens += 1
+
+            while VIDEO_PLACEHOLDER in content:
+                if num_video_tokens >= len(video_grid_thw):
+                    raise ValueError(f"`len(videos)` is less than the number of {VIDEO_PLACEHOLDER} tokens.")
+
+                content = content.replace(
+                    VIDEO_PLACEHOLDER,
+                    "<|vision_start|>{}<|vision_end|>".format(
+                        self.video_token * (video_grid_thw[num_video_tokens].prod() // merge_length)
+                    ),
+                    1,
+                )
+                num_video_tokens += 1
+
+            message["content"] = content
+
+        if len(images) != num_image_tokens:
+            raise ValueError(f"The number of images does not match the number of {IMAGE_PLACEHOLDER} tokens.")
+
+        if len(videos) != num_video_tokens:
+            raise ValueError(f"The number of videos does not match the number of {VIDEO_PLACEHOLDER} tokens.")
+
+        return messages
+
+    @override
+    def get_mm_inputs(
+        self,
+        images: Sequence["ImageInput"],
+        videos: Sequence["VideoInput"],
+        imglens: Sequence[int],
+        vidlens: Sequence[int],
+        batch_ids: Sequence[List[int]],
+        processor: Optional["ProcessorMixin"],
+    ) -> Dict[str, Union[List[int], "paddle.Tensor"]]:
+        self._validate_input(images, videos)
+        return self._get_mm_inputs(images, videos, processor)
+
+
+PLUGINS = {
+    "base": BasePlugin,
+    "qwen2_vl": Qwen2vlPlugin,
+}
+
+
+def get_mm_plugin(
+    name: str,
+    image_token: Optional[str] = None,
+    video_token: Optional[str] = None,
+) -> "BasePlugin":
+    plugin_class = PLUGINS.get(name, None)
+    if plugin_class is None:
+        raise ValueError(f"Multimodal plugin `{name}` not found.")
+
+    return plugin_class(image_token, video_token)
diff --git a/paddlemix/MULLM_WebUI/extras/preprocess.py b/paddlemix/MULLM_WebUI/extras/preprocess.py
new file mode 100644
index 000000000..092d77474
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/extras/preprocess.py
@@ -0,0 +1,328 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import bisect
+from collections import defaultdict
+from functools import partial
+from typing import (
+    TYPE_CHECKING,
+    Any,
+    Callable,
+    Dict,
+    List,
+    Literal,
+    Optional,
+    Sequence,
+    Tuple,
+)
+
+if TYPE_CHECKING:
+    from paddlenlp.transformers import PretrainedTokenizer, ProcessorMixin, ImageInput, VideoInput
+    from .args import DataArguments
+    from .template import Template
+
+from ...utils.log import logger
+from .constants import IGNORE_INDEX
+
+
+def search_for_fit(numbers: Sequence[int], capacity: int) -> int:
+    r"""
+    Finds the index of largest number that fits into the knapsack with the given capacity.
+    """
+    index = bisect.bisect(numbers, capacity)
+    return -1 if index == 0 else (index - 1)
+
+
+def greedy_knapsack(numbers: List[int], capacity: int) -> List[List[int]]:
+    r"""
+    An efficient greedy algorithm with binary search for the knapsack problem.
+    """
+    numbers.sort()  # sort numbers in ascending order for binary search
+    knapsacks = []
+
+    while numbers:
+        current_knapsack = []
+        remaining_capacity = capacity
+
+        while True:
+            index = search_for_fit(numbers, remaining_capacity)
+            if index == -1:
+                break  # no more numbers fit in this knapsack
+
+            remaining_capacity -= numbers[index]  # update the remaining capacity
+            current_knapsack.append(numbers.pop(index))  # add the number to knapsack
+
+        knapsacks.append(current_knapsack)
+
+    return knapsacks
+
+
+def infer_seqlen(source_len: int, target_len: int, cutoff_len: int) -> Tuple[int, int]:
+    r"""
+    Computes the real sequence length after truncation by the cutoff_len.
+    """
+    if target_len * 2 < cutoff_len:  # truncate source
+        max_target_len = cutoff_len
+    elif source_len * 2 < cutoff_len:  # truncate target
+        max_target_len = cutoff_len - source_len
+    else:  # truncate both
+        max_target_len = int(cutoff_len * (target_len / (source_len + target_len)))
+
+    new_target_len = min(max_target_len, target_len)
+    max_source_len = max(cutoff_len - new_target_len, 0)
+    new_source_len = min(max_source_len, source_len)
+    return new_source_len, new_target_len
+
+
+def _encode_supervised_example(
+    prompt: Sequence[Dict[str, str]],
+    response: Sequence[Dict[str, str]],
+    system: Optional[str],
+    tools: Optional[str],
+    images: Sequence["ImageInput"],
+    videos: Sequence["VideoInput"],
+    template: "Template",
+    tokenizer: "PretrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
+    cutoff_len: int,
+    train_on_prompt: bool,
+    mask_history: bool,
+) -> Tuple[List[int], List[int]]:
+    messages = template.mm_plugin.process_messages(prompt + response, images, videos, processor)
+    input_ids, labels = template.mm_plugin.process_token_ids([], [], images, videos, tokenizer, processor)
+    encoded_pairs = template.encode_multiturn(tokenizer, messages, system, tools)
+    total_length = len(input_ids) + (1 if template.efficient_eos else 0)
+    mm_inputs = template.mm_plugin._get_mm_inputs(images, videos, processor)
+    pixel_values = mm_inputs.get("pixel_values", None)
+    image_grid_thw = mm_inputs.get("image_grid_thw", None)
+    pixel_values_videos = mm_inputs.get("pixel_values_videos", None)
+    video_grid_thw = mm_inputs.get("video_grid_thw", None)
+
+    if mask_history:
+        encoded_pairs = encoded_pairs[::-1]  # high priority for last turns
+
+    for turn_idx, (source_ids, target_ids) in enumerate(encoded_pairs):
+        if total_length >= cutoff_len:
+            break
+
+        source_len, target_len = infer_seqlen(len(source_ids), len(target_ids), cutoff_len - total_length)
+        source_ids = source_ids[:source_len]
+        target_ids = target_ids[:target_len]
+        total_length += source_len + target_len
+
+        if train_on_prompt:
+            source_label = source_ids
+        elif template.efficient_eos:
+            source_label = [tokenizer.eos_token_id] + [IGNORE_INDEX] * (source_len - 1)
+        else:
+            source_label = [IGNORE_INDEX] * source_len
+
+        if mask_history and turn_idx != 0:  # train on the last turn only
+            target_label = [IGNORE_INDEX] * target_len
+        else:
+            target_label = target_ids
+
+        if mask_history:  # reversed sequences
+            input_ids = source_ids + target_ids + input_ids
+            labels = source_label + target_label + labels
+        else:
+            input_ids += source_ids + target_ids
+            labels += source_label + target_label
+
+    if template.efficient_eos:
+        input_ids += [tokenizer.eos_token_id]
+        labels += [tokenizer.eos_token_id]
+
+    return input_ids, labels, pixel_values, image_grid_thw, pixel_values_videos, video_grid_thw
+
+
+def preprocess_supervised_dataset(
+    examples: Dict[str, List[Any]],
+    template: "Template",
+    tokenizer: "PretrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
+    data_args: "DataArguments",
+) -> Dict[str, List[Any]]:
+    # build inputs with format `<bos> X Y <eos>` and labels with format `<ignore> ... <ignore> Y <eos>`
+    # for multiturn examples, we only mask the prompt part in each prompt-response pair.
+    model_inputs = defaultdict(list)
+    for i in range(len(examples["_prompt"])):
+        if len(examples["_prompt"][i]) % 2 != 1 or len(examples["_response"][i]) != 1:
+            logger("30", "Dropped invalid example: {}".format(examples["_prompt"][i] + examples["_response"][i]))
+            continue
+        (
+            input_ids,
+            labels,
+            pixel_values,
+            image_grid_thw,
+            video_pixel_values,
+            video_grid_thw,
+        ) = _encode_supervised_example(
+            prompt=examples["_prompt"][i],
+            response=examples["_response"][i],
+            system=examples["_system"][i],
+            tools=examples["_tools"][i],
+            images=examples["_images"][i] or [],
+            videos=examples["_videos"][i] or [],
+            template=template,
+            tokenizer=tokenizer,
+            processor=processor,
+            cutoff_len=data_args.cutoff_len,
+            train_on_prompt=data_args.train_on_prompt,
+            mask_history=data_args.mask_history,
+        )
+        model_inputs["input_ids"].append(input_ids)
+        model_inputs["attention_mask"].append([1] * len(input_ids))
+        model_inputs["labels"].append(labels)
+        model_inputs["images"].append(examples["_images"][i])
+        model_inputs["videos"].append(examples["_videos"][i])
+        if pixel_values is not None:
+            model_inputs["pixel_values"].append(pixel_values)
+            model_inputs["image_grid_thw"].append(image_grid_thw[0])
+        if video_pixel_values is not None:
+            model_inputs["video_pixel_values"].append(video_pixel_values)
+            model_inputs["video_grid_thw"].append(video_grid_thw[0])
+
+    return model_inputs
+
+
+def preprocess_packed_supervised_dataset(
+    examples: Dict[str, List[Any]],
+    template: "Template",
+    tokenizer: "PretrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
+    data_args: "DataArguments",
+) -> Dict[str, List[Any]]:
+    valid_num = 0
+    batch_input_ids, batch_labels, batch_images, batch_videos = [], [], [], []
+    lengths = []
+    length2indexes = defaultdict(list)
+    for i in range(len(examples["_prompt"])):
+        if len(examples["_prompt"][i]) % 2 != 1 or len(examples["_response"][i]) != 1:
+            logger("30", "Dropped invalid example: {}".format(examples["_prompt"][i] + examples["_response"][i]))
+            continue
+
+        input_ids, labels = _encode_supervised_example(
+            prompt=examples["_prompt"][i],
+            response=examples["_response"][i],
+            system=examples["_system"][i],
+            tools=examples["_tools"][i],
+            images=examples["_images"][i] or [],
+            videos=examples["_videos"][i] or [],
+            template=template,
+            tokenizer=tokenizer,
+            processor=processor,
+            cutoff_len=data_args.cutoff_len - 1,  # reserved for the padding token
+            train_on_prompt=data_args.train_on_prompt,
+            mask_history=data_args.mask_history,
+        )
+        length = len(input_ids)
+        if length > data_args.cutoff_len:
+            logger(30, f"Dropped lengthy example with length {length} > {data_args.cutoff_len}.")
+        else:
+            lengths.append(length)
+            length2indexes[length].append(valid_num)
+            batch_input_ids.append(input_ids)
+            batch_labels.append(labels)
+            batch_images.append(examples["_images"][i] or [])
+            batch_videos.append(examples["_videos"][i] or [])
+            valid_num += 1
+
+    model_inputs = defaultdict(list)
+    knapsacks = greedy_knapsack(lengths, data_args.cutoff_len - 1)  # reserved for the padding token
+    for knapsack in knapsacks:
+        packed_input_ids, packed_attention_masks, packed_labels = [], [], []
+        packed_images, packed_videos = [], []
+        for i, length in enumerate(knapsack):
+            index = length2indexes[length].pop()
+            packed_input_ids += batch_input_ids[index]
+            packed_labels += batch_labels[index]
+            packed_images += batch_images[index]
+            packed_videos += batch_videos[index]
+            if data_args.neat_packing:
+                packed_attention_masks += [i + 1] * len(batch_input_ids[index])  # start from 1
+            else:
+                packed_attention_masks += [1] * len(batch_input_ids[index])
+
+        if len(packed_input_ids) < data_args.cutoff_len:
+            pad_length = data_args.cutoff_len - len(packed_input_ids)
+            packed_input_ids += [tokenizer.pad_token_id] * pad_length
+            packed_labels += [IGNORE_INDEX] * pad_length
+            if data_args.neat_packing:
+                packed_attention_masks += [0] * pad_length
+            else:
+                packed_attention_masks += [1] * pad_length  # more efficient flash_attn
+
+        if len(packed_input_ids) != data_args.cutoff_len:
+            raise ValueError("The length of packed example should be identical to the cutoff length.")
+
+        model_inputs["input_ids"].append(packed_input_ids)
+        model_inputs["attention_mask"].append(packed_attention_masks)
+        model_inputs["labels"].append(packed_labels)
+        model_inputs["images"].append(packed_images or None)
+        model_inputs["videos"].append(packed_videos or None)
+
+    return model_inputs
+
+
+def print_supervised_dataset_example(example: Dict[str, List[int]], tokenizer: "PretrainedTokenizer") -> None:
+    valid_labels = list(filter(lambda x: x != IGNORE_INDEX, example["labels"]))
+    print("input_ids:\n{}".format(example["input_ids"]))
+    print("inputs:\n{}".format(tokenizer.decode(example["input_ids"], skip_special_tokens=False)))
+    print("label_ids:\n{}".format(example["labels"]))
+    print(f"labels:\n{tokenizer.decode(valid_labels, skip_special_tokens=False)}")
+
+
+def get_preprocess_and_print_func(
+    data_args: "DataArguments",
+    stage: Literal["sft"],
+    template: "Template",
+    tokenizer: "PretrainedTokenizer",
+    processor: Optional["ProcessorMixin"],
+    do_generate: bool = False,
+) -> Tuple[Callable, Callable]:
+    if stage == "sft" and not do_generate:
+        if data_args.packing:
+            if data_args.neat_packing:  # hack datasets to have int32 attention mask
+                from datasets.arrow_writer import OptimizedTypedSequence, TypedSequence
+
+                def __init__(self, data, **kwargs):
+                    return TypedSequence.__init__(
+                        self,
+                        data,
+                        type=kwargs.pop("type", None),
+                        try_type=kwargs.pop("try_type", None),
+                        optimized_int_type=kwargs.pop("optimized_int_type", None),
+                    )
+
+                OptimizedTypedSequence.__init__ = __init__
+            preprocess_func = partial(
+                preprocess_packed_supervised_dataset,
+                template=template,
+                tokenizer=tokenizer,
+                processor=processor,
+                data_args=data_args,
+            )
+        else:
+            preprocess_func = partial(
+                preprocess_supervised_dataset,
+                template=template,
+                tokenizer=tokenizer,
+                processor=processor,
+                data_args=data_args,
+            )
+
+        print_function = partial(print_supervised_dataset_example, tokenizer=tokenizer)
+
+    return preprocess_func, print_function
diff --git a/paddlemix/MULLM_WebUI/extras/template.py b/paddlemix/MULLM_WebUI/extras/template.py
new file mode 100644
index 000000000..8f44b65d3
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/extras/template.py
@@ -0,0 +1,411 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+TEMPLATES = {}
+
+
+from dataclasses import dataclass
+from typing import TYPE_CHECKING, Dict, List, Optional, Sequence, Tuple, Union
+
+from typing_extensions import override
+
+from ...utils.log import logger
+from .data import Role
+from .formatter import EmptyFormatter, FunctionFormatter, StringFormatter, ToolFormatter
+from .plugin import get_mm_plugin
+
+if TYPE_CHECKING:
+    from paddlenlp.transformers.tokenizer_utils import PreTrainedTokenizer
+
+    from .args import DataArguments
+    from .formatter import SLOTS, Formatter
+    from .plugin import BasePlugin
+
+
+@dataclass
+class Template:
+    format_user: "Formatter"
+    format_assistant: "Formatter"
+    format_system: "Formatter"
+    format_function: "Formatter"
+    format_observation: "Formatter"
+    format_tools: "Formatter"
+    format_separator: "Formatter"
+    format_prefix: "Formatter"
+    default_system: str
+    stop_words: List[str]
+    efficient_eos: bool
+    replace_eos: bool
+    replace_jinja_template: bool
+    mm_plugin: "BasePlugin"
+
+    def encode_oneturn(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: Sequence[Dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+    ) -> Tuple[List[int], List[int]]:
+        r"""
+        Returns a single pair of token ids representing prompt and response respectively.
+        """
+        encoded_messages = self._encode(tokenizer, messages, system, tools)
+        prompt_ids = []
+        for encoded_ids in encoded_messages[:-1]:
+            prompt_ids += encoded_ids
+
+        answer_ids = encoded_messages[-1]
+        return prompt_ids, answer_ids
+
+    def encode_multiturn(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: Sequence[Dict[str, str]],
+        system: Optional[str] = None,
+        tools: Optional[str] = None,
+    ) -> List[Tuple[List[int], List[int]]]:
+        r"""
+        Returns multiple pairs of token ids representing prompts and responses respectively.
+        """
+        encoded_messages = self._encode(tokenizer, messages, system, tools)
+        return [(encoded_messages[i], encoded_messages[i + 1]) for i in range(0, len(encoded_messages), 2)]
+
+    def extract_tool(self, content: str) -> Union[str, List[Tuple[str, str]]]:
+        r"""
+        Extracts tool message.
+        """
+        return self.format_tools.extract(content)
+
+    def _encode(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: Sequence[Dict[str, str]],
+        system: Optional[str],
+        tools: Optional[str],
+    ) -> List[List[int]]:
+        r"""
+        Encodes formatted inputs to pairs of token ids.
+        Turn 0: prefix + system + query        resp
+        Turn t: sep + query                    resp
+        """
+        system = system or self.default_system
+        encoded_messages = []
+        for i, message in enumerate(messages):
+            elements = []
+            if i == 0:
+                elements += self.format_prefix.apply()
+                if system or tools:
+                    tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
+                    elements += self.format_system.apply(content=(system + tool_text))
+
+            if i > 0 and i % 2 == 0:
+                elements += self.format_separator.apply()
+
+            if message["role"] == Role.USER.value:
+                elements += self.format_user.apply(content=message["content"], idx=str(i // 2))
+            elif message["role"] == Role.ASSISTANT.value:
+                elements += self.format_assistant.apply(content=message["content"])
+            elif message["role"] == Role.OBSERVATION.value:
+                elements += self.format_observation.apply(content=message["content"])
+            elif message["role"] == Role.FUNCTION.value:
+                elements += self.format_function.apply(content=message["content"])
+            else:
+                raise NotImplementedError("Unexpected role: {}".format(message["role"]))
+            encoded_messages.append(self._convert_elements_to_ids(tokenizer, elements))
+
+        return encoded_messages
+
+    def _convert_elements_to_ids(self, tokenizer: "PreTrainedTokenizer", elements: "SLOTS") -> List[int]:
+        r"""
+        Converts elements to token ids.
+        """
+        token_ids = []
+        for elem in elements:
+            if isinstance(elem, str):
+                if len(elem) != 0:
+                    token_ids += tokenizer.encode(elem, add_special_tokens=False).input_ids
+            elif isinstance(elem, dict):
+                token_ids += [tokenizer.convert_tokens_to_ids(elem.get("token"))]
+            elif isinstance(elem, set):
+                if "bos_token" in elem and tokenizer.bos_token_id is not None:
+                    token_ids += [tokenizer.bos_token_id]
+                elif "eos_token" in elem and tokenizer.eos_token_id is not None:
+                    token_ids += [tokenizer.eos_token_id]
+            else:
+                raise ValueError(f"Input must be string, set[str] or dict[str, str], got {type(elem)}")
+
+        return token_ids
+
+
+@dataclass
+class Llama2Template(Template):
+    @override
+    def _encode(
+        self,
+        tokenizer: "PreTrainedTokenizer",
+        messages: Sequence[Dict[str, str]],
+        system: str,
+        tools: str,
+    ) -> List[List[int]]:
+        r"""
+        Encodes formatted inputs to pairs of token ids.
+        Turn 0: prefix + system + query        resp
+        Turn t: sep + query                    resp
+        """
+        system = system or self.default_system
+        encoded_messages = []
+        for i, message in enumerate(messages):
+            elements = []
+
+            system_text = ""
+            if i == 0:
+                elements += self.format_prefix.apply()
+                if system or tools:
+                    tool_text = self.format_tools.apply(content=tools)[0] if tools else ""
+                    system_text = self.format_system.apply(content=(system + tool_text))[0]
+
+            if i > 0 and i % 2 == 0:
+                elements += self.format_separator.apply()
+
+            if message["role"] == Role.USER.value:
+                elements += self.format_user.apply(content=system_text + message["content"])
+            elif message["role"] == Role.ASSISTANT.value:
+                elements += self.format_assistant.apply(content=message["content"])
+            elif message["role"] == Role.OBSERVATION.value:
+                elements += self.format_observation.apply(content=message["content"])
+            elif message["role"] == Role.FUNCTION.value:
+                elements += self.format_function.apply(content=message["content"])
+            else:
+                raise NotImplementedError("Unexpected role: {}".format(message["role"]))
+
+            encoded_messages.append(self._convert_elements_to_ids(tokenizer, elements))
+
+        return encoded_messages
+
+
+def _register_template(
+    name: str,
+    format_user: Optional["Formatter"] = None,
+    format_assistant: Optional["Formatter"] = None,
+    format_system: Optional["Formatter"] = None,
+    format_function: Optional["Formatter"] = None,
+    format_observation: Optional["Formatter"] = None,
+    format_tools: Optional["Formatter"] = None,
+    format_separator: Optional["Formatter"] = None,
+    format_prefix: Optional["Formatter"] = None,
+    default_system: str = "",
+    stop_words: Sequence[str] = [],
+    efficient_eos: bool = False,
+    replace_eos: bool = False,
+    replace_jinja_template: bool = True,
+    mm_plugin: "BasePlugin" = get_mm_plugin(name="base"),
+) -> None:
+    r"""
+    Registers a chat template.
+
+    To add the following chat template:
+    ```
+    [HUMAN]:
+    user prompt here
+    [AI]:
+    model response here
+
+    [HUMAN]:
+    user prompt here
+    [AI]:
+    model response here
+    ```
+
+    The corresponding code should be:
+    ```
+    _register_template(
+        name="custom",
+        format_user=StringFormatter(slots=["[HUMAN]:\n{{content}}\n[AI]:\n"]),
+        format_separator=EmptyFormatter(slots=["\n\n"]),
+        efficient_eos=True,
+    )
+    ```
+    """
+    eos_slots = [] if efficient_eos else [{"eos_token"}]
+    template_class = Llama2Template if name.startswith("llama2") else Template
+    default_user_formatter = StringFormatter(slots=["{{content}}"])
+    default_assistant_formatter = StringFormatter(slots=["{{content}}"] + eos_slots)
+    default_function_formatter = FunctionFormatter(slots=eos_slots, tool_format="default")
+    default_tool_formatter = ToolFormatter(tool_format="default")
+    default_separator_formatter = EmptyFormatter()
+    default_prefix_formatter = EmptyFormatter()
+    TEMPLATES[name] = template_class(
+        format_user=format_user or default_user_formatter,
+        format_assistant=format_assistant or default_assistant_formatter,
+        format_system=format_system or default_user_formatter,
+        format_function=format_function or default_function_formatter,
+        format_observation=format_observation or format_user or default_user_formatter,
+        format_tools=format_tools or default_tool_formatter,
+        format_separator=format_separator or default_separator_formatter,
+        format_prefix=format_prefix or default_prefix_formatter,
+        default_system=default_system,
+        stop_words=stop_words,
+        efficient_eos=efficient_eos,
+        replace_eos=replace_eos,
+        replace_jinja_template=replace_jinja_template,
+        mm_plugin=mm_plugin,
+    )
+
+
+def _add_or_replace_eos_token(tokenizer: "PreTrainedTokenizer", eos_token: str) -> None:
+    is_added = tokenizer.eos_token_id is None
+    num_added_tokens = tokenizer.add_special_tokens({"eos_token": eos_token})
+
+    if is_added:
+        logger(20, f"Add eos token: {tokenizer.eos_token}")
+    else:
+        logger(20, f"Replace eos token: {tokenizer.eos_token}")
+
+    if num_added_tokens > 0:
+        logger(30, "New tokens have been added, make sure `resize_vocab` is True.")
+
+
+def _jinja_escape(content: str) -> str:
+    return content.replace("'", r"\'")
+
+
+def _convert_slots_to_jinja(slots: "SLOTS", tokenizer: "PreTrainedTokenizer", placeholder: str = "content") -> str:
+    slot_items = []
+    for slot in slots:
+        if isinstance(slot, str):
+            slot_pieces = slot.split("{{content}}")
+            if slot_pieces[0]:
+                slot_items.append("'" + _jinja_escape(slot_pieces[0]) + "'")
+            if len(slot_pieces) > 1:
+                slot_items.append(placeholder)
+                if slot_pieces[1]:
+                    slot_items.append("'" + _jinja_escape(slot_pieces[1]) + "'")
+        elif isinstance(slot, set):  # do not use {{ eos_token }} since it may be replaced
+            if "bos_token" in slot and tokenizer.bos_token_id is not None:
+                slot_items.append("'" + tokenizer.bos_token + "'")
+            elif "eos_token" in slot and tokenizer.eos_token_id is not None:
+                slot_items.append("'" + tokenizer.eos_token + "'")
+        elif isinstance(slot, dict):
+            raise ValueError("Dict is not supported.")
+
+    return " + ".join(slot_items)
+
+
+def _get_jinja_template(template: "Template", tokenizer: "PreTrainedTokenizer") -> str:
+    r"""
+    Returns the jinja template.
+    """
+    jinja_template = ""
+
+    prefix = _convert_slots_to_jinja(template.format_prefix.apply(), tokenizer)
+    if prefix:
+        jinja_template += "{{ " + prefix + " }}"
+
+    if template.default_system:
+        jinja_template += "{% set system_message = '" + _jinja_escape(template.default_system) + "' %}"
+
+    jinja_template += (
+        "{% if messages[0]['role'] == 'system' %}{% set loop_messages = messages[1:] %}"
+        "{% set system_message = messages[0]['content'] %}{% else %}{% set loop_messages = messages %}{% endif %}"
+    )
+
+    system_message = _convert_slots_to_jinja(template.format_system.apply(), tokenizer, placeholder="system_message")
+    if not isinstance(template, Llama2Template):
+        jinja_template += "{% if system_message is defined %}{{ " + system_message + " }}{% endif %}"
+
+    jinja_template += "{% for message in loop_messages %}"
+    jinja_template += "{% set content = message['content'] %}"
+    if isinstance(template, Llama2Template):
+        jinja_template += "{% if loop.index0 == 0 and system_message is defined %}"
+        jinja_template += "{% set content = " + system_message + " + message['content'] %}"
+        jinja_template += "{% endif %}"
+
+    jinja_template += "{% if message['role'] == 'user' %}"
+    user_message = _convert_slots_to_jinja(template.format_user.apply(), tokenizer)
+    jinja_template += "{{ " + user_message + " }}"
+
+    jinja_template += "{% elif message['role'] == 'assistant' %}"
+    assistant_message = _convert_slots_to_jinja(
+        template.format_assistant.apply() + template.format_separator.apply(), tokenizer
+    )
+    jinja_template += "{{ " + assistant_message + " }}"
+    jinja_template += "{% endif %}"
+    jinja_template += "{% endfor %}"
+    return jinja_template
+
+
+def get_template_and_fix_tokenizer(tokenizer: "PreTrainedTokenizer", data_args: "DataArguments") -> "Template":
+    r"""
+    Gets chat template and fixes the tokenizer.
+    """
+    if data_args.template is None:
+        template = TEMPLATES["empty"]  # placeholder
+    else:
+        template = TEMPLATES.get(data_args.template, None)
+        if template is None:
+            raise ValueError(f"Template {data_args.template} does not exist.")
+
+    if data_args.train_on_prompt and template.efficient_eos:
+        raise ValueError("Current template does not support `train_on_prompt`.")
+
+    if data_args.tool_format is not None:
+        logger(20, f"Using tool format: {data_args.tool_format}.")
+        eos_slots = [] if template.efficient_eos else [{"eos_token"}]
+        template.format_function = FunctionFormatter(slots=eos_slots, tool_format=data_args.tool_format)
+        template.format_tools = ToolFormatter(tool_format=data_args.tool_format)
+
+    stop_words = template.stop_words
+    if template.replace_eos:
+        if not stop_words:
+            raise ValueError("Stop words are required to replace the EOS token.")
+
+        _add_or_replace_eos_token(tokenizer, eos_token=stop_words[0])
+        stop_words = stop_words[1:]
+
+    if tokenizer.eos_token_id is None:
+        _add_or_replace_eos_token(tokenizer, eos_token="<|endoftext|>")
+
+    if tokenizer.pad_token_id is None:
+        tokenizer.pad_token = tokenizer.eos_token
+        logger(20, f"Add pad token: {tokenizer.pad_token}")
+
+    if stop_words:
+        num_added_tokens = tokenizer.add_special_tokens(
+            dict(additional_special_tokens=stop_words), replace_additional_special_tokens=False
+        )
+        logger(20, "Add {} to stop words.".format(",".join(stop_words)))
+        if num_added_tokens > 0:
+            logger.warning_rank0("New tokens have been added, make sure `resize_vocab` is True.")
+
+    if tokenizer.chat_template is None or template.replace_jinja_template:
+        try:
+            tokenizer.chat_template = _get_jinja_template(template, tokenizer)
+        except ValueError as e:
+            logger(20, f"Cannot add this chat template to tokenizer: {e}.")
+
+    return template
+
+
+_register_template(
+    name="qwen2_vl",
+    format_user=StringFormatter(slots=["<|im_start|>user\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_system=StringFormatter(slots=["<|im_start|>system\n{{content}}<|im_end|>\n"]),
+    format_observation=StringFormatter(slots=["<|im_start|>tool\n{{content}}<|im_end|>\n<|im_start|>assistant\n"]),
+    format_separator=EmptyFormatter(slots=["\n"]),
+    default_system="You are a helpful assistant.",
+    stop_words=["<|im_end|>"],
+    replace_eos=True,
+    replace_jinja_template=False,
+    mm_plugin=get_mm_plugin(name="qwen2_vl", image_token="<|image_pad|>", video_token="<|video_pad|>"),
+)
diff --git a/paddlemix/MULLM_WebUI/extras/tool.py b/paddlemix/MULLM_WebUI/extras/tool.py
new file mode 100644
index 000000000..c179f8dff
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/extras/tool.py
@@ -0,0 +1,178 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import re
+from abc import ABC, abstractmethod
+from collections import namedtuple
+from dataclasses import dataclass
+from typing import Any, Dict, List, Tuple, Union
+
+from typing_extensions import override
+
+from .data import SLOTS
+
+DEFAULT_TOOL_PROMPT = (
+    "You have access to the following tools:\n{tool_text}"
+    "Use the following format if using a tool:\n"
+    "```\n"
+    "Action: tool name (one of [{tool_names}])\n"
+    "Action Input: the input to the tool, in a JSON format representing the kwargs "
+    """(e.g. ```{{"input": "hello world", "num_beams": 5}}```)\n"""
+    "```\n"
+)
+
+
+GLM4_TOOL_PROMPT = "你是一个名为 ChatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的，" "你的任务是针对用户的问题和要求提供适当的答复和支持。# 可用工具{tool_text}"
+
+
+FunctionCall = namedtuple("FunctionCall", ["name", "arguments"])
+
+
+@dataclass
+class ToolUtils(ABC):
+    """
+    Base class for tool utilities.
+    """
+
+    @staticmethod
+    @abstractmethod
+    def get_function_slots() -> SLOTS:
+        r"""
+        Gets a list of slots corresponding to a single function call.
+        """
+        ...
+
+    @staticmethod
+    @abstractmethod
+    def tool_formatter(tools: List[Dict[str, Any]]) -> str:
+        r"""
+        Generates the system message describing all the available tools.
+        """
+        ...
+
+    @staticmethod
+    @abstractmethod
+    def tool_extractor(content: str) -> Union[str, List["FunctionCall"]]:
+        r"""
+        Extracts all the function calls from the response message.
+        """
+        ...
+
+
+class DefaultToolUtils(ToolUtils):
+    @override
+    @staticmethod
+    def get_function_slots() -> SLOTS:
+        return ["Action: {{name}}\nAction Input: {{arguments}}\n"]
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: List[Dict[str, Any]]) -> str:
+        tool_text = ""
+        tool_names = []
+        for tool in tools:
+            param_text = ""
+            for name, param in tool["parameters"]["properties"].items():
+                required, enum, items = "", "", ""
+                if name in tool["parameters"].get("required", []):
+                    required = ", required"
+
+                if param.get("enum", None):
+                    enum = ", should be one of [{}]".format(", ".join(param["enum"]))
+
+                if param.get("items", None):
+                    items = ", where each item should be {}".format(param["items"].get("type", ""))
+
+                param_text += "  - {name} ({type}{required}): {desc}{enum}{items}\n".format(
+                    name=name,
+                    type=param.get("type", ""),
+                    required=required,
+                    desc=param.get("description", ""),
+                    enum=enum,
+                    items=items,
+                )
+
+            tool_text += "> Tool Name: {name}\nTool Description: {desc}\nTool Args:\n{args}\n".format(
+                name=tool["name"], desc=tool.get("description", ""), args=param_text
+            )
+            tool_names.append(tool["name"])
+
+        return DEFAULT_TOOL_PROMPT.format(tool_text=tool_text, tool_names=", ".join(tool_names))
+
+    @override
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, List["FunctionCall"]]:
+        regex = re.compile(r"Action:\s*([a-zA-Z0-9_]+)\s*Action Input:\s*(.+?)(?=\s*Action:|\s*$)", re.DOTALL)
+        action_match: List[Tuple[str, str]] = re.findall(regex, content)
+        if not action_match:
+            return content
+
+        results = []
+        for match in action_match:
+            tool_name = match[0].strip()
+            tool_input = match[1].strip().strip('"').strip("```")
+            try:
+                arguments = json.loads(tool_input)
+                results.append((tool_name, json.dumps(arguments, ensure_ascii=False)))
+            except json.JSONDecodeError:
+                return content
+
+        return results
+
+
+class GLM4ToolUtils(ToolUtils):
+    @override
+    @staticmethod
+    def get_function_slots() -> SLOTS:
+        return ["{{name}}\n{{arguments}}"]
+
+    @override
+    @staticmethod
+    def tool_formatter(tools: List[Dict[str, Any]]) -> str:
+        tool_text = ""
+        for tool in tools:
+            tool_text += "\n\n## {name}\n\n{body}\n在调用上述函数时，请使用 Json 格式表示调用的参数。".format(
+                name=tool["name"], body=json.dumps(tool, indent=4, ensure_ascii=False)
+            )
+
+        return GLM4_TOOL_PROMPT.format(tool_text=tool_text)
+
+    @override
+    @staticmethod
+    def tool_extractor(content: str) -> Union[str, List["FunctionCall"]]:
+        if "\n" not in content:
+            return content
+
+        tool_name, tool_input = content.split("\n", maxsplit=1)
+        try:
+            arguments = json.loads(tool_input)
+        except json.JSONDecodeError:
+            return content
+
+        return [(tool_name, json.dumps(arguments, ensure_ascii=False))]
+
+
+TOOLS = {
+    "default": DefaultToolUtils(),
+    "glm4": GLM4ToolUtils(),
+}
+
+
+def get_tool_utils(name: str) -> "ToolUtils":
+    tool_utils = TOOLS.get(name, None)
+    if tool_utils is None:
+        raise ValueError(f"Tool utils `{name}` not found.")
+
+    return tool_utils
diff --git a/paddlemix/MULLM_WebUI/extras/training.py b/paddlemix/MULLM_WebUI/extras/training.py
new file mode 100644
index 000000000..eb4344624
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/extras/training.py
@@ -0,0 +1,87 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+from typing import Any, Dict, List, Optional, Tuple
+
+import gradio as gr
+import paddle
+
+from .constants import RUNNING_LOG, TRAINER_LOG
+from .packages import is_matplotlib_available
+from .ploting import gen_loss_plot
+
+
+def get_current_device() -> "paddle.device":
+    r"""
+    Gets the current available device.
+    """
+    return paddle.device.get_device()
+
+
+def get_peak_memory() -> Tuple[int, int]:
+    r"""
+    Gets the peak memory usage for the current device (in Bytes).
+    """
+    if paddle.device.get_device() > 0:
+        return paddle.device.cuda.max_memory_allocated(), paddle.device.cuda.max_memory_reserved()
+    else:
+        return 0, 0
+
+
+def get_trainer_info(output_path: os.PathLike, do_train: bool) -> Tuple[str, "gr.Slider", Optional["gr.Plot"]]:
+    r"""
+    Gets training infomation for monitor.
+    """
+    running_log = ""
+    running_progress = gr.Slider(visible=False)
+    running_loss = None
+
+    running_log_path = os.path.join(output_path, RUNNING_LOG)
+    if os.path.isfile(running_log_path):
+        with open(running_log_path, encoding="utf-8") as f:
+            running_log = f.read()
+
+    trainer_log_path = os.path.join(output_path, TRAINER_LOG)
+    if os.path.isfile(trainer_log_path):
+        trainer_log: List[Dict[str, Any]] = []
+        with open(trainer_log_path, encoding="utf-8") as f:
+            for line in f:
+                trainer_log.append(json.loads(line))
+
+        if len(trainer_log) != 0:
+            latest_log = trainer_log[-1]
+            percentage = latest_log["percentage"]
+            label = "Running {:d}/{:d}: {} < {}".format(
+                latest_log["current_steps"],
+                latest_log["total_steps"],
+                latest_log["elapsed_time"],
+                latest_log["remaining_time"],
+            )
+            running_progress = gr.Slider(label=label, value=percentage, visible=True)
+
+            if do_train and is_matplotlib_available():
+                running_loss = gr.Plot(gen_loss_plot(trainer_log))
+
+    return running_log, running_progress, running_loss
+
+
+def get_eval_results(path: os.PathLike) -> str:
+    r"""
+    Gets scores after evaluation.
+    """
+    with open(path, encoding="utf-8") as f:
+        result = json.dumps(json.load(f), indent=4)
+    return f"```json\n{result}\n```\n"
diff --git a/paddlemix/MULLM_WebUI/fig/chat_1.jpg b/paddlemix/MULLM_WebUI/fig/chat_1.jpg
new file mode 100644
index 000000000..3b2a980d5
Binary files /dev/null and b/paddlemix/MULLM_WebUI/fig/chat_1.jpg differ
diff --git a/paddlemix/MULLM_WebUI/fig/chat_2.jpg b/paddlemix/MULLM_WebUI/fig/chat_2.jpg
new file mode 100644
index 000000000..f22d368ca
Binary files /dev/null and b/paddlemix/MULLM_WebUI/fig/chat_2.jpg differ
diff --git a/paddlemix/MULLM_WebUI/fig/example_chat.jpg b/paddlemix/MULLM_WebUI/fig/example_chat.jpg
new file mode 100644
index 000000000..b6a571198
Binary files /dev/null and b/paddlemix/MULLM_WebUI/fig/example_chat.jpg differ
diff --git a/paddlemix/MULLM_WebUI/fig/example_train.jpg b/paddlemix/MULLM_WebUI/fig/example_train.jpg
new file mode 100644
index 000000000..fdd1da552
Binary files /dev/null and b/paddlemix/MULLM_WebUI/fig/example_train.jpg differ
diff --git a/paddlemix/MULLM_WebUI/fig/overview.jpg b/paddlemix/MULLM_WebUI/fig/overview.jpg
new file mode 100644
index 000000000..5c8b45f3e
Binary files /dev/null and b/paddlemix/MULLM_WebUI/fig/overview.jpg differ
diff --git a/paddlemix/MULLM_WebUI/fig/train_1.jpg b/paddlemix/MULLM_WebUI/fig/train_1.jpg
new file mode 100644
index 000000000..c18d51c95
Binary files /dev/null and b/paddlemix/MULLM_WebUI/fig/train_1.jpg differ
diff --git a/paddlemix/MULLM_WebUI/fig/train_2.jpg b/paddlemix/MULLM_WebUI/fig/train_2.jpg
new file mode 100644
index 000000000..c0cd28498
Binary files /dev/null and b/paddlemix/MULLM_WebUI/fig/train_2.jpg differ
diff --git a/paddlemix/MULLM_WebUI/fig/train_3.jpg b/paddlemix/MULLM_WebUI/fig/train_3.jpg
new file mode 100644
index 000000000..c701accaf
Binary files /dev/null and b/paddlemix/MULLM_WebUI/fig/train_3.jpg differ
diff --git a/paddlemix/MULLM_WebUI/interface.py b/paddlemix/MULLM_WebUI/interface.py
new file mode 100644
index 000000000..479007bfa
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/interface.py
@@ -0,0 +1,52 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+
+from .components import create_infer_tab, create_top, create_train_tab
+from .css import CSS
+from .engine import Engine
+from .extras.packages import is_gradio_available
+
+if is_gradio_available():
+    import gradio as gr
+
+
+def create_ui(demo_mode: bool = False) -> "gr.Blocks":
+    engine = Engine(demo_mode=demo_mode, pure_chat=False)
+
+    with gr.Blocks(title="PaddleMIX Board", css=CSS) as demo:
+        engine.manager.add_elems("top", create_top())
+        lang: "gr.Dropdown" = engine.manager.get_elem_by_id("top.lang")
+
+        with gr.Tab("Train"):
+            engine.manager.add_elems("train", create_train_tab(engine))
+
+        with gr.Tab("Chat"):
+            engine.manager.add_elems("infer", create_infer_tab(engine))
+
+        demo.load(engine.resume, outputs=engine.manager.get_elem_list(), concurrency_limit=None)
+        lang.change(engine.change_lang, [lang], engine.manager.get_elem_list(), queue=False)
+
+        # lang.input(inputs=[lang], queue=False)
+
+    return demo
+
+
+def run_web_ui() -> None:
+    gradio_ipv6 = os.getenv("GRADIO_IPV6", "0").lower() in ["true", "1"]
+    gradio_share = os.getenv("GRADIO_SHARE", "0").lower() in ["true", "1"]
+    server_name = os.getenv("GRADIO_SERVER_NAME", "[::]" if gradio_ipv6 else "0.0.0.0")
+    server_port = int(os.getenv("GRADIO_SERVER_PORT", "8260"))
+    create_ui().launch(share=gradio_share, server_name=server_name, inbrowser=True, server_port=server_port)
diff --git a/paddlemix/MULLM_WebUI/locales.py b/paddlemix/MULLM_WebUI/locales.py
new file mode 100644
index 000000000..60b8836ca
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/locales.py
@@ -0,0 +1,365 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+LOCALES = {
+    "lang": {"en": {"label": "Lang"}, "zh": {"label": "语言"}},
+    "model_tag": {"zh": "模型", "en": "Model"},
+    "image_tag": {"zh": "图像", "en": "Image"},
+    "video_tag": {"zh": "视频", "en": "Video"},
+    "question_tag": {"zh": "问题", "en": "Question"},
+    "model_name": {"en": {"label": "Model name"}, "zh": {"label": "模型名称"}},
+    "model_path": {
+        "en": {"label": "Model path", "info": "Path to pretrained model or model identifier from Hugging Face."},
+        "zh": {"label": "模型路径", "info": "本地模型的文件路径或 Hugging Face 的模型标识符。"},
+    },
+    "finetuning_type": {"en": {"label": "Finetuning method"}, "zh": {"label": "微调方法"}},
+    "checkpoint_path": {"en": {"label": "Checkpoint path"}, "zh": {"label": "检查点路径"}},
+    "template": {
+        "en": {"label": "Prompt template", "info": "The template used in constructing prompts."},
+        "zh": {"label": "提示模板", "info": "构建提示词时使用的模板。"},
+    },
+    "rope_scaling": {"en": {"label": "RoPE scaling"}, "zh": {"label": "RoPE 插值方法"}},
+    "booster": {"en": {"label": "Booster"}, "zh": {"label": "加速方式"}},
+    "training_stage": {
+        "en": {"label": "Stage", "info": "The stage to perform in training."},
+        "zh": {"label": "训练阶段", "info": "目前采用的训练方式。"},
+    },
+    "dataset_dir": {
+        "en": {"label": "Data dir", "info": "Path to the data directory."},
+        "zh": {"label": "数据路径", "info": "数据文件夹的路径。"},
+    },
+    "dataset": {"en": {"label": "Dataset"}, "zh": {"label": "数据集"}},
+    "data_preview_btn": {"en": {"value": "Preview dataset"}, "zh": {"value": "预览数据集"}},
+    "preview_count": {"en": {"label": "Count"}, "zh": {"label": "数量"}},
+    "page_index": {"en": {"label": "Page"}, "zh": {"label": "页数"}},
+    "prev_btn": {"en": {"value": "Prev"}, "zh": {"value": "上一页"}},
+    "next_btn": {"en": {"value": "Next"}, "zh": {"value": "下一页"}},
+    "close_btn": {"en": {"value": "Close"}, "zh": {"value": "关闭"}},
+    "preview_samples": {"en": {"label": "Samples"}, "zh": {"label": "样例"}},
+    "learning_rate": {
+        "en": {"label": "Learning rate", "info": "Initial learning rate for AdamW."},
+        "zh": {"label": "学习率", "info": "AdamW 优化器的初始学习率。"},
+    },
+    "num_train_epochs": {
+        "en": {"label": "Epochs", "info": "Total number of training epochs to perform."},
+        "zh": {"label": "训练轮数", "info": "需要执行的训练总轮数。"},
+    },
+    "max_grad_norm": {
+        "en": {"label": "Maximum gradient norm", "info": "Norm for gradient clipping."},
+        "zh": {"label": "最大梯度范数", "info": "用于梯度裁剪的范数。"},
+    },
+    "max_samples": {
+        "en": {"label": "Max samples", "info": "Maximum samples per dataset."},
+        "zh": {"label": "最大样本数", "info": "每个数据集的最大样本数。"},
+    },
+    "compute_type": {
+        "en": {"label": "Compute type", "info": "Whether to use mixed precision training."},
+        "zh": {"label": "计算类型", "info": "是否使用混合精度训练。"},
+    },
+    "cutoff_len": {
+        "en": {"label": "Cutoff length", "info": "Max tokens in input sequence."},
+        "zh": {"label": "截断长度", "info": "输入序列分词后的最大长度。"},
+    },
+    "batch_size": {
+        "en": {"label": "Batch size", "info": "Number of samples processed on each GPU."},
+        "zh": {"label": "批处理大小", "info": "每个 GPU 处理的样本数量。"},
+    },
+    "gradient_accumulation_steps": {
+        "en": {"label": "Gradient accumulation", "info": "Number of steps for gradient accumulation."},
+        "zh": {"label": "梯度累积", "info": "梯度累积的步数。"},
+    },
+    "val_size": {
+        "en": {"label": "Val size", "info": "Proportion of data in the dev set."},
+        "zh": {"label": "验证集比例", "info": "验证集占全部样本的百分比。"},
+    },
+    "lr_scheduler_type": {
+        "en": {"label": "LR scheduler", "info": "Name of the learning rate scheduler."},
+        "zh": {"label": "学习率调节器", "info": "学习率调度器的名称。"},
+    },
+    "extra_tab": {"en": {"label": "Extra configurations"}, "zh": {"label": "其它参数设置"}},
+    "logging_steps": {
+        "en": {"label": "Logging steps", "info": "Number of steps between two logs."},
+        "zh": {"label": "日志间隔", "info": "每两次日志输出间的更新步数。"},
+    },
+    "save_steps": {
+        "en": {"label": "Save steps", "info": "Number of steps between two checkpoints."},
+        "zh": {"label": "保存间隔", "info": "每两次断点保存间的更新步数。"},
+    },
+    "eval_steps": {
+        "en": {"label": "Validation steps", "info": "Number of steps between two evaluations."},
+        "zh": {"label": "验证步数间隔", "info": "每两次评估之间的的更新步数。"},
+    },
+    "warmup_steps": {
+        "en": {"label": "Warmup steps", "info": "Number of steps used for warmup."},
+        "zh": {"label": "预热步数", "info": "学习率预热采用的步数。"},
+    },
+    "neftune_alpha": {
+        "en": {"label": "NEFTune alpha", "info": "Magnitude of noise adding to embedding vectors."},
+        "zh": {"label": "NEFTune 噪声参数", "info": "嵌入向量所添加的噪声大小。"},
+    },
+    "extra_args": {
+        "en": {"label": "Extra arguments", "info": "Extra arguments passed to the trainer in JSON format."},
+        "zh": {"label": "额外参数", "info": "以 JSON 格式传递给训练器的额外参数。"},
+    },
+    "packing": {
+        "en": {"label": "Pack sequences", "info": "Pack sequences into samples of fixed length."},
+        "zh": {"label": "序列打包", "info": "将序列打包为等长样本。"},
+    },
+    "neat_packing": {
+        "en": {"label": "Use neat packing", "info": "Avoid cross-attention between packed sequences."},
+        "zh": {"label": "使用无污染打包", "info": "避免打包后的序列产生交叉注意力。"},
+    },
+    "train_on_prompt": {
+        "en": {"label": "Train on prompt", "info": "Disable the label mask on the prompt (only for SFT)."},
+        "zh": {"label": "学习提示词", "info": "不在提示词的部分添加掩码（仅适用于 SFT）。"},
+    },
+    "mask_history": {
+        "en": {"label": "Mask history", "info": "Train on the last turn only (only for SFT)."},
+        "zh": {"label": "不学习历史对话", "info": "仅学习最后一轮对话（仅适用于 SFT）。"},
+    },
+    "resize_vocab": {
+        "en": {"label": "Resize token embeddings", "info": "Resize the tokenizer vocab and the embedding layers."},
+        "zh": {"label": "更改词表大小", "info": "更改分词器词表和嵌入层的大小。"},
+    },
+    "use_llama_pro": {
+        "en": {"label": "Enable LLaMA Pro", "info": "Make the parameters in the expanded blocks trainable."},
+        "zh": {"label": "使用 LLaMA Pro", "info": "仅训练块扩展后的参数。"},
+    },
+    "shift_attn": {
+        "en": {"label": "Enable S^2 Attention", "info": "Use shift short attention proposed by LongLoRA."},
+        "zh": {"label": "使用 S^2 Attention", "info": "使用 LongLoRA 提出的 shift short attention。"},
+    },
+    "report_to": {
+        "en": {"label": "Enable external logger", "info": "Use TensorBoard or wandb to log experiment."},
+        "zh": {"label": "启用外部记录面板", "info": "使用 TensorBoard 或 wandb 记录实验。"},
+    },
+    "freeze_tab": {"en": {"label": "Freeze tuning configurations"}, "zh": {"label": "部分参数微调设置"}},
+    "freeze_trainable_layers": {
+        "en": {
+            "label": "Trainable layers",
+            "info": "Number of the last(+)/first(-) hidden layers to be set as trainable.",
+        },
+        "zh": {"label": "可训练层数", "info": "最末尾（+）/最前端（-）可训练隐藏层的数量。"},
+    },
+    "freeze_trainable_modules": {
+        "en": {
+            "label": "Trainable modules",
+            "info": "Name(s) of trainable modules. Use commas to separate multiple modules.",
+        },
+        "zh": {"label": "可训练模块", "info": "可训练模块的名称。使用英文逗号分隔多个名称。"},
+    },
+    "freeze_extra_modules": {
+        "en": {
+            "label": "Extra modules (optional)",
+            "info": "Name(s) of modules apart from hidden layers to be set as trainable. Use commas to separate multiple modules.",
+        },
+        "zh": {"label": "额外模块（非必填）", "info": "除隐藏层以外的可训练模块名称。使用英文逗号分隔多个名称。"},
+    },
+    "lora_tab": {"en": {"label": "LoRA configurations"}, "zh": {"label": "LoRA 参数设置"}},
+    "lora_rank": {
+        "en": {"label": "LoRA rank", "info": "The rank of LoRA matrices."},
+        "zh": {"label": "LoRA 秩", "info": "LoRA 矩阵的秩大小。"},
+    },
+    "lora_alpha": {
+        "en": {"label": "LoRA alpha", "info": "Lora scaling coefficient."},
+        "zh": {"label": "LoRA 缩放系数", "info": "LoRA 缩放系数大小。"},
+    },
+    "lora_dropout": {
+        "en": {"label": "LoRA dropout", "info": "Dropout ratio of LoRA weights."},
+        "zh": {"label": "LoRA 随机丢弃", "info": "LoRA 权重随机丢弃的概率。"},
+    },
+    "loraplus_lr_ratio": {
+        "en": {"label": "LoRA+ LR ratio", "info": "The LR ratio of the B matrices in LoRA."},
+        "zh": {"label": "LoRA+ 学习率比例", "info": "LoRA+ 中 B 矩阵的学习率倍数。"},
+    },
+    "create_new_adapter": {
+        "en": {
+            "label": "Create new adapter",
+            "info": "Create a new adapter with randomly initialized weight upon the existing one.",
+        },
+        "zh": {"label": "新建适配器", "info": "在现有的适配器上创建一个随机初始化后的新适配器。"},
+    },
+    "use_rslora": {
+        "en": {"label": "Use rslora", "info": "Use the rank stabilization scaling factor for LoRA layer."},
+        "zh": {"label": "使用 rslora", "info": "对 LoRA 层使用秩稳定缩放方法。"},
+    },
+    "use_dora": {
+        "en": {"label": "Use DoRA", "info": "Use weight-decomposed LoRA."},
+        "zh": {"label": "使用 DoRA", "info": "使用权重分解的 LoRA。"},
+    },
+    "use_pissa": {
+        "en": {"label": "Use PiSSA", "info": "Use PiSSA method."},
+        "zh": {"label": "使用 PiSSA", "info": "使用 PiSSA 方法。"},
+    },
+    "lora_target": {
+        "en": {
+            "label": "LoRA modules (optional)",
+            "info": "Name(s) of modules to apply LoRA. Use commas to separate multiple modules.",
+        },
+        "zh": {"label": "LoRA 作用模块（非必填）", "info": "应用 LoRA 的模块名称。使用英文逗号分隔多个名称。"},
+    },
+    "additional_target": {
+        "en": {
+            "label": "Additional modules (optional)",
+            "info": "Name(s) of modules apart from LoRA layers to be set as trainable. Use commas to separate multiple modules.",
+        },
+        "zh": {"label": "附加模块（非必填）", "info": "除 LoRA 层以外的可训练模块名称。使用英文逗号分隔多个名称。"},
+    },
+    "rlhf_tab": {"en": {"label": "RLHF configurations"}, "zh": {"label": "RLHF 参数设置"}},
+    "pref_beta": {
+        "en": {"label": "Beta value", "info": "Value of the beta parameter in the loss."},
+        "zh": {"label": "Beta 参数", "info": "损失函数中 beta 超参数大小。"},
+    },
+    "pref_ftx": {
+        "en": {"label": "Ftx gamma", "info": "The weight of SFT loss in the final loss."},
+        "zh": {"label": "Ftx gamma", "info": "损失函数中 SFT 损失的权重大小。"},
+    },
+    "pref_loss": {
+        "en": {"label": "Loss type", "info": "The type of the loss function."},
+        "zh": {"label": "损失类型", "info": "损失函数的类型。"},
+    },
+    "reward_model": {
+        "en": {"label": "Reward model", "info": "Adapter of the reward model in PPO training."},
+        "zh": {"label": "奖励模型", "info": "PPO 训练中奖励模型的适配器路径。"},
+    },
+    "ppo_score_norm": {
+        "en": {"label": "Score norm", "info": "Normalizing scores in PPO training."},
+        "zh": {"label": "奖励模型", "info": "PPO 训练中归一化奖励分数。"},
+    },
+    "ppo_whiten_rewards": {
+        "en": {"label": "Whiten rewards", "info": "Whiten the rewards in PPO training."},
+        "zh": {"label": "白化奖励", "info": "PPO 训练中将奖励分数做白化处理。"},
+    },
+    "arg_save_btn": {"en": {"value": "Save arguments"}, "zh": {"value": "保存训练参数"}},
+    "arg_load_btn": {"en": {"value": "Load arguments"}, "zh": {"value": "载入训练参数"}},
+    "start_btn": {"en": {"value": "Start"}, "zh": {"value": "开始"}},
+    "stop_btn": {"en": {"value": "Abort"}, "zh": {"value": "中断"}},
+    "output_dir": {
+        "en": {"label": "Output dir", "info": "Directory for saving results."},
+        "zh": {"label": "输出目录", "info": "保存结果的路径。"},
+    },
+    "config_path": {
+        "en": {"label": "Config path", "info": "Path to config saving arguments."},
+        "zh": {"label": "配置路径", "info": "保存训练参数的配置文件路径。"},
+    },
+    "device_count": {
+        "en": {"label": "Device count", "info": "Number of devices available."},
+        "zh": {"label": "设备数量", "info": "当前可用的运算设备数。"},
+    },
+    "output_box": {"en": {"label": "Info Box", "value": "Ready."}, "zh": {"label": "信息栏", "value": "准备就绪。"}},
+    "loss_viewer": {"en": {"label": "Loss"}, "zh": {"label": "损失"}},
+    "predict": {"en": {"label": "Save predictions"}, "zh": {"label": "保存预测结果"}},
+    "infer_backend": {"en": {"label": "Inference engine"}, "zh": {"label": "推理引擎"}},
+    "infer_dtype": {"en": {"label": "Inference data type"}, "zh": {"label": "推理数据类型"}},
+    "load_btn": {"en": {"value": "Load model"}, "zh": {"value": "加载模型"}},
+    "unload_btn": {"en": {"value": "Unload model"}, "zh": {"value": "卸载模型"}},
+    "info_box": {
+        "en": {"label": "Info", "value": "Model unloaded, please load a model first."},
+        "zh": {"label": "信息栏", "value": "模型未加载，请先加载模型。"},
+    },
+    "role": {"en": {"label": "Role"}, "zh": {"label": "角色"}},
+    "system": {"en": {"placeholder": "System prompt (optional)"}, "zh": {"placeholder": "系统提示词（非必填）"}},
+    "tools": {"en": {"placeholder": "Tools (optional)"}, "zh": {"placeholder": "工具列表（非必填）"}},
+    "image": {"en": {"label": "Image"}, "zh": {"label": "图像"}},
+    "video": {"en": {"label": "Video (optional)"}, "zh": {"label": "视频"}},
+    "query": {"en": {"placeholder": "Input..."}, "zh": {"placeholder": "输入..."}},
+    "chat_btn": {"en": {"value": "Chat"}, "zh": {"value": "对话"}},
+    "max_length": {"en": {"label": "Maximum length"}, "zh": {"label": "最大长度"}},
+    "max_new_tokens": {"en": {"label": "Maximum new tokens"}, "zh": {"label": "最大生成长度"}},
+    "top_p": {"en": {"label": "Top-p"}, "zh": {"label": "Top-p 采样值"}},
+    "temperature": {"en": {"label": "Temperature"}, "zh": {"label": "温度系数"}},
+    "clear_btn": {"en": {"value": "Clear history"}, "zh": {"value": "清空历史"}},
+    "export_size": {
+        "en": {"label": "Max shard size (GB)", "info": "The maximum size for a model file."},
+        "zh": {"label": "最大分块大小（GB）", "info": "单个模型文件的最大大小。"},
+    },
+    "export_quantization_bit": {
+        "en": {"label": "Export quantization bit.", "info": "Quantizing the exported model."},
+        "zh": {"label": "导出量化等级", "info": "量化导出模型。"},
+    },
+    "export_quantization_dataset": {
+        "en": {"label": "Export quantization dataset", "info": "The calibration dataset used for quantization."},
+        "zh": {"label": "导出量化数据集", "info": "量化过程中使用的校准数据集。"},
+    },
+    "export_device": {
+        "en": {"label": "Export device", "info": "Which device should be used to export model."},
+        "zh": {"label": "导出设备", "info": "导出模型使用的设备类型。"},
+    },
+    "export_legacy_format": {
+        "en": {"label": "Export legacy format", "info": "Do not use safetensors to save the model."},
+        "zh": {"label": "导出旧格式", "info": "不使用 safetensors 格式保存模型。"},
+    },
+    "export_dir": {
+        "en": {"label": "Export dir", "info": "Directory to save exported model."},
+        "zh": {"label": "导出目录", "info": "保存导出模型的文件夹路径。"},
+    },
+    "export_hub_model_id": {
+        "en": {"label": "HF Hub ID (optional)", "info": "Repo ID for uploading model to Hugging Face hub."},
+        "zh": {"label": "HF Hub ID（非必填）", "info": "用于将模型上传至 Hugging Face Hub 的仓库 ID。"},
+    },
+    "export_btn": {"en": {"value": "Export"}, "zh": {"value": "开始导出"}},
+    "question_box": {
+        "en": {"label": "Question", "info": "Enter your question here..."},
+        "zh": {"label": "问题", "info": "在这里输入你的问题..."},
+    },
+    "seed_box": {"en": {"label": "Seed"}, "zh": {"label": "seed"}},
+    "question_type": {"en": {"label": "Question type"}, "zh": {"label": "问题类型"}},
+    "state_checkbox_group": {
+        "en": {
+            "label": "Chat State",
+            "info": "check whether ready to chat or not",
+            "choices": ["Question", "Image", "Video", "Model"],
+        },
+        "zh": {"label": "对话状态", "info": "检查是否可以对话", "choices": ["问题", "图像", "视频", "模型"]},
+    },
+    "ckpt_box": {"en": {"label": "Checkpoint Item"}, "zh": {"label": "检查点"}},
+}
+
+ALERTS = {
+    "err_conflict": {"en": "A process is in running, please abort it first.", "zh": "任务已存在，请先中断训练。"},
+    "err_exists": {"en": "You have loaded a model, please unload it first.", "zh": "模型已存在，请先卸载模型。"},
+    "err_no_model": {"en": "Please select a model.", "zh": "请选择模型。"},
+    "err_no_cfg_path": {"en": "Please enter config path.", "zh": "请输入配置文件的路径"},
+    "err_no_path": {"en": "Model not found.", "zh": "模型未找到。"},
+    "err_no_dataset": {"en": "Please choose a dataset.", "zh": "请选择数据集。"},
+    "err_no_adapter": {"en": "Please select an adapter.", "zh": "请选择适配器。"},
+    "err_no_output_dir": {"en": "Please provide output dir.", "zh": "请填写输出目录。"},
+    "err_failed": {"en": "Failed.", "zh": "训练出错。"},
+    "err_json_schema": {"en": "Invalid JSON schema.", "zh": "Json 格式错误。"},
+    "err_config_not_found": {"en": "Config file is not found.", "zh": "未找到配置文件。"},
+    "warn_no_cuda": {"en": "CUDA environment was not detected.", "zh": "未检测到 CUDA 环境。"},
+    "warn_output_dir_exists": {
+        "en": "Output dir already exists, will resume training from here.",
+        "zh": "输出目录已存在，将从该断点恢复训练。",
+    },
+    "info_aborting": {"en": "Aborted, wait for terminating...", "zh": "训练中断，正在等待进程结束……"},
+    "info_aborted": {"en": "Ready.", "zh": "准备就绪。"},
+    "info_finished": {"en": "Finished.", "zh": "训练完毕。"},
+    "info_config_saved": {"en": "Arguments have been saved at: ", "zh": "训练参数已保存至："},
+    "info_config_loaded": {"en": "Arguments have been restored.", "zh": "训练参数已载入。"},
+    "info_loading": {"en": "Loading model...", "zh": "加载中……"},
+    "info_training": {"en": "Training...", "zh": "训练中……"},
+    "info_unloading": {"en": "Unloading model...", "zh": "卸载中……"},
+    "info_loaded": {"en": "Model loaded!", "zh": "模型已加载！"},
+    "info_unloaded": {"en": "Model unloaded.", "zh": "模型已卸载。"},
+    "info_unload_error": {"en": "Model has not been loaded yet", "zh": "模型未加载，卸载失败"},
+    "info_exporting": {"en": "Exporting model...", "zh": "正在导出模型……"},
+    "info_exported": {"en": "Model exported.", "zh": "模型导出完成。"},
+    "info_query": {"en": "Please enter your question and then try again.", "zh": "请输入你的问题后重新尝试。"},
+    "info_upload_file": {"en": "Please upload your image or video before chatting", "zh": "请上传图片或视频后重新尝试"},
+    "info_upload_model": {"en": "Please select and load your Model", "zh": "请选择并加载您的模型"},
+    "info_generating": {"en": "Generating...", "zh": "生成中……"},
+    "info_generated": {"en": "Generated successfully", "zh": "生成成功"},
+}
diff --git a/paddlemix/MULLM_WebUI/manager.py b/paddlemix/MULLM_WebUI/manager.py
new file mode 100644
index 000000000..ef187bba2
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/manager.py
@@ -0,0 +1,73 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import TYPE_CHECKING, Dict, Generator, List, Set, Tuple
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+
+class Manager:
+    def __init__(self) -> None:
+        self._id_to_elem: Dict[str, "Component"] = {}
+        self._elem_to_id: Dict["Component", str] = {}
+
+    def add_elems(self, tab_name: str, elem_dict: Dict[str, "Component"]) -> None:
+        r"""
+        Adds elements to manager.
+        """
+        for elem_name, elem in elem_dict.items():
+            elem_id = f"{tab_name}.{elem_name}"
+            self._id_to_elem[elem_id] = elem
+            self._elem_to_id[elem] = elem_id
+
+    def get_elem_list(self) -> List["Component"]:
+        r"""
+        Returns the list of all elements.
+        """
+        return list(self._id_to_elem.values())
+
+    def get_elem_iter(self) -> Generator[Tuple[str, "Component"], None, None]:
+        r"""
+        Returns an iterator over all elements with their names.
+        """
+        for elem_id, elem in self._id_to_elem.items():
+            yield elem_id.split(".")[-1], elem
+
+    def get_elem_by_id(self, elem_id: str) -> "Component":
+        r"""
+        Gets element by id.
+
+        Example: top.lang, train.dataset
+        """
+        return self._id_to_elem[elem_id]
+
+    def get_id_by_elem(self, elem: "Component") -> str:
+        r"""
+        Gets id by element.
+        """
+        return self._elem_to_id[elem]
+
+    def get_base_elems(self) -> Set["Component"]:
+        r"""
+        Gets the base elements that are commonly used.
+        """
+        return {
+            self._id_to_elem["top.lang"],
+            self._id_to_elem["top.model_name"],
+            self._id_to_elem["top.model_path"],
+            self._id_to_elem["top.finetuning_type"],
+            self._id_to_elem["top.checkpoint_path"],
+            self._id_to_elem["top.template"],
+        }
diff --git a/paddlemix/MULLM_WebUI/requirements.txt b/paddlemix/MULLM_WebUI/requirements.txt
new file mode 100644
index 000000000..1db3a5dbe
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/requirements.txt
@@ -0,0 +1,5 @@
+gradio==4.44.1
+datasets
+pandas
+Pillow
+tqdm
\ No newline at end of file
diff --git a/paddlemix/MULLM_WebUI/run_web.py b/paddlemix/MULLM_WebUI/run_web.py
new file mode 100644
index 000000000..ef7b69be4
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/run_web.py
@@ -0,0 +1,18 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddlemix.MULLM_WebUI.interface import run_web_ui
+
+if __name__ == "__main__":
+    run_web_ui()
diff --git a/paddlemix/MULLM_WebUI/runner.py b/paddlemix/MULLM_WebUI/runner.py
new file mode 100644
index 000000000..ed1748d79
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/runner.py
@@ -0,0 +1,419 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import os
+import time
+from threading import Thread
+from typing import TYPE_CHECKING, Any, Dict, Optional
+
+import paddle
+from paddlenlp.data import DataCollatorForSeq2Seq
+from paddlenlp.peft import LoRAConfig, LoRAModel
+from paddlenlp.trainer.trainer import TRAINING_ARGS_NAME, IntervalStrategy, Trainer
+from paddlenlp.transformers import Qwen2Tokenizer
+
+from ..models.qwen2_vl.modeling_qwen2_vl import Qwen2VLForConditionalGeneration
+from ..processors.qwen2_vl_processing import Qwen2VLImageProcessor, Qwen2VLProcessor
+from .common import DEFAULT_CONFIG_DIR, get_dataset, get_save_dir
+from .extras.args import get_train_args, load_args, save_args
+from .extras.callbacks import LogCallback
+from .extras.constants import IGNORE_INDEX, PEFT_METHODS, TRAINING_STAGES
+from .extras.packages import is_gradio_available
+from .extras.template import get_template_and_fix_tokenizer
+from .extras.training import get_eval_results, get_trainer_info
+from .locales import ALERTS
+
+if is_gradio_available():
+    import gradio as gr
+
+
+if TYPE_CHECKING:
+    from gradio.components import Component
+
+    from .manager import Manager
+
+
+class Runner:
+    def __init__(self, manager: "Manager", demo_mode: bool = False) -> None:
+        self.manager = manager
+        self.demo_mode = demo_mode
+        """ Resume """
+        self.trainer: Optional["Thread"] = None
+        self.do_train = True
+        self.running_data: Dict["Component", Any] = None
+        """ State """
+        self.aborted = False
+        self.running = False
+        self.dataset = None
+
+    def set_abort(self) -> None:
+        self.aborted = True
+
+    def _initialize(self, data: Dict["Component", Any], do_train: bool, from_preview: bool) -> str:
+        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
+        lang, model_name, model_path = get("top.lang"), get("top.model_name"), get("top.model_path")
+        dataset = get("train.dataset") if do_train else get("eval.dataset")
+        config_path = get("train.config_path")
+        if self.running:
+            return ALERTS["err_conflict"][lang]
+
+        if not model_name:
+            return ALERTS["err_no_model"][lang]
+
+        if not config_path:
+            return ALERTS["err_no_cfg_path"][lang]
+
+        if not model_path:
+            return ALERTS["err_no_path"][lang]
+
+        if not dataset:
+            return ALERTS["err_no_dataset"][lang]
+
+        if do_train:
+            if not get("train.output_dir"):
+                return ALERTS["err_no_output_dir"][lang]
+
+            try:
+                json.loads(get("train.extra_args"))
+            except json.JSONDecodeError:
+                return ALERTS["err_json_schema"][lang]
+
+        else:
+            if not get("eval.output_dir"):
+                return ALERTS["err_no_output_dir"][lang]
+
+        return ""
+
+    def _finalize(self, lang: str, finish_info: str) -> str:
+        finish_info = ALERTS["info_aborted"][lang] if self.aborted else finish_info
+        gr.Info(finish_info)
+        self.trainer = None
+        self.aborted = False
+        self.running = False
+        self.running_data = None
+        paddle.device.cuda.empty_cache()
+        return finish_info
+
+    def _parse_train_args(self, data: Dict["Component", Any]) -> Dict[str, Any]:
+        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
+        model_name, finetuning_type = get("top.model_name"), get("top.finetuning_type")
+
+        args = dict(
+            stage=TRAINING_STAGES[get("train.training_stage")],
+            do_train=True,
+            model_name_or_path=get("top.model_path"),
+            cache_dir=None,
+            preprocessing_num_workers=16,
+            finetuning_type=finetuning_type,
+            template=get("top.template"),
+            dataset_dir=get("train.dataset_dir"),
+            dataset=",".join(get("train.dataset")),
+            cutoff_len=get("train.cutoff_len"),
+            learning_rate=float(get("train.learning_rate")),
+            num_train_epochs=float(get("train.num_train_epochs")),
+            max_samples=int(get("train.max_samples")),
+            per_device_train_batch_size=get("train.batch_size"),
+            gradient_accumulation_steps=get("train.gradient_accumulation_steps"),
+            lr_scheduler_type=get("train.lr_scheduler_type"),
+            max_grad_norm=float(get("train.max_grad_norm")),
+            logging_steps=get("train.logging_steps"),
+            save_steps=get("train.save_steps"),
+            warmup_steps=get("train.warmup_steps"),
+            output_dir=get_save_dir(model_name, finetuning_type, get("train.output_dir")),
+            fp16=(get("train.compute_type") == "fp16"),
+            bf16=(get("train.compute_type") == "bf16"),
+            pure_bf16=(get("train.compute_type") == "pure_bf16"),
+            plot_loss=True,
+            ddp_timeout=180000000,
+            include_num_input_tokens_seen=False,
+        )
+        args.update(json.loads(get("train.extra_args")))
+
+        # checkpoints
+        if get("top.checkpoint_path"):
+            if finetuning_type in PEFT_METHODS:  # list
+                args["adapter_name_or_path"] = ",".join(
+                    [get_save_dir(model_name, finetuning_type, adapter) for adapter in get("top.checkpoint_path")]
+                )
+            else:  # str
+                args["model_name_or_path"] = get_save_dir(model_name, finetuning_type, get("top.checkpoint_path"))
+
+        # lora config
+        if args["finetuning_type"] == "lora":
+            args["lora_rank"] = get("train.lora_rank")
+            args["lora_alpha"] = get("train.lora_alpha")
+            args["lora_dropout"] = get("train.lora_dropout")
+            args["loraplus_lr_ratio"] = get("train.loraplus_lr_ratio")
+            args["use_rslora"] = get("train.use_rslora")
+            args["pissa_init"] = get("train.use_pissa")
+            args["pissa_convert"] = get("train.use_pissa")
+
+        # eval config
+        if get("train.val_size") > 1e-6:
+            args["val_size"] = get("train.val_size")
+            args["eval_strategy"] = "steps"
+            args["eval_steps"] = get("train.eval_steps")
+            args["per_device_eval_batch_size"] = args["per_device_train_batch_size"]
+
+        return args
+
+    def _parse_eval_args(self, data: Dict["Component", Any]) -> Dict[str, Any]:
+        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
+        model_name, finetuning_type = get("top.model_name"), get("top.finetuning_type")
+
+        args = dict(
+            stage="sft",
+            model_name_or_path=get("top.model_path"),
+            cache_dir=None,
+            preprocessing_num_workers=16,
+            finetuning_type=finetuning_type,
+            quantization_method=get("top.quantization_method"),
+            template=get("top.template"),
+            rope_scaling=get("top.rope_scaling") if get("top.rope_scaling") in ["linear", "dynamic"] else None,
+            flash_attn="fa2" if get("top.booster") == "flashattn2" else "auto",
+            use_unsloth=(get("top.booster") == "unsloth"),
+            dataset_dir=get("eval.dataset_dir"),
+            eval_dataset=",".join(get("eval.dataset")),
+            cutoff_len=get("eval.cutoff_len"),
+            max_samples=int(get("eval.max_samples")),
+            per_device_eval_batch_size=get("eval.batch_size"),
+            predict_with_generate=True,
+            max_new_tokens=get("eval.max_new_tokens"),
+            top_p=get("eval.top_p"),
+            temperature=get("eval.temperature"),
+            output_dir=get_save_dir(model_name, finetuning_type, get("eval.output_dir")),
+        )
+
+        if get("eval.predict"):
+            args["do_predict"] = True
+        else:
+            args["do_eval"] = True
+
+        # checkpoints
+        if get("top.checkpoint_path"):
+            if finetuning_type in PEFT_METHODS:  # list
+                args["adapter_name_or_path"] = ",".join(
+                    [get_save_dir(model_name, finetuning_type, adapter) for adapter in get("top.checkpoint_path")]
+                )
+            else:  # str
+                args["model_name_or_path"] = get_save_dir(model_name, finetuning_type, get("top.checkpoint_path"))
+
+        return args
+
+    def _form_config_dict(self, data: Dict["Component", Any]) -> Dict[str, Any]:
+        config_dict = {}
+        skip_ids = ["top.lang", "top.model_path", "train.output_dir", "train.config_path"]
+        for elem, value in data.items():
+            elem_id = self.manager.get_id_by_elem(elem)
+            if elem_id not in skip_ids:
+                config_dict[elem_id] = value
+
+        return config_dict
+
+    def run_train_v2(self, data):
+        self.running_data = data
+        thread = Thread(target=self.run_train, args=(data,))
+        thread.start()
+        self.trainer = thread
+        yield from self.monitor()
+
+    def run_train(self, data):
+        def check_aborted(self, trainer):
+            while True:
+                time.sleep(1)
+                if self.aborted:
+                    trainer.control.should_epoch_stop = True
+                    trainer.control.should_training_stop = True
+
+        callbacks = [LogCallback]
+        # yield from self._launch(data, do_train=True)
+        get = lambda elem_id: data[self.manager.get_elem_by_id(elem_id)]
+        model_path = get("top.model_path")
+        model_name = get("top.model_name")
+        checkpoint_path = get("top.checkpoint_path")
+        finetuning_type = get("top.finetuning_type")
+
+        tokenizer = Qwen2Tokenizer.from_pretrained(model_path)
+        args = self._parse_train_args(data)
+        model_args, data_args, training_args, finetuning_args, generating_args = get_train_args(args)
+
+        # resume model
+        resume_path = None
+        if isinstance(checkpoint_path, str) and len(checkpoint_path) > 0:
+            resume_root_path = get_save_dir(model_name, finetuning_type, checkpoint_path)
+        else:
+            os.makedirs(training_args.output_dir, exist_ok=True)
+            resume_root_path = training_args.output_dir
+        if resume_root_path is not None:
+            ckpts = []
+            for ckpt in os.listdir(resume_root_path):
+                if "checkpoint" in ckpt:
+                    ckpts.append(ckpt)
+            ckpts.sort()
+            if len(ckpts) > 0:
+                resume_path = os.path.join(resume_root_path, ckpts[0])
+
+        tokenizer.model_max_len = data_args.cutoff_len
+        image_processor = Qwen2VLImageProcessor(max_pixels=256 * 28 * 28)
+        processor = Qwen2VLProcessor(image_processor, tokenizer)
+        template = get_template_and_fix_tokenizer(tokenizer, data_args)
+        self.dataset = get_dataset(
+            template, model_args, data_args, training_args, finetuning_args.stage, tokenizer, processor
+        )
+        model = Qwen2VLForConditionalGeneration.from_pretrained(model_path, dtype=model_args.compute_dtype)
+        data_collator = DataCollatorForSeq2Seq(
+            tokenizer=tokenizer,
+            pad_to_multiple_of=8,  # for shift short attention
+            label_pad_token_id=IGNORE_INDEX,
+        )
+        # ALERTS
+        if finetuning_type == "lora":
+            target = [
+                "model.layers.*q_proj.*",
+                "model.layers.*k_proj.*",
+                "model.layers.*v_proj.*",
+                "model.layers.*gate_proj.*",
+                "model.layers.*up_proj.*",
+                "model.layers.*down_proj.*",
+                "model.layers.*o_proj.*",
+            ]
+            lora_cfg = LoRAConfig(
+                target_modules=target,
+                r=finetuning_args.lora_rank,
+                lora_alpha=finetuning_args.lora_alpha,
+                lora_dropout=finetuning_args.lora_dropout,
+                merge_weights=False,
+                dtype=model_args.compute_dtype,  # using str type for dtype while saving bfloat16
+                rslora=finetuning_args.use_rslora,
+                lora_plus_scale=finetuning_args.loraplus_lr_ratio,
+                pissa=finetuning_args.pissa_init,
+                tensor_parallel_degree=1,
+            )
+            model = LoRAModel(model, lora_cfg)
+            model.mark_only_lora_as_trainable()
+            model.print_trainable_parameters()
+        # len(list(filter(lambda p:p.stop_gradient,model.parameters())))
+        # trainer = Trainer( model=model, args=training_args, train_dataset=self.dataset['train_dataset'], eval_dataset=None, tokenizer=tokenizer, data_collator=data_collator, callbacks=callbacks )
+        trainer = Trainer(
+            model=model,
+            args=training_args,
+            train_dataset=self.dataset["train_dataset"],
+            eval_dataset=self.dataset.get("eval_dataset", None),
+            tokenizer=tokenizer,
+            data_collator=data_collator,
+            callbacks=callbacks,
+        )
+        # trainer.control.should_evaluate = True
+        if self.dataset.get("eval_dataset") is not None:
+            training_args.evaluation_strategy = IntervalStrategy.STEPS
+
+        daemon_thread = Thread(target=check_aborted, args=(self, trainer))
+        daemon_thread.daemon = True
+        daemon_thread.start()
+        train_result = trainer.train(resume_path)  # image token batching存在问题，无法打包成patch
+        if train_result.global_step == trainer.state.max_steps:
+            trainer.save_model()
+            trainer.save_state()
+
+    def monitor(self):
+        self.aborted = False
+        self.running = True
+
+        get = lambda elem_id: self.running_data[self.manager.get_elem_by_id(elem_id)]
+        lang, model_name, finetuning_type = get("top.lang"), get("top.model_name"), get("top.finetuning_type")
+        output_dir = get("{}.output_dir".format("train"))
+        output_path = get_save_dir(model_name, finetuning_type, output_dir)
+
+        output_box = self.manager.get_elem_by_id("{}.output_box".format("train"))
+        progress_bar = self.manager.get_elem_by_id("{}.progress_bar".format("train"))
+        loss_viewer = self.manager.get_elem_by_id("train.loss_viewer") if self.do_train else None
+
+        running_log = ""
+        while self.trainer is not None:
+            if self.aborted:
+                yield {
+                    output_box: ALERTS["info_aborting"][lang],
+                    progress_bar: gr.Slider(visible=False),
+                }
+            else:
+                running_log, running_progress, running_loss = get_trainer_info(output_path, self.do_train)
+                return_dict = {
+                    output_box: running_log,
+                    progress_bar: running_progress,
+                }
+                if running_loss is not None:
+                    return_dict[loss_viewer] = running_loss
+
+                yield return_dict
+            if self.trainer.is_alive():
+                # yield {
+                #     output_box: ALERTS["info_training"][lang],
+                #     progress_bar: gr.Slider(visible=False),
+                # }
+                return []
+            else:
+                self.trainer = None
+                self.running = False
+                print("trainer exited")
+
+        if self.do_train:
+            if os.path.exists(os.path.join(output_path, TRAINING_ARGS_NAME)):
+                finish_info = ALERTS["info_finished"][lang]
+            else:
+                finish_info = ALERTS["err_failed"][lang]
+        else:
+            if os.path.exists(os.path.join(output_path, "all_results.json")):
+                finish_info = get_eval_results(os.path.join(output_path, "all_results.json"))
+            else:
+                finish_info = ALERTS["err_failed"][lang]
+
+        return_dict = {
+            output_box: self._finalize(lang, finish_info) + "\n\n" + running_log,
+            progress_bar: gr.Slider(visible=False),
+        }
+        yield return_dict
+
+    def save_args(self, data):
+        output_box = self.manager.get_elem_by_id("train.output_box")
+        error = self._initialize(data, do_train=True, from_preview=True)
+        if error:
+            gr.Warning(error)
+            return {output_box: error}
+
+        lang = data[self.manager.get_elem_by_id("top.lang")]
+        config_path = data[self.manager.get_elem_by_id("train.config_path")]
+        os.makedirs(DEFAULT_CONFIG_DIR, exist_ok=True)
+        save_path = os.path.join(DEFAULT_CONFIG_DIR, config_path)
+
+        save_args(save_path, self._form_config_dict(data))
+        return {output_box: ALERTS["info_config_saved"][lang] + save_path}
+
+    def load_args(self, lang: str, config_path: str):
+        output_box = self.manager.get_elem_by_id("train.output_box")
+        config_dict = load_args(os.path.join(DEFAULT_CONFIG_DIR, config_path))
+        if config_dict is None:
+            gr.Warning(ALERTS["err_config_not_found"][lang])
+            return {output_box: ALERTS["err_config_not_found"][lang]}
+
+        output_dict: Dict["Component", Any] = {output_box: ALERTS["info_config_loaded"][lang]}
+        for elem_id, value in config_dict.items():
+            output_dict[self.manager.get_elem_by_id(elem_id)] = value
+
+        return output_dict
+
+    def check_output_dir(self, lang: str, model_name: str, finetuning_type: str, output_dir: str):
+        if model_name and output_dir and os.path.isdir(get_save_dir(model_name, finetuning_type, output_dir)):
+            gr.Warning(ALERTS["warn_output_dir_exists"][lang])
+        return ALERTS["warn_output_dir_exists"][lang]
diff --git a/paddlemix/MULLM_WebUI/scripts/convert_dataset.py b/paddlemix/MULLM_WebUI/scripts/convert_dataset.py
new file mode 100644
index 000000000..2e73e619d
--- /dev/null
+++ b/paddlemix/MULLM_WebUI/scripts/convert_dataset.py
@@ -0,0 +1,98 @@
+# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import io
+import json
+import os
+
+import pandas as pd
+from PIL import Image
+from tqdm import tqdm
+
+
+def get_args():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--data_dir", type=str, default="../data")
+    parser.add_argument("--dataset_dir", type=str, default="pokemon_gpt4o_zh")
+    parser.add_argument("--file_name", type=str, default="../data/pokemon_gpt4o_zh/pokemon_gpt4o_zh.parquet")
+
+    args = parser.parse_args()
+    return args
+
+
+ROLE_MAPPING = {"human": "user", "gpt": "assistant"}
+
+
+def to_conv_template(content, role):
+    return {"role": role, "content": content}
+
+
+def write_json(data, file_name):
+    with open(file_name, "w", encoding="utf-8") as f:
+        json.dump(data, f, indent=4, ensure_ascii=False)
+
+
+def load_json(file_name):
+    with open(file_name, "r", encoding="utf-8") as f:
+        data = json.load(f)
+    return data
+
+
+if __name__ == "__main__":
+    args = get_args()
+    df = pd.read_parquet(args.file_name)
+    output_path = os.path.join(args.data_dir, args.dataset_dir)
+    converted_dataset = []
+    os.makedirs(os.path.join(output_path, "images"), exist_ok=True)
+    for i in tqdm(range(len(df))):
+        new_conversation = {
+            "messages": [],
+            "images": [],
+        }
+        cur_conversation = df.iloc[i].conversations
+
+        # process conversation
+        for conv_id in range(len(cur_conversation)):
+            role = cur_conversation[conv_id]["from"]
+            role = ROLE_MAPPING[role]
+            content = cur_conversation[conv_id]["value"]
+            new_conversation["messages"] += [to_conv_template(content, role)]
+
+        cur_images = df.iloc[i].images
+        for img_id in range(len(cur_images)):
+            save_dir = os.path.join(args.data_dir, "{}/images/{}_{}.png".format(args.dataset_dir, str(i), str(img_id)))
+            if not os.path.exists(save_dir):
+                img = Image.open(io.BytesIO(cur_images[img_id]["bytes"]))
+                img.save(save_dir)
+            new_conversation["images"].append("{}/images/{}_{}.png".format(args.dataset_dir, str(i), str(img_id)))
+        # break
+        converted_dataset += [new_conversation]
+    if not os.path.exists(os.path.join(output_path, f"{args.dataset_dir}.json")):
+        write_json(converted_dataset, os.path.join(output_path, "pokemon_gpt4o_zh.json"))
+
+    # add to dataset info
+    info_path = os.path.join(args.data_dir, "dataset_info.json")
+    if os.path.exists(info_path):
+        dataset_info = load_json(info_path)
+    else:
+        dataset_info = {}
+    if args.dataset_dir not in dataset_info.keys():
+        dataset_info[args.dataset_dir] = {
+            "file_name": f"{args.dataset_dir}.json",
+            "formatting": "sharegpt",
+            "columns": {"messages": "messages", "images": "images"},
+            "tags": {"role_tag": "role", "content_tag": "content", "user_tag": "user", "assistant_tag": "assistant"},
+        }
+        write_json(dataset_info, os.path.join(args.data_dir, "dataset_info.json"))
diff --git a/paddlemix/auto/configuration.py b/paddlemix/auto/configuration.py
index 7ba9fb1c7..ea615343d 100644
--- a/paddlemix/auto/configuration.py
+++ b/paddlemix/auto/configuration.py
@@ -21,6 +21,14 @@
 from typing import Dict, List, Type
 
 from paddlenlp.transformers import AutoConfig
+#fix paddlenlp 3.0b3 auto
+from paddlemix.models.llava.language_model.llava_llama import LlavaConfig
+try:
+    AutoConfig.register("llava", LlavaConfig)
+    print('LlavaConfig register success!!!!!')
+except:
+    pass
+
 from paddlenlp.transformers.configuration_utils import PretrainedConfig
 from paddlenlp.transformers.model_utils import PretrainedModel
 from paddlenlp.utils.import_utils import import_module
diff --git a/paddlemix/auto/tokenizer.py b/paddlemix/auto/tokenizer.py
index bfe236509..6d90153b6 100644
--- a/paddlemix/auto/tokenizer.py
+++ b/paddlemix/auto/tokenizer.py
@@ -18,6 +18,15 @@
 
 import yaml
 from paddlenlp.transformers import AutoTokenizer
+#fix paddlenlp 3.0b3 auto
+from paddlemix.models.llava.language_model.tokenizer import LLavaTokenizer
+from paddlemix.models.llava.language_model.llava_llama import LlavaConfig
+try:
+    AutoTokenizer.register(LlavaConfig, LLavaTokenizer)
+    print('LLavaTokenizer register success!!!!')
+except:
+    pass
+
 from paddlenlp.utils.import_utils import import_module
 from paddlenlp.utils.log import logger
 
@@ -44,7 +53,7 @@ def _update_name_mapping(cls):
             cls._name_mapping[key] = value
 
     @classmethod
-    def _get_tokenizer_class_from_config(cls, pretrained_model_name_or_path, config_file_path):
+    def _get_tokenizer_class_from_config(cls, pretrained_model_name_or_path, config_file_path, use_fast=None):
         cls._update_name_mapping()
         with io.open(config_file_path, encoding="utf-8") as f:
             init_kwargs = json.load(f)
@@ -63,9 +72,9 @@ def _get_tokenizer_class_from_config(cls, pretrained_model_name_or_path, config_
                     import_class = import_module(f"paddlemix.models.{class_name}")
 
             tokenizer_class = getattr(import_class, init_class)
-            # if use_fast:
-            #     fast_tokenizer_class = cls._get_fast_tokenizer_class(init_class, class_name)
-            #     tokenizer_class = fast_tokenizer_class if fast_tokenizer_class else tokenizer_class
+            if use_fast:
+                fast_tokenizer_class = cls._get_fast_tokenizer_class(init_class, class_name)
+                tokenizer_class = fast_tokenizer_class if fast_tokenizer_class else tokenizer_class
             return tokenizer_class
         # If no `init_class`, we use pattern recognition to recognize the tokenizer class.
         else:
@@ -80,8 +89,8 @@ def _get_tokenizer_class_from_config(cls, pretrained_model_name_or_path, config_
                         import_class = import_module(f"paddlemix.models.{class_name}.tokenizer")
                     tokenizer_class = getattr(import_class, init_class)
 
-                    # if use_fast:
-                    #     fast_tokenizer_class = cls._get_fast_tokenizer_class(init_class, class_name)
-                    #     tokenizer_class = fast_tokenizer_class if fast_tokenizer_class else tokenizer_class
+                    if use_fast:
+                        fast_tokenizer_class = cls._get_fast_tokenizer_class(init_class, class_name)
+                        tokenizer_class = fast_tokenizer_class if fast_tokenizer_class else tokenizer_class
                     break
             return tokenizer_class
diff --git a/paddlemix/datacopilot/nn/inscaptagger.py b/paddlemix/datacopilot/nn/inscaptagger.py
index c84fd7c42..b346ceb4d 100644
--- a/paddlemix/datacopilot/nn/inscaptagger.py
+++ b/paddlemix/datacopilot/nn/inscaptagger.py
@@ -11,24 +11,23 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
-
+import os
 import paddle
 from paddlenlp.generation import TextStreamer
-from paddlemix.auto import (
-    AutoConfigMIX,
-    AutoModelMIX,
-    AutoProcessorMIX,
-    AutoTokenizerMIX,
+from paddlemix.models.llava.language_model.llava_llama import (
+    LlavaConfig,
+    LlavaLlamaForCausalLM,
 )
+from paddlemix.models.llava.language_model.tokenizer import LLavaTokenizer
+from paddlemix.processors import LlavaProcessor
 from paddlemix.models.llava.constants import (
     DEFAULT_IM_END_TOKEN,
     DEFAULT_IM_START_TOKEN,
     DEFAULT_IMAGE_TOKEN,
 )
 from paddlemix.models.llava.conversation import conv_templates
-from paddlemix.models.llava.mm_utils import get_model_name_from_path, load_image
-from paddlemix.utils.log import logger
-
+from paddlemix.models.llava.mm_utils import load_image
+from paddlenlp.transformers import CLIPImageProcessor
 
 class PPInsCapTagger(object):
     def __init__(self, model_name_or_path, max_new_tokens = 4096, dtype='float16') -> None:
@@ -39,11 +38,19 @@ def __init__(self, model_name_or_path, max_new_tokens = 4096, dtype='float16') -
 
 
     def init_model(self, model_name_or_path, max_new_tokens, dtype):
-        tokenizer = AutoTokenizerMIX.from_pretrained(model_name_or_path)
-        model_config = AutoConfigMIX.from_pretrained(model_name_or_path)
-        model = AutoModelMIX.from_pretrained(model_name_or_path, dtype=dtype)
+        tokenizer = LLavaTokenizer.from_pretrained(model_name_or_path)
+        model_config = LlavaConfig.from_pretrained(model_name_or_path)
+        model = LlavaLlamaForCausalLM.from_pretrained(model_name_or_path, dtype=dtype)
         model.eval()
-        processor, _ = AutoProcessorMIX.from_pretrained(model_name_or_path, eval="eval", max_length=max_new_tokens, image_aspect_ratio=model_config.image_aspect_ratio)
+        name_or_path = (os.path.join(model_name_or_path, "processor", "eval"))
+        image_processor = CLIPImageProcessor.from_pretrained(name_or_path)
+        processor = LlavaProcessor(
+            image_processor, 
+            tokenizer,
+            max_length=max_new_tokens, 
+            image_aspect_ratio=model_config.image_aspect_ratio
+            )
+        
         model.resize_token_embeddings(len(tokenizer))
         vision_tower = model.get_vision_tower()
 
diff --git a/paddlemix/examples/llava/pretrain.py b/paddlemix/examples/llava/pretrain.py
index 463e1c654..dc9669f87 100755
--- a/paddlemix/examples/llava/pretrain.py
+++ b/paddlemix/examples/llava/pretrain.py
@@ -154,6 +154,18 @@ def main():
             checkpoint = last_checkpoint
         train_result = trainer.train(resume_from_checkpoint=checkpoint)
         if training_args.benchmark:
+            def get_paddle_memory_info():
+                """get_memory_info"""
+                divisor = 2**30
+                return (
+                    paddle.device.cuda.memory_allocated() / divisor,
+                    paddle.device.cuda.max_memory_allocated() / divisor,
+                    paddle.device.cuda.memory_reserved() / divisor,
+                    paddle.device.cuda.max_memory_reserved() / divisor,
+                )
+            memory_allocated, max_memory_allocated, memory_reserved, max_memory_reserved = get_paddle_memory_info()
+
+            logger.info(f'memory_allocated:{memory_allocated}GB, max_memory_allocated: {max_memory_allocated}GB, memory_reserved:{memory_reserved}GB, max_memory_reserved: {max_memory_reserved}GB \n')
             total_effective_samples = total_samples * training_args.num_train_epochs
             effective_samples_per_second = total_effective_samples / train_result.metrics["train_runtime"]
             mem_gpu = (
diff --git a/paddlemix/examples/llava/run_predict.py b/paddlemix/examples/llava/run_predict.py
index da8e5d518..6a4753bbf 100644
--- a/paddlemix/examples/llava/run_predict.py
+++ b/paddlemix/examples/llava/run_predict.py
@@ -16,13 +16,6 @@
 
 import paddle
 from paddlenlp.generation import TextStreamer
-
-from paddlemix.auto import (
-    AutoConfigMIX,
-    AutoModelMIX,
-    AutoProcessorMIX,
-    AutoTokenizerMIX,
-)
 from paddlemix.models.llava.constants import (
     DEFAULT_IM_END_TOKEN,
     DEFAULT_IM_START_TOKEN,
@@ -33,6 +26,14 @@
 
 from paddlemix.utils.log import logger
 
+import os
+from paddlemix.models.llava.language_model.llava_llama import (
+    LlavaConfig,
+    LlavaLlamaForCausalLM,
+)
+from paddlemix.models.llava.language_model.tokenizer import LLavaTokenizer
+from paddlemix.processors import LlavaProcessor
+from paddlenlp.transformers import CLIPImageProcessor
 
 def main(args):
     paddle.seed(seed=0)
@@ -46,18 +47,21 @@ def main(args):
         compute_dtype = "float32"
 
     model_name = get_model_name_from_path(args.model_path)
-    tokenizer = AutoTokenizerMIX.from_pretrained(args.model_path)
-    model_config = AutoConfigMIX.from_pretrained(args.model_path)
-    model = AutoModelMIX.from_pretrained(args.model_path, dtype=compute_dtype)
+    
+    model_name_or_path = args.model_path
+    tokenizer = LLavaTokenizer.from_pretrained(model_name_or_path)
+    model_config = LlavaConfig.from_pretrained(model_name_or_path)
+    model = LlavaLlamaForCausalLM.from_pretrained(model_name_or_path, dtype=dtype)
     model.eval()
+    name_or_path = (os.path.join(model_name_or_path, "processor", "eval"))
+    image_processor = CLIPImageProcessor.from_pretrained(name_or_path)
+    processor = LlavaProcessor(
+        image_processor, 
+        tokenizer,
+        max_length=max_new_tokens, 
+        image_aspect_ratio=model_config.image_aspect_ratio
+        )
 
-    processor, _ = AutoProcessorMIX.from_pretrained(
-        args.model_path,
-        eval="eval",
-        max_length=args.max_new_tokens,
-        image_aspect_ratio=model_config.image_aspect_ratio,
-    )
-    # processor = LlavaProcessor.from_pretrained(args.model_path, eval="eval", max_length=args.max_new_tokens, image_aspect_ratio=model_config.image_aspect_ratio)
 
     model.resize_token_embeddings(len(tokenizer))
     vision_tower = model.get_vision_tower()
diff --git a/paddlemix/examples/llava/run_predict_multiround.py b/paddlemix/examples/llava/run_predict_multiround.py
index 43cce2cec..6e344b3a1 100644
--- a/paddlemix/examples/llava/run_predict_multiround.py
+++ b/paddlemix/examples/llava/run_predict_multiround.py
@@ -32,6 +32,15 @@
 from paddlemix.models.llava.mm_utils import get_model_name_from_path, load_image
 from paddlemix.utils.log import logger
 
+import os
+from paddlemix.models.llava.language_model.llava_llama import (
+    LlavaConfig,
+    LlavaLlamaForCausalLM,
+)
+from paddlemix.models.llava.language_model.tokenizer import LLavaTokenizer
+from paddlemix.processors import LlavaProcessor
+from paddlenlp.transformers import CLIPImageProcessor
+
 
 def main(args):
     paddle.seed(seed=0)
@@ -45,17 +54,21 @@ def main(args):
         compute_dtype = "float32"
 
     model_name = get_model_name_from_path(args.model_path)
-    tokenizer = AutoTokenizerMIX.from_pretrained(args.model_path)
-    model_config = AutoConfigMIX.from_pretrained(args.model_path)
-    model = AutoModelMIX.from_pretrained(args.model_path, dtype=compute_dtype)
+    
+    model_name_or_path = args.model_path
+    tokenizer = LLavaTokenizer.from_pretrained(model_name_or_path)
+    model_config = LlavaConfig.from_pretrained(model_name_or_path)
+    model = LlavaLlamaForCausalLM.from_pretrained(model_name_or_path, dtype=dtype)
     model.eval()
+    name_or_path = (os.path.join(model_name_or_path, "processor", "eval"))
+    image_processor = CLIPImageProcessor.from_pretrained(name_or_path)
+    processor = LlavaProcessor(
+        image_processor, 
+        tokenizer,
+        max_length=max_new_tokens, 
+        image_aspect_ratio=model_config.image_aspect_ratio
+        )
 
-    processor, _ = AutoProcessorMIX.from_pretrained(
-        args.model_path,
-        eval="eval",
-        max_length=args.max_new_tokens,
-        image_aspect_ratio=model_config.image_aspect_ratio,
-    )
 
     model.resize_token_embeddings(len(tokenizer))
     vision_tower = model.get_vision_tower()
diff --git a/paddlemix/models/qwen2_vl/modeling_qwen2_vl.py b/paddlemix/models/qwen2_vl/modeling_qwen2_vl.py
index 992146989..a6e7d6000 100644
--- a/paddlemix/models/qwen2_vl/modeling_qwen2_vl.py
+++ b/paddlemix/models/qwen2_vl/modeling_qwen2_vl.py
@@ -277,12 +277,13 @@ def forward(self, hidden_states: paddle.Tensor) -> paddle.Tensor:
         if _IS_NPU:
             # NOTE: In npu device, conv3d only support fp16 or bf16 dtype.
             hidden_states = F.conv3d(
-                hidden_states.cast(paddle.bfloat16),
-                self.proj.weight.cast(paddle.bfloat16),
-                stride=self.proj._stride)
+                hidden_states.cast(paddle.bfloat16), self.proj.weight.cast(paddle.bfloat16), stride=self.proj._stride
+            )
             hidden_states = hidden_states.to(target_dtype).reshape([-1, self.embed_dim])
         else:
-            hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).reshape([-1, self.embed_dim])
+            # NOTE（changwenbin）: AttributeError: 'Variable' object has no attribute 'to'.
+            # hidden_states = self.proj(hidden_states.to(dtype=target_dtype)).reshape([-1, self.embed_dim])
+            hidden_states = self.proj(paddle.cast(hidden_states, dtype=target_dtype)).reshape([-1, self.embed_dim])
         return hidden_states
 
 
@@ -608,10 +609,10 @@ def forward(
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
 
-        query_states = query_states.astype("float32") 
+        query_states = query_states.astype("float32")
         key_states = key_states.astype("float32")
         value_states = value_states.astype("float32")
-        
+
         attn_weights = paddle.matmul(query_states, key_states.transpose([0, 1, 3, 2])) / math.sqrt(self.head_dim)
 
         if attention_mask is not None:
@@ -619,7 +620,7 @@ def forward(
         attn_weights = nn.functional.softmax(attn_weights, axis=-1, dtype="float32")
 
         attn_output = paddle.matmul(attn_weights.cast(self.config.dtype), value_states.cast(self.config.dtype))
-        
+
         if attn_output.shape != [bsz, self.num_heads, q_len, self.head_dim]:
             raise ValueError(
                 f"`attn_output` should be of size {(bsz, q_len, self.num_heads, self.head_dim)}, but is"
@@ -869,7 +870,7 @@ def forward(
         residual = hidden_states
 
         hidden_states = self.input_layernorm(hidden_states)
-        
+
         hidden_states, self_attn_weights, present_key_value = self.self_attn(
             hidden_states=hidden_states,
             attention_mask=attention_mask,
@@ -980,7 +981,7 @@ def rot_pos_emb(self, grid_thw):
         return rotary_pos_emb
 
     def forward(self, hidden_states: paddle.Tensor, grid_thw: paddle.Tensor) -> paddle.Tensor:
-        
+
         hidden_states = self.patch_embed(hidden_states)
         rotary_pos_emb = self.rot_pos_emb(grid_thw)
 
@@ -1223,8 +1224,12 @@ def set_decoder(self, decoder):
     def get_decoder(self):
         return self.model
 
+    @staticmethod
     def get_rope_index(
-        self,
+        spatial_merge_size,
+        image_token_id,
+        video_token_id,
+        vision_start_token_id,
         input_ids: paddle.Tensor,
         image_grid_thw: Optional[paddle.Tensor] = None,
         video_grid_thw: Optional[paddle.Tensor] = None,
@@ -1274,10 +1279,6 @@ def get_rope_index(
             position_ids (`paddle.Tensor` of shape `(3, batch_size, sequence_length)`)
             mrope_position_deltas (`paddle.Tensor` of shape `(batch_size)`)
         """
-        spatial_merge_size = self.config.vision_config.spatial_merge_size
-        image_token_id = self.config.image_token_id
-        video_token_id = self.config.video_token_id
-        vision_start_token_id = self.config.vision_start_token_id
         mrope_position_deltas = []
         if image_grid_thw is not None or video_grid_thw is not None:
             total_input_ids = input_ids
@@ -1353,7 +1354,9 @@ def get_rope_index(
                 if _IS_NPU:
                     # NOTE: bool + id的混合索引赋值未生效，暂时绕过
                     bool_indices = (attention_mask[i] == 1).unsqueeze(0).tile([position_ids.shape[0], 1])
-                    position_ids[:, i] = paddle.index_put(position_ids[:, i], [bool_indices], llm_positions.reshape([-1]))
+                    position_ids[:, i] = paddle.index_put(
+                        position_ids[:, i], [bool_indices], llm_positions.reshape([-1])
+                    )
                 else:
                     position_ids[..., i, attention_mask[i] == 1] = llm_positions
                 mrope_position_deltas.append(llm_positions.max() + 1 - len(total_input_ids[i]))
@@ -1393,6 +1396,44 @@ def update_model_kwargs_for_generation(
 
         return model_kwargs
 
+    # NOTE（changwenbin）: Vision module added for high-performance inference.
+    def vision_forward(
+        self,
+        input_ids: paddle.Tensor,
+        inputs_embeds: Optional[paddle.Tensor] = None,
+        attention_mask: Optional[paddle.Tensor] = None,
+        position_ids: Optional[paddle.Tensor] = None,
+        pixel_values: Optional[paddle.Tensor] = None,
+        pixel_values_videos: Optional[paddle.Tensor] = None,
+        image_grid_thw: Optional[paddle.Tensor] = None,
+        video_grid_thw: Optional[paddle.Tensor] = None,
+        rope_deltas: Optional[paddle.Tensor] = None,
+    ):
+        if inputs_embeds is None:
+            # NOTE: (zhoukangkang、changwenbin) In the high-performance reasoning of Qwen2-vl,
+            # in order to reduce video memory, the qwen2 embed_tokens method in Paddlenlp is reused here.
+            from paddlenlp.experimental.transformers.qwen2.modeling import (
+                Qwen2VLForConditionalGenerationBlockInferenceModel,
+            )
+
+            assert isinstance(
+                self.model, Qwen2VLForConditionalGenerationBlockInferenceModel
+            ), "model is not an instance of Qwen2VLForConditionalGenerationBlockInferenceModel"
+
+            inputs_embeds = self.model.qwen2.embed_tokens(input_ids)
+
+            if pixel_values is not None:
+                pixel_values = paddle.cast(pixel_values, paddle.bfloat16)
+                image_embeds = self.visual(pixel_values, grid_thw=image_grid_thw)
+                image_mask = input_ids == self.config.image_token_id
+                inputs_embeds[image_mask] = image_embeds
+            if pixel_values_videos is not None:
+                pixel_values_videos = paddle.cast(pixel_values_videos, paddle.bfloat16)
+                video_embeds = self.visual(pixel_values_videos, grid_thw=video_grid_thw)
+                video_mask = input_ids == self.config.video_token_id
+                inputs_embeds[video_mask] = video_embeds
+        return inputs_embeds
+
     def forward(
         self,
         input_ids: paddle.Tensor = None,  # [1, 400] sum 49356255
@@ -1471,7 +1512,6 @@ def forward(
                 inputs_embeds[video_mask] = video_embeds
             if attention_mask is not None:
                 attention_mask = attention_mask
-
         outputs = self.model(
             input_ids=None,
             position_ids=position_ids,
@@ -1503,7 +1543,9 @@ def forward(
             loss = loss / label_sum
 
         if not return_dict:
-            output = (logits,) + outputs[1:]
+            # output = (logits,) + outputs[1:]
+            # Note: (changwenbin) fix "can only concatenate tuple (not "list") to tuple".
+            output = (logits,) + tuple(outputs[1:])
             return (loss,) + output if loss is not None else output
             # return logits + 28 layers k and v
 
@@ -1550,7 +1592,14 @@ def prepare_inputs_for_generation(
         if attention_mask is not None and position_ids is None:
             if cache_position is None or (cache_position is not None and cache_position[0] == 0):
                 position_ids, rope_deltas = self.get_rope_index(
-                    input_ids, image_grid_thw, video_grid_thw, attention_mask
+                    self.config.vision_config.spatial_merge_size,
+                    self.config.image_token_id,
+                    self.config.video_token_id,
+                    self.config.vision_start_token_id,
+                    input_ids,
+                    image_grid_thw,
+                    video_grid_thw,
+                    attention_mask,
                 )
                 self.rope_deltas = rope_deltas
             else:
diff --git a/paddlemix/tools/supervised_finetune.py b/paddlemix/tools/supervised_finetune.py
index 4fb6bbfb3..bc6efd66a 100755
--- a/paddlemix/tools/supervised_finetune.py
+++ b/paddlemix/tools/supervised_finetune.py
@@ -180,6 +180,19 @@ def main():
             checkpoint = last_checkpoint
         train_result = trainer.train(resume_from_checkpoint=checkpoint)
         if training_args.benchmark:
+            def get_paddle_memory_info():
+                """get_memory_info"""
+                divisor = 2**30
+                return (
+                    paddle.device.cuda.memory_allocated() / divisor,
+                    paddle.device.cuda.max_memory_allocated() / divisor,
+                    paddle.device.cuda.memory_reserved() / divisor,
+                    paddle.device.cuda.max_memory_reserved() / divisor,
+                )
+            memory_allocated, max_memory_allocated, memory_reserved, max_memory_reserved = get_paddle_memory_info()
+
+            logger.info(f'memory_allocated:{memory_allocated}GB, max_memory_allocated: {max_memory_allocated}GB, memory_reserved:{memory_reserved}GB, max_memory_reserved: {max_memory_reserved}GB \n')
+            
             total_effective_samples = total_samples * training_args.num_train_epochs
             effective_samples_per_second = total_effective_samples / train_result.metrics["train_runtime"]
             mem_gpu = (