diff --git a/inference/benchmarks/bertLarge/README.md b/inference/benchmarks/bertLarge/README.md
index e7fc57df9..96c5cce94 100644
--- a/inference/benchmarks/bertLarge/README.md
+++ b/inference/benchmarks/bertLarge/README.md
@@ -58,6 +58,25 @@ bert_reference_results_text_md5.txt
 
    - XTCL 2.1
 
+#### 2.3 天数智芯 MR-100
+
+- ##### 硬件环境
+    - 机器、加速卡型号: MR-100
+    
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04
+   - OS kernel版本: 5.15.0-89-generic
+   - 加速卡驱动版本：3.2.0
+   - Docker 版本：24.0.4
+   - 依赖软件版本：
+      - torch-1.13.1+corex.3.2.1
+      - onnxsim
+
+- 推理工具包
+
+   - IXRT: ixrt-0.8.0+corex.3.2.1
+
+
 ### 4. 运行情况（BERT-Large）
 
 * 指标列表
@@ -83,3 +102,5 @@ bert_reference_results_text_md5.txt
 | tensorrt | fp16      | 32 | 1283.9   | 257.3       | 260.4      | 408.3         | 418.1          | 45.3% | 0.600/0.638 | 17.4/40.0 |
 | tensorrt | fp32   | 32 | 1868.8   | 150.4       | 152.2      | 190.4         | 194.1       | 42.0% | 0.638/0.638 | 16.9/40.0 |
 | kunlunxin_xtcl| W32A16   | 32 |/ | /          | /       | /          | /          | / | 0.638/0.638| /|
+| iluvatar_ixrt| fp16  | 32 |/ | /          | /       | /          | /          | / | 0.599/0.638| /|
+
diff --git a/inference/benchmarks/bertLarge/pytorch/iluvatar_requirements.txt b/inference/benchmarks/bertLarge/pytorch/iluvatar_requirements.txt
new file mode 100644
index 000000000..ea980c9cd
--- /dev/null
+++ b/inference/benchmarks/bertLarge/pytorch/iluvatar_requirements.txt
@@ -0,0 +1,2 @@
+transformers
+onnxsim
\ No newline at end of file
diff --git a/inference/configs/bertLarge/vendor_config/iluvatar_configurations.yaml b/inference/configs/bertLarge/vendor_config/iluvatar_configurations.yaml
new file mode 100644
index 000000000..3f1c5fec7
--- /dev/null
+++ b/inference/configs/bertLarge/vendor_config/iluvatar_configurations.yaml
@@ -0,0 +1,5 @@
+ixrt_tmp_path: iluvatar_tmp/bertLarge.trt
+compiler: ixrt
+# no_validation: true
+has_dynamic_axis: false
+torchtrt_full_compile: true
\ No newline at end of file
diff --git a/inference/docker_images/iluvatar/pytorch/packages/README.md b/inference/docker_images/iluvatar/pytorch/packages/README.md
index 224dbe2af..9314892bc 100644
--- a/inference/docker_images/iluvatar/pytorch/packages/README.md
+++ b/inference/docker_images/iluvatar/pytorch/packages/README.md
@@ -2,7 +2,7 @@
 
 >联系邮箱: contact-us@iluvatar.com
 
-ixrt-0.7.0+corex.latest.version-cp310-cp310-linux_x86_64.whl
+ixrt-0.8.0+corex.latest.version-cp310-cp310-linux_x86_64.whl
 
 torchvision-0.14.1+corex.3.2.1.20231006.892-cp310-cp310-linux_x86_64.whl
 
diff --git a/inference/inference_engine/iluvatar/ixrt.py b/inference/inference_engine/iluvatar/ixrt.py
index 44fc85c4b..abb5cd17e 100644
--- a/inference/inference_engine/iluvatar/ixrt.py
+++ b/inference/inference_engine/iluvatar/ixrt.py
@@ -9,7 +9,6 @@
 import time
 import subprocess
 
-
 class InferModel:
 
     class HostDeviceMem(object):
@@ -66,27 +65,32 @@ def __init__(self, config, onnx_path, model):
 
     def build_engine(self, config, onnx_path):
         if config.exist_compiler_path is None:
-            trt_path = config.log_dir + "/" + config.ixrt_tmp_path
+            ixrt_path = config.log_dir + "/" + config.ixrt_tmp_path
 
-            dir_trt_path = os.path.dirname(trt_path)
+            dir_trt_path = os.path.dirname(ixrt_path)
             os.makedirs(dir_trt_path, exist_ok=True)
 
             time.sleep(10)
 
-            trtexec_cmd = "ixrtexec --onnx=" + onnx_path + " --save_engine=" + trt_path
+            onnxsim_cmd = f"onnxsim {onnx_path} {onnx_path}"
+
+            onnxsim_cmd = subprocess.Popen(onnxsim_cmd, shell=True)
+            onnxsim_cmd.wait()
+
+            ixrtexec_cmd = "ixrtexec --onnx=" + onnx_path + " --save_engine=" + ixrt_path
             if config.fp16:
-                trtexec_cmd += " --precision fp16"
+                ixrtexec_cmd += " --precision fp16"
             if config.has_dynamic_axis:
-                trtexec_cmd += " --minShapes=" + config.minShapes
-                trtexec_cmd += " --optShapes=" + config.optShapes
-                trtexec_cmd += " --maxShapes=" + config.maxShapes
+                ixrtexec_cmd += " --minShapes=" + config.minShapes
+                ixrtexec_cmd += " --optShapes=" + config.optShapes
+                ixrtexec_cmd += " --maxShapes=" + config.maxShapes
 
-            p = subprocess.Popen(trtexec_cmd, shell=True)
+            p = subprocess.Popen(ixrtexec_cmd, shell=True)
             p.wait()
         else:
-            trt_path = config.exist_compiler_path
+            ixrt_path = config.exist_compiler_path
 
-        with open(trt_path, "rb") as f:
+        with open(ixrt_path, "rb") as f:
             return self.runtime.deserialize_cuda_engine(f.read())
 
     def allocate_buffers(self, engine):
diff --git a/training/benchmarks/aquila2_7b/flagscale/README.md b/training/benchmarks/aquila2_7b/flagscale/README.md
index 27d5eb1b0..b75cdad0c 100644
--- a/training/benchmarks/aquila2_7b/flagscale/README.md
+++ b/training/benchmarks/aquila2_7b/flagscale/README.md
@@ -4,7 +4,9 @@ aquila2是北京人工智能研究院开源的语言模型，包含基础语言
 
 ## 模型配置及tokenizer准备
 
-本测试样例为预训练case，需要下载tokenizer，下载链接为https://github.com/FlagOpen/FlagScale/tree/main/examples/aquila/tokenizer。需要在data_dir下创建tokenizer目录，将上述链接中的三个文件下载到此目录中
+本测试样例为预训练case，需要下载tokenizer，下载链接为https://github.com/FlagOpen/FlagScale/tree/main/examples/aquila/tokenizer
+
+此tokenizer需要下载FlagScale仓库ed55532这一commit版本，需要在data_dir下创建tokenizer目录，将上述链接中的三个文件下载到此目录中
 
 ## 数据准备
 
@@ -14,4 +16,4 @@ https://model.ks3-cn-beijing.ksyuncs.com/nlpdata/pile_wikipedia_demo.bin
 
 https://model.ks3-cn-beijing.ksyuncs.com/nlpdata/pile_wikipedia_demo.idx
 
-将上述两个文件放置于data_dir下。
\ No newline at end of file
+将上述两个文件放置于data_dir下。
diff --git a/training/benchmarks/bert_hf/pytorch/train/trainer.py b/training/benchmarks/bert_hf/pytorch/train/trainer.py
index 78f40a686..f63e4cd8e 100755
--- a/training/benchmarks/bert_hf/pytorch/train/trainer.py
+++ b/training/benchmarks/bert_hf/pytorch/train/trainer.py
@@ -82,21 +82,7 @@ def train_one_epoch(self, train_dataloader, eval_dataloader):
             dist_pytorch.barrier(self.config.vendor)
             pure_start_time = time.time()
 
-            if scaler is not None:
-                with torch.cuda.amp.autocast(enabled=True):
-                    output = model(input_ids=input_ids, labels=labels)
-                    loss = output.loss
-
-                scaler.scale(loss).backward()
-                if step % self.config.gradient_accumulation_steps == 0:
-                    scaler.step(optimizer)
-                    scaler.update()
-            else:
-                output = model(input_ids=input_ids, labels=labels)
-                loss = output.loss
-                loss.backward()
-                if step % self.config.gradient_accumulation_steps == 0:
-                    optimizer.step()
+            loss = self.adapter.train_one_step(model, (input_ids, labels), optimizer, step, scaler)
 
             if step % self.config.log_freq == 0:
                 print("Train Step " + str(step) + "/" + str(len(data_loader)) +
diff --git a/training/benchmarks/bert_hf/pytorch/train/trainer_adapter.py b/training/benchmarks/bert_hf/pytorch/train/trainer_adapter.py
index ba8eaa585..19f8427cf 100755
--- a/training/benchmarks/bert_hf/pytorch/train/trainer_adapter.py
+++ b/training/benchmarks/bert_hf/pytorch/train/trainer_adapter.py
@@ -41,3 +41,24 @@ def create_grad_scaler():
     """create_grad_scaler for mixed precision training"""
     scaler = torch.cuda.amp.GradScaler() if config.amp else None
     return scaler
+
+
+def train_one_step(model, batch_data, optimizer, cur_step, scaler=None):
+    input_ids, labels = batch_data
+    if scaler:
+        with torch.cuda.amp.autocast(enabled=True):
+            output = model(input_ids=input_ids, labels=labels)
+            loss = output.loss
+
+        scaler.scale(loss).backward()
+        if cur_step % config.gradient_accumulation_steps == 0:
+            scaler.step(optimizer)
+            scaler.update()
+    else:
+        output = model(input_ids=input_ids, labels=labels)
+        loss = output.loss
+        loss.backward()
+        if cur_step % config.gradient_accumulation_steps == 0:
+            optimizer.step()
+
+    return loss
diff --git a/training/benchmarks/driver/dist_pytorch.py b/training/benchmarks/driver/dist_pytorch.py
index 6c824c422..2704dcfd5 100755
--- a/training/benchmarks/driver/dist_pytorch.py
+++ b/training/benchmarks/driver/dist_pytorch.py
@@ -149,6 +149,8 @@ def barrier(vendor="nvidia"):
     if torch.distributed.is_available() and torch.distributed.is_initialized():
         if vendor == "kunlunxin":
             torch.distributed.barrier()
+        elif vendor == "mthreads":
+            torch.distributed.barrier()
         else:
             torch.distributed.all_reduce(torch.cuda.FloatTensor(1))
             torch.cuda.synchronize()
@@ -172,6 +174,23 @@ def init_dist_training_env(config):
                                                  rank=rank,
                                                  world_size=world_size)
             config.n_device = torch.distributed.get_world_size()
+    elif config.vendor == "mthreads":
+        import torch_musa
+        if int(os.environ.get("WORLD_SIZE", 1)) <= 1:
+            config.device = torch.device("musa")
+            config.n_device = 1
+        else:
+            torch.musa.set_device(config.local_rank)
+            host_addr_full = 'tcp://' + os.environ[
+                "MASTER_ADDR"] + ':' + os.environ["MASTER_PORT"]
+            rank = int(os.environ["RANK"])
+            world_size = int(os.environ["WORLD_SIZE"])
+            torch.distributed.init_process_group(backend=config.dist_backend,
+                                                 init_method=host_addr_full,
+                                                 rank=rank,
+                                                 world_size=world_size)
+            config.device = torch.device("musa", config.local_rank)
+            config.n_device = torch.distributed.get_world_size()
     else:  # nvidia
         if int(os.environ.get("WORLD_SIZE", 1)) <= 1:
             config.device = torch.device("cuda")
diff --git a/training/benchmarks/driver/helper.py b/training/benchmarks/driver/helper.py
index c8f406615..de513901e 100644
--- a/training/benchmarks/driver/helper.py
+++ b/training/benchmarks/driver/helper.py
@@ -74,6 +74,12 @@ def set_seed(self, seed: int, vendor: str = None):
         elif lower_vendor == "ascend":
             import mindspore
             mindspore.set_seed(seed)
+        elif lower_vendor == "mthreads":
+            import torch
+            import torch_musa
+            torch.manual_seed(seed)
+            torch.musa.manual_seed(seed)
+            torch.musa.manual_seed_all(seed)
         else:
             # TODO 其他厂商设置seed，在此扩展
             pass
diff --git a/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py b/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py
index 10ae55779..fc730ac44 100644
--- a/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py
+++ b/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py
@@ -10,6 +10,11 @@
 from importlib import import_module
 
 import torch
+try:
+    import torch_musa
+    DEVICE = 'musa'
+except:
+    DEVICE = 'cuda'
 from torch.utils.data import DataLoader
 from torch.utils.data.distributed import DistributedSampler
 
@@ -54,29 +59,32 @@ def get_argument_parser():
 
 def train(model_engine, dataloader):
     model_engine.train()
+    device = torch.device(f"{DEVICE}:{args.local_rank}")
     ave_loss = 0.0
     for step, data in enumerate(dataloader):
 
         fake_data = torch.tensor(data).long()
-        input_ids = fake_data.to(args.local_rank)
-        labels = fake_data.to(args.local_rank)
+        input_ids = fake_data.to(device)
+        labels = fake_data.to(device)
         loss = model_engine(input_ids=input_ids, labels=labels).loss
         model_engine.backward(loss)
         model_engine.step()
 
         ave_loss += loss
-        if step % 10 == 0 and args.local_rank == 0:
+        if step > 0 and step % 10 == 0 and args.local_rank == 0:
             print('Step {}/{}, Loss: {}'.format(step, len(dataloader),
                                                 ave_loss / 10))
             ave_loss = 0.0
 
 
-def get_deepspeed_engine(args, model_config_dir, flashattn):
+def get_deepspeed_engine(args, model_config_dir):
     with deepspeed.zero.Init(config_dict_or_path=args.deepspeed_config,
                              enabled=True,
                              mem_efficient_linear=False,
                              mpu=None):
-        model = get_llama_model(model_config_dir, flashattn)
+        model = get_llama_model(model_config_dir, args.flashattn)
+    if args.gradient_checkpointing_enable:
+        model.gradient_checkpointing_enable()
 
     model_engine, _, _, _ = deepspeed.initialize(
         args=args, model=model, model_parameters=model.parameters())
@@ -107,10 +115,12 @@ def get_metric(texts):
     theoryflops = getattr(module, 'theoryflops')
     epochs = getattr(module, 'epochs')
     flashattn = getattr(module, 'flashattn')
+    gradient_checkpointing_enable = getattr(module, 'gradient_checkpointing_enable', False)
+    args.flashattn = flashattn
+    args.gradient_checkpointing_enable = gradient_checkpointing_enable
 
     deepspeed.init_distributed()
-    model_engine = get_deepspeed_engine(args, os.path.join("llama2_7b_hf"),
-                                        flashattn)
+    model_engine = get_deepspeed_engine(args, os.path.join("llama2_7b_hf"))
     dataset = get_llama_dataset(args, seqlength, datafilename)
 
     logger = logging.getLogger("DeepSpeed")
@@ -138,4 +148,8 @@ def get_metric(texts):
             chip_tps = whole_tps / args.nproc * args.nnodes
             print("System tokens per second: ", whole_tps)
             print("Tokens/p/s: ", chip_tps)
+
+            TFLOPS = int(theoryflops/1000000000000)
+            print("Theory TFLOPS: ", TFLOPS)
+            print("Tokens/TFLOPS: ", chip_tps / TFLOPS)
             print("MFU: ", chip_tps * 7000000000.0 * 6 / theoryflops)
diff --git a/training/benchmarks/resnet50/pytorch/train/trainer.py b/training/benchmarks/resnet50/pytorch/train/trainer.py
index 52e7d6ae7..b07d90c68 100755
--- a/training/benchmarks/resnet50/pytorch/train/trainer.py
+++ b/training/benchmarks/resnet50/pytorch/train/trainer.py
@@ -82,22 +82,7 @@ def train_one_epoch(self, train_dataloader, eval_dataloader):
             pure_start_time = time.time()
             optimizer.zero_grad()
 
-            images, target = batch
-            if scaler is not None:
-                with torch.cuda.amp.autocast(enabled=True):
-                    output = model(images)
-                    loss = criterion(output, target)
-
-                scaler.scale(loss).backward()
-                scaler.step(optimizer)
-                scaler.update()
-            else:
-                output = model(images)
-
-                criterion = torch.nn.CrossEntropyLoss()
-                loss = criterion(output, target)
-                loss.backward()
-                optimizer.step()
+            loss = self.adapter.train_step(model, batch, optimizer, scaler)
 
             if step % self.config.log_freq == 0:
                 print("Train Step " + str(step) + "/" + str(len(data_loader)) +
diff --git a/training/benchmarks/resnet50/pytorch/train/trainer_adapter.py b/training/benchmarks/resnet50/pytorch/train/trainer_adapter.py
index ba8eaa585..d4b7b4708 100755
--- a/training/benchmarks/resnet50/pytorch/train/trainer_adapter.py
+++ b/training/benchmarks/resnet50/pytorch/train/trainer_adapter.py
@@ -41,3 +41,23 @@ def create_grad_scaler():
     """create_grad_scaler for mixed precision training"""
     scaler = torch.cuda.amp.GradScaler() if config.amp else None
     return scaler
+
+
+def train_step(model, batch, optimizer, scaler=None):
+    """train one step"""
+    images, target = batch
+    criterion = torch.nn.CrossEntropyLoss()
+    if scaler:
+        with torch.cuda.amp.autocast(enabled=True):
+            output = model(images)
+            loss = criterion(output, target)
+        scaler.scale(loss).backward()
+        scaler.step(optimizer)
+        scaler.update()
+    else:
+        output = model(images)
+        loss = criterion(output, target)
+        loss.backward()
+        optimizer.step()
+
+    return loss
diff --git a/training/iluvatar/iluvatar_monitor.py b/training/iluvatar/iluvatar_monitor.py
index 7ba03907d..cc8de6c2d 100644
--- a/training/iluvatar/iluvatar_monitor.py
+++ b/training/iluvatar/iluvatar_monitor.py
@@ -231,7 +231,7 @@ def get_system_info():
     cmd = cmd + r"echo ;"
     
     cmd = cmd + r"echo Accelerator Model:;"
-    cmd = cmd + r"ixsmi -L;"
+    cmd = cmd + r"export PATH=/usr/local/corex/bin:$PATH; export LD_LIBRARY_PATH=/usr/local/corex/lib; ixsmi -L;"
     cmd = cmd + r"echo ;"
     
     cmd = cmd + r"echo Accelerator Driver version:;"
diff --git a/training/iluvatar/mobilenetv2-pytorch/README.md b/training/iluvatar/mobilenetv2-pytorch/README.md
index 0c0a34303..62d81319a 100644
--- a/training/iluvatar/mobilenetv2-pytorch/README.md
+++ b/training/iluvatar/mobilenetv2-pytorch/README.md
@@ -40,7 +40,8 @@
 
 | 配置                  | precision | fix_hp         | e2e_time | p_whole | p_train | p_core | acc    | mem         |
 | --------------------- | --------- | -------------- | -------- | ------- | ------- | ------ | ------ | ----------- |
-| BI-V100单机8卡（1x8） | fp32      | bs=256,lr=0.72 | 103759   | 3520    | 3604    | 3651   | 68.61% | 21.6 / 32.0 |
+| BI-V100单机8卡（1x8）  | fp32      | /              | 174534    | 1857    | 1876    | 1885   | 68.52% | 3.6/32.0  |
+| BI-V100单机8卡（1x8） | fp32      | bs=256,lr=0.72 | 87559   | 4390    | 4543    | 4625   | 61.92% | 21.6 / 32.0 |
 | BI-V100单机8卡（1x1） | fp32      | bs=256,lr=0.72 | /        | 624     | 632     | 633    | /      | 21.4 / 32.0 |
 | BI-V100单机8卡（2x8） | fp32      | bs=256,lr=0.72 | /        | 6835    | 7058    | 7219   | /      | 22.2 / 32.0 |
 
diff --git a/training/iluvatar/mobilenetv2-pytorch/config/config_BI-V100x1x8.py b/training/iluvatar/mobilenetv2-pytorch/config/config_BI-V100x1x8.py
index 714586e2e..2c42ee22e 100644
--- a/training/iluvatar/mobilenetv2-pytorch/config/config_BI-V100x1x8.py
+++ b/training/iluvatar/mobilenetv2-pytorch/config/config_BI-V100x1x8.py
@@ -1,5 +1,5 @@
 from config_common import *
 
-train_batch_size = 256
-eval_batch_size = 256
+train_batch_size = 32
+eval_batch_size = 32
 
diff --git a/training/iluvatar/swin_transformer-pytorch/README.md b/training/iluvatar/swin_transformer-pytorch/README.md
index 12c71636c..32318463b 100644
--- a/training/iluvatar/swin_transformer-pytorch/README.md
+++ b/training/iluvatar/swin_transformer-pytorch/README.md
@@ -8,15 +8,37 @@
 
 - ##### 软件环境
    - OS版本：Ubuntu 20.04
-   - OS kernel版本:  4.15.0-156-generic x86_64    
-   - 加速卡驱动版本：3.0.0
-   - Docker 版本：20.10.8
-   - 训练框架版本：torch-1.10.2+corex.3.0.0
+   - OS kernel版本:  5.4.0-148-generic   
+   - 加速卡驱动版本：3.1.0
+   - Docker 版本：24.0.5
+   - 训练框架版本：torch-1.13.1+corex.3.1.0
    - 依赖软件版本：无
 
 
 ### 运行情况
-| 训练资源 | 配置文件        | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能（samples/s) |
-| -------- | --------------- | ----------- | -------- | -------- | ------- | ---------------- |
-| 单机8卡  | config_A100x1x8  |              |          |          |         |                  |
+* 通用指标
+
+| 指标名称       | 指标值                                        | 特殊说明                                    |
+| -------------- | --------------------------------------------- | ------------------------------------------- |
+| 任务类别       | Image Classification && Semantic Segmantation |                                             |
+| 模型           | swin_transformer                              |                                             |
+| 数据集         | Imagenet2012 1K                               |                                             |
+| 数据精度       | precision,见“性能指标”                        | 可选fp32/amp/fp16/tf32                      |
+| 超参修改       | fix_hp,见“性能指标”                           | 跑满硬件设备评测吞吐量所需特殊超参          |
+| 硬件设备简称   | Iluvatar BI-V100                             |                                             |
+| 硬件存储使用   | mem,见“性能指标”                              | 通常称为“显存”,单位为GiB                    |
+| 端到端时间     | e2e_time,见“性能指标”                         | 总时间+Perf初始化等时间                     |
+| 总吞吐量       | p_whole,见“性能指标”                          | 实际训练样本数除以总时间(performance_whole) |
+| 训练吞吐量     | p_train,见“性能指标”                          | 不包含每个epoch末尾的评估部分耗时           |
+| **计算吞吐量** | **p_core,见“性能指标”**                       | 不包含数据IO部分的耗时(p3>p2>p1)            |
+| 训练结果       | final_acc1,见“性能指标”                         | 验证准确率                                    |
+| 额外修改项     | 无                                            |                                             |
+
+* 性能指标
+
+| 配置                 | precision| fix_hp | e2e_time | p_whole | p_train | p_core | final_acc1 | mem       |
+|----------------------| ---------| ------ | ---------| ------- | ------- | ------  | -------- | --------- |
+| BI-V100单机8卡(1x8)  | amp     | bs=352 | /        | /       | /        | /      |    81.23     | 26.8/32.0  |
+| BI-V100单机单卡(1x1)  | amp     | bs=352 | /        | /       | /        | /      | /       | 26.5/32.0  |
+| BI-V100两机8卡(2x8)  | amp     | bs=352 | /        | /       | /        | /      | /       | 26.8/32.0  |
 
diff --git a/training/iluvatar/swin_transformer-pytorch/config/config_BI-V100x1x1.py b/training/iluvatar/swin_transformer-pytorch/config/config_BI-V100x1x1.py
new file mode 100644
index 000000000..f594228b8
--- /dev/null
+++ b/training/iluvatar/swin_transformer-pytorch/config/config_BI-V100x1x1.py
@@ -0,0 +1,4 @@
+from config_common import *
+
+cfg = "configs/swin/swin_tiny_patch4_window7_224.yaml"
+train_batch_size = 352
diff --git a/training/iluvatar/swin_transformer-pytorch/config/config_BI-V100x1x8.py b/training/iluvatar/swin_transformer-pytorch/config/config_BI-V100x1x8.py
index 52ef64da3..f594228b8 100644
--- a/training/iluvatar/swin_transformer-pytorch/config/config_BI-V100x1x8.py
+++ b/training/iluvatar/swin_transformer-pytorch/config/config_BI-V100x1x8.py
@@ -1,4 +1,4 @@
 from config_common import *
 
 cfg = "configs/swin/swin_tiny_patch4_window7_224.yaml"
-train_batch_size = 256
+train_batch_size = 352
diff --git a/training/iluvatar/swin_transformer-pytorch/config/config_BI-V100x2x8.py b/training/iluvatar/swin_transformer-pytorch/config/config_BI-V100x2x8.py
new file mode 100644
index 000000000..f594228b8
--- /dev/null
+++ b/training/iluvatar/swin_transformer-pytorch/config/config_BI-V100x2x8.py
@@ -0,0 +1,4 @@
+from config_common import *
+
+cfg = "configs/swin/swin_tiny_patch4_window7_224.yaml"
+train_batch_size = 352
diff --git a/training/kunlunxin/tacotron2-pytorch/README.md b/training/kunlunxin/tacotron2-pytorch/README.md
index db2d8b5e4..55d54fce5 100644
--- a/training/kunlunxin/tacotron2-pytorch/README.md
+++ b/training/kunlunxin/tacotron2-pytorch/README.md
@@ -18,12 +18,31 @@
 
 
 ### 运行情况
-| 训练资源 | 配置文件        | 运行时长(s) | 目标val_loss | 收敛val_loss | epoch数 | 性能(samples/s) |
-| -------- | --------------- | ----------- | ------------ | ------------ | ------- | --------------- |
-| 单机1卡  | config_R300x1x1 | /           |              | /            |         |                 |
-| 单机8卡  | config_R300x1x8 |             | 0.4852(fp32) | 0.4271(fp32) | 1235    |                 |
-| 两机8卡  | config_R300x2x8 | /           |              | /            |         |                 |
+* 通用指标
 
+| 指标名称       | 指标值                  | 特殊说明                                    |
+| -------------- | ----------------------- | ------------------------------------------- |
+| 任务类别       | SpeechSynthesis         |                                             |
+| 模型           | tacotron2               |                                             |
+| 数据集         | LJSpeech                |                                             |
+| 数据精度       | precision,见“性能指标”  | 可选fp32/amp/fp16/tf32                      |
+| 超参修改       | fix_hp,见“性能指标”     | 跑满硬件设备评测吞吐量所需特殊超参          |
+| 硬件设备简称   | Kunlunxin R300        |                                             |
+| 硬件存储使用   | mem,见“性能指标”        | 通常称为“显存”,单位为GiB                    |
+| 端到端时间     | e2e_time,见“性能指标”   | 总时间+Perf初始化等时间                     |
+| 总吞吐量       | p_whole,见“性能指标”    | 实际训练样本数除以总时间(performance_whole) |
+| 训练吞吐量     | p_train,见“性能指标”    | 不包含每个epoch末尾的评估部分耗时           |
+| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1)            |
+| 训练结果       | val_loss,见“性能指标”   | 验证loss                                    |
+| 额外修改项     | 无                      |                                             |
+
+* 性能指标
+
+| 配置                 | precision | fix_hp          | e2e_time | p_whole | p_train | p_core | val_loss | mem       |
+| -------------------- | --------- | --------------- | -------- | ------- | ------- | ------ | -------- | --------- |
+| R300单机8卡(1x8)  | fp32      | bs=96, lr=0.001 | /        | /       | /       | /      | 0.4801   | 26.3/32.0   |
+| R300双机8卡(2x8)  | fp32      | bs=96, lr=0.001 | /        | /       | /       | /      | /        | 25.0/32.0 |
+| R300单机单卡(1x1) | fp32      | bs=128, lr=0.001 | /        | /       | /       | /      | /        | 30.5/32.0 |
 ### 许可证
 
 Apache 2.0 license。
diff --git a/training/kunlunxin/tacotron2-pytorch/config/config_R300x1x1.py b/training/kunlunxin/tacotron2-pytorch/config/config_R300x1x1.py
new file mode 100644
index 000000000..342338fce
--- /dev/null
+++ b/training/kunlunxin/tacotron2-pytorch/config/config_R300x1x1.py
@@ -0,0 +1,9 @@
+from config_common import *
+
+train_batch_size = 128
+eval_batch_size = train_batch_size
+
+warmup = 0.2
+learning_rate = 1e-3
+
+seed = 23333
diff --git a/training/kunlunxin/tacotron2-pytorch/config/config_R300x1x8.py b/training/kunlunxin/tacotron2-pytorch/config/config_R300x1x8.py
index 85f55987e..c83fd15fe 100644
--- a/training/kunlunxin/tacotron2-pytorch/config/config_R300x1x8.py
+++ b/training/kunlunxin/tacotron2-pytorch/config/config_R300x1x8.py
@@ -1,6 +1,6 @@
 from config_common import *
 
-train_batch_size = 48
+train_batch_size = 96
 eval_batch_size = train_batch_size
 
 warmup = 0.2
diff --git a/training/kunlunxin/tacotron2-pytorch/config/config_R300x2x8.py b/training/kunlunxin/tacotron2-pytorch/config/config_R300x2x8.py
new file mode 100644
index 000000000..c83fd15fe
--- /dev/null
+++ b/training/kunlunxin/tacotron2-pytorch/config/config_R300x2x8.py
@@ -0,0 +1,9 @@
+from config_common import *
+
+train_batch_size = 96
+eval_batch_size = train_batch_size
+
+warmup = 0.2
+learning_rate = 1e-3
+
+seed = 23333
\ No newline at end of file
diff --git a/training/kunlunxin/tacotron2-pytorch/config/environment_variables.sh b/training/kunlunxin/tacotron2-pytorch/config/environment_variables.sh
new file mode 100644
index 000000000..0383763a0
--- /dev/null
+++ b/training/kunlunxin/tacotron2-pytorch/config/environment_variables.sh
@@ -0,0 +1,5 @@
+export XACC=1
+export BKCL_PCIE_RING=1
+export XACC_LOAD_FLAGS=1
+export BKCL_TIMEOUT=1800
+export XMLIR_D_XPU_L3_SIZE=10485760
diff --git a/training/mthreads/README.md b/training/mthreads/README.md
new file mode 100644
index 000000000..194b9e73f
--- /dev/null
+++ b/training/mthreads/README.md
@@ -0,0 +1,70 @@
+
+# 厂商信息
+
+官网: https://www.mthreads.com/
+
+摩尔线程智能科技（北京）有限责任公司（简称：摩尔线程）是一家以GPU芯片设计为主的集成电路设计企业，专注于研发设计全功能GPU芯片及相关产品，为科技生态合作伙伴提供强大的计算加速能力。公司致力于创新研发面向“元计算”应用的新一代GPU，构建融合视觉计算、3D图形计算、科学计算及人工智能计算的综合计算平台，建立基于云原生GPU计算的生态系统，助力驱动数字经济发展。
+
+摩尔线程MTT  S系列全功能GPU支持多样算力，借助覆盖深度学习、图形渲染、视频处理和科学计算的完整MUSA软件栈，可为AI训练、AI推理、大模型、AIGC、云游戏、云渲染、视频云、数字孪生等场景提供通用智能算力支持，旨在为数据中心、智算中心和元计算中心的建设构建坚实算力基础，助力元宇宙中多元应用创新和落地。
+
+MUSA软件栈通过musify CUDA代码迁移工具、计算/通信加速库、mcc编译器、musa运行时和驱动实现对CUDA生态的兼容，帮助用户快速完成代码及应用的迁移。通过torch_musa插件，可以实现MTT S系列GPU对原生PyTorch的对接，用户可以无感的把AI模型运行在摩尔线程全功能GPU上。
+
+# FlagPerf适配验证环境说明
+## 环境配置参考
+  - 硬件
+    - 机器型号： MCCX D800
+    - 加速卡型号: MTT S4000 48GB
+    - CPU型号：Intel(R) Xeon(R) Gold 6430 CPU @ 2.00GHz
+    - 多机网络类型、带宽: InfiniBand，2*200Gbps
+  - 软件
+    - OS版本：Ubuntu 20.04 LTS
+    - OS kernel版本: 5.4.0-42-generic
+    - 加速卡驱动版本：2.2.0
+    - Docker 版本: 20.10.24
+
+## 容器镜像信息
+- 容器构建信息
+  - Dockerfile路径：training/mthreads/docker_image/pytorch_2.0/Dockerfile
+  - 构建后软件安装脚本: training/mthreads/docker_image/pytorch_2.0/pytorch_2.0_install.sh
+
+- 核心软件信息
+
+  - AI框架&版本
+    - PyTorch: v2.0.0
+
+  - 其它软件版本
+    - torch_musa: 2.0.0+git8614ba1
+    - musa toolkits: 1.5.0+git3d8791d
+    - mcc: 1.5.2+git3730bdd
+    - mublas: 1.2.0+gitd9867b5
+
+
+## 加速卡监控采集
+- 加速卡使用信息采集命令
+
+  ```bash
+  mthreads-gmi -q | grep -E 'GPU Current Temp|Power Draw|Used|Total|Gpu' | \
+  awk -F ': *' '/GPU Current Temp|Power Draw|Used|Total|Gpu/ \
+  { values[(NR-1)%5+1] = $2; } NR % 5 == 0 { print values[4], values[5], values[2], values[1], values[3]; }'
+  ```
+- 监控项示例：
+    ```bash
+    45C 109.51W 1MiB 32768MiB 0%
+    44C 108.95W 1MiB 32768MiB 0%
+    46C 110.87W 1MiB 32768MiB 0%
+    43C 104.33W 1MiB 32768MiB 0%
+    44C 107.55W 8MiB 32768MiB 0%
+    46C 110.51W 8MiB 32768MiB 0%
+    44C 106.59W 8MiB 32768MiB 0%
+    44C 104.58W 8MiB 32768MiB 0%
+    ```
+- 加速卡使用信息采集项说明
+
+|监控项| 日志文件 | 格式 |
+|---|---|---|
+|温度| mthreads_monitor.log | xxx C |
+|功耗 |mthreads_monitor.log | xxx W |
+|显存占用大小 |mthreads_monitor.log |xxx MiB |
+|总显存大小 |mthreads_monitor.log |xxx MiB |
+|显存使用率 |mthreads_monitor.log |xxx % |
+
diff --git a/training/mthreads/bert_hf-pytorch/README.md b/training/mthreads/bert_hf-pytorch/README.md
new file mode 100644
index 000000000..17cfdb961
--- /dev/null
+++ b/training/mthreads/bert_hf-pytorch/README.md
@@ -0,0 +1,48 @@
+### 摩尔线程 MTT S系列 GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+    - 机器型号: MCCX D800
+    - 加速卡型号: MTT S4000 48GB
+    - CPU型号: Intel(R) Xeon(R) Gold 6430 CPU @ 2.00GHz
+    - 多机网络类型、带宽: InfiniBand，2*200Gbps
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04 LTS
+   - OS kernel版本: 5.4.0-42-generic
+   - 加速卡驱动版本：
+   - Docker 版本：20.10.24
+   - 训练框架版本：pytorch-2.0.0+torch_musa-git8614ba1
+   - 依赖软件版本：
+     - musa toolkits: 1.5.0+git3d8791d
+     - mcc: 1.5.2+git3730bdd
+     - mublas: 1.2.0+gitd9867b5
+
+### 运行情况
+
+* 通用指标
+
+| 指标名称       | 指标值                  | 特殊说明                              |
+| -------------- | ----------------------- | ------------------------------------- |
+| 任务类别       | 自然语言编码          |                                       |
+| 模型           | bert-large-uncased |                                       |
+| 数据集         | Wikipedia   |                                       |
+| 数据精度       | precision,见“性能指标”  | 可选fp32/amp/fp16                     |
+| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 |
+| 硬件设备简称   | MTT S4000             |                                       |
+| 硬件存储使用   | mem,见“性能指标”        | 通常称为“显存”,单位为GiB              |
+| 端到端时间     | e2e_time,见“性能指标”   | 总时间+Perf初始化等时间               |
+| 总吞吐量       | p_whole,见“性能指标”    | 实际训练序列数除以总时间(performance_whole) |
+| 训练吞吐量     | p_train,见“性能指标”    | 不包含每个epoch末尾的评估部分耗时     |
+| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时      |
+| 训练结果       | acc,见“性能指标”        | masked_lm任务准确率(实际/目标) |
+| 额外修改项     | fp16实现方式 | mthreads使用1+8+7格式(bf16)来实现16位浮点数 |
+
+* 性能指标
+
+| 配置               | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc  | mem |
+| ------------------ | --------- | ---- | ---- | ---- | ---- | ---- |  ---- | ---- |
+| S4000单机单卡（1x1） | fp32 | \ |  |  |  |  | \ | \ |
+| S4000单机单卡（1x1） | amp | \ |  |  |  |  | \ | \ |
+| S4000单机8卡（1x8） | amp | bs=20, lr=2.5e-05 |  |  |     |  | 0.658/0.655| 34.2/48.0 |
+| S4000两机8卡（2x8） | amp | \ |  |  |     |  | \ | \ |
+
+
diff --git a/training/mthreads/bert_hf-pytorch/config/config_S4000x1x1.py b/training/mthreads/bert_hf-pytorch/config/config_S4000x1x1.py
new file mode 100644
index 000000000..f0816d4ad
--- /dev/null
+++ b/training/mthreads/bert_hf-pytorch/config/config_S4000x1x1.py
@@ -0,0 +1,11 @@
+vendor = "mthreads"
+
+train_batch_size = 20
+eval_batch_size = train_batch_size
+lr = 0.000005  # fp32/amp
+#lr = 0.00005   # bf16
+
+dist_backend = "mccl"
+
+amp = True
+fp16 = False
diff --git a/training/mthreads/bert_hf-pytorch/config/config_S4000x1x8.py b/training/mthreads/bert_hf-pytorch/config/config_S4000x1x8.py
new file mode 100644
index 000000000..3cb18282b
--- /dev/null
+++ b/training/mthreads/bert_hf-pytorch/config/config_S4000x1x8.py
@@ -0,0 +1,11 @@
+vendor = "mthreads"
+
+train_batch_size = 20
+eval_batch_size = train_batch_size
+lr = 5e-06 / 0.2 # fp32/amp
+#lr = 5e-05 / 0.6  # bf16
+
+dist_backend = "mccl"
+
+amp = True
+fp16 = False
diff --git a/training/mthreads/bert_hf-pytorch/config/environment_variables.sh b/training/mthreads/bert_hf-pytorch/config/environment_variables.sh
new file mode 100644
index 000000000..5cfe0142c
--- /dev/null
+++ b/training/mthreads/bert_hf-pytorch/config/environment_variables.sh
@@ -0,0 +1,8 @@
+# =================================================
+# Export variables
+# =================================================
+
+
+export OMP_NUM_THREADS=1
+export MUSA_KERNEL_TIMEOUT=3600000
+
diff --git a/training/mthreads/bert_hf-pytorch/config/requirements.txt b/training/mthreads/bert_hf-pytorch/config/requirements.txt
new file mode 100644
index 000000000..a81d3b585
--- /dev/null
+++ b/training/mthreads/bert_hf-pytorch/config/requirements.txt
@@ -0,0 +1 @@
+transformers==4.35.0
diff --git a/training/mthreads/bert_hf-pytorch/extern/trainer_adapter.py b/training/mthreads/bert_hf-pytorch/extern/trainer_adapter.py
new file mode 100644
index 000000000..72b0e68b4
--- /dev/null
+++ b/training/mthreads/bert_hf-pytorch/extern/trainer_adapter.py
@@ -0,0 +1,32 @@
+import torch
+import torch_musa
+
+import config
+
+
+def create_grad_scaler():
+    """create_grad_scaler for mixed precision training"""
+    scaler = torch_musa.amp.GradScaler() if config.amp else None
+    return scaler
+
+
+def train_one_step(model, batch_data, optimizer, cur_step, scaler=None):
+    input_ids, labels = batch_data
+    if scaler:
+        with torch_musa.amp.autocast(enabled=True):
+            output = model(input_ids=input_ids, labels=labels)
+            loss = output.loss
+
+        scaler.scale(loss).backward()
+        if cur_step % config.gradient_accumulation_steps == 0:
+            scaler.step(optimizer)
+            scaler.update()
+    else:
+        output = model(input_ids=input_ids, labels=labels)
+        loss = output.loss
+        loss.backward()
+        if cur_step % config.gradient_accumulation_steps == 0:
+            optimizer.step()
+
+    return loss
+    
\ No newline at end of file
diff --git a/training/mthreads/docker_image/deepspeed/Dockerfile b/training/mthreads/docker_image/deepspeed/Dockerfile
new file mode 100644
index 000000000..e427ddd80
--- /dev/null
+++ b/training/mthreads/docker_image/deepspeed/Dockerfile
@@ -0,0 +1,4 @@
+FROM sh-harbor.mthreads.com/mt-ai/lm-qy2:FlagPerf-v1
+ENV PATH=/opt/conda/envs/py38/bin/:/opt/conda/condabin/conda:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/musa/lib:$LD_LIBRARY_PATH
+RUN ln -sf /usr/bin/bash /usr/bin/sh
diff --git a/training/mthreads/docker_image/deepspeed/deepspeed_install.sh b/training/mthreads/docker_image/deepspeed/deepspeed_install.sh
new file mode 100644
index 000000000..a9bf588e2
--- /dev/null
+++ b/training/mthreads/docker_image/deepspeed/deepspeed_install.sh
@@ -0,0 +1 @@
+#!/bin/bash
diff --git a/training/mthreads/docker_image/pytorch_2.0/Dockerfile b/training/mthreads/docker_image/pytorch_2.0/Dockerfile
new file mode 100644
index 000000000..2982c1af5
--- /dev/null
+++ b/training/mthreads/docker_image/pytorch_2.0/Dockerfile
@@ -0,0 +1,3 @@
+FROM moore-threads/pytorch:flagperf-py38
+ENV PATH /opt/conda/envs/py38/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/musa/lib/:$LD_LIBRARY_PATH
diff --git a/training/mthreads/docker_image/pytorch_2.0/pytorch_install.sh b/training/mthreads/docker_image/pytorch_2.0/pytorch_install.sh
new file mode 100644
index 000000000..cc1f786e8
--- /dev/null
+++ b/training/mthreads/docker_image/pytorch_2.0/pytorch_install.sh
@@ -0,0 +1 @@
+#!/bin/bash
\ No newline at end of file
diff --git a/training/mthreads/llama2_7b-deepspeed/README.md b/training/mthreads/llama2_7b-deepspeed/README.md
new file mode 100644
index 000000000..ed2fc06af
--- /dev/null
+++ b/training/mthreads/llama2_7b-deepspeed/README.md
@@ -0,0 +1,50 @@
+### Moore Threads S4000 GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+    - 机器型号: MCCX D800 
+    - 加速卡型号: S4000
+    - CPU型号: Intel(R) Xeon(R) Gold 6330 CPU @ 2.00GHz
+    - 多机网络类型、带宽: InfiniBand，2x200Gb/s
+    
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04 LTS
+   - OS kernel版本: 5.4.0-42-generic
+   - 加速卡驱动版本：2.2.0
+   - Docker镜像和版本: PyTorch2.0_musa1.4_ec6a747fd342 
+   - 训练框架版本：pytorch-2.0.0+torch_musa-git8ea3501
+   - 依赖软件版本:
+     - musa toolkits: 1.4.0+git4e25703
+     - mublas: 1.1.0+gite484aa2
+
+- ##### 优化策略
+
+   - scaled dot product attention
+   - checkpointing
+
+### 运行情况
+
+* 输入批尺寸
+  1. local_batchsize(micro_batchsize)，简写为LBS，即实际进入模型的张量批尺寸，为config_S4000x1x8.py中所写，在本case中默认为3
+  2. seqlength(max_position_embedding)，简写为MPE，即实际进入模型的序列长度，为config_S4000x1x8.py中所写，在本case中默认为4096
+  3. gradient_accumulate_steps，简写为GAS，即梯度累加步数，为ds_config.json中所写，在本case中默认为1
+  4. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size，简写为GBS。在本case中，只存在数据并行，因此data_parallel_size=world_size。
+
+* 通用指标
+
+| 指标名称     | 指标值                     | 特殊说明                           |
+| ------------ | -------------------------- | ---------------------------------- |
+| 任务类别     | 自然语言理解               |                                    |
+| 模型         | llama2_7b                  |                                    |
+| 数据集       | openwebtext                | 如无特殊说明，训练前1亿个token |
+| 数据精度     | amp                        |                                    |
+| 超参修改     | fix_hp,见“性能指标”        | 运行必要特殊超参，例如需要改小seqlength避免OOM |
+| 硬件设备简称 | S4000                      |                                    |
+| 硬件存储使用 | mem,见“性能指标”           | 通常称为“显存”,单位为GiB           |
+| 计算使用率 | MFU,见“性能指标”           | 参见PaLM论文定义 |
+| **吞吐量**   | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数          |
+
+* 性能指标
+
+| 配置                |  fix_hp           | token/p/s | loss | mem       | MFU       |
+| ------------------- | ---------------- | ------ | ------- | --------- | --------- |
+| S4000单机8卡（1x8）  |       / |  |44.2/48.0|3.20|  |
diff --git a/training/mthreads/llama2_7b-deepspeed/config/config_S4000x1x8.py b/training/mthreads/llama2_7b-deepspeed/config/config_S4000x1x8.py
new file mode 100644
index 000000000..2011c5b18
--- /dev/null
+++ b/training/mthreads/llama2_7b-deepspeed/config/config_S4000x1x8.py
@@ -0,0 +1,7 @@
+seqlength = 4096
+batchsize = 6
+datafilename = "openwebtext_llama2_100M.npy"
+epochs = 1
+theoryflops = 98000000000000.0
+flashattn = True # using sdp attention
+gradient_checkpointing_enable = True
diff --git a/training/mthreads/llama2_7b-deepspeed/config/ds_config.json b/training/mthreads/llama2_7b-deepspeed/config/ds_config.json
new file mode 100644
index 000000000..01e8c085f
--- /dev/null
+++ b/training/mthreads/llama2_7b-deepspeed/config/ds_config.json
@@ -0,0 +1,40 @@
+{
+  "gradient_accumulation_steps": 1,
+  "train_micro_batch_size_per_gpu": 1,
+  "prescale_gradients": false,
+  "zero_allow_untested_optimizer": true,
+  "optimizer": {
+    "type": "AdamW",
+    "params": {
+      "lr": 1e-5,
+      "weight_decay": 0.1,
+      "betas": [
+        0.9,
+        0.95
+      ],
+      "eps": 1e-5
+    }
+  },
+  "zero_optimization": {
+    "stage": 3,
+    "stage3_max_live_parameters": 1e9,
+    "stage3_max_reuse_distance": 1e9,
+    "stage3_prefetch_bucket_size": 1e7,
+    "sub_group_size": 1e9,
+    "contiguous_gradients": true,
+    "allgather_bucket_size": 1e8,
+    "reduce_bucket_size": 1e7,
+    "overlap_comm": false,
+    "reduce_scatter": true
+  },
+  "steps_per_print": 50,
+  "gradient_clipping": 1.0,
+  "wall_clock_breakdown": false,
+  "fp16": {
+    "enabled": true,
+    "loss_scale_window": 100
+  },
+  "bf16": {
+    "enabled": false
+  }
+}
diff --git a/training/mthreads/llama2_7b-deepspeed/config/environment_variables.sh b/training/mthreads/llama2_7b-deepspeed/config/environment_variables.sh
new file mode 100644
index 000000000..4ddb3cc29
--- /dev/null
+++ b/training/mthreads/llama2_7b-deepspeed/config/environment_variables.sh
@@ -0,0 +1,4 @@
+export MUSA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7'
+export DS_ACCELERATOR=musa
+export MUSA_KERNEL_TIMEOUT=1800000
+export NCCL_PROTOS=2
diff --git a/training/mthreads/llama2_7b-deepspeed/config/requirements.txt b/training/mthreads/llama2_7b-deepspeed/config/requirements.txt
new file mode 100644
index 000000000..e69de29bb
diff --git a/training/mthreads/mthreads_monitor.py b/training/mthreads/mthreads_monitor.py
new file mode 100644
index 000000000..092b832df
--- /dev/null
+++ b/training/mthreads/mthreads_monitor.py
@@ -0,0 +1,290 @@
+# ！/usr/bin/env python3
+# encoding: utf-8
+'''
+Usage:  python3 sys-monitor.py -o operation -l [log_path]
+            -o, --operation     start|stop|restart|status
+            -l, --log           log path , ./logs/ default
+'''
+
+import os
+import sys
+import time
+import signal
+import atexit
+import argparse
+import datetime
+from multiprocessing import Process
+import subprocess
+import schedule
+
+
+class Daemon:
+    '''
+    daemon subprocess class.
+    usage: subclass this daemon and override the run() method.
+    sys-monitor.pid: in the /tmp/, auto del when unexpected exit.
+    verbose: debug mode, disabled default.
+    '''
+
+    def __init__(self,
+                 pid_file,
+                 log_file,
+                 err_file,
+                 gpu_log,
+                 log_path,
+                 rate=5,
+                 stdin=os.devnull,
+                 stdout=os.devnull,
+                 stderr=os.devnull,
+                 home_dir='.',
+                 umask=0o22,
+                 verbose=0):
+        self.stdin = stdin
+        self.stdout = stdout
+        self.stderr = stderr
+        self.home_dir = home_dir
+        self.verbose = verbose
+        self.pidfile = pid_file
+        self.logfile = log_file
+        self.errfile = err_file
+        self.gpufile = gpu_log
+        self.logpath = log_path
+        self.rate = rate
+        self.umask = umask
+        self.verbose = verbose
+        self.daemon_alive = True
+
+    def get_pid(self):
+        try:
+            with open(self.pidfile, 'r') as pf:
+                pid = int(pf.read().strip())
+        except IOError:
+            pid = None
+        except SystemExit:
+            pid = None
+        return pid
+
+    def del_pid(self):
+        if os.path.exists(self.pidfile):
+            os.remove(self.pidfile)
+
+    def run(self):
+        '''
+        NOTE: override the method in subclass
+        '''
+
+        def gpu_mon(file):
+            TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S')
+            # TODO more elegant way?
+            cmd = "mthreads-gmi -q | grep -E 'GPU Current Temp|Power Draw|Used|Total|Gpu' | "
+            cmd += "awk -F ': *' '/GPU Current Temp|Power Draw|Used|Total|Gpu/ { values[(NR-1)%5+1] = $2; } NR % 5 == 0 { print values[4], values[5], values[2], values[1], values[3]; }'"
+            process = subprocess.Popen(cmd,
+                                       shell=True,
+                                       stdout=subprocess.PIPE,
+                                       stderr=subprocess.STDOUT,
+                                       encoding='utf-8')
+            try:
+                out = process.communicate(timeout=10)
+            except subprocess.TimeoutExpired:
+                process.kill()
+                out = process.communicate()
+
+            if process.returncode != 0:
+                result = "error"
+            result = TIMESTAMP + "\n" + out[0] + "\n"
+            with open(file, 'a') as f:
+                f.write(result)
+
+        def timer_gpu_mon():
+            gpu_process = Process(target=gpu_mon, args=(self.gpufile, ))
+            gpu_process.start()
+
+        schedule.every(self.rate).seconds.do(timer_gpu_mon)
+        while True:
+            schedule.run_pending()
+            time.sleep(5)
+
+    def daemonize(self):
+        if self.verbose >= 1:
+            print('daemon process starting ...')
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #1 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        os.chdir(self.home_dir)
+        os.setsid()
+        os.umask(self.umask)
+        try:
+            pid = os.fork()
+            if pid > 0:
+                sys.exit(0)
+        except OSError as e:
+            sys.stderr.write('fork #2 failed: %d (%s)\n' %
+                             (e.errno, e.strerror))
+            sys.exit(1)
+        sys.stdout.flush()
+        sys.stderr.flush()
+        si = open(self.stdin, 'r')
+        so = open(self.stdout, 'a+')
+        if self.stderr:
+            se = open(self.stderr, 'a+')
+        else:
+            se = so
+        os.dup2(si.fileno(), sys.stdin.fileno())
+        os.dup2(so.fileno(), sys.stdout.fileno())
+        os.dup2(se.fileno(), sys.stderr.fileno())
+        atexit.register(self.del_pid)
+        pid = str(os.getpid())
+        with open(self.pidfile, 'w+') as f:
+            f.write('%s\n' % pid)
+
+    def start(self):
+        if not os.path.exists(self.logpath):
+            os.makedirs(self.logpath)
+        elif os.path.exists(self.gpufile):
+            os.remove(self.gpufile)
+        if self.verbose >= 1:
+            print('ready to start ......')
+        # check for a pid file to see if the daemon already runs
+        pid = self.get_pid()
+        if pid:
+            msg = 'pid file %s already exists, is it already running?\n'
+            sys.stderr.write(msg % self.pidfile)
+            sys.exit(1)
+        # start the daemon
+        self.daemonize()
+        self.run()
+
+    def stop(self):
+        if self.verbose >= 1:
+            print('stopping ...')
+        pid = self.get_pid()
+        if not pid:
+            msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile
+            sys.stderr.write(msg)
+            if os.path.exists(self.pidfile):
+                os.remove(self.pidfile)
+            return
+        # try to kill the daemon process
+        try:
+            i = 0
+            while 1:
+                os.kill(pid, signal.SIGTERM)
+                time.sleep(1)
+                i = i + 1
+                if i % 10 == 0:
+                    os.kill(pid, signal.SIGHUP)
+        except OSError as err:
+            err = str(err)
+            if err.find('No such process') > 0:
+                if os.path.exists(self.pidfile):
+                    os.remove(self.pidfile)
+            else:
+                print(str(err))
+                sys.exit(1)
+            if self.verbose >= 1:
+                print('Stopped!')
+
+    def restart(self):
+        self.stop()
+        self.start()
+
+    def status(self):
+        pid = self.get_pid()
+        if pid:
+            if os.path.exists('/proc/%d' % pid):
+                return pid
+        return False
+
+
+def parse_args():
+    ''' Check script input parameter. '''
+    parse = argparse.ArgumentParser(description='Sys monitor script')
+    parse.add_argument('-o',
+                       type=str,
+                       metavar='[operation]',
+                       required=True,
+                       help='start|stop|restart|status')
+    parse.add_argument('-l',
+                       type=str,
+                       metavar='[log_path]',
+                       required=False,
+                       default='./logs/',
+                       help='log path')
+    args = parse.parse_args()
+    return args
+
+
+def get_system_info():
+    cmd = r"echo OS version:;"
+    cmd = cmd + r"cat /etc/issue | head -n1 | awk '{print $1, $2, $3}';"
+    cmd = cmd + r"echo ;"
+
+    cmd = cmd + r"echo OS Kernel version:;"
+    cmd = cmd + r"uname -r;"
+    cmd = cmd + r"echo ;"
+
+    cmd = cmd + r"echo Hardware Model:;"
+    cmd = cmd + r"sudo dmidecode | grep -A9 'System Information' | tail -n +2 | sed 's/^[ \t]*//';"
+    cmd = cmd + r"echo ;"
+
+    cmd = cmd + r"echo Accelerator Model:;"
+    cmd = cmd + r"mthreads-gmi -L;"
+    cmd = cmd + r"echo ;"
+
+    cmd = cmd + r"echo Accelerator Driver version:;"
+    cmd = cmd + r"mthreads-gmi | grep 'Driver Version' | awk '{print $3}';"
+    cmd = cmd + r"echo ;"
+
+    cmd = cmd + r"echo Docker version:;"
+    cmd = cmd + r"docker -v"
+
+    return cmd
+
+
+def main():
+    sample_rate1 = 5
+    args = parse_args()
+    operation = args.o
+    log_path = args.l
+    pid_fn = str('/tmp/gpu_monitor.pid')
+    log_fn = str(log_path + '/mthreads_monitor.log')
+    err_fn = str(log_path + '/mthreads_monitor.err')
+    # result for gpu
+    gpu_fn = str(log_path + '/mthreads_monitor.log')
+    sys_fn = str(log_path + '/sys_info.log')
+    cmd = get_system_info()
+    with open(sys_fn, "w") as f:
+        p = subprocess.Popen(cmd, shell=True, stdout=f, stderr=subprocess.STDOUT)
+        p.wait()
+
+    subdaemon = Daemon(pid_fn,
+                       log_fn,
+                       err_fn,
+                       gpu_fn,
+                       log_path,
+                       verbose=1,
+                       rate=sample_rate1)
+    if operation == 'start':
+        subdaemon.start()
+    elif operation == 'stop':
+        subdaemon.stop()
+    elif operation == 'restart':
+        subdaemon.restart()
+    elif operation == 'status':
+        pid = subdaemon.status()
+        if pid:
+            print('process [%s] is running ......' % pid)
+        else:
+            print('daemon process [%s] stopped' % pid)
+    else:
+        print("invalid argument!")
+        sys.exit(1)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/training/mthreads/resnet50-pytorch/README.md b/training/mthreads/resnet50-pytorch/README.md
new file mode 100644
index 000000000..3b5048860
--- /dev/null
+++ b/training/mthreads/resnet50-pytorch/README.md
@@ -0,0 +1,52 @@
+### 1. 数据集准备
+[下载ImageNet2012](../../benchmarks/resnet50) 
+
+### 2. 摩尔线程 MTT S系列 GPU配置与运行信息参考
+#### 环境配置
+- ##### 硬件环境
+    - 硬件
+      - 机器型号： MCCX D800
+      - 加速卡型号: MTT S4000 48GB
+      - CPU型号：Intel(R) Xeon(R) Gold 6430 CPU @ 2.00GHz
+      - 多机网络类型、带宽: InfiniBand，2*200Gbps
+    
+- ##### 软件环境
+   - OS版本：Ubuntu 20.04 LTS
+   - OS kernel版本: 5.4.0-42-generic
+   - 加速卡驱动版本：2.2.0
+   - Docker版本: 20.10.24
+   - 训练框架版本：pytorch-2.0.0+torch_musa-git8614ba1
+   - 依赖软件版本:
+     - musa toolkits: 1.5.0+git3d8791d
+     - mcc: 1.5.2+git3730bdd
+     - mublas: 1.2.0+gitd9867b5
+
+### 运行情况
+
+* 通用指标
+
+| 指标名称       | 指标值                  | 特殊说明                              |
+| -------------- | ----------------------- | ------------------------------------- |
+| 任务类别       | 图像分类                |                                       |
+| 模型           | resnet50                |                                       |
+| 数据集         | ImageNet2012            |                                       |
+| 数据精度       | precision,见“性能指标”  | 可选fp32                    |
+| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 |
+| 硬件设备简称   | MTT S3000         |                                       |
+| 硬件存储使用   | mem,见“性能指标”        | 通常称为“显存”,单位为GiB              |
+| 端到端时间     | e2e_time,见“性能指标”   | 总时间+Perf初始化等时间               |
+| 总吞吐量       | p_whole,见“性能指标”    | 实际训练图片数除以总时间(performance_whole) |
+| 训练吞吐量     | p_train,见“性能指标”    | 不包含每个epoch末尾的评估部分耗时     |
+| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1)      |
+| 训练结果       | acc,见“性能指标”        | 单位为top1分类准确率(acc1)            |
+| 额外修改项     | 无                      |                                       |
+
+* 性能指标
+
+| 配置               | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc  | mem |
+| ------------------ | --------- | ---- | ---- | ---- | ---- | ---- |  ---- | ---- |
+| 单机1卡（1x1） | fp32 | / |  |  |     |  | / | / |
+| 单机8卡（1x8） | fp32 |bs=256,lr=0.8 |     |     |   |    | /| 25.0/48.0 |
+| 单机8卡（1x8） | amp |bs=512,lr=0.2 |     |     |   |    | 73.08| 26.2/48.0 |
+| 单机8卡（1x8） | bf16 |bs=512,lr=0.2 |     |     |   |    | /| 25.7/48.0 |
+| 两机8卡（2x8） | fp32 | / |  |  |     |  | /| /|
diff --git a/training/mthreads/resnet50-pytorch/config/config_S4000x1x1.py b/training/mthreads/resnet50-pytorch/config/config_S4000x1x1.py
new file mode 100644
index 000000000..e3437bec1
--- /dev/null
+++ b/training/mthreads/resnet50-pytorch/config/config_S4000x1x1.py
@@ -0,0 +1,8 @@
+lr = 0.1
+train_batch_size = 256
+eval_batch_size = train_batch_size
+
+dist_backend = "mccl"
+amp = False
+fp16 = False
+
diff --git a/training/mthreads/resnet50-pytorch/config/config_S4000x1x8.py b/training/mthreads/resnet50-pytorch/config/config_S4000x1x8.py
new file mode 100644
index 000000000..7b9b4be72
--- /dev/null
+++ b/training/mthreads/resnet50-pytorch/config/config_S4000x1x8.py
@@ -0,0 +1,8 @@
+lr = 0.8
+train_batch_size = 256
+eval_batch_size = train_batch_size
+
+dist_backend = "mccl"
+amp = False
+fp16 = False
+
diff --git a/training/mthreads/resnet50-pytorch/extern/trainer_adapter.py b/training/mthreads/resnet50-pytorch/extern/trainer_adapter.py
new file mode 100644
index 000000000..cc955f536
--- /dev/null
+++ b/training/mthreads/resnet50-pytorch/extern/trainer_adapter.py
@@ -0,0 +1,35 @@
+import torch
+import torch_musa
+import config
+from driver import dist_pytorch
+
+
+def convert_model(model):
+    model.to(memory_format=torch.channels_last)
+    return model
+
+
+def create_grad_scaler():
+    """create_grad_scaler for mixed precision training"""
+    scaler = torch_musa.amp.GradScaler() if config.amp else None
+    return scaler
+
+
+def train_step(model, batch, optimizer, scaler=None):
+    """train one step"""
+    images, target = batch
+    criterion = torch.nn.CrossEntropyLoss()
+    if scaler:
+        with torch.musa.amp.autocast(enabled=True):
+            output = model(images)
+            loss = criterion(output, target)
+        scaler.scale(loss).backward()
+        scaler.step(optimizer)
+        scaler.update()
+    else:
+        output = model(images)
+        loss = criterion(output, target)
+        loss.backward()
+        optimizer.step()
+
+    return loss
diff --git a/training/nvidia/swin_transformer-pytorch/README.md b/training/nvidia/swin_transformer-pytorch/README.md
index 3bdbfb9b8..a1b9a169d 100644
--- a/training/nvidia/swin_transformer-pytorch/README.md
+++ b/training/nvidia/swin_transformer-pytorch/README.md
@@ -33,7 +33,7 @@
 | 总吞吐量       | p_whole,见“性能指标”                          | 实际训练样本数除以总时间(performance_whole) |
 | 训练吞吐量     | p_train,见“性能指标”                          | 不包含每个epoch末尾的评估部分耗时           |
 | **计算吞吐量** | **p_core,见“性能指标”**                       | 不包含数据IO部分的耗时(p3>p2>p1)            |
-| 训练结果       | val_loss,见“性能指标”                         | 验证loss                                    |
+| 训练结果       | final_acc1,见“性能指标”                         | 验证准确率                                    |
 | 额外修改项     | 无                                            |                                             |
 
 * 性能指标
diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py
index 5b36dced2..0ad664a92 100644
--- a/training/run_benchmarks/config/test_conf.py
+++ b/training/run_benchmarks/config/test_conf.py
@@ -1,7 +1,7 @@
 '''Test Configs, including'''
 # -*-coding:utf-8 -*-
 
-# Set accelerator's vendor name, e.g. iluvatar, cambricon, kunlunxin and ascend.
+# Set accelerator's vendor name, e.g. iluvatar, cambricon, kunlunxin, ascend and mthreads.
 # We will run benchmarks in training/<vendor>
 VENDOR = "nvidia"
 
@@ -19,6 +19,8 @@
 #       "--device=/dev/davinciX --device=/dev/davinci_manager + \
 #        --device=/dev/devmm_svm --device=/dev/hisi_hdc + \
 #        -v /usr/local/Ascend/driver -v /usr/local/dcmi -v /usr/local/bin/npu-smi"
+#   mthreads:
+#       " --env MTHREADS_VISIBLE_DEVICES=all"
 ACCE_CONTAINER_OPT = " --gpus all"
 # XXX_VISIBLE_DEVICE item name in env
 # possible value of ACCE_VISIBLE_DEVICE_ENV_NAME are:
@@ -26,6 +28,7 @@
 #   MLU_VISIBLE_DEVICES for cambricon
 #   XPU_VISIBLE_DEVICES for kunlunxin
 #   ASCEND_VISIBLE_DEVICES for ascend
+#   MUSA_VISIBLE_DEVICES for mthreads
 ACCE_VISIBLE_DEVICE_ENV_NAME = "CUDA_VISIBLE_DEVICES"
 
 # Set pip source, which will be used in preparing envs in container
@@ -84,6 +87,9 @@
     # "longformer:pytorch_1.12:A100:1:8:1": "/raid/dataset/longformer_train/",
     # "detr:pytorch_1.13:A100:1:8:1": "/raid/dataset/detr/coco2017/",
     
+    # "llama2_7b:deepspeed:A100:1:8:1": "/raid/dataset/llama2_7b_pretrain",
+    # "aquila2_7b:flagscale:A100:1:8:1": "/raid/dataset/aquila2_7b_pretrain",
+    
     # "llama1_7B:paddle_2.5.1:TP1PP1SH2SP8A10040G:1:8:1":"/raid/dataset/llama/"
     # "llama1_7B:paddle_2.5.1:TP2PP1SH1SP4A10040G:1:8:1":"/raid/dataset/llama/"
     # "llama1_7B:paddle_2.5.1:TP2PP1SH2SP4A10040G:1:8:1":"/raid/dataset/llama/"
@@ -115,6 +121,13 @@
     # "bert:pytorch:R300:1:8:1": "/raid/dataset/bert_large/train",
     # "longformer:pytorch:R300:1:8:1": "/raid/dataset/longformer_train",
     # "distilbert:pytorch:R300:1:8:1": "/raid/dataset/distilbert/",
-    # "swin_transformer:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/"
+    # "swin_transformer:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/",
+    # "tacotron2:pytorch:R300:1:8:1": "/raid/dataset/tacotron2/LJSpeech/",
+
+    # mthreads cases
+    # "resnet50:pytorch_2.0:S4000:1:8:1": "/data/flagperf/ImageNet",
+    # "retinanet:pytorch_2.0:S4000:1:8:1": "/data/flagperf/coco2017",
+    # "bert_hf:pytorch_2.0:S4000:1:8:1": "/data/flagperf/bert_hf",
+    # "llama2_7b:deepspeed:S4000:1:8:1": "/data/flagperf/llama/openwebtext",
 }
 
diff --git a/training/run_benchmarks/deepspeed/start_deepspeed_task.py b/training/run_benchmarks/deepspeed/start_deepspeed_task.py
index f98c8ed37..154b3cd2f 100644
--- a/training/run_benchmarks/deepspeed/start_deepspeed_task.py
+++ b/training/run_benchmarks/deepspeed/start_deepspeed_task.py
@@ -114,11 +114,16 @@ def main():
     train_script_path = helper.get_train_script_path(task_args)
     config_dir, config_file = helper.get_config_dir_file(task_args)
     config_file = os.path.join(config_dir, config_file)
+    ds_config_file = os.path.join(config_dir, "ds_config.json")
 
     exec_cmd = "cd " + os.path.dirname(train_script_path) + ";"
     exec_cmd = exec_cmd + "deepspeed --num_gpus=" + str(
         task_args.nproc) + " run_pretraining.py"
-    exec_cmd = exec_cmd + " --deepspeed --deepspeed_config ds_config.json --data_dir " + task_args.data_dir
+
+    exec_cmd = exec_cmd + " --deepspeed --deepspeed_config "
+    exec_cmd = exec_cmd + ds_config_file
+    exec_cmd = exec_cmd + " --data_dir " + task_args.data_dir
+
     exec_cmd = exec_cmd + " --flagperf_config " + config_file
     exec_cmd = exec_cmd + " --nproc " + str(
         task_args.nproc) + " --nnodes " + str(task_args.nnodes)