diff --git a/inference/benchmarks/bertLarge/README.md b/inference/benchmarks/bertLarge/README.md index e7fc57df9..96c5cce94 100644 --- a/inference/benchmarks/bertLarge/README.md +++ b/inference/benchmarks/bertLarge/README.md @@ -58,6 +58,25 @@ bert_reference_results_text_md5.txt - XTCL 2.1 +#### 2.3 天数智芯 MR-100 + +- ##### 硬件环境 + - 机器、加速卡型号: MR-100 + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 + - OS kernel版本: 5.15.0-89-generic + - 加速卡驱动版本:3.2.0 + - Docker 版本:24.0.4 + - 依赖软件版本: + - torch-1.13.1+corex.3.2.1 + - onnxsim + +- 推理工具包 + + - IXRT: ixrt-0.8.0+corex.3.2.1 + + ### 4. 运行情况(BERT-Large) * 指标列表 @@ -83,3 +102,5 @@ bert_reference_results_text_md5.txt | tensorrt | fp16 | 32 | 1283.9 | 257.3 | 260.4 | 408.3 | 418.1 | 45.3% | 0.600/0.638 | 17.4/40.0 | | tensorrt | fp32 | 32 | 1868.8 | 150.4 | 152.2 | 190.4 | 194.1 | 42.0% | 0.638/0.638 | 16.9/40.0 | | kunlunxin_xtcl| W32A16 | 32 |/ | / | / | / | / | / | 0.638/0.638| /| +| iluvatar_ixrt| fp16 | 32 |/ | / | / | / | / | / | 0.599/0.638| /| + diff --git a/inference/benchmarks/bertLarge/pytorch/iluvatar_requirements.txt b/inference/benchmarks/bertLarge/pytorch/iluvatar_requirements.txt new file mode 100644 index 000000000..ea980c9cd --- /dev/null +++ b/inference/benchmarks/bertLarge/pytorch/iluvatar_requirements.txt @@ -0,0 +1,2 @@ +transformers +onnxsim \ No newline at end of file diff --git a/inference/configs/bertLarge/vendor_config/iluvatar_configurations.yaml b/inference/configs/bertLarge/vendor_config/iluvatar_configurations.yaml new file mode 100644 index 000000000..3f1c5fec7 --- /dev/null +++ b/inference/configs/bertLarge/vendor_config/iluvatar_configurations.yaml @@ -0,0 +1,5 @@ +ixrt_tmp_path: iluvatar_tmp/bertLarge.trt +compiler: ixrt +# no_validation: true +has_dynamic_axis: false +torchtrt_full_compile: true \ No newline at end of file diff --git a/inference/docker_images/iluvatar/pytorch/packages/README.md b/inference/docker_images/iluvatar/pytorch/packages/README.md index 224dbe2af..9314892bc 100644 --- a/inference/docker_images/iluvatar/pytorch/packages/README.md +++ b/inference/docker_images/iluvatar/pytorch/packages/README.md @@ -2,7 +2,7 @@ >联系邮箱: contact-us@iluvatar.com -ixrt-0.7.0+corex.latest.version-cp310-cp310-linux_x86_64.whl +ixrt-0.8.0+corex.latest.version-cp310-cp310-linux_x86_64.whl torchvision-0.14.1+corex.3.2.1.20231006.892-cp310-cp310-linux_x86_64.whl diff --git a/inference/inference_engine/iluvatar/ixrt.py b/inference/inference_engine/iluvatar/ixrt.py index 44fc85c4b..abb5cd17e 100644 --- a/inference/inference_engine/iluvatar/ixrt.py +++ b/inference/inference_engine/iluvatar/ixrt.py @@ -9,7 +9,6 @@ import time import subprocess - class InferModel: class HostDeviceMem(object): @@ -66,27 +65,32 @@ def __init__(self, config, onnx_path, model): def build_engine(self, config, onnx_path): if config.exist_compiler_path is None: - trt_path = config.log_dir + "/" + config.ixrt_tmp_path + ixrt_path = config.log_dir + "/" + config.ixrt_tmp_path - dir_trt_path = os.path.dirname(trt_path) + dir_trt_path = os.path.dirname(ixrt_path) os.makedirs(dir_trt_path, exist_ok=True) time.sleep(10) - trtexec_cmd = "ixrtexec --onnx=" + onnx_path + " --save_engine=" + trt_path + onnxsim_cmd = f"onnxsim {onnx_path} {onnx_path}" + + onnxsim_cmd = subprocess.Popen(onnxsim_cmd, shell=True) + onnxsim_cmd.wait() + + ixrtexec_cmd = "ixrtexec --onnx=" + onnx_path + " --save_engine=" + ixrt_path if config.fp16: - trtexec_cmd += " --precision fp16" + ixrtexec_cmd += " --precision fp16" if config.has_dynamic_axis: - trtexec_cmd += " --minShapes=" + config.minShapes - trtexec_cmd += " --optShapes=" + config.optShapes - trtexec_cmd += " --maxShapes=" + config.maxShapes + ixrtexec_cmd += " --minShapes=" + config.minShapes + ixrtexec_cmd += " --optShapes=" + config.optShapes + ixrtexec_cmd += " --maxShapes=" + config.maxShapes - p = subprocess.Popen(trtexec_cmd, shell=True) + p = subprocess.Popen(ixrtexec_cmd, shell=True) p.wait() else: - trt_path = config.exist_compiler_path + ixrt_path = config.exist_compiler_path - with open(trt_path, "rb") as f: + with open(ixrt_path, "rb") as f: return self.runtime.deserialize_cuda_engine(f.read()) def allocate_buffers(self, engine): diff --git a/training/benchmarks/aquila2_7b/flagscale/README.md b/training/benchmarks/aquila2_7b/flagscale/README.md index 27d5eb1b0..b75cdad0c 100644 --- a/training/benchmarks/aquila2_7b/flagscale/README.md +++ b/training/benchmarks/aquila2_7b/flagscale/README.md @@ -4,7 +4,9 @@ aquila2是北京人工智能研究院开源的语言模型,包含基础语言 ## 模型配置及tokenizer准备 -本测试样例为预训练case,需要下载tokenizer,下载链接为https://github.com/FlagOpen/FlagScale/tree/main/examples/aquila/tokenizer。需要在data_dir下创建tokenizer目录,将上述链接中的三个文件下载到此目录中 +本测试样例为预训练case,需要下载tokenizer,下载链接为https://github.com/FlagOpen/FlagScale/tree/main/examples/aquila/tokenizer + +此tokenizer需要下载FlagScale仓库ed55532这一commit版本,需要在data_dir下创建tokenizer目录,将上述链接中的三个文件下载到此目录中 ## 数据准备 @@ -14,4 +16,4 @@ https://model.ks3-cn-beijing.ksyuncs.com/nlpdata/pile_wikipedia_demo.bin https://model.ks3-cn-beijing.ksyuncs.com/nlpdata/pile_wikipedia_demo.idx -将上述两个文件放置于data_dir下。 \ No newline at end of file +将上述两个文件放置于data_dir下。 diff --git a/training/benchmarks/bert_hf/pytorch/train/trainer.py b/training/benchmarks/bert_hf/pytorch/train/trainer.py index 78f40a686..f63e4cd8e 100755 --- a/training/benchmarks/bert_hf/pytorch/train/trainer.py +++ b/training/benchmarks/bert_hf/pytorch/train/trainer.py @@ -82,21 +82,7 @@ def train_one_epoch(self, train_dataloader, eval_dataloader): dist_pytorch.barrier(self.config.vendor) pure_start_time = time.time() - if scaler is not None: - with torch.cuda.amp.autocast(enabled=True): - output = model(input_ids=input_ids, labels=labels) - loss = output.loss - - scaler.scale(loss).backward() - if step % self.config.gradient_accumulation_steps == 0: - scaler.step(optimizer) - scaler.update() - else: - output = model(input_ids=input_ids, labels=labels) - loss = output.loss - loss.backward() - if step % self.config.gradient_accumulation_steps == 0: - optimizer.step() + loss = self.adapter.train_one_step(model, (input_ids, labels), optimizer, step, scaler) if step % self.config.log_freq == 0: print("Train Step " + str(step) + "/" + str(len(data_loader)) + diff --git a/training/benchmarks/bert_hf/pytorch/train/trainer_adapter.py b/training/benchmarks/bert_hf/pytorch/train/trainer_adapter.py index ba8eaa585..19f8427cf 100755 --- a/training/benchmarks/bert_hf/pytorch/train/trainer_adapter.py +++ b/training/benchmarks/bert_hf/pytorch/train/trainer_adapter.py @@ -41,3 +41,24 @@ def create_grad_scaler(): """create_grad_scaler for mixed precision training""" scaler = torch.cuda.amp.GradScaler() if config.amp else None return scaler + + +def train_one_step(model, batch_data, optimizer, cur_step, scaler=None): + input_ids, labels = batch_data + if scaler: + with torch.cuda.amp.autocast(enabled=True): + output = model(input_ids=input_ids, labels=labels) + loss = output.loss + + scaler.scale(loss).backward() + if cur_step % config.gradient_accumulation_steps == 0: + scaler.step(optimizer) + scaler.update() + else: + output = model(input_ids=input_ids, labels=labels) + loss = output.loss + loss.backward() + if cur_step % config.gradient_accumulation_steps == 0: + optimizer.step() + + return loss diff --git a/training/benchmarks/driver/dist_pytorch.py b/training/benchmarks/driver/dist_pytorch.py index 6c824c422..2704dcfd5 100755 --- a/training/benchmarks/driver/dist_pytorch.py +++ b/training/benchmarks/driver/dist_pytorch.py @@ -149,6 +149,8 @@ def barrier(vendor="nvidia"): if torch.distributed.is_available() and torch.distributed.is_initialized(): if vendor == "kunlunxin": torch.distributed.barrier() + elif vendor == "mthreads": + torch.distributed.barrier() else: torch.distributed.all_reduce(torch.cuda.FloatTensor(1)) torch.cuda.synchronize() @@ -172,6 +174,23 @@ def init_dist_training_env(config): rank=rank, world_size=world_size) config.n_device = torch.distributed.get_world_size() + elif config.vendor == "mthreads": + import torch_musa + if int(os.environ.get("WORLD_SIZE", 1)) <= 1: + config.device = torch.device("musa") + config.n_device = 1 + else: + torch.musa.set_device(config.local_rank) + host_addr_full = 'tcp://' + os.environ[ + "MASTER_ADDR"] + ':' + os.environ["MASTER_PORT"] + rank = int(os.environ["RANK"]) + world_size = int(os.environ["WORLD_SIZE"]) + torch.distributed.init_process_group(backend=config.dist_backend, + init_method=host_addr_full, + rank=rank, + world_size=world_size) + config.device = torch.device("musa", config.local_rank) + config.n_device = torch.distributed.get_world_size() else: # nvidia if int(os.environ.get("WORLD_SIZE", 1)) <= 1: config.device = torch.device("cuda") diff --git a/training/benchmarks/driver/helper.py b/training/benchmarks/driver/helper.py index c8f406615..de513901e 100644 --- a/training/benchmarks/driver/helper.py +++ b/training/benchmarks/driver/helper.py @@ -74,6 +74,12 @@ def set_seed(self, seed: int, vendor: str = None): elif lower_vendor == "ascend": import mindspore mindspore.set_seed(seed) + elif lower_vendor == "mthreads": + import torch + import torch_musa + torch.manual_seed(seed) + torch.musa.manual_seed(seed) + torch.musa.manual_seed_all(seed) else: # TODO 其他厂商设置seed,在此扩展 pass diff --git a/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py b/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py index 10ae55779..fc730ac44 100644 --- a/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py +++ b/training/benchmarks/llama2_7b/deepspeed/run_pretraining.py @@ -10,6 +10,11 @@ from importlib import import_module import torch +try: + import torch_musa + DEVICE = 'musa' +except: + DEVICE = 'cuda' from torch.utils.data import DataLoader from torch.utils.data.distributed import DistributedSampler @@ -54,29 +59,32 @@ def get_argument_parser(): def train(model_engine, dataloader): model_engine.train() + device = torch.device(f"{DEVICE}:{args.local_rank}") ave_loss = 0.0 for step, data in enumerate(dataloader): fake_data = torch.tensor(data).long() - input_ids = fake_data.to(args.local_rank) - labels = fake_data.to(args.local_rank) + input_ids = fake_data.to(device) + labels = fake_data.to(device) loss = model_engine(input_ids=input_ids, labels=labels).loss model_engine.backward(loss) model_engine.step() ave_loss += loss - if step % 10 == 0 and args.local_rank == 0: + if step > 0 and step % 10 == 0 and args.local_rank == 0: print('Step {}/{}, Loss: {}'.format(step, len(dataloader), ave_loss / 10)) ave_loss = 0.0 -def get_deepspeed_engine(args, model_config_dir, flashattn): +def get_deepspeed_engine(args, model_config_dir): with deepspeed.zero.Init(config_dict_or_path=args.deepspeed_config, enabled=True, mem_efficient_linear=False, mpu=None): - model = get_llama_model(model_config_dir, flashattn) + model = get_llama_model(model_config_dir, args.flashattn) + if args.gradient_checkpointing_enable: + model.gradient_checkpointing_enable() model_engine, _, _, _ = deepspeed.initialize( args=args, model=model, model_parameters=model.parameters()) @@ -107,10 +115,12 @@ def get_metric(texts): theoryflops = getattr(module, 'theoryflops') epochs = getattr(module, 'epochs') flashattn = getattr(module, 'flashattn') + gradient_checkpointing_enable = getattr(module, 'gradient_checkpointing_enable', False) + args.flashattn = flashattn + args.gradient_checkpointing_enable = gradient_checkpointing_enable deepspeed.init_distributed() - model_engine = get_deepspeed_engine(args, os.path.join("llama2_7b_hf"), - flashattn) + model_engine = get_deepspeed_engine(args, os.path.join("llama2_7b_hf")) dataset = get_llama_dataset(args, seqlength, datafilename) logger = logging.getLogger("DeepSpeed") @@ -138,4 +148,8 @@ def get_metric(texts): chip_tps = whole_tps / args.nproc * args.nnodes print("System tokens per second: ", whole_tps) print("Tokens/p/s: ", chip_tps) + + TFLOPS = int(theoryflops/1000000000000) + print("Theory TFLOPS: ", TFLOPS) + print("Tokens/TFLOPS: ", chip_tps / TFLOPS) print("MFU: ", chip_tps * 7000000000.0 * 6 / theoryflops) diff --git a/training/benchmarks/resnet50/pytorch/train/trainer.py b/training/benchmarks/resnet50/pytorch/train/trainer.py index 52e7d6ae7..b07d90c68 100755 --- a/training/benchmarks/resnet50/pytorch/train/trainer.py +++ b/training/benchmarks/resnet50/pytorch/train/trainer.py @@ -82,22 +82,7 @@ def train_one_epoch(self, train_dataloader, eval_dataloader): pure_start_time = time.time() optimizer.zero_grad() - images, target = batch - if scaler is not None: - with torch.cuda.amp.autocast(enabled=True): - output = model(images) - loss = criterion(output, target) - - scaler.scale(loss).backward() - scaler.step(optimizer) - scaler.update() - else: - output = model(images) - - criterion = torch.nn.CrossEntropyLoss() - loss = criterion(output, target) - loss.backward() - optimizer.step() + loss = self.adapter.train_step(model, batch, optimizer, scaler) if step % self.config.log_freq == 0: print("Train Step " + str(step) + "/" + str(len(data_loader)) + diff --git a/training/benchmarks/resnet50/pytorch/train/trainer_adapter.py b/training/benchmarks/resnet50/pytorch/train/trainer_adapter.py index ba8eaa585..d4b7b4708 100755 --- a/training/benchmarks/resnet50/pytorch/train/trainer_adapter.py +++ b/training/benchmarks/resnet50/pytorch/train/trainer_adapter.py @@ -41,3 +41,23 @@ def create_grad_scaler(): """create_grad_scaler for mixed precision training""" scaler = torch.cuda.amp.GradScaler() if config.amp else None return scaler + + +def train_step(model, batch, optimizer, scaler=None): + """train one step""" + images, target = batch + criterion = torch.nn.CrossEntropyLoss() + if scaler: + with torch.cuda.amp.autocast(enabled=True): + output = model(images) + loss = criterion(output, target) + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() + else: + output = model(images) + loss = criterion(output, target) + loss.backward() + optimizer.step() + + return loss diff --git a/training/iluvatar/iluvatar_monitor.py b/training/iluvatar/iluvatar_monitor.py index 7ba03907d..cc8de6c2d 100644 --- a/training/iluvatar/iluvatar_monitor.py +++ b/training/iluvatar/iluvatar_monitor.py @@ -231,7 +231,7 @@ def get_system_info(): cmd = cmd + r"echo ;" cmd = cmd + r"echo Accelerator Model:;" - cmd = cmd + r"ixsmi -L;" + cmd = cmd + r"export PATH=/usr/local/corex/bin:$PATH; export LD_LIBRARY_PATH=/usr/local/corex/lib; ixsmi -L;" cmd = cmd + r"echo ;" cmd = cmd + r"echo Accelerator Driver version:;" diff --git a/training/iluvatar/mobilenetv2-pytorch/README.md b/training/iluvatar/mobilenetv2-pytorch/README.md index 0c0a34303..62d81319a 100644 --- a/training/iluvatar/mobilenetv2-pytorch/README.md +++ b/training/iluvatar/mobilenetv2-pytorch/README.md @@ -40,7 +40,8 @@ | 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | | --------------------- | --------- | -------------- | -------- | ------- | ------- | ------ | ------ | ----------- | -| BI-V100单机8卡(1x8) | fp32 | bs=256,lr=0.72 | 103759 | 3520 | 3604 | 3651 | 68.61% | 21.6 / 32.0 | +| BI-V100单机8卡(1x8) | fp32 | / | 174534 | 1857 | 1876 | 1885 | 68.52% | 3.6/32.0 | +| BI-V100单机8卡(1x8) | fp32 | bs=256,lr=0.72 | 87559 | 4390 | 4543 | 4625 | 61.92% | 21.6 / 32.0 | | BI-V100单机8卡(1x1) | fp32 | bs=256,lr=0.72 | / | 624 | 632 | 633 | / | 21.4 / 32.0 | | BI-V100单机8卡(2x8) | fp32 | bs=256,lr=0.72 | / | 6835 | 7058 | 7219 | / | 22.2 / 32.0 | diff --git a/training/iluvatar/mobilenetv2-pytorch/config/config_BI-V100x1x8.py b/training/iluvatar/mobilenetv2-pytorch/config/config_BI-V100x1x8.py index 714586e2e..2c42ee22e 100644 --- a/training/iluvatar/mobilenetv2-pytorch/config/config_BI-V100x1x8.py +++ b/training/iluvatar/mobilenetv2-pytorch/config/config_BI-V100x1x8.py @@ -1,5 +1,5 @@ from config_common import * -train_batch_size = 256 -eval_batch_size = 256 +train_batch_size = 32 +eval_batch_size = 32 diff --git a/training/iluvatar/swin_transformer-pytorch/README.md b/training/iluvatar/swin_transformer-pytorch/README.md index 12c71636c..32318463b 100644 --- a/training/iluvatar/swin_transformer-pytorch/README.md +++ b/training/iluvatar/swin_transformer-pytorch/README.md @@ -8,15 +8,37 @@ - ##### 软件环境 - OS版本:Ubuntu 20.04 - - OS kernel版本: 4.15.0-156-generic x86_64 - - 加速卡驱动版本:3.0.0 - - Docker 版本:20.10.8 - - 训练框架版本:torch-1.10.2+corex.3.0.0 + - OS kernel版本: 5.4.0-148-generic + - 加速卡驱动版本:3.1.0 + - Docker 版本:24.0.5 + - 训练框架版本:torch-1.13.1+corex.3.1.0 - 依赖软件版本:无 ### 运行情况 -| 训练资源 | 配置文件 | 运行时长(s) | 目标精度 | 收敛精度 | Steps数 | 性能(samples/s) | -| -------- | --------------- | ----------- | -------- | -------- | ------- | ---------------- | -| 单机8卡 | config_A100x1x8 | | | | | | +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | --------------------------------------------- | ------------------------------------------- | +| 任务类别 | Image Classification && Semantic Segmantation | | +| 模型 | swin_transformer | | +| 数据集 | Imagenet2012 1K | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16/tf32 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | Iluvatar BI-V100 | | +| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) | +| 训练结果 | final_acc1,见“性能指标” | 验证准确率 | +| 额外修改项 | 无 | | + +* 性能指标 + +| 配置 | precision| fix_hp | e2e_time | p_whole | p_train | p_core | final_acc1 | mem | +|----------------------| ---------| ------ | ---------| ------- | ------- | ------ | -------- | --------- | +| BI-V100单机8卡(1x8) | amp | bs=352 | / | / | / | / | 81.23 | 26.8/32.0 | +| BI-V100单机单卡(1x1) | amp | bs=352 | / | / | / | / | / | 26.5/32.0 | +| BI-V100两机8卡(2x8) | amp | bs=352 | / | / | / | / | / | 26.8/32.0 | diff --git a/training/iluvatar/swin_transformer-pytorch/config/config_BI-V100x1x1.py b/training/iluvatar/swin_transformer-pytorch/config/config_BI-V100x1x1.py new file mode 100644 index 000000000..f594228b8 --- /dev/null +++ b/training/iluvatar/swin_transformer-pytorch/config/config_BI-V100x1x1.py @@ -0,0 +1,4 @@ +from config_common import * + +cfg = "configs/swin/swin_tiny_patch4_window7_224.yaml" +train_batch_size = 352 diff --git a/training/iluvatar/swin_transformer-pytorch/config/config_BI-V100x1x8.py b/training/iluvatar/swin_transformer-pytorch/config/config_BI-V100x1x8.py index 52ef64da3..f594228b8 100644 --- a/training/iluvatar/swin_transformer-pytorch/config/config_BI-V100x1x8.py +++ b/training/iluvatar/swin_transformer-pytorch/config/config_BI-V100x1x8.py @@ -1,4 +1,4 @@ from config_common import * cfg = "configs/swin/swin_tiny_patch4_window7_224.yaml" -train_batch_size = 256 +train_batch_size = 352 diff --git a/training/iluvatar/swin_transformer-pytorch/config/config_BI-V100x2x8.py b/training/iluvatar/swin_transformer-pytorch/config/config_BI-V100x2x8.py new file mode 100644 index 000000000..f594228b8 --- /dev/null +++ b/training/iluvatar/swin_transformer-pytorch/config/config_BI-V100x2x8.py @@ -0,0 +1,4 @@ +from config_common import * + +cfg = "configs/swin/swin_tiny_patch4_window7_224.yaml" +train_batch_size = 352 diff --git a/training/kunlunxin/tacotron2-pytorch/README.md b/training/kunlunxin/tacotron2-pytorch/README.md index db2d8b5e4..55d54fce5 100644 --- a/training/kunlunxin/tacotron2-pytorch/README.md +++ b/training/kunlunxin/tacotron2-pytorch/README.md @@ -18,12 +18,31 @@ ### 运行情况 -| 训练资源 | 配置文件 | 运行时长(s) | 目标val_loss | 收敛val_loss | epoch数 | 性能(samples/s) | -| -------- | --------------- | ----------- | ------------ | ------------ | ------- | --------------- | -| 单机1卡 | config_R300x1x1 | / | | / | | | -| 单机8卡 | config_R300x1x8 | | 0.4852(fp32) | 0.4271(fp32) | 1235 | | -| 两机8卡 | config_R300x2x8 | / | | / | | | +* 通用指标 +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | ----------------------- | ------------------------------------------- | +| 任务类别 | SpeechSynthesis | | +| 模型 | tacotron2 | | +| 数据集 | LJSpeech | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16/tf32 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | Kunlunxin R300 | | +| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) | +| 训练结果 | val_loss,见“性能指标” | 验证loss | +| 额外修改项 | 无 | | + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | val_loss | mem | +| -------------------- | --------- | --------------- | -------- | ------- | ------- | ------ | -------- | --------- | +| R300单机8卡(1x8) | fp32 | bs=96, lr=0.001 | / | / | / | / | 0.4801 | 26.3/32.0 | +| R300双机8卡(2x8) | fp32 | bs=96, lr=0.001 | / | / | / | / | / | 25.0/32.0 | +| R300单机单卡(1x1) | fp32 | bs=128, lr=0.001 | / | / | / | / | / | 30.5/32.0 | ### 许可证 Apache 2.0 license。 diff --git a/training/kunlunxin/tacotron2-pytorch/config/config_R300x1x1.py b/training/kunlunxin/tacotron2-pytorch/config/config_R300x1x1.py new file mode 100644 index 000000000..342338fce --- /dev/null +++ b/training/kunlunxin/tacotron2-pytorch/config/config_R300x1x1.py @@ -0,0 +1,9 @@ +from config_common import * + +train_batch_size = 128 +eval_batch_size = train_batch_size + +warmup = 0.2 +learning_rate = 1e-3 + +seed = 23333 diff --git a/training/kunlunxin/tacotron2-pytorch/config/config_R300x1x8.py b/training/kunlunxin/tacotron2-pytorch/config/config_R300x1x8.py index 85f55987e..c83fd15fe 100644 --- a/training/kunlunxin/tacotron2-pytorch/config/config_R300x1x8.py +++ b/training/kunlunxin/tacotron2-pytorch/config/config_R300x1x8.py @@ -1,6 +1,6 @@ from config_common import * -train_batch_size = 48 +train_batch_size = 96 eval_batch_size = train_batch_size warmup = 0.2 diff --git a/training/kunlunxin/tacotron2-pytorch/config/config_R300x2x8.py b/training/kunlunxin/tacotron2-pytorch/config/config_R300x2x8.py new file mode 100644 index 000000000..c83fd15fe --- /dev/null +++ b/training/kunlunxin/tacotron2-pytorch/config/config_R300x2x8.py @@ -0,0 +1,9 @@ +from config_common import * + +train_batch_size = 96 +eval_batch_size = train_batch_size + +warmup = 0.2 +learning_rate = 1e-3 + +seed = 23333 \ No newline at end of file diff --git a/training/kunlunxin/tacotron2-pytorch/config/environment_variables.sh b/training/kunlunxin/tacotron2-pytorch/config/environment_variables.sh new file mode 100644 index 000000000..0383763a0 --- /dev/null +++ b/training/kunlunxin/tacotron2-pytorch/config/environment_variables.sh @@ -0,0 +1,5 @@ +export XACC=1 +export BKCL_PCIE_RING=1 +export XACC_LOAD_FLAGS=1 +export BKCL_TIMEOUT=1800 +export XMLIR_D_XPU_L3_SIZE=10485760 diff --git a/training/mthreads/README.md b/training/mthreads/README.md new file mode 100644 index 000000000..194b9e73f --- /dev/null +++ b/training/mthreads/README.md @@ -0,0 +1,70 @@ + +# 厂商信息 + +官网: https://www.mthreads.com/ + +摩尔线程智能科技(北京)有限责任公司(简称:摩尔线程)是一家以GPU芯片设计为主的集成电路设计企业,专注于研发设计全功能GPU芯片及相关产品,为科技生态合作伙伴提供强大的计算加速能力。公司致力于创新研发面向“元计算”应用的新一代GPU,构建融合视觉计算、3D图形计算、科学计算及人工智能计算的综合计算平台,建立基于云原生GPU计算的生态系统,助力驱动数字经济发展。 + +摩尔线程MTT S系列全功能GPU支持多样算力,借助覆盖深度学习、图形渲染、视频处理和科学计算的完整MUSA软件栈,可为AI训练、AI推理、大模型、AIGC、云游戏、云渲染、视频云、数字孪生等场景提供通用智能算力支持,旨在为数据中心、智算中心和元计算中心的建设构建坚实算力基础,助力元宇宙中多元应用创新和落地。 + +MUSA软件栈通过musify CUDA代码迁移工具、计算/通信加速库、mcc编译器、musa运行时和驱动实现对CUDA生态的兼容,帮助用户快速完成代码及应用的迁移。通过torch_musa插件,可以实现MTT S系列GPU对原生PyTorch的对接,用户可以无感的把AI模型运行在摩尔线程全功能GPU上。 + +# FlagPerf适配验证环境说明 +## 环境配置参考 + - 硬件 + - 机器型号: MCCX D800 + - 加速卡型号: MTT S4000 48GB + - CPU型号:Intel(R) Xeon(R) Gold 6430 CPU @ 2.00GHz + - 多机网络类型、带宽: InfiniBand,2*200Gbps + - 软件 + - OS版本:Ubuntu 20.04 LTS + - OS kernel版本: 5.4.0-42-generic + - 加速卡驱动版本:2.2.0 + - Docker 版本: 20.10.24 + +## 容器镜像信息 +- 容器构建信息 + - Dockerfile路径:training/mthreads/docker_image/pytorch_2.0/Dockerfile + - 构建后软件安装脚本: training/mthreads/docker_image/pytorch_2.0/pytorch_2.0_install.sh + +- 核心软件信息 + + - AI框架&版本 + - PyTorch: v2.0.0 + + - 其它软件版本 + - torch_musa: 2.0.0+git8614ba1 + - musa toolkits: 1.5.0+git3d8791d + - mcc: 1.5.2+git3730bdd + - mublas: 1.2.0+gitd9867b5 + + +## 加速卡监控采集 +- 加速卡使用信息采集命令 + + ```bash + mthreads-gmi -q | grep -E 'GPU Current Temp|Power Draw|Used|Total|Gpu' | \ + awk -F ': *' '/GPU Current Temp|Power Draw|Used|Total|Gpu/ \ + { values[(NR-1)%5+1] = $2; } NR % 5 == 0 { print values[4], values[5], values[2], values[1], values[3]; }' + ``` +- 监控项示例: + ```bash + 45C 109.51W 1MiB 32768MiB 0% + 44C 108.95W 1MiB 32768MiB 0% + 46C 110.87W 1MiB 32768MiB 0% + 43C 104.33W 1MiB 32768MiB 0% + 44C 107.55W 8MiB 32768MiB 0% + 46C 110.51W 8MiB 32768MiB 0% + 44C 106.59W 8MiB 32768MiB 0% + 44C 104.58W 8MiB 32768MiB 0% + ``` +- 加速卡使用信息采集项说明 + +|监控项| 日志文件 | 格式 | +|---|---|---| +|温度| mthreads_monitor.log | xxx C | +|功耗 |mthreads_monitor.log | xxx W | +|显存占用大小 |mthreads_monitor.log |xxx MiB | +|总显存大小 |mthreads_monitor.log |xxx MiB | +|显存使用率 |mthreads_monitor.log |xxx % | + diff --git a/training/mthreads/bert_hf-pytorch/README.md b/training/mthreads/bert_hf-pytorch/README.md new file mode 100644 index 000000000..17cfdb961 --- /dev/null +++ b/training/mthreads/bert_hf-pytorch/README.md @@ -0,0 +1,48 @@ +### 摩尔线程 MTT S系列 GPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器型号: MCCX D800 + - 加速卡型号: MTT S4000 48GB + - CPU型号: Intel(R) Xeon(R) Gold 6430 CPU @ 2.00GHz + - 多机网络类型、带宽: InfiniBand,2*200Gbps +- ##### 软件环境 + - OS版本:Ubuntu 20.04 LTS + - OS kernel版本: 5.4.0-42-generic + - 加速卡驱动版本: + - Docker 版本:20.10.24 + - 训练框架版本:pytorch-2.0.0+torch_musa-git8614ba1 + - 依赖软件版本: + - musa toolkits: 1.5.0+git3d8791d + - mcc: 1.5.2+git3730bdd + - mublas: 1.2.0+gitd9867b5 + +### 运行情况 + +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | ----------------------- | ------------------------------------- | +| 任务类别 | 自然语言编码 | | +| 模型 | bert-large-uncased | | +| 数据集 | Wikipedia | | +| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | MTT S4000 | | +| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练序列数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时 | +| 训练结果 | acc,见“性能指标” | masked_lm任务准确率(实际/目标) | +| 额外修改项 | fp16实现方式 | mthreads使用1+8+7格式(bf16)来实现16位浮点数 | + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | +| ------------------ | --------- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | +| S4000单机单卡(1x1) | fp32 | \ | | | | | \ | \ | +| S4000单机单卡(1x1) | amp | \ | | | | | \ | \ | +| S4000单机8卡(1x8) | amp | bs=20, lr=2.5e-05 | | | | | 0.658/0.655| 34.2/48.0 | +| S4000两机8卡(2x8) | amp | \ | | | | | \ | \ | + + diff --git a/training/mthreads/bert_hf-pytorch/config/config_S4000x1x1.py b/training/mthreads/bert_hf-pytorch/config/config_S4000x1x1.py new file mode 100644 index 000000000..f0816d4ad --- /dev/null +++ b/training/mthreads/bert_hf-pytorch/config/config_S4000x1x1.py @@ -0,0 +1,11 @@ +vendor = "mthreads" + +train_batch_size = 20 +eval_batch_size = train_batch_size +lr = 0.000005 # fp32/amp +#lr = 0.00005 # bf16 + +dist_backend = "mccl" + +amp = True +fp16 = False diff --git a/training/mthreads/bert_hf-pytorch/config/config_S4000x1x8.py b/training/mthreads/bert_hf-pytorch/config/config_S4000x1x8.py new file mode 100644 index 000000000..3cb18282b --- /dev/null +++ b/training/mthreads/bert_hf-pytorch/config/config_S4000x1x8.py @@ -0,0 +1,11 @@ +vendor = "mthreads" + +train_batch_size = 20 +eval_batch_size = train_batch_size +lr = 5e-06 / 0.2 # fp32/amp +#lr = 5e-05 / 0.6 # bf16 + +dist_backend = "mccl" + +amp = True +fp16 = False diff --git a/training/mthreads/bert_hf-pytorch/config/environment_variables.sh b/training/mthreads/bert_hf-pytorch/config/environment_variables.sh new file mode 100644 index 000000000..5cfe0142c --- /dev/null +++ b/training/mthreads/bert_hf-pytorch/config/environment_variables.sh @@ -0,0 +1,8 @@ +# ================================================= +# Export variables +# ================================================= + + +export OMP_NUM_THREADS=1 +export MUSA_KERNEL_TIMEOUT=3600000 + diff --git a/training/mthreads/bert_hf-pytorch/config/requirements.txt b/training/mthreads/bert_hf-pytorch/config/requirements.txt new file mode 100644 index 000000000..a81d3b585 --- /dev/null +++ b/training/mthreads/bert_hf-pytorch/config/requirements.txt @@ -0,0 +1 @@ +transformers==4.35.0 diff --git a/training/mthreads/bert_hf-pytorch/extern/trainer_adapter.py b/training/mthreads/bert_hf-pytorch/extern/trainer_adapter.py new file mode 100644 index 000000000..72b0e68b4 --- /dev/null +++ b/training/mthreads/bert_hf-pytorch/extern/trainer_adapter.py @@ -0,0 +1,32 @@ +import torch +import torch_musa + +import config + + +def create_grad_scaler(): + """create_grad_scaler for mixed precision training""" + scaler = torch_musa.amp.GradScaler() if config.amp else None + return scaler + + +def train_one_step(model, batch_data, optimizer, cur_step, scaler=None): + input_ids, labels = batch_data + if scaler: + with torch_musa.amp.autocast(enabled=True): + output = model(input_ids=input_ids, labels=labels) + loss = output.loss + + scaler.scale(loss).backward() + if cur_step % config.gradient_accumulation_steps == 0: + scaler.step(optimizer) + scaler.update() + else: + output = model(input_ids=input_ids, labels=labels) + loss = output.loss + loss.backward() + if cur_step % config.gradient_accumulation_steps == 0: + optimizer.step() + + return loss + \ No newline at end of file diff --git a/training/mthreads/docker_image/deepspeed/Dockerfile b/training/mthreads/docker_image/deepspeed/Dockerfile new file mode 100644 index 000000000..e427ddd80 --- /dev/null +++ b/training/mthreads/docker_image/deepspeed/Dockerfile @@ -0,0 +1,4 @@ +FROM sh-harbor.mthreads.com/mt-ai/lm-qy2:FlagPerf-v1 +ENV PATH=/opt/conda/envs/py38/bin/:/opt/conda/condabin/conda:$PATH +ENV LD_LIBRARY_PATH=/usr/local/musa/lib:$LD_LIBRARY_PATH +RUN ln -sf /usr/bin/bash /usr/bin/sh diff --git a/training/mthreads/docker_image/deepspeed/deepspeed_install.sh b/training/mthreads/docker_image/deepspeed/deepspeed_install.sh new file mode 100644 index 000000000..a9bf588e2 --- /dev/null +++ b/training/mthreads/docker_image/deepspeed/deepspeed_install.sh @@ -0,0 +1 @@ +#!/bin/bash diff --git a/training/mthreads/docker_image/pytorch_2.0/Dockerfile b/training/mthreads/docker_image/pytorch_2.0/Dockerfile new file mode 100644 index 000000000..2982c1af5 --- /dev/null +++ b/training/mthreads/docker_image/pytorch_2.0/Dockerfile @@ -0,0 +1,3 @@ +FROM moore-threads/pytorch:flagperf-py38 +ENV PATH /opt/conda/envs/py38/bin:$PATH +ENV LD_LIBRARY_PATH=/usr/local/musa/lib/:$LD_LIBRARY_PATH diff --git a/training/mthreads/docker_image/pytorch_2.0/pytorch_install.sh b/training/mthreads/docker_image/pytorch_2.0/pytorch_install.sh new file mode 100644 index 000000000..cc1f786e8 --- /dev/null +++ b/training/mthreads/docker_image/pytorch_2.0/pytorch_install.sh @@ -0,0 +1 @@ +#!/bin/bash \ No newline at end of file diff --git a/training/mthreads/llama2_7b-deepspeed/README.md b/training/mthreads/llama2_7b-deepspeed/README.md new file mode 100644 index 000000000..ed2fc06af --- /dev/null +++ b/training/mthreads/llama2_7b-deepspeed/README.md @@ -0,0 +1,50 @@ +### Moore Threads S4000 GPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 机器型号: MCCX D800 + - 加速卡型号: S4000 + - CPU型号: Intel(R) Xeon(R) Gold 6330 CPU @ 2.00GHz + - 多机网络类型、带宽: InfiniBand,2x200Gb/s + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 LTS + - OS kernel版本: 5.4.0-42-generic + - 加速卡驱动版本:2.2.0 + - Docker镜像和版本: PyTorch2.0_musa1.4_ec6a747fd342 + - 训练框架版本:pytorch-2.0.0+torch_musa-git8ea3501 + - 依赖软件版本: + - musa toolkits: 1.4.0+git4e25703 + - mublas: 1.1.0+gite484aa2 + +- ##### 优化策略 + + - scaled dot product attention + - checkpointing + +### 运行情况 + +* 输入批尺寸 + 1. local_batchsize(micro_batchsize),简写为LBS,即实际进入模型的张量批尺寸,为config_S4000x1x8.py中所写,在本case中默认为3 + 2. seqlength(max_position_embedding),简写为MPE,即实际进入模型的序列长度,为config_S4000x1x8.py中所写,在本case中默认为4096 + 3. gradient_accumulate_steps,简写为GAS,即梯度累加步数,为ds_config.json中所写,在本case中默认为1 + 4. global_batchsize恒等于local_batchsize\*gradient_accumulate_steps\*data_parallel_size,简写为GBS。在本case中,只存在数据并行,因此data_parallel_size=world_size。 + +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| ------------ | -------------------------- | ---------------------------------- | +| 任务类别 | 自然语言理解 | | +| 模型 | llama2_7b | | +| 数据集 | openwebtext | 如无特殊说明,训练前1亿个token | +| 数据精度 | amp | | +| 超参修改 | fix_hp,见“性能指标” | 运行必要特殊超参,例如需要改小seqlength避免OOM | +| 硬件设备简称 | S4000 | | +| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | +| 计算使用率 | MFU,见“性能指标” | 参见PaLM论文定义 | +| **吞吐量** | **token/p/s,见“性能指标”** | 平均单卡每秒处理的token数 | + +* 性能指标 + +| 配置 | fix_hp | token/p/s | loss | mem | MFU | +| ------------------- | ---------------- | ------ | ------- | --------- | --------- | +| S4000单机8卡(1x8) | / | |44.2/48.0|3.20| | diff --git a/training/mthreads/llama2_7b-deepspeed/config/config_S4000x1x8.py b/training/mthreads/llama2_7b-deepspeed/config/config_S4000x1x8.py new file mode 100644 index 000000000..2011c5b18 --- /dev/null +++ b/training/mthreads/llama2_7b-deepspeed/config/config_S4000x1x8.py @@ -0,0 +1,7 @@ +seqlength = 4096 +batchsize = 6 +datafilename = "openwebtext_llama2_100M.npy" +epochs = 1 +theoryflops = 98000000000000.0 +flashattn = True # using sdp attention +gradient_checkpointing_enable = True diff --git a/training/mthreads/llama2_7b-deepspeed/config/ds_config.json b/training/mthreads/llama2_7b-deepspeed/config/ds_config.json new file mode 100644 index 000000000..01e8c085f --- /dev/null +++ b/training/mthreads/llama2_7b-deepspeed/config/ds_config.json @@ -0,0 +1,40 @@ +{ + "gradient_accumulation_steps": 1, + "train_micro_batch_size_per_gpu": 1, + "prescale_gradients": false, + "zero_allow_untested_optimizer": true, + "optimizer": { + "type": "AdamW", + "params": { + "lr": 1e-5, + "weight_decay": 0.1, + "betas": [ + 0.9, + 0.95 + ], + "eps": 1e-5 + } + }, + "zero_optimization": { + "stage": 3, + "stage3_max_live_parameters": 1e9, + "stage3_max_reuse_distance": 1e9, + "stage3_prefetch_bucket_size": 1e7, + "sub_group_size": 1e9, + "contiguous_gradients": true, + "allgather_bucket_size": 1e8, + "reduce_bucket_size": 1e7, + "overlap_comm": false, + "reduce_scatter": true + }, + "steps_per_print": 50, + "gradient_clipping": 1.0, + "wall_clock_breakdown": false, + "fp16": { + "enabled": true, + "loss_scale_window": 100 + }, + "bf16": { + "enabled": false + } +} diff --git a/training/mthreads/llama2_7b-deepspeed/config/environment_variables.sh b/training/mthreads/llama2_7b-deepspeed/config/environment_variables.sh new file mode 100644 index 000000000..4ddb3cc29 --- /dev/null +++ b/training/mthreads/llama2_7b-deepspeed/config/environment_variables.sh @@ -0,0 +1,4 @@ +export MUSA_VISIBLE_DEVICES='0,1,2,3,4,5,6,7' +export DS_ACCELERATOR=musa +export MUSA_KERNEL_TIMEOUT=1800000 +export NCCL_PROTOS=2 diff --git a/training/mthreads/llama2_7b-deepspeed/config/requirements.txt b/training/mthreads/llama2_7b-deepspeed/config/requirements.txt new file mode 100644 index 000000000..e69de29bb diff --git a/training/mthreads/mthreads_monitor.py b/training/mthreads/mthreads_monitor.py new file mode 100644 index 000000000..092b832df --- /dev/null +++ b/training/mthreads/mthreads_monitor.py @@ -0,0 +1,290 @@ +# !/usr/bin/env python3 +# encoding: utf-8 +''' +Usage: python3 sys-monitor.py -o operation -l [log_path] + -o, --operation start|stop|restart|status + -l, --log log path , ./logs/ default +''' + +import os +import sys +import time +import signal +import atexit +import argparse +import datetime +from multiprocessing import Process +import subprocess +import schedule + + +class Daemon: + ''' + daemon subprocess class. + usage: subclass this daemon and override the run() method. + sys-monitor.pid: in the /tmp/, auto del when unexpected exit. + verbose: debug mode, disabled default. + ''' + + def __init__(self, + pid_file, + log_file, + err_file, + gpu_log, + log_path, + rate=5, + stdin=os.devnull, + stdout=os.devnull, + stderr=os.devnull, + home_dir='.', + umask=0o22, + verbose=0): + self.stdin = stdin + self.stdout = stdout + self.stderr = stderr + self.home_dir = home_dir + self.verbose = verbose + self.pidfile = pid_file + self.logfile = log_file + self.errfile = err_file + self.gpufile = gpu_log + self.logpath = log_path + self.rate = rate + self.umask = umask + self.verbose = verbose + self.daemon_alive = True + + def get_pid(self): + try: + with open(self.pidfile, 'r') as pf: + pid = int(pf.read().strip()) + except IOError: + pid = None + except SystemExit: + pid = None + return pid + + def del_pid(self): + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + + def run(self): + ''' + NOTE: override the method in subclass + ''' + + def gpu_mon(file): + TIMESTAMP = datetime.datetime.now().strftime('%Y-%m-%d-%H:%M:%S') + # TODO more elegant way? + cmd = "mthreads-gmi -q | grep -E 'GPU Current Temp|Power Draw|Used|Total|Gpu' | " + cmd += "awk -F ': *' '/GPU Current Temp|Power Draw|Used|Total|Gpu/ { values[(NR-1)%5+1] = $2; } NR % 5 == 0 { print values[4], values[5], values[2], values[1], values[3]; }'" + process = subprocess.Popen(cmd, + shell=True, + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT, + encoding='utf-8') + try: + out = process.communicate(timeout=10) + except subprocess.TimeoutExpired: + process.kill() + out = process.communicate() + + if process.returncode != 0: + result = "error" + result = TIMESTAMP + "\n" + out[0] + "\n" + with open(file, 'a') as f: + f.write(result) + + def timer_gpu_mon(): + gpu_process = Process(target=gpu_mon, args=(self.gpufile, )) + gpu_process.start() + + schedule.every(self.rate).seconds.do(timer_gpu_mon) + while True: + schedule.run_pending() + time.sleep(5) + + def daemonize(self): + if self.verbose >= 1: + print('daemon process starting ...') + try: + pid = os.fork() + if pid > 0: + sys.exit(0) + except OSError as e: + sys.stderr.write('fork #1 failed: %d (%s)\n' % + (e.errno, e.strerror)) + sys.exit(1) + os.chdir(self.home_dir) + os.setsid() + os.umask(self.umask) + try: + pid = os.fork() + if pid > 0: + sys.exit(0) + except OSError as e: + sys.stderr.write('fork #2 failed: %d (%s)\n' % + (e.errno, e.strerror)) + sys.exit(1) + sys.stdout.flush() + sys.stderr.flush() + si = open(self.stdin, 'r') + so = open(self.stdout, 'a+') + if self.stderr: + se = open(self.stderr, 'a+') + else: + se = so + os.dup2(si.fileno(), sys.stdin.fileno()) + os.dup2(so.fileno(), sys.stdout.fileno()) + os.dup2(se.fileno(), sys.stderr.fileno()) + atexit.register(self.del_pid) + pid = str(os.getpid()) + with open(self.pidfile, 'w+') as f: + f.write('%s\n' % pid) + + def start(self): + if not os.path.exists(self.logpath): + os.makedirs(self.logpath) + elif os.path.exists(self.gpufile): + os.remove(self.gpufile) + if self.verbose >= 1: + print('ready to start ......') + # check for a pid file to see if the daemon already runs + pid = self.get_pid() + if pid: + msg = 'pid file %s already exists, is it already running?\n' + sys.stderr.write(msg % self.pidfile) + sys.exit(1) + # start the daemon + self.daemonize() + self.run() + + def stop(self): + if self.verbose >= 1: + print('stopping ...') + pid = self.get_pid() + if not pid: + msg = 'pid file [%s] does not exist. Not running?\n' % self.pidfile + sys.stderr.write(msg) + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + return + # try to kill the daemon process + try: + i = 0 + while 1: + os.kill(pid, signal.SIGTERM) + time.sleep(1) + i = i + 1 + if i % 10 == 0: + os.kill(pid, signal.SIGHUP) + except OSError as err: + err = str(err) + if err.find('No such process') > 0: + if os.path.exists(self.pidfile): + os.remove(self.pidfile) + else: + print(str(err)) + sys.exit(1) + if self.verbose >= 1: + print('Stopped!') + + def restart(self): + self.stop() + self.start() + + def status(self): + pid = self.get_pid() + if pid: + if os.path.exists('/proc/%d' % pid): + return pid + return False + + +def parse_args(): + ''' Check script input parameter. ''' + parse = argparse.ArgumentParser(description='Sys monitor script') + parse.add_argument('-o', + type=str, + metavar='[operation]', + required=True, + help='start|stop|restart|status') + parse.add_argument('-l', + type=str, + metavar='[log_path]', + required=False, + default='./logs/', + help='log path') + args = parse.parse_args() + return args + + +def get_system_info(): + cmd = r"echo OS version:;" + cmd = cmd + r"cat /etc/issue | head -n1 | awk '{print $1, $2, $3}';" + cmd = cmd + r"echo ;" + + cmd = cmd + r"echo OS Kernel version:;" + cmd = cmd + r"uname -r;" + cmd = cmd + r"echo ;" + + cmd = cmd + r"echo Hardware Model:;" + cmd = cmd + r"sudo dmidecode | grep -A9 'System Information' | tail -n +2 | sed 's/^[ \t]*//';" + cmd = cmd + r"echo ;" + + cmd = cmd + r"echo Accelerator Model:;" + cmd = cmd + r"mthreads-gmi -L;" + cmd = cmd + r"echo ;" + + cmd = cmd + r"echo Accelerator Driver version:;" + cmd = cmd + r"mthreads-gmi | grep 'Driver Version' | awk '{print $3}';" + cmd = cmd + r"echo ;" + + cmd = cmd + r"echo Docker version:;" + cmd = cmd + r"docker -v" + + return cmd + + +def main(): + sample_rate1 = 5 + args = parse_args() + operation = args.o + log_path = args.l + pid_fn = str('/tmp/gpu_monitor.pid') + log_fn = str(log_path + '/mthreads_monitor.log') + err_fn = str(log_path + '/mthreads_monitor.err') + # result for gpu + gpu_fn = str(log_path + '/mthreads_monitor.log') + sys_fn = str(log_path + '/sys_info.log') + cmd = get_system_info() + with open(sys_fn, "w") as f: + p = subprocess.Popen(cmd, shell=True, stdout=f, stderr=subprocess.STDOUT) + p.wait() + + subdaemon = Daemon(pid_fn, + log_fn, + err_fn, + gpu_fn, + log_path, + verbose=1, + rate=sample_rate1) + if operation == 'start': + subdaemon.start() + elif operation == 'stop': + subdaemon.stop() + elif operation == 'restart': + subdaemon.restart() + elif operation == 'status': + pid = subdaemon.status() + if pid: + print('process [%s] is running ......' % pid) + else: + print('daemon process [%s] stopped' % pid) + else: + print("invalid argument!") + sys.exit(1) + + +if __name__ == '__main__': + main() diff --git a/training/mthreads/resnet50-pytorch/README.md b/training/mthreads/resnet50-pytorch/README.md new file mode 100644 index 000000000..3b5048860 --- /dev/null +++ b/training/mthreads/resnet50-pytorch/README.md @@ -0,0 +1,52 @@ +### 1. 数据集准备 +[下载ImageNet2012](../../benchmarks/resnet50) + +### 2. 摩尔线程 MTT S系列 GPU配置与运行信息参考 +#### 环境配置 +- ##### 硬件环境 + - 硬件 + - 机器型号: MCCX D800 + - 加速卡型号: MTT S4000 48GB + - CPU型号:Intel(R) Xeon(R) Gold 6430 CPU @ 2.00GHz + - 多机网络类型、带宽: InfiniBand,2*200Gbps + +- ##### 软件环境 + - OS版本:Ubuntu 20.04 LTS + - OS kernel版本: 5.4.0-42-generic + - 加速卡驱动版本:2.2.0 + - Docker版本: 20.10.24 + - 训练框架版本:pytorch-2.0.0+torch_musa-git8614ba1 + - 依赖软件版本: + - musa toolkits: 1.5.0+git3d8791d + - mcc: 1.5.2+git3730bdd + - mublas: 1.2.0+gitd9867b5 + +### 运行情况 + +* 通用指标 + +| 指标名称 | 指标值 | 特殊说明 | +| -------------- | ----------------------- | ------------------------------------- | +| 任务类别 | 图像分类 | | +| 模型 | resnet50 | | +| 数据集 | ImageNet2012 | | +| 数据精度 | precision,见“性能指标” | 可选fp32 | +| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 | +| 硬件设备简称 | MTT S3000 | | +| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB | +| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 | +| 总吞吐量 | p_whole,见“性能指标” | 实际训练图片数除以总时间(performance_whole) | +| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | +| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) | +| 训练结果 | acc,见“性能指标” | 单位为top1分类准确率(acc1) | +| 额外修改项 | 无 | | + +* 性能指标 + +| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem | +| ------------------ | --------- | ---- | ---- | ---- | ---- | ---- | ---- | ---- | +| 单机1卡(1x1) | fp32 | / | | | | | / | / | +| 单机8卡(1x8) | fp32 |bs=256,lr=0.8 | | | | | /| 25.0/48.0 | +| 单机8卡(1x8) | amp |bs=512,lr=0.2 | | | | | 73.08| 26.2/48.0 | +| 单机8卡(1x8) | bf16 |bs=512,lr=0.2 | | | | | /| 25.7/48.0 | +| 两机8卡(2x8) | fp32 | / | | | | | /| /| diff --git a/training/mthreads/resnet50-pytorch/config/config_S4000x1x1.py b/training/mthreads/resnet50-pytorch/config/config_S4000x1x1.py new file mode 100644 index 000000000..e3437bec1 --- /dev/null +++ b/training/mthreads/resnet50-pytorch/config/config_S4000x1x1.py @@ -0,0 +1,8 @@ +lr = 0.1 +train_batch_size = 256 +eval_batch_size = train_batch_size + +dist_backend = "mccl" +amp = False +fp16 = False + diff --git a/training/mthreads/resnet50-pytorch/config/config_S4000x1x8.py b/training/mthreads/resnet50-pytorch/config/config_S4000x1x8.py new file mode 100644 index 000000000..7b9b4be72 --- /dev/null +++ b/training/mthreads/resnet50-pytorch/config/config_S4000x1x8.py @@ -0,0 +1,8 @@ +lr = 0.8 +train_batch_size = 256 +eval_batch_size = train_batch_size + +dist_backend = "mccl" +amp = False +fp16 = False + diff --git a/training/mthreads/resnet50-pytorch/extern/trainer_adapter.py b/training/mthreads/resnet50-pytorch/extern/trainer_adapter.py new file mode 100644 index 000000000..cc955f536 --- /dev/null +++ b/training/mthreads/resnet50-pytorch/extern/trainer_adapter.py @@ -0,0 +1,35 @@ +import torch +import torch_musa +import config +from driver import dist_pytorch + + +def convert_model(model): + model.to(memory_format=torch.channels_last) + return model + + +def create_grad_scaler(): + """create_grad_scaler for mixed precision training""" + scaler = torch_musa.amp.GradScaler() if config.amp else None + return scaler + + +def train_step(model, batch, optimizer, scaler=None): + """train one step""" + images, target = batch + criterion = torch.nn.CrossEntropyLoss() + if scaler: + with torch.musa.amp.autocast(enabled=True): + output = model(images) + loss = criterion(output, target) + scaler.scale(loss).backward() + scaler.step(optimizer) + scaler.update() + else: + output = model(images) + loss = criterion(output, target) + loss.backward() + optimizer.step() + + return loss diff --git a/training/nvidia/swin_transformer-pytorch/README.md b/training/nvidia/swin_transformer-pytorch/README.md index 3bdbfb9b8..a1b9a169d 100644 --- a/training/nvidia/swin_transformer-pytorch/README.md +++ b/training/nvidia/swin_transformer-pytorch/README.md @@ -33,7 +33,7 @@ | 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) | | 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 | | **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) | -| 训练结果 | val_loss,见“性能指标” | 验证loss | +| 训练结果 | final_acc1,见“性能指标” | 验证准确率 | | 额外修改项 | 无 | | * 性能指标 diff --git a/training/run_benchmarks/config/test_conf.py b/training/run_benchmarks/config/test_conf.py index 5b36dced2..0ad664a92 100644 --- a/training/run_benchmarks/config/test_conf.py +++ b/training/run_benchmarks/config/test_conf.py @@ -1,7 +1,7 @@ '''Test Configs, including''' # -*-coding:utf-8 -*- -# Set accelerator's vendor name, e.g. iluvatar, cambricon, kunlunxin and ascend. +# Set accelerator's vendor name, e.g. iluvatar, cambricon, kunlunxin, ascend and mthreads. # We will run benchmarks in training/ VENDOR = "nvidia" @@ -19,6 +19,8 @@ # "--device=/dev/davinciX --device=/dev/davinci_manager + \ # --device=/dev/devmm_svm --device=/dev/hisi_hdc + \ # -v /usr/local/Ascend/driver -v /usr/local/dcmi -v /usr/local/bin/npu-smi" +# mthreads: +# " --env MTHREADS_VISIBLE_DEVICES=all" ACCE_CONTAINER_OPT = " --gpus all" # XXX_VISIBLE_DEVICE item name in env # possible value of ACCE_VISIBLE_DEVICE_ENV_NAME are: @@ -26,6 +28,7 @@ # MLU_VISIBLE_DEVICES for cambricon # XPU_VISIBLE_DEVICES for kunlunxin # ASCEND_VISIBLE_DEVICES for ascend +# MUSA_VISIBLE_DEVICES for mthreads ACCE_VISIBLE_DEVICE_ENV_NAME = "CUDA_VISIBLE_DEVICES" # Set pip source, which will be used in preparing envs in container @@ -84,6 +87,9 @@ # "longformer:pytorch_1.12:A100:1:8:1": "/raid/dataset/longformer_train/", # "detr:pytorch_1.13:A100:1:8:1": "/raid/dataset/detr/coco2017/", + # "llama2_7b:deepspeed:A100:1:8:1": "/raid/dataset/llama2_7b_pretrain", + # "aquila2_7b:flagscale:A100:1:8:1": "/raid/dataset/aquila2_7b_pretrain", + # "llama1_7B:paddle_2.5.1:TP1PP1SH2SP8A10040G:1:8:1":"/raid/dataset/llama/" # "llama1_7B:paddle_2.5.1:TP2PP1SH1SP4A10040G:1:8:1":"/raid/dataset/llama/" # "llama1_7B:paddle_2.5.1:TP2PP1SH2SP4A10040G:1:8:1":"/raid/dataset/llama/" @@ -115,6 +121,13 @@ # "bert:pytorch:R300:1:8:1": "/raid/dataset/bert_large/train", # "longformer:pytorch:R300:1:8:1": "/raid/dataset/longformer_train", # "distilbert:pytorch:R300:1:8:1": "/raid/dataset/distilbert/", - # "swin_transformer:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/" + # "swin_transformer:pytorch:R300:1:8:1": "/raid/dataset/ImageNet_1k_2012/", + # "tacotron2:pytorch:R300:1:8:1": "/raid/dataset/tacotron2/LJSpeech/", + + # mthreads cases + # "resnet50:pytorch_2.0:S4000:1:8:1": "/data/flagperf/ImageNet", + # "retinanet:pytorch_2.0:S4000:1:8:1": "/data/flagperf/coco2017", + # "bert_hf:pytorch_2.0:S4000:1:8:1": "/data/flagperf/bert_hf", + # "llama2_7b:deepspeed:S4000:1:8:1": "/data/flagperf/llama/openwebtext", } diff --git a/training/run_benchmarks/deepspeed/start_deepspeed_task.py b/training/run_benchmarks/deepspeed/start_deepspeed_task.py index f98c8ed37..154b3cd2f 100644 --- a/training/run_benchmarks/deepspeed/start_deepspeed_task.py +++ b/training/run_benchmarks/deepspeed/start_deepspeed_task.py @@ -114,11 +114,16 @@ def main(): train_script_path = helper.get_train_script_path(task_args) config_dir, config_file = helper.get_config_dir_file(task_args) config_file = os.path.join(config_dir, config_file) + ds_config_file = os.path.join(config_dir, "ds_config.json") exec_cmd = "cd " + os.path.dirname(train_script_path) + ";" exec_cmd = exec_cmd + "deepspeed --num_gpus=" + str( task_args.nproc) + " run_pretraining.py" - exec_cmd = exec_cmd + " --deepspeed --deepspeed_config ds_config.json --data_dir " + task_args.data_dir + + exec_cmd = exec_cmd + " --deepspeed --deepspeed_config " + exec_cmd = exec_cmd + ds_config_file + exec_cmd = exec_cmd + " --data_dir " + task_args.data_dir + exec_cmd = exec_cmd + " --flagperf_config " + config_file exec_cmd = exec_cmd + " --nproc " + str( task_args.nproc) + " --nnodes " + str(task_args.nnodes)