refine bigtransfer (#317)

* refine bigtransfer, add configs and update results * update readme --------- Co-authored-by: zhouyu <[email protected]>
FlagOpen · Nov 10, 2023 · ef9c582 · ef9c582
1 parent 38fc5e8
commit ef9c582
Show file tree

Hide file tree

Showing 8 changed files with 65 additions and 17 deletions.
diff --git a/training/benchmarks/bigtransfer/pytorch/config/_base.py b/training/benchmarks/bigtransfer/pytorch/config/_base.py
@@ -5,6 +5,11 @@
 data_dir: str = None
 name: str = "bigtransfer"
 
+# torch.backends.cudnn.benchmark
+cudnn_benchmark: bool = False
+# torch.backends.cudnn.deterministic
+cudnn_deterministic: bool = True
+
 # Optional paramters
 
 # =========================================================
@@ -47,10 +52,7 @@
 # utils
 # =========================================================
 seed: int = 0
-# torch.backends.cudnn.benchmark
-cudnn_benchmark: bool = False
-# torch.backends.cudnn.deterministic
-cudnn_deterministic: bool = True
+
 
 dist_backend: str = 'nccl'
 num_workers: int = 8

diff --git a/training/benchmarks/bigtransfer/pytorch/config/mutable_params.py b/training/benchmarks/bigtransfer/pytorch/config/mutable_params.py
@@ -9,6 +9,7 @@
     'momentum',
     'lr_steps',
     'lr_gamma',
+    'batch_size',
     'train_batch_size',
     'eval_batch_size',
     'fp16',

diff --git a/training/benchmarks/bigtransfer/pytorch/run_pretraining.py b/training/benchmarks/bigtransfer/pytorch/run_pretraining.py
@@ -5,8 +5,6 @@
 from typing import Any, Tuple
 
 # 三方库
-import numpy as np
-
 # benchmarks目录 append到sys.path
 CURR_PATH = os.path.abspath(os.path.dirname(__file__))
 sys.path.append(os.path.abspath(os.path.join(CURR_PATH,
@@ -91,7 +89,7 @@ def main() -> Tuple[Any, Any]:
     # TRAIN_START
     dist_pytorch.barrier(config.vendor)
     model_driver.event(Event.TRAIN_START)
-    raw_train_start_time = logger.previous_log_time  # 训练起始时间，单位为ms
+    raw_train_start_time = time.time()  # 训练起始时间，单位为ms
 
     # 训练过程
     epoch = -1
@@ -103,11 +101,9 @@ def main() -> Tuple[Any, Any]:
 
     # TRAIN_END事件
     model_driver.event(Event.TRAIN_END)
-    raw_train_end_time = logger.previous_log_time  # 训练结束时间，单位为ms
 
     # 训练时长，单位为秒
-    training_state.raw_train_time = (raw_train_end_time -
-                                     raw_train_start_time) / 1e+3
+    training_state.raw_train_time = time.time() - raw_train_start_time
 
     return config, training_state
 
@@ -130,5 +126,10 @@ def main() -> Tuple[Any, Any]:
         "final_accuracy": state.eval_mAP,
         "raw_train_time": state.raw_train_time,
         "init_time": state.init_time,
+        "num_trained_samples": state.num_trained_samples,
+        "pure_training_computing_time": state.pure_compute_time,
+        "throughput(ips)_raw": state.num_trained_samples / state.raw_train_time,
+        "throughput(ips)_no_eval": state.num_trained_samples / state.no_eval_time,
+        "throughput(ips)_pure_compute": state.num_trained_samples / state.pure_compute_time,
     }
     logger.log(Event.FINISHED, message=finished_info, stacklevel=0)
diff --git a/training/benchmarks/bigtransfer/pytorch/train/trainer.py b/training/benchmarks/bigtransfer/pytorch/train/trainer.py
@@ -1,8 +1,6 @@
-import math
 import time
 import torch
 import torch.utils.data
-import torchvision
 from torch.types import Device
 import os
 import sys
@@ -15,7 +13,7 @@
 
 CURR_PATH = os.path.abspath(os.path.dirname(__file__))
 sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../")))
-from driver import Driver, Event, dist_pytorch
+from driver import Driver, dist_pytorch
 
 
 class Trainer:
@@ -53,10 +51,10 @@ def train_one_epoch(self, train_dataloader):
         model = self.model
         optimizer = self.optimizer
         device = self.device
-        epoch = self.training_state.epoch
         config = self.config
 
         model.train()
+        no_eval_start_time = time.time()
         mixup = 0.1
         cri = torch.nn.CrossEntropyLoss().to(device)
 
@@ -67,6 +65,7 @@ def train_one_epoch(self, train_dataloader):
             # Schedule sending to GPU(s)
             x = x.to(device)
             y = y.to(device)
+            self.training_state.num_trained_samples += x.size(0) * config.n_device
 
             # Update learning-rate, including stop training if over.
             lr = self.get_lr(step, config)
@@ -77,6 +76,7 @@ def train_one_epoch(self, train_dataloader):
 
             x, y_a, y_b = self.mixup_data(x, y, mixup_l)
 
+            pure_compute_start_time = time.time()
             # compute output
             logits = model(x)
             c = self.mixup_criterion(cri, logits, y_a, y_b, mixup_l)
@@ -97,9 +97,13 @@ def train_one_epoch(self, train_dataloader):
             if need_update:
                 optimizer.step()
                 optimizer.zero_grad()
+
+            self.training_state.pure_compute_time += time.time() - pure_compute_start_time    
             # Sample new mixup ratio for next batch
             mixup_l = np.random.beta(mixup, mixup)
 
+        self.training_state.no_eval_time += time.time() - no_eval_start_time    
+
         all_c, all_top1, all_top5 = self.evaluator.evaluate(model, device)
 
         state = self.training_state

diff --git a/training/benchmarks/bigtransfer/pytorch/train/training_state.py b/training/benchmarks/bigtransfer/pytorch/train/training_state.py
@@ -15,8 +15,12 @@ class TrainingState:
     end_training: bool = False
     converged: bool = False
 
+    num_trained_samples: int = 0
+
     init_time = 0
     raw_train_time = 0
+    no_eval_time = 0.0
+    pure_compute_time = 0.0
 
     def converged_success(self):
         """converged success"""

diff --git a/training/nvidia/bigtransfer-pytorch/README.md b/training/nvidia/bigtransfer-pytorch/README.md
@@ -24,9 +24,36 @@
 
 
 ### 运行情况
-| 训练资源 | 配置文件        | 运行时长(s) | 目标精度 | 收敛精度(top1) | 性能（samples/s） |
-| -------- | --------------- | ----------- | -------- | ------------- | ----------------- |
-| 单机8卡  | config_A100x1x8 | 5771.27 | 0.83  | 0.8411     | 222.02       |
+
+
+
+* 通用指标
+
+| 指标名称       | 指标值                                          | 特殊说明                                    |
+| -------------- | ----------------------------------------------- | ------------------------------------------- |
+| 任务类别       | Image Classification && Representation Learning |                                             |
+| 模型           | Big Transfer                                    |                                             |
+| 数据集         | Imagenet2012 1K                                 |                                             |
+| 数据精度       | precision,见“性能指标”                          | 可选fp32/amp/fp16/tf32                      |
+| 超参修改       | fix_hp,见“性能指标”                             | 跑满硬件设备评测吞吐量所需特殊超参          |
+| 硬件设备简称   | nvidia A100                                     |                                             |
+| 硬件存储使用   | mem,见“性能指标”                                | 通常称为“显存”,单位为GiB                    |
+| 端到端时间     | e2e_time,见“性能指标”                           | 总时间+Perf初始化等时间                     |
+| 总吞吐量       | p_whole,见“性能指标”                            | 实际训练样本数除以总时间(performance_whole) |
+| 训练吞吐量     | p_train,见“性能指标”                            | 不包含每个epoch末尾的评估部分耗时           |
+| **计算吞吐量** | **p_core,见“性能指标”**                         | 不包含数据IO部分的耗时(p3>p2>p1)            |
+| 训练结果       | acc,见“性能指标”                                | 单位为top1分类准确率(acc1)                  |
+| 额外修改项     | 无                                              |                                             |
+
+* 性能指标
+
+| 配置              | precision | fix_hp | e2e_time | p_whole | p_train | p_core | final_acc1 | mem       |
+| ----------------- | --------- | ------ | -------- | ------- | ------- | ------ | ---------- | --------- |
+| A100单机8卡(1x8)  | fp32      | /      | 5869     | 222     | 225     | 228    | 0.84192    | 31.4/40.0 |
+| A100单机8卡(1x8)  | fp32      | bs=20  | 5505     | 236     | 240     | 243    | 0.84016    | 37.4/40.0 |
+| A100单机单卡(1x1) | fp32      | bs=16  | /     | 23.1    | 29.5    | 29.8   | /          | 38.1/40.0 |
+| A100两机8卡(2x8)  | fp32      | bs=20  | /     | 459     | 465     | 470    | /          | 36.6/40.0 |
+
 
 训练精度来源：https://paperswithcode.com/paper/large-scale-learning-of-general-visual
 

diff --git a/training/nvidia/swin_transformer-pytorch/config/config_A100x1x1.py b/training/nvidia/swin_transformer-pytorch/config/config_A100x1x1.py
@@ -0,0 +1,5 @@
+from config_common import *
+
+cfg = "configs/swin/swin_tiny_patch4_window7_224.yaml"
+train_batch_size = 384
+
diff --git a/training/nvidia/swin_transformer-pytorch/config/config_A100x2x8.py b/training/nvidia/swin_transformer-pytorch/config/config_A100x2x8.py
@@ -0,0 +1,4 @@
+from config_common import *
+
+cfg = "configs/swin/swin_tiny_patch4_window7_224.yaml"
+train_batch_size = 384