Skip to content

Commit

Permalink
refine bigtransfer (#317)
Browse files Browse the repository at this point in the history
* refine bigtransfer, add configs and update results

* update readme

---------

Co-authored-by: zhouyu <[email protected]>
  • Loading branch information
yuzhou03 and zhouyu authored Nov 10, 2023
1 parent 38fc5e8 commit ef9c582
Show file tree
Hide file tree
Showing 8 changed files with 65 additions and 17 deletions.
10 changes: 6 additions & 4 deletions training/benchmarks/bigtransfer/pytorch/config/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,11 @@
data_dir: str = None
name: str = "bigtransfer"

# torch.backends.cudnn.benchmark
cudnn_benchmark: bool = False
# torch.backends.cudnn.deterministic
cudnn_deterministic: bool = True

# Optional paramters

# =========================================================
Expand Down Expand Up @@ -47,10 +52,7 @@
# utils
# =========================================================
seed: int = 0
# torch.backends.cudnn.benchmark
cudnn_benchmark: bool = False
# torch.backends.cudnn.deterministic
cudnn_deterministic: bool = True


dist_backend: str = 'nccl'
num_workers: int = 8
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
'momentum',
'lr_steps',
'lr_gamma',
'batch_size',
'train_batch_size',
'eval_batch_size',
'fp16',
Expand Down
13 changes: 7 additions & 6 deletions training/benchmarks/bigtransfer/pytorch/run_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,6 @@
from typing import Any, Tuple

# 三方库
import numpy as np

# benchmarks目录 append到sys.path
CURR_PATH = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.abspath(os.path.join(CURR_PATH,
Expand Down Expand Up @@ -91,7 +89,7 @@ def main() -> Tuple[Any, Any]:
# TRAIN_START
dist_pytorch.barrier(config.vendor)
model_driver.event(Event.TRAIN_START)
raw_train_start_time = logger.previous_log_time # 训练起始时间,单位为ms
raw_train_start_time = time.time() # 训练起始时间,单位为ms

# 训练过程
epoch = -1
Expand All @@ -103,11 +101,9 @@ def main() -> Tuple[Any, Any]:

# TRAIN_END事件
model_driver.event(Event.TRAIN_END)
raw_train_end_time = logger.previous_log_time # 训练结束时间,单位为ms

# 训练时长,单位为秒
training_state.raw_train_time = (raw_train_end_time -
raw_train_start_time) / 1e+3
training_state.raw_train_time = time.time() - raw_train_start_time

return config, training_state

Expand All @@ -130,5 +126,10 @@ def main() -> Tuple[Any, Any]:
"final_accuracy": state.eval_mAP,
"raw_train_time": state.raw_train_time,
"init_time": state.init_time,
"num_trained_samples": state.num_trained_samples,
"pure_training_computing_time": state.pure_compute_time,
"throughput(ips)_raw": state.num_trained_samples / state.raw_train_time,
"throughput(ips)_no_eval": state.num_trained_samples / state.no_eval_time,
"throughput(ips)_pure_compute": state.num_trained_samples / state.pure_compute_time,
}
logger.log(Event.FINISHED, message=finished_info, stacklevel=0)
12 changes: 8 additions & 4 deletions training/benchmarks/bigtransfer/pytorch/train/trainer.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,6 @@
import math
import time
import torch
import torch.utils.data
import torchvision
from torch.types import Device
import os
import sys
Expand All @@ -15,7 +13,7 @@

CURR_PATH = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../")))
from driver import Driver, Event, dist_pytorch
from driver import Driver, dist_pytorch


class Trainer:
Expand Down Expand Up @@ -53,10 +51,10 @@ def train_one_epoch(self, train_dataloader):
model = self.model
optimizer = self.optimizer
device = self.device
epoch = self.training_state.epoch
config = self.config

model.train()
no_eval_start_time = time.time()
mixup = 0.1
cri = torch.nn.CrossEntropyLoss().to(device)

Expand All @@ -67,6 +65,7 @@ def train_one_epoch(self, train_dataloader):
# Schedule sending to GPU(s)
x = x.to(device)
y = y.to(device)
self.training_state.num_trained_samples += x.size(0) * config.n_device

# Update learning-rate, including stop training if over.
lr = self.get_lr(step, config)
Expand All @@ -77,6 +76,7 @@ def train_one_epoch(self, train_dataloader):

x, y_a, y_b = self.mixup_data(x, y, mixup_l)

pure_compute_start_time = time.time()
# compute output
logits = model(x)
c = self.mixup_criterion(cri, logits, y_a, y_b, mixup_l)
Expand All @@ -97,9 +97,13 @@ def train_one_epoch(self, train_dataloader):
if need_update:
optimizer.step()
optimizer.zero_grad()

self.training_state.pure_compute_time += time.time() - pure_compute_start_time
# Sample new mixup ratio for next batch
mixup_l = np.random.beta(mixup, mixup)

self.training_state.no_eval_time += time.time() - no_eval_start_time

all_c, all_top1, all_top5 = self.evaluator.evaluate(model, device)

state = self.training_state
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,12 @@ class TrainingState:
end_training: bool = False
converged: bool = False

num_trained_samples: int = 0

init_time = 0
raw_train_time = 0
no_eval_time = 0.0
pure_compute_time = 0.0

def converged_success(self):
"""converged success"""
Expand Down
33 changes: 30 additions & 3 deletions training/nvidia/bigtransfer-pytorch/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -24,9 +24,36 @@


### 运行情况
| 训练资源 | 配置文件 | 运行时长(s) | 目标精度 | 收敛精度(top1) | 性能(samples/s) |
| -------- | --------------- | ----------- | -------- | ------------- | ----------------- |
| 单机8卡 | config_A100x1x8 | 5771.27 | 0.83 | 0.8411 | 222.02 |



* 通用指标

| 指标名称 | 指标值 | 特殊说明 |
| -------------- | ----------------------------------------------- | ------------------------------------------- |
| 任务类别 | Image Classification && Representation Learning | |
| 模型 | Big Transfer | |
| 数据集 | Imagenet2012 1K | |
| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16/tf32 |
| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 |
| 硬件设备简称 | nvidia A100 | |
| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB |
| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 |
| 总吞吐量 | p_whole,见“性能指标” | 实际训练样本数除以总时间(performance_whole) |
| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 |
| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) |
| 训练结果 | acc,见“性能指标” | 单位为top1分类准确率(acc1) |
| 额外修改项 || |

* 性能指标

| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | final_acc1 | mem |
| ----------------- | --------- | ------ | -------- | ------- | ------- | ------ | ---------- | --------- |
| A100单机8卡(1x8) | fp32 | / | 5869 | 222 | 225 | 228 | 0.84192 | 31.4/40.0 |
| A100单机8卡(1x8) | fp32 | bs=20 | 5505 | 236 | 240 | 243 | 0.84016 | 37.4/40.0 |
| A100单机单卡(1x1) | fp32 | bs=16 | / | 23.1 | 29.5 | 29.8 | / | 38.1/40.0 |
| A100两机8卡(2x8) | fp32 | bs=20 | / | 459 | 465 | 470 | / | 36.6/40.0 |


训练精度来源:https://paperswithcode.com/paper/large-scale-learning-of-general-visual

Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
from config_common import *

cfg = "configs/swin/swin_tiny_patch4_window7_224.yaml"
train_batch_size = 384

Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from config_common import *

cfg = "configs/swin/swin_tiny_patch4_window7_224.yaml"
train_batch_size = 384

0 comments on commit ef9c582

Please sign in to comment.