Skip to content

Commit

Permalink
refine retina
Browse files Browse the repository at this point in the history
  • Loading branch information
zhouyu committed Jul 20, 2023
1 parent 65656a8 commit 4e91873
Show file tree
Hide file tree
Showing 8 changed files with 72 additions and 41 deletions.
5 changes: 5 additions & 0 deletions training/benchmarks/retinanet/pytorch/config/_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
vendor: str = "nvidia"
data_dir: str = None
name: str = "retinanet"
cudnn_benchmark: bool = False
cudnn_deterministic: bool = True

# Optional parameters

Expand Down Expand Up @@ -64,3 +66,6 @@
amp: bool = False
sync_bn: bool = False
gradient_accumulation_steps: int = 1


pretrained_path = "resnet50-0676ba61.pth"
2 changes: 1 addition & 1 deletion training/benchmarks/retinanet/pytorch/model/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
from torchvision.models.detection import retinanet_resnet50_fpn
import torchvision

import os

def create_model():

torchvision.models.resnet.__dict__['model_urls'][
'resnet50'] = 'https://download.pytorch.org/models/resnet50-0676ba61.pth'
return retinanet_resnet50_fpn()
24 changes: 12 additions & 12 deletions training/benchmarks/retinanet/pytorch/run_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
from driver.helper import InitHelper
from dataloaders.dataloader import get_coco_api_from_dataset

# TODO 导入相关的模块、方法、变量。这里保持名称一致,实现可以不同。
# 导入相关的模块、方法、变量。这里保持名称一致,实现可以不同。
from train import trainer_adapter
from train.evaluator import Evaluator
from train.trainer import Trainer
Expand All @@ -40,6 +40,7 @@ def main() -> Tuple[Any, Any]:
config = model_driver.config
dist_pytorch.init_dist_training_env(config)
dist_pytorch.barrier(config.vendor)
config.distributed = dist_pytorch.get_world_size() > 1
model_driver.event(Event.INIT_START)

# logger
Expand Down Expand Up @@ -76,11 +77,8 @@ def main() -> Tuple[Any, Any]:

# evaluation统计
init_evaluation_start = time.time() # evaluation起始时间,单位为秒

trainer.evaluate(trainer.model, eval_dataloader, device=trainer.device)

init_evaluation_end = time.time() # evaluation结束时间,单位为秒

init_evaluation_info = dict(time=init_evaluation_end -
init_evaluation_start)

Expand All @@ -97,7 +95,7 @@ def main() -> Tuple[Any, Any]:
# TRAIN_START
dist_pytorch.barrier(config.vendor)
model_driver.event(Event.TRAIN_START)
raw_train_start_time = logger.previous_log_time # 训练起始时间,单位为ms
train_start_time = time.time()

# 训练过程
epoch = 0
Expand All @@ -107,12 +105,8 @@ def main() -> Tuple[Any, Any]:
epoch += 1

# TRAIN_END事件
training_state.traintime = time.time() - train_start_time
model_driver.event(Event.TRAIN_END)
raw_train_end_time = logger.previous_log_time # 训练结束时间,单位为ms

# 训练时长,单位为秒
training_state.raw_train_time = (raw_train_end_time -
raw_train_start_time) / 1e+3

return config, training_state

Expand All @@ -127,10 +121,16 @@ def main() -> Tuple[Any, Any]:
e2e_time = time.time() - start
if config_update.do_train:

training_perf = state.num_trained_samples / state.raw_train_time
finished_info = {
"e2e_time": e2e_time,
"training_samples_per_second": training_perf,
"train_time": state.train_time,
"train_no_eval_time": state.no_eval_time,
"pure_training_computing_time": state.pure_compute_time,
"throughput(ips)_raw": state.num_trained_samples / state.traintime,
"throughput(ips)_no_eval":
state.num_trained_samples / state.no_eval_time,
"throughput(ips)_pure_compute":
state.num_trained_samples / state.pure_compute_time,
"converged": state.converged,
"final_mAP": state.eval_mAP,
"raw_train_time": state.raw_train_time,
Expand Down
28 changes: 14 additions & 14 deletions training/benchmarks/retinanet/pytorch/train/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,24 +5,17 @@
import time
import torch
import torch.utils.data
import torchvision
from torch.types import Device
import os
import sys

from model import create_model
from optimizers import create_optimizer
from schedulers import create_scheduler
from train.evaluator import Evaluator
from train.training_state import TrainingState

from dataloaders.dataloader import get_coco_api_from_dataset

import utils.utils

CURR_PATH = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../")))
from driver import Driver, Event, dist_pytorch
from driver import Driver, dist_pytorch


class Trainer:
Expand All @@ -41,7 +34,7 @@ def init(self):
torch.set_num_threads(1)
device = torch.device(self.config.device)
dist_pytorch.main_proc_print("Init progress:")
self.model = create_model()
self.model = create_model(self.config)
self.model.to(self.device)

self.model = self.adapter.convert_model(self.model)
Expand All @@ -56,11 +49,15 @@ def train_one_epoch(self, train_dataloader, eval_dataloader):
optimizer = self.optimizer
data_loader = train_dataloader
device = self.device
epoch = self.training_state.epoch
state = self.training_state
config = self.config
epoch = state.epoch

if self.config.distributed:
train_dataloader.batch_sampler.sampler.set_epoch(epoch)

model.train()
noeval_start_time = time.time()
metric_logger = utils.utils.MetricLogger(delimiter=" ")
metric_logger.add_meter(
'lr', utils.utils.SmoothedValue(window_size=1, fmt='{value:.6f}'))
Expand All @@ -81,6 +78,7 @@ def train_one_epoch(self, train_dataloader, eval_dataloader):
targets = [{k: v.to(device)
for k, v in t.items()} for t in targets]

pure_compute_start_time = time.time()
loss_dict = model(images, targets)

losses = sum(loss for loss in loss_dict.values())
Expand All @@ -104,13 +102,16 @@ def train_one_epoch(self, train_dataloader, eval_dataloader):
metric_logger.update(loss=losses_reduced, **loss_dict_reduced)
metric_logger.update(lr=optimizer.param_groups[0]["lr"])

self.training_state.pure_compute_time += time.time(
) - pure_compute_start_time

self.lr_scheduler.step()
state.num_trained_samples += len(data_loader.dataset)
self.training_state.no_eval_time += time.time() - noeval_start_time

# evaluate
self.evaluate(self.model, eval_dataloader, device=self.device)

state = self.training_state
config = self.config

state.eval_mAP = self.evaluator.coco_eval['bbox'].stats.tolist()[0]
print(state.eval_mAP)
if state.eval_mAP >= config.target_mAP:
Expand All @@ -121,7 +122,6 @@ def train_one_epoch(self, train_dataloader, eval_dataloader):

if epoch >= config.max_epoch:
state.end_training = True
state.num_trained_samples += len(data_loader.dataset)

@torch.no_grad()
def evaluate(self, model, data_loader, device):
Expand Down
10 changes: 1 addition & 9 deletions training/benchmarks/retinanet/pytorch/train/trainer_adapter.py
Original file line number Diff line number Diff line change
@@ -1,28 +1,21 @@
# Copyright (c) 2023 BAAI. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License")
import os
import sys
import torch

import torch.distributed as dist
from torch.optim import Optimizer
from torch import nn, Tensor
from torch.nn.parallel import DistributedDataParallel as DDP
import config

CURR_PATH = os.path.abspath(os.path.dirname(__file__))
sys.path.append(os.path.abspath(os.path.join(CURR_PATH, "../../../")))
import config
from driver.dist_pytorch import main_proc_print


def convert_model(model: nn.Module) -> nn.Module:
"""convert_model"""
return model


def model_to_fp16(model: nn.Module) -> nn.Module:
"""model_to_fp16"""
# To prevent OOM for model sizes that cannot fit in GPU memory in full precision
if config.fp16:
main_proc_print(" > use fp16...")
Expand All @@ -31,7 +24,6 @@ def model_to_fp16(model: nn.Module) -> nn.Module:


def model_to_ddp(model: nn.Module) -> nn.Module:
"""model_to_ddp"""
if dist.is_available() and dist.is_initialized():
model = DDP(model, device_ids=[config.local_rank])
return model
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@

@dataclass
class TrainingState:
"""TrainingState dataclass"""
_trainer = None
_status = 'aborted' # later set to 'success' if termination criteria met

Expand All @@ -19,12 +18,19 @@ class TrainingState:

epoch: int = 1
num_trained_samples = 0

# state related
end_training: bool = False
converged: bool = False

# time related
init_time = 0
raw_train_time = 0

train_time = 0
no_eval_time = 0
pure_compute_time = 0.0

train_start_timestamp = 0

def status(self):
Expand Down
34 changes: 30 additions & 4 deletions training/nvidia/retinanet-pytorch/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,35 @@ torchvision.models.resnet.__dict__['model_urls'][
- 依赖软件版本:无


### 运行情况
| 训练资源 | 配置文件 | 运行时长(s) | 目标精度 | 收敛精度(mAP) | 性能(samples/s) |
| -------- | --------------- | ----------- | -------- | ------------- | ----------------- |
| 单机8卡 | config_A100x1x8 | 14978.137 | 0.35 | 0.3528 | 140.92 |
* 通用指标

| 指标名称 | 指标值 | 特殊说明 |
| -------------- | ----------------------- | ------------------------------------------- |
| 任务类别 | 图像分类 | |
| 模型 | retinanet | |
| 数据集 | COCO2017 | |
| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 |
| 单卡批尺寸 | bs,见“性能指标” | 即local batch_size |
| 模型 | resnet50 | |
| 数据集 | ImageNet2012 | |
| 数据精度 | precision,见“性能指标” | 可选fp32/amp/fp16 |
| 超参修改 | fix_hp,见“性能指标” | 跑满硬件设备评测吞吐量所需特殊超参 |
| 硬件设备简称 | nvidia A100 | |
| 硬件存储使用 | mem,见“性能指标” | 通常称为“显存”,单位为GiB |
| 端到端时间 | e2e_time,见“性能指标” | 总时间+Perf初始化等时间 |
| 总吞吐量 | p_whole,见“性能指标” | 实际训练图片数除以总时间(performance_whole) |
| 训练吞吐量 | p_train,见“性能指标” | 不包含每个epoch末尾的评估部分耗时 |
| **计算吞吐量** | **p_core,见“性能指标”** | 不包含数据IO部分的耗时(p3>p2>p1) |
| 训练结果 | acc,见“性能指标” | 单位为top1分类准确率(acc1) |
| 额外修改项 || |

* 性能指标

| 配置 | precision | fix_hp | e2e_time | p_whole | p_train | p_core | acc | mem |
| ------------------- | --------- | ------------- | -------- | ------- | ------- | ------ | ------ | --------- |
| A100单机8卡(1x8) | fp32 | bs=16,lr=0.08 | 15253 | 138 | 152 | 164 | 0.3529 | 38.8/40.0 |
| A100单机单卡(1x1) | fp32 | bs=16,lr=0.08 | | | | | | |
| A100两机8卡(2x8) | fp32 | bs=16,lr=0.08 | | | | | | |


训练精度来源:[torchvision.models — Torchvision 0.8.1 documentation (pytorch.org)](https://pytorch.org/vision/0.8/models.html?highlight=faster#torchvision.models.detection.retinanet_resnet50_fpn)
2 changes: 2 additions & 0 deletions training/nvidia/retinanet-pytorch/config/config_A100x1x8.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
vendor: str = "nvidia"

train_batch_size = 16
eval_batch_size = 16
lr = 0.08

0 comments on commit 4e91873

Please sign in to comment.