From 20ecc6ff24c0a8f9e2251752f41cf9f14448bab6 Mon Sep 17 00:00:00 2001 From: barondai Date: Fri, 27 Sep 2024 10:44:25 +0800 Subject: [PATCH] =?UTF-8?q?Update=20metric.py=EF=BC=8Cset=20metric=20suppo?= =?UTF-8?q?rt=20npu?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Currently,when use more than two Ascned servers to run training, it would generate an error: "FileNotFoundError: [Errno 2] No such file or directory: '.dist_test/tmpjajv4wn5/part_8.pkl'". For this errir is because default training use collect_device=cpu, and it need tmpxxxxx.pkl files to sync training weights, and set collect_device to npu if is_npu_available is true,then training weights will sync by hccl,do not need to carete .dist_test directory and tmp pkl files. Launch mutil-server training cmd: export NNODES=2 NODE_RANK=0 PORT=29500 MASTER_ADDR="10.1.1.1" ; setsid /bin/bash tools/dist_train.sh configs/rtmdet/rtmdet_s_8xb32-300e_coco.py 8 export NNODES=2 NODE_RANK=1 PORT=29500 MASTER_ADDR="10.1.1.1" ; setsid /bin/bash tools/dist_train.sh configs/rtmdet/rtmdet_s_8xb32-300e_coco.py 8 NODE_RANK=0 is Master server, NNODES=2 means two servers training --- mmengine/evaluator/metric.py | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mmengine/evaluator/metric.py b/mmengine/evaluator/metric.py index 6e6d40bee3..4d052f269a 100644 --- a/mmengine/evaluator/metric.py +++ b/mmengine/evaluator/metric.py @@ -5,12 +5,17 @@ from torch import Tensor +import torch +import torch_npu +from torch_npu.contrib import transfer_to_npu + from mmengine.dist import (broadcast_object_list, collect_results, is_main_process) from mmengine.fileio import dump from mmengine.logging import print_log from mmengine.registry import METRICS from mmengine.structures import BaseDataElement +from mmengine.device import is_npu_available class BaseMetric(metaclass=ABCMeta): @@ -49,7 +54,10 @@ def __init__(self, "`collect_device='cpu'`") self._dataset_meta: Union[None, dict] = None - self.collect_device = collect_device + if is_npu_available(): + self.collect_device = 'gpu' + else: + self.collect_device = collect_device self.results: List[Any] = [] self.prefix = prefix or self.default_prefix self.collect_dir = collect_dir