kubeedge · chou-shun · Aug 23, 2021 · Aug 27, 2021 · Aug 27, 2021 · Jul 6, 2022
diff --git a/examples/lib-samples/backend/mindspore/ResNet50/README.md b/examples/lib-samples/backend/mindspore/ResNet50/README.md
@@ -0,0 +1,70 @@
+# Resnet Example with Mindspore Backend
+This document describes how to use the mindspore backend to train Resnet-50 network with the cifar-10 dataset
+
+## Preparatory Stage
+### Prepare Dataset
+In this example, We need to prepare the cifar10 dataset in advance, and put it into `/home/sedna/examples/backend/mindspore/resnet/`
+```bash
+cd /home/sedna/examples/backend/mindspore/resnet
+wget http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz
+tar -zxvf cifar-10-binary.tar.gz
+```
+### Parameters
+you can change the parameters of the model in `src/config.py`
+
+## Modeling Stage
+This example support CPU and NPU, you can follow these steps for training, testing and inference
+### Train
+> * CPU
+>```bash
+>  bash scripts/run_standalone_train_cpu.sh [DATASET_PATH] [MODEL_SAVE_PATH]
+>  # model_save_path must be ABSOLUTE PATH
+>  # The log message would be showed in the terminal
+>  # The ckpt file would be saved in [MODEL_SAVE_PATH]
+>```
+> * NPU
+>```bash
+>  bash scripts/run_standalone_train.sh [DATASET_PATH] [MODEL_SAVE_PATH]
+>  # [MODEL_SAVE_PATH] must be ABSOLUTE PATH
+>  # The log message would be saved to scripts/train/log
+>  # The ckpt file would be saved in [MODEL_SAVE_PATH]
+>```
+###Test
+> * CPU
+>```bash
+>  bash scripts/run_test_cpu.sh [DATASET_PATH] [CHECKPOINT_PATH]
+>  # [CHECKPOINT_PATH] must be ABSOLUTE PATH
+>  # The log message would be saved to scripts/test/log
+>```
+> * NPU
+>```bash
+>  bash scripts/run_test.sh [DATASET_PATH] [CHECKPOINT_PATH]
+>  # [CHECKPOINT_PATH] must be ABSOLUTE PATH
+>  # The log message would be saved to scripts/test/log
+>```
+###Infer
+>```bash
+>  bash scripts/run_infer.sh [IMAGE_PATH] [CHECKPOINT_PATH]
+>  # [CHECKPOINT_PATH] must be ABSOLUTE PATH
+>  # The log message would be saved to scripts/infer/log
+>```
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
diff --git a/examples/lib-samples/backend/mindspore/ResNet50/inference.py b/examples/lib-samples/backend/mindspore/ResNet50/inference.py
@@ -0,0 +1,44 @@
+import argparse
+import mindspore as ms
+from mindspore import Tensor
+import mindspore.dataset.vision.c_transforms as C
+import numpy as np
+from lib.sedna.backend import set_backend
+import cv2
+from interface import Estimator
+
+parser = argparse.ArgumentParser(description="resnet50 infer")
+parser.add_argument('--image_path', type=str, default="")
+parser.add_argument(
+    '--device_target',
+    type=str,
+    default="Ascend",
+    choices=(
+        "Ascend",
+        "CPU"),
+    help="Device target, support Ascend, CPU")
+parser.add_argument('--checkpoint_path', type=str)
+
+
+def preprocess():
+    resize = C.Resize((224, 224))
+    rescale = C.Rescale(1.0 / 255.0, 0.0)
+    normalize = C.Normalize([0.4914, 0.4822, 0.4465], [0.2023, 0.1994, 0.2010])
+    transpose = C.HWC2CHW()
+    return [resize, rescale, normalize, transpose]
+
+
+def main():
+    args = parser.parse_args()
+    img = cv2.imread(args.image_path)
+    data_preprocess = preprocess()
+    for method in data_preprocess:
+        img = method(img)
+    img = np.expand_dims(img, 0)
+    data = Tensor(img, ms.float32)
+    model = set_backend(estimator=Estimator)
+    return model.predict(data)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/lib-samples/backend/mindspore/ResNet50/interface.py b/examples/lib-samples/backend/mindspore/ResNet50/interface.py
@@ -0,0 +1,268 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""train resnet."""
+import os
+import numpy as np
+from mindspore import context
+from mindspore import Tensor
+from mindspore.nn.optim.momentum import Momentum
+from mindspore.train.model import Model
+from mindspore.context import ParallelMode
+from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
+from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
+from mindspore.train.loss_scale_manager import FixedLossScaleManager
+from mindspore.train.serialization import load_checkpoint, load_param_into_net
+from mindspore.communication.management import init, get_rank, get_group_size
+from mindspore.parallel import set_algo_parameters
+import mindspore.nn as nn
+import mindspore.common.initializer as weight_init
+from src.lr_generator import get_lr
+
+from src.resnet import resnet50 as resnet
+from src.config import config1 as config
+from src.dataset import create_dataset1 as create_dataset
+
+
+class Estimator:
+    def __init__(self) -> None:
+        self.has_load = False
+        self.network = None
+        self.train_network = None
+
+    def train(self, train_data, **kwargs):
+        args_opt = kwargs.get("args_opt")
+        target = args_opt.device_target
+        if target == "CPU":
+            args_opt.run_distribute = False
+
+        ckpt_save_dir = args_opt.model_save_path
+
+        # init context
+        if args_opt.run_distribute:
+            if target == "Ascend":
+                device_id = int(os.getenv('DEVICE_ID'))
+                context.set_context(
+                    device_id=device_id,
+                    enable_auto_mixed_precision=True)
+                context.set_auto_parallel_context(
+                    device_num=args_opt.device_num,
+                    parallel_mode=ParallelMode.DATA_PARALLEL,
+                    gradients_mean=True)
+                set_algo_parameters(elementwise_op_strategy_follow=True)
+                context.set_auto_parallel_context(
+                    all_reduce_fusion_config=[85, 160])
+                init()
+            # GPU target
+            else:
+                init()
+                context.set_auto_parallel_context(
+                    device_num=get_group_size(),
+                    parallel_mode=ParallelMode.DATA_PARALLEL,
+                    gradients_mean=True)
+                if args_opt.net == "resnet50":
+                    context.set_auto_parallel_context(
+                        all_reduce_fusion_config=[85, 160])
+            ckpt_save_dir = args_opt.save_checkpoint_path + \
+                "ckpt_" + str(get_rank()) + "/"
+
+        # create dataset
+        dataset = create_dataset(
+            dataset_path=train_data,
+            do_train=True,
+            repeat_num=1,
+            batch_size=config.batch_size,
+            target=target,
+            distribute=args_opt.run_distribute)
+        step_size = dataset.get_dataset_size()
+
+        # define net
+        net = resnet(class_num=config.class_num)
+
+        # init weight
+        if args_opt.pre_trained:
+            param_dict = load_checkpoint(args_opt.pre_trained)
+            load_param_into_net(net, param_dict)
+        else:
+            for _, cell in net.cells_and_names():
+                if isinstance(cell, nn.Conv2d):
+                    cell.weight.set_data(
+                        weight_init.initializer(
+                            weight_init.XavierUniform(),
+                            cell.weight.shape,
+                            cell.weight.dtype))
+                if isinstance(cell, nn.Dense):
+                    cell.weight.set_data(
+                        weight_init.initializer(
+                            weight_init.TruncatedNormal(),
+                            cell.weight.shape,
+                            cell.weight.dtype))
+
+        # init lr
+        lr = get_lr(
+            lr_init=config.lr_init,
+            lr_end=config.lr_end,
+            lr_max=config.lr_max,
+            warmup_epochs=config.warmup_epochs,
+            total_epochs=config.epoch_size,
+            steps_per_epoch=step_size,
+            lr_decay_mode=config.lr_decay_mode)
+        lr = Tensor(lr)
+
+        # define opt
+        decayed_params = []
+        no_decayed_params = []
+        for param in net.trainable_params():
+            if 'beta' not in param.name and 'gamma' not in param.name and 'bias' not in param.name:
+                decayed_params.append(param)
+            else:
+                no_decayed_params.append(param)
+
+        group_params = [{'params': decayed_params, 'weight_decay': config.weight_decay},
+                        {'params': no_decayed_params},
+                        {'order_params': net.trainable_params()}]
+        opt = Momentum(
+            group_params,
+            lr,
+            config.momentum,
+            loss_scale=config.loss_scale)
+        # define loss, model
+        if target == "Ascend":
+            loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
+            loss_scale = FixedLossScaleManager(
+                config.loss_scale, drop_overflow_update=False)
+            model = Model(
+                net,
+                loss_fn=loss,
+                optimizer=opt,
+                loss_scale_manager=loss_scale,
+                metrics={'acc'},
+                amp_level="O2",
+                keep_batchnorm_fp32=False)
+        else:
+            # GPU and CPU target
+            loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction="mean")
+
+            if target != "CPU":
+                opt = Momentum(
+                    filter(
+                        lambda x: x.requires_grad,
+                        net.get_parameters()),
+                    lr,
+                    config.momentum,
+                    config.weight_decay,
+                    config.loss_scale)
+                loss_scale = FixedLossScaleManager(
+                    config.loss_scale, drop_overflow_update=False)
+                # Mixed precision
+                model = Model(
+                    net,
+                    loss_fn=loss,
+                    optimizer=opt,
+                    loss_scale_manager=loss_scale,
+                    metrics={'acc'},
+                    amp_level="O2",
+                    keep_batchnorm_fp32=False)
+            else:
+                # fp32 training
+                opt = Momentum(
+                    filter(
+                        lambda x: x.requires_grad,
+                        net.get_parameters()),
+                    lr,
+                    config.momentum,
+                    config.weight_decay)
+                model = Model(
+                    net,
+                    loss_fn=loss,
+                    optimizer=opt,
+                    metrics={'acc'})
+
+        # define callbacks
+        time_cb = TimeMonitor(data_size=step_size)
+        loss_cb = LossMonitor()
+        cb = [time_cb, loss_cb]
+        if config.save_checkpoint:
+            config_ck = CheckpointConfig(
+                save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
+                keep_checkpoint_max=config.keep_checkpoint_max)
+            ckpt_cb = ModelCheckpoint(
+                prefix="resnet",
+                directory=ckpt_save_dir,
+                config=config_ck)
+            cb += [ckpt_cb]
+
+        # train model
+        dataset_sink_mode = target != "CPU"
+        model.train(
+            config.epoch_size - config.pretrain_epoch_size,
+            dataset,
+            callbacks=cb,
+            sink_size=dataset.get_dataset_size(),
+            dataset_sink_mode=dataset_sink_mode)
+
+    def evaluate(self, valid_data, **kwargs):
+        args_opt = kwargs.get("args_opt")
+        target = args_opt.device_target
+        # init context
+        if target == "Ascend":
+            device_id = int(os.getenv('DEVICE_ID'))
+            context.set_context(device_id=device_id)
+
+        # create dataset
+        dataset = create_dataset(
+            dataset_path=valid_data,
+            do_train=False,
+            batch_size=config.batch_size,
+            target=target)
+
+        # define net
+        net = self.network
+
+        # define loss, model
+        loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
+
+        # define model
+        model = Model(
+            net,
+            loss_fn=loss,
+            metrics={
+                'top_1_accuracy',
+                'top_5_accuracy'})
+
+        # eval model
+        res = model.eval(dataset)
+        print("result:", res, "ckpt=", args_opt.checkpoint_path)
+
+    def predict(self, data, class_name):
+
+        # define model
+        model = Model(self.network)
+
+        # infer data
+        res = model.predict(data)
+        softmax = nn.Softmax()
+
+        # get label result
+        pred_class = class_name[np.argmax(softmax(res[0]))]
+        print("This image belongs to: ", pred_class)
+        return pred_class
+
+    def load(self, model_url):
+        print("load model url: ", model_url)
+        self.network = resnet(class_num=config.class_num)
+        param_dict = load_checkpoint(model_url)
+        load_param_into_net(self.network, param_dict)
+        self.network.set_train(False)
+        self.has_load = True