kubeedge · chou-shun · Aug 23, 2021 · Aug 27, 2021 · Aug 27, 2021 · Jul 6, 2022
diff --git a/examples/lib-samples/backend/mindspore/ResNet50/README.md b/examples/lib-samples/backend/mindspore/ResNet50/README.md
@@ -1,70 +1,100 @@
 # Resnet Example with Mindspore Backend
-This document describes how to use the mindspore backend to train Resnet-50 network with the cifar-10 dataset
+This document describes how to use the mindspore backend to train Resnet-50 network with the cifar-10 dataset.
+
+## Script Description
+
+### Script and Sample Code
+```shell
+└──ResNet50
+  ├── README.md
+  ├── scripts
+    ├── run_eval.sh                        # launch ascend evaluation
+    ├── run_eval_cpu.sh                    # launch cpu evaluation
+    ├── run_infer.sh                       # launch cpu inference
+    ├── run_standalone_train.sh            # launch ascend standalone training
+    ├── run_standalone_train_cpu.sh        # launch cpu training
+  ├── src
+    ├── config.py                          # parameter configuration
+    ├── dataset.py                         # data preprocessing
+    ├── CrossEntropySmooth.py              # loss definition for ImageNet2012 dataset
+    ├── lr_generator.py                    # generate learning rate for each step
+    ├── resnet.py                          # resnet backbone, including resnet50 and resnet101 and se-resnet50
+  ├── inference.py                         # Entrance to inference
+  ├── interface.py                         # Implements class "Estimator"
+  ├── eval.py                              # Entrance to evaluation
+  ├── train.py                             # Entrance to training
+```
+
+## Script Parameters
+
+Parameters for both training and evaluation can be set in `config.py`.
+
+
+```bash
+"class_num": 10,                  # dataset class num
+"batch_size": 32,                 # batch size of input tensor
+"loss_scale": 1024,               # loss scale
+"momentum": 0.9,                  # momentum
+"weight_decay": 1e-4,             # weight decay
+"epoch_size": 90,                 # only valid for taining, which is always 1 for inference
+"pretrain_epoch_size": 0,         # epoch size that model has been trained before loading pretrained checkpoint, actual training epoch size is equal to epoch_size minus pretrain_epoch_size
+"save_checkpoint": True,          # whether save checkpoint or not
+"save_checkpoint_epochs": 5,      # the epoch interval between two checkpoints. By default, the last checkpoint will be saved after the last step
+"keep_checkpoint_max": 10,        # only keep the last keep_checkpoint_max checkpoint
+"warmup_epochs": 5,               # number of warmup epoch
+"lr_decay_mode": "poly"           # decay mode can be selected in steps, ploy and default
+"lr_init": 0.01,                  # initial learning rate
+"lr_end": 0.00001,                # final learning rate
+"lr_max": 0.1,                    # maximum learning rate
+```
 
 ## Preparatory Stage
 ### Prepare Dataset
-In this example, We need to prepare the cifar10 dataset in advance, and put it into `/home/sedna/examples/backend/mindspore/resnet/`
+In this example, we need to prepare the cifar10 dataset in advance, and put it into `/home/sedna/examples/backend/mindspore/resnet/`.
 ```bash
-cd /home/sedna/examples/backend/mindspore/resnet
+cd /home/sedna/examples/lib-samples/backend/mindspore/ResNet50
 wget http://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz
 tar -zxvf cifar-10-binary.tar.gz
 ```
 ### Parameters
-you can change the parameters of the model in `src/config.py`
+you can change the parameters of the model in `src/config.py`.
 
 ## Modeling Stage
-This example support CPU and NPU, you can follow these steps for training, testing and inference
-### Train
-> * CPU
->```bash
->  bash scripts/run_standalone_train_cpu.sh [DATASET_PATH] [MODEL_SAVE_PATH]
->  # model_save_path must be ABSOLUTE PATH
->  # The log message would be showed in the terminal
->  # The ckpt file would be saved in [MODEL_SAVE_PATH]
->```
-> * NPU
->```bash
->  bash scripts/run_standalone_train.sh [DATASET_PATH] [MODEL_SAVE_PATH]
->  # [MODEL_SAVE_PATH] must be ABSOLUTE PATH
->  # The log message would be saved to scripts/train/log
->  # The ckpt file would be saved in [MODEL_SAVE_PATH]
->```
-###Test
-> * CPU
->```bash
->  bash scripts/run_test_cpu.sh [DATASET_PATH] [CHECKPOINT_PATH]
->  # [CHECKPOINT_PATH] must be ABSOLUTE PATH
->  # The log message would be saved to scripts/test/log
->```
-> * NPU
->```bash
->  bash scripts/run_test.sh [DATASET_PATH] [CHECKPOINT_PATH]
->  # [CHECKPOINT_PATH] must be ABSOLUTE PATH
->  # The log message would be saved to scripts/test/log
->```
-###Infer
->```bash
->  bash scripts/run_infer.sh [IMAGE_PATH] [CHECKPOINT_PATH]
->  # [CHECKPOINT_PATH] must be ABSOLUTE PATH
->  # The log message would be saved to scripts/infer/log
->```
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
+This example support CPU and NPU, you can follow these steps for training, testing and inference.
 
+### Training
+* #### Running on CPU
+```bash
+ bash scripts/run_standalone_train_cpu.sh [DATASET_PATH] [MODEL_SAVE_PATH]
+ # model_save_path must be ABSOLUTE PATH
+ # The log message would be showed in the terminal
+ # The ckpt file would be saved in [MODEL_SAVE_PATH]
+```
+* #### Runing on NPU
+```bash
+ bash scripts/run_standalone_train.sh [DATASET_PATH] [MODEL_SAVE_PATH]
+ # [MODEL_SAVE_PATH] must be ABSOLUTE PATH
+ # The log message would be saved to scripts/train/log
+ # The ckpt file would be saved in [MODEL_SAVE_PATH]
+```
 
+### Evaluation
+* #### Running on CPU
+```bash 
+ bash scripts/run_eval_cpu.sh [DATASET_PATH] [CHECKPOINT_PATH]
+ # [CHECKPOINT_PATH] must be ABSOLUTE PATH
+ # The log message would be saved to scripts/test/log
+```
+* #### Running on NPU
+```bash
+ bash scripts/run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]
+ # [CHECKPOINT_PATH] must be ABSOLUTE PATH
+ # The log message would be saved to scripts/test/log
+```
 
+### Inference
+```bash
+ bash scripts/run_infer.sh [IMAGE_PATH] [CHECKPOINT_PATH]
+ # [CHECKPOINT_PATH] must be ABSOLUTE PATH
+ # The log message would be saved to scripts/infer/log
+```
diff --git a/...amples/backend/mindspore/ResNet50/test.py → ...amples/backend/mindspore/ResNet50/eval.py b/...amples/backend/mindspore/ResNet50/test.py → ...amples/backend/mindspore/ResNet50/eval.py
@@ -15,7 +15,7 @@
 """train resnet."""
 import argparse
 from mindspore.common import set_seed
-from lib.sedna.backend import set_backend
+from sedna.backend import set_backend
 from interface import Estimator
 
 parser = argparse.ArgumentParser(description='Image classification')

diff --git a/examples/lib-samples/backend/mindspore/ResNet50/inference.py b/examples/lib-samples/backend/mindspore/ResNet50/inference.py
@@ -3,8 +3,8 @@
 from mindspore import Tensor
 import mindspore.dataset.vision.c_transforms as C
 import numpy as np
-from lib.sedna.backend import set_backend
 import cv2
+from sedna.backend import set_backend
 from interface import Estimator
 
 parser = argparse.ArgumentParser(description="resnet50 infer")
@@ -30,12 +30,15 @@ def preprocess():
 
 def main():
     args = parser.parse_args()
+
+    # read image and preprocess
     img = cv2.imread(args.image_path)
     data_preprocess = preprocess()
     for method in data_preprocess:
         img = method(img)
     img = np.expand_dims(img, 0)
     data = Tensor(img, ms.float32)
+
     model = set_backend(estimator=Estimator)
     return model.predict(data)
 

diff --git a/examples/lib-samples/backend/mindspore/ResNet50/interface.py b/examples/lib-samples/backend/mindspore/ResNet50/interface.py
@@ -39,17 +39,27 @@ class Estimator:
     def __init__(self) -> None:
         self.has_load = False
         self.network = None
-        self.train_network = None
 
     def train(self, train_data, **kwargs):
+        """The whole process of model training
+
+        The training process of the resnet model. At present, it supports single NPU and CPU.
+        Multi-GPU and multi-NPU will be supported in the future.
+
+        Args:
+            train_data: training dataset path
+            kwargs: Including args_opt and other parameters. args_opt is passed by train.py,
+                    includes some key parameters
+
+        """
         args_opt = kwargs.get("args_opt")
         target = args_opt.device_target
         if target == "CPU":
             args_opt.run_distribute = False
 
         ckpt_save_dir = args_opt.model_save_path
 
-        # init context
+        # Multi-GPU/Multi-NPU
         if args_opt.run_distribute:
             if target == "Ascend":
                 device_id = int(os.getenv('DEVICE_ID'))
@@ -137,6 +147,7 @@ def train(self, train_data, **kwargs):
             lr,
             config.momentum,
             loss_scale=config.loss_scale)
+
         # define loss, model
         if target == "Ascend":
             loss = SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
@@ -213,9 +224,20 @@ def train(self, train_data, **kwargs):
             dataset_sink_mode=dataset_sink_mode)
 
     def evaluate(self, valid_data, **kwargs):
+        """The whole process of model evaluation.
+
+        The evaluation process of the resnet model. At present, it supports single NPU and CPU.
+        GPU will be supported in the future.
+
+        Args:
+            valid_data: evaluation dataset path.
+            kwargs: Including args_opt and other parameters. args_opt is passed by eval.py,
+                    includes some key parameters.
+
+        """
+
         args_opt = kwargs.get("args_opt")
         target = args_opt.device_target
-        # init context
         if target == "Ascend":
             device_id = int(os.getenv('DEVICE_ID'))
             context.set_context(device_id=device_id)
@@ -245,21 +267,50 @@ def evaluate(self, valid_data, **kwargs):
         res = model.eval(dataset)
         print("result:", res, "ckpt=", args_opt.checkpoint_path)
 
-    def predict(self, data, class_name):
+    def predict(self, data):
+        """Inference for the image data
+
+        Infer the image data and output its category
+
+        Args:
+            data: image to be inferred
+        """
+
+        class_name = [
+            'airplane',
+            "automobile",
+            "bird",
+            "cat",
+            "deer",
+            "dog",
+            "frog",
+            "horse",
+            "ship",
+            "truck"]
 
         # define model
         model = Model(self.network)
 
         # infer data
         res = model.predict(data)
-        softmax = nn.Softmax()
 
+        # The output of the model is the score of each category, which needs to be softmax.
+        softmax = nn.Softmax()
         # get label result
         pred_class = class_name[np.argmax(softmax(res[0]))]
+
         print("This image belongs to: ", pred_class)
         return pred_class
 
     def load(self, model_url):
+        """load checkpoint into model
+
+        Initialize resnet model, and load the specified model file for evaluation and inference
+
+        Args:
+            model_url: Url of model file
+        """
+
         print("load model url: ", model_url)
         self.network = resnet(class_num=config.class_num)
         param_dict = load_checkpoint(model_url)

diff --git a/...nd/mindspore/ResNet50/scripts/run_test.sh → ...nd/mindspore/ResNet50/scripts/run_eval.sh b/...nd/mindspore/ResNet50/scripts/run_test.sh → ...nd/mindspore/ResNet50/scripts/run_eval.sh
@@ -16,7 +16,7 @@
 
 if [ $# != 2 ]
 then 
-    echo "Usage: sh run_test.sh [DATASET_PATH] [CHECKPOINT_PATH]"
+    echo "Usage: sh run_eval.sh [DATASET_PATH] [CHECKPOINT_PATH]"
 exit 1
 fi
 
@@ -56,16 +56,16 @@ export DEVICE_ID=0
 export RANK_SIZE=$DEVICE_NUM
 export RANK_ID=0
 
-if [ -d "test" ];
+if [ -d "eval" ];
 then
-    rm -rf ./test
+    rm -rf ./eval
 fi
-mkdir ./test
-cp ../*.py ./test
-cp *.sh ./test
-cp -r ../src ./test
-cd ./test || exit
+mkdir ./eval
+cp ../*.py ./eval
+cp *.sh ./eval
+cp -r ../src ./eval
+cd ./eval || exit
 env > env.log
-echo "start test for device $DEVICE_ID"
-python test.py --dataset_path=$PATH1 --checkpoint_path=$PATH2 &> log &
+echo "start evaluation for device $DEVICE_ID"
+python eval.py --dataset_path=$PATH1 --checkpoint_path=$PATH2 &> log &
 cd ..
diff --git a/...indspore/ResNet50/scripts/run_test_cpu.sh → ...indspore/ResNet50/scripts/run_eval_cpu.sh b/...indspore/ResNet50/scripts/run_test_cpu.sh → ...indspore/ResNet50/scripts/run_eval_cpu.sh
@@ -16,7 +16,7 @@
 
 if [ $# != 2 ]
 then
-    echo "Usage: sh run_test_cpu.sh [DATASET_PATH] [CHECKPOINT_PATH]"
+    echo "Usage: sh run_eval_cpu.sh [DATASET_PATH] [CHECKPOINT_PATH]"
 exit 1
 fi
 
@@ -30,6 +30,7 @@ get_real_path(){
 
 PATH1=$(get_real_path $1)
 PATH2=$(get_real_path $2)
+
 export BACKEND_TYPE="MINDSPORE"
 export DEVICE_CATEGORY="CPU"
 
@@ -51,17 +52,17 @@ ckpt=`basename $PATH2`
 export MODEL_URL=$dirPath
 export MODEL_NAME=$ckpt
 
-if [ -d "test" ];
+if [ -d "eval" ];
 then
-    rm -rf ./test
+    rm -rf ./eval
 fi
-mkdir ./test
-cp ../*.py ./test
-cp *.sh ./test
-cp -r ../src ./test
-cp -r ../src ./test
-cd ./test || exit
+mkdir ./eval
+cp ../*.py ./eval
+cp *.sh ./eval
+cp -r ../src ./eval
+cp -r ../src ./eval
+cd ./eval || exit
 env > env.log
-echo "start test for CPU"
-python test.py --device_target="CPU" --dataset_path=$PATH1 --checkpoint_path=$PATH2 &> log &
+echo "start evaluation for CPU"
+python eval.py --device_target="CPU" --dataset_path=$PATH1 --checkpoint_path=$PATH2 &> log &
 cd ..
diff --git a/examples/lib-samples/backend/mindspore/ResNet50/src/config.py b/examples/lib-samples/backend/mindspore/ResNet50/src/config.py
@@ -29,7 +29,6 @@
     "save_checkpoint": True,
     "save_checkpoint_epochs": 5,
     "keep_checkpoint_max": 10,
-    "save_checkpoint_path": "./",
     "warmup_epochs": 5,
     "lr_decay_mode": "poly",
     "lr_init": 0.01,