mindspore-ai · shgal637 · Nov 4, 2024
diff --git a/examples/community/vfl_attacks/README.md b/examples/community/vfl_attacks/README.md
@@ -0,0 +1,93 @@
+# 纵向联邦学习攻击
+
+本项目基于MindSpore框架实现了纵向联邦学习模型的一种后门攻击方法和一种标签推理攻击方法。
+
+## 原型论文
+
+T. Zou, Y. Liu, Y. Kang, W. Liu, Y. He, Z. Yi, Q. Yang, and Y.-Q. Zhang, “Defending batch-level label inference and replacement attacks in vertical federated learning,” IEEE Transactions on Big Data, pp. 1–12, 2022. [PDF][https://www.computer.org/csdl/journal/bd/5555/01/09833321/1F8uKhxrvNe]
+
+Fu C, Zhang X, Ji S, et al. Label inference attacks against vertical federated learning[C]//31st USENIX security symposium (USENIX Security 22). 2022: 1397-1414. [PDF][https://www.usenix.org/conference/usenixsecurity22/presentation/fu-chong]
+
+## 环境要求
+
+Mindspore >= 1.9
+
+## 脚本说明
+
+```markdown
+│  README.md
+│  the_example.py  // 应用示例
+│
+├─examples  //示例
+│  ├─common
+│  │  │  constants.py  //用户定义常量
+│  │
+│  ├─datasets
+│  │  │  cifar_dataset.py   //用户加载数据集
+│  │  │  functions.py
+│  │
+│  └─model
+│      │  init_active_model.py   //用户加载顶层模型
+│      │  init_passive_model.py  //用户加载底层模型
+│      │  resnet.py  //用户定义模型结构
+│      │  resnet_cifar.py
+│      │  top_model_fcn.py
+│      │  vgg.py
+│      │  vgg_cifar.py
+│
+├─utils  //实现VFL功能和两个算法
+│  ├─config
+│  │  │  args_process.py  //审查并处理用户传入的参数
+│  │  │  config.yaml  //默认参数配置文件
+│  │
+│  ├─datasets  //定义VFL数据集加载方式
+│  │  │  common.py
+│  │  │  image_dataset.py
+│  │
+│  ├─methods
+│  │  ├─direct_attack
+│  │  │  │  direct_attack_passive_party.py  //定义直接标签推理攻击的攻击者对象
+│  │  │  │  direct_attack_vfl.py  //定义直接标签推理攻击的VFL对象
+│  │  │
+│  │  └─g_r
+│  │      │  g_r_passive_party.py  //定义梯度替换后门攻击的攻击者对象
+│  │
+│  ├─model
+│  │  │  base_model.py  //VFL中模型的基本类
+│  │
+│  ├─party
+│  │  │  active_party.py  //主动方对象
+│  │  │  passive_party.py  //被动方对象
+│  │
+│  └─vfl
+│      │  init_vfl.py  //初始化各参与方
+│      │  vfl.py  //定义VFL对象，包括各类过程函数
+```
+
+## 引入相关包
+
+```Python
+from utils.vfl.init_vfl import Init_Vfl
+from utils.vfl.vfl import VFL
+from utils.methods.direct_attack.direct_attack_vfl import DirectVFL
+from utils.config.args_process import argsments_function
+```
+
+## Init_Vfl介绍
+
+该模块负责垂直联邦学习（VFL）中参与者的初始化，包括参与者的模型、参数和类。对于主动参与方，定义对象为VFLActiveModel，对于正常被动参与方，定义对象为VFLPassiveModel，对于梯度替换后门攻击，定义攻击者对象为GRPassiveModel，对于直接标签推理攻击，定义对象为DirectAttackPassiveModel。
+
+## VFL介绍
+
+该模块定义了VFL中各种过程函数，包括训练、预测、更新等。
+
+## DirectVFL 介绍
+
+该模块实现了直接标签推理攻击，在VFL类的基础上定义了直接标签推理攻击中的过程函数。
+
+## argsments_function 介绍
+
+该函数接受并审查用户的参数，并封装为utils中各个类支持的格式。其中，梯度替换攻击用于分割VFL场景，直接标签推理攻击只适用于不分割VFL场景。
+
+## 扩展
+本项目当前支持CIFAR-10、BHI数据集，目前examples/datasets文件夹中给出了CIFAR-10数据集加载代码，BHI数据集参考该代码进行扩展即可。如需自定义模型结构或数据集加载方式，请参考并修改examples文件夹中的对应文件内容。
diff --git a/examples/community/vfl_attacks/__init__.py b/examples/community/vfl_attacks/__init__.py
diff --git a/examples/community/vfl_attacks/examples/__init__.py b/examples/community/vfl_attacks/examples/__init__.py
diff --git a/examples/community/vfl_attacks/examples/common/constants.py b/examples/community/vfl_attacks/examples/common/constants.py
@@ -0,0 +1,21 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+This module defines constant variables.
+"""
+
+checkpoint_path = './output_logs'
+output_path = './output_logs/'
+data_path = '../data/'
diff --git a/examples/community/vfl_attacks/examples/datasets/__init__.py b/examples/community/vfl_attacks/examples/datasets/__init__.py
diff --git a/examples/community/vfl_attacks/examples/datasets/cifar_dataset.py b/examples/community/vfl_attacks/examples/datasets/cifar_dataset.py
@@ -0,0 +1,197 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+Load the dataset and construct the dataloader.
+
+This module provides functions to create data loaders for the CIFAR-10 and CIFAR-100 datasets,
+including train data loader, test data loader, and backdoor test data loader.
+"""
+import os
+import pickle
+import numpy as np
+import mindspore as ms
+from mindspore.dataset import vision
+from utils.datasets.common import generate_dataloader
+from examples.datasets.functions import get_random_indices, get_target_indices
+from examples.common.constants import data_path
+
+# Transform for CIFAR train dataset.
+train_transform = ms.dataset.transforms.Compose([
+    vision.ToTensor()
+])
+
+# Transform for CIFAR test dataset.
+test_transform = ms.dataset.transforms.Compose([
+    vision.ToTensor()
+])
+
+def _get_labeled_data_with_2_party(data_dir, dataset, dtype="train", num_samples=None):
+    """
+    Read data from a local file.
+
+    Args:
+        data_dir (str): Directory path of the local file.
+        dataset (str): Dataset name, supported values are 'cifar10' and 'cifar100'.
+        dtype (str): Type of data to read, either "Train" or "Test".
+
+    Returns:
+        tuple: A tuple containing the data X and the labels Y.
+    """
+    if dataset == 'cifar10':
+        data_dir = data_dir + 'cifar-10-batches-py/'
+        train_list = [
+            'data_batch_1',
+            'data_batch_2',
+            'data_batch_3',
+            'data_batch_4',
+            'data_batch_5']
+        test_list = ['test_batch']
+        all_data = []
+        targets = []
+        downloaded_list = train_list if dtype == 'train' else test_list
+        for file_name in downloaded_list:
+            file_path = os.path.join(data_dir, file_name)
+            with open(file_path, 'rb') as f:
+                entry = pickle.load(f, encoding='latin1')
+                all_data.append(entry['data'])
+                if 'labels' in entry:
+                    targets.extend(entry['labels'])
+                else:
+                    targets.extend(entry['fine_labels'])
+        all_data = np.vstack(all_data).reshape(-1, 3, 32, 32)
+        targets = np.array(targets)
+        if num_samples is not None:
+            indices = get_random_indices(num_samples, len(all_data))
+            datas, labels = all_data[indices], targets[indices]
+        else:
+            datas, labels = all_data, targets
+    else:
+        filename = data_dir + 'cifar-100-python/' + dtype
+        with open(filename, 'rb') as f:
+            datadict = pickle.load(f, encoding='latin1')
+            x = datadict['data']
+            all_data = x.reshape(-1, 3, 32, 32)
+            targets = datadict['fine_labels']
+            targets = np.array(targets)
+        if num_samples is not None:
+            indices = get_random_indices(num_samples, len(all_data))
+            datas, labels = all_data[indices], targets[indices]
+        else:
+            datas, labels = all_data, targets
+
+    return datas, labels
+
+
+def _load_two_party_data(data_dir, args):
+    """
+    Get data from a local dataset, supporting only two parties.
+
+    Args:
+        data_dir (str): Path of the local dataset.
+        args (dict): Configuration.
+
+    Returns:
+        tuple: A tuple containing the following data:
+            X_train: Normal train features.
+            y_train: Normal train labels.
+            X_test: Normal test features.
+            y_test: Normal test labels.
+            backdoor_X_test: Backdoor test features.
+            backdoor_y_test: Backdoor test labels.
+            backdoor_indices_train: Indices of backdoor samples in the normal train dataset.
+            backdoor_target_indices: Indices of backdoor labels in the normal train dataset.
+    """
+    print("# load_two_party_data")
+    n_train = args['target_train_size']
+    n_test = args['target_test_size']
+    if n_train == -1:
+        n_train = None
+    if n_test == -1:
+        n_test = None
+
+    x_train, y_train = _get_labeled_data_with_2_party(data_dir=data_dir,
+                                         dataset=args['dataset'],
+                                         dtype='train',
+                                         num_samples=n_train)
+
+    x_test, y_test = _get_labeled_data_with_2_party(data_dir=data_dir,
+                                                   dataset=args['dataset'],
+                                                   dtype='test',
+                                                   num_samples=n_test)
+
+    # Randomly select samples of other classes from normal train dataset as backdoor samples.
+    train_indices = np.where(y_train != args['backdoor_label'])[0]
+    backdoor_indices_train = np.random.choice(train_indices, args['backdoor_train_size'], replace=False)
+
+    # Randomly select samples of other classes from normal test dataset to generate backdoor test dataset.
+    test_indices = np.where(y_test != args['backdoor_label'])[0]
+    backdoor_indices_test = np.random.choice(test_indices, args['backdoor_test_size'], replace=False)
+    backdoor_x_test, backdoor_y_test = x_test[backdoor_indices_test], \
+                                       y_test[backdoor_indices_test]
+    backdoor_y_test = np.full_like(backdoor_y_test, args['backdoor_label'])
+
+    # Randomly select samples of backdoor label in normal train dataset, for gradient-replacement.
+    backdoor_target_indices = get_target_indices(y_train, args['backdoor_label'], args['backdoor_train_size'])
+
+    print(f"y_train.shape: {y_train.shape}")
+    print(f"y_test.shape: {y_test.shape}")
+    print(f"backdoor_y_test.shape: {backdoor_y_test.shape}")
+
+    return x_train, y_train, x_test, y_test, backdoor_x_test, backdoor_y_test, \
+           backdoor_indices_train, backdoor_target_indices
+
+
+def get_cifar_dataloader(args):
+    """
+    Generate loaders for the CIFAR dataset, supporting CIFAR-10 and CIFAR-100.
+
+    Args:
+        args (dict): Configuration.
+
+    Returns:
+        tuple: A tuple containing the following data loaders:
+            train_dl: Loader for the normal train dataset.
+            test_dl: Loader for the normal test dataset.
+            backdoor_test_dl: Loader for the backdoor test dataset, containing only backdoor samples,
+                            used for ASR evaluation.
+            backdoor_indices: Indices of backdoor samples in the normal train dataset.
+            backdoor_target_indices: Indices of backdoor labels in the normal train dataset,
+                            used by Gradient-Replacement.
+    """
+    result = _load_two_party_data(data_path, args)
+    x_train, y_train, x_test, y_test, backdoor_x_test, backdoor_y_test, \
+    backdoor_indices, backdoor_target_indices = result
+
+    batch_size = args['target_batch_size']
+    # Get loader of normal train dataset, used by normal training.
+    train_dl = generate_dataloader((x_train, y_train), batch_size, train_transform, shuffle=True, half=args['half'])
+    # GFet loader of normal test dataset, used to evaluate main task accuracy.
+    test_dl = generate_dataloader((x_test, y_test), batch_size, test_transform, shuffle=False, half=args['half'])
+
+    backdoor_test_dl = None
+    if args['backdoor'] != 'no':
+        # Get loader of backdoor test dataset, used to evaluate backdoor task accuracy.
+        backdoor_test_dl = generate_dataloader((backdoor_x_test, backdoor_y_test), batch_size, test_transform,
+                                               shuffle=False,
+                                               backdoor_indices=np.arange(args['backdoor_test_size']),
+                                               trigger=args['trigger'], trigger_add=args['trigger_add'], half=args['half'])
+
+    if args['backdoor'] == 'g_r':
+        # Get loader of train dataset used by Gradient-Replacement, containing backdoor features and normal labels.
+        train_dl = generate_dataloader((x_train, y_train), batch_size, train_transform,
+                                           shuffle=True,
+                                           backdoor_indices=backdoor_indices, half=args['half'])
+
+    return train_dl, test_dl, backdoor_test_dl, backdoor_indices, backdoor_target_indices
diff --git a/examples/community/vfl_attacks/examples/datasets/functions.py b/examples/community/vfl_attacks/examples/datasets/functions.py
@@ -0,0 +1,56 @@
+# Copyright 2024 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""
+This module provides functions to select samples from a dataset.
+
+Functions:
+    _get_target_indices chooses the samples of the specified category.
+    _get_random_indices randomly selects samples.
+"""
+import numpy as np
+
+def get_target_indices(labels, target_label, size, backdoor_indices=None):
+    """
+    Get indices with a specified size of the target label.
+
+    Args:
+        labels (ndarray): Array of labels in the dataset.
+        target_label (int): The target label to filter.
+        size (int): The number of indices to return.
+
+    Returns:
+        ndarray: An array of indices with the specified size of the target label.
+    """
+    indices = np.where(labels == target_label)[0]
+    indices = np.setdiff1d(indices, backdoor_indices)
+    np.random.shuffle(indices)
+    result = indices[:size]
+    return result
+
+
+def get_random_indices(target_length, all_length):
+    """
+    Generate random indices.
+
+    Args:
+        target_length (int): The length of the target indices to generate.
+        all_length (int): The total length of all indices available.
+
+    Returns:
+        ndarray: An array of random indices.
+    """
+    all_indices = np.arange(all_length)
+    indices = np.random.choice(all_indices, target_length, replace=False)
+    return indices