Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add protect method for feature order in fl-xgb #497

Merged
merged 24 commits into from
Jan 31, 2023
Merged
Show file tree
Hide file tree
Changes from 20 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion federatedscope/autotune/baseline/fedhpo_vfl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ vertical:
key_size: 256
dims: [7, 14]
algo: 'xgb'
xgb_use_bin: False
eval:
freq: 5
best_res_update_round_wise_key: test_loss
Expand Down
11 changes: 8 additions & 3 deletions federatedscope/core/auxiliaries/trainer_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
"mftrainer": "MFTrainer",
"cltrainer": "CLTrainer",
"lptrainer": "LPTrainer",
"verticaltrainer": "VerticalTrainer",
"atc_trainer": "ATCTrainer",
}

Expand Down Expand Up @@ -135,8 +134,6 @@ def get_trainer(model=None,
dict_path = "federatedscope.cv.trainer.trainer"
elif config.trainer.type.lower() in ['nlptrainer']:
dict_path = "federatedscope.nlp.trainer.trainer"
elif config.trainer.type.lower() in ['verticaltrainer']:
dict_path = "federatedscope.vertical_fl.trainer.trainer"
elif config.trainer.type.lower() in ['cltrainer', 'lptrainer']:
dict_path = "federatedscope.cl.trainer.trainer"
elif config.trainer.type.lower() in [
Expand Down Expand Up @@ -171,6 +168,14 @@ def get_trainer(model=None,
config=config,
only_for_eval=only_for_eval,
monitor=monitor)
elif config.trainer.type.lower() in ['verticaltrainer']:
from federatedscope.vertical_fl.trainer.utils import \
get_vertical_trainer
trainer = get_vertical_trainer(config=config,
model=model,
data=data,
device=device,
monitor=monitor)
else:
# try to find user registered trainer
trainer = None
Expand Down
4 changes: 3 additions & 1 deletion federatedscope/core/configs/cfg_fl_setting.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,9 @@ def extend_fl_setting_cfg(cfg):
cfg.vertical.encryption = 'paillier'
cfg.vertical.key_size = 3072
cfg.vertical.algo = 'lr' # ['lr', 'xgb']
cfg.vertical.xgb_use_bin = False
cfg.vertical.protect_object = ''
cfg.vertical.protect_method = ''
cfg.vertical.protect_args = []

# --------------- register corresponding check function ----------
cfg.register_cfg_check_fun(assert_fl_setting_cfg)
Expand Down
3 changes: 3 additions & 0 deletions federatedscope/vertical_fl/trainer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from federatedscope.vertical_fl.trainer.trainer import VerticalTrainer
from federatedscope.vertical_fl.trainer.feature_order_protected_trainer \
import FeatureOrderProtectedTrainer
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
import numpy as np
from federatedscope.vertical_fl.trainer.trainer import VerticalTrainer


class FeatureOrderProtectedTrainer(VerticalTrainer):
def __init__(self, model, data, device, config, monitor):
super(FeatureOrderProtectedTrainer,
self).__init__(model, data, device, config, monitor)

assert config.vertical.protect_method != '', \
"Please specify the adopted method for protecting feature order"
args = config.vertical.protect_args[0] if len(
config.vertical.protect_args) > 0 else {}

if config.vertical.protect_method == 'use_bins':
self.bin_num = args.get('bin_num', 100)
self.share_bin = args.get('share_bin', False)
self.protect_funcs = self._protect_via_bins
self.split_value = None
else:
raise ValueError(f"The method {args['method']} is not provided")

def get_feature_value(self, feature_idx, value_idx):
assert self.split_value is not None

return self.split_value[feature_idx][value_idx]

def _protect_via_bins(self, raw_feature_order, data):
protected_feature_order = list()
bin_size = int(np.ceil(self.cfg.dataloader.batch_size / self.bin_num))
split_position = [[] for _ in range(len(raw_feature_order))
] if self.share_bin else None
self.split_value = [dict() for _ in range(len(raw_feature_order))]
for i in range(len(raw_feature_order)):
_protected_feature_order = list()
for j in range(self.bin_num):
idx_start = j * bin_size
idx_end = min((j + 1) * bin_size, len(raw_feature_order[i]))
feature_order_frame = raw_feature_order[i][idx_start:idx_end]
np.random.shuffle(feature_order_frame)
_protected_feature_order.append(feature_order_frame)
if self.share_bin:
if j != self.bin_num - 1:
split_position[i].append(idx_end)
min_value = min(data[feature_order_frame][:, i])
max_value = max(data[feature_order_frame][:, i])
if j == 0:
self.split_value[i][idx_end] = max_value
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The value on the right hand side should be divided by 2.0

elif j == self.bin_num - 1:
self.split_value[i][idx_start] += min_value / 2.0
else:
self.split_value[i][idx_start] += min_value / 2.0
self.split_value[i][idx_end] = max_value / 2.0
else:
mean_value = np.mean(data[feature_order_frame][:, i])
for x in range(idx_start, idx_end):
self.split_value[i][x] = mean_value
protected_feature_order.append(
np.concatenate(_protected_feature_order))

extra_info = None
if split_position is not None:
extra_info = {'split_position': split_position}

return {
'feature_order': protected_feature_order,
'extra_info': extra_info
}

def _get_feature_order_info(self, data):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For convenience, I also protected the label owner's feature order before. Actually, label owner does not need to do this.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since it need some more efforts to fix this issue such as modifying the split position accordingly, we can add TODO item here and fix it later

num_of_feature = data.shape[1]
feature_order = [0] * num_of_feature
for i in range(num_of_feature):
feature_order[i] = data[:, i].argsort()
return self.protect_funcs(feature_order, data)
67 changes: 53 additions & 14 deletions federatedscope/vertical_fl/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,18 @@


class VerticalTrainer(object):
def __init__(self, model, data, device, config, monitor, only_for_eval):
def __init__(self, model, data, device, config, monitor):
self.model = model
self.data = data
self.device = device
self.cfg = config
self.monitor = monitor
self.only_for_eval = only_for_eval

self.bin_num = config.train.optimizer.bin_num
self.eta = config.train.optimizer.eta

self.merged_feature_order = None
self.client_feature_order = None
self.extra_info = None
self.batch_x = None
self.batch_y = None
self.batch_y_hat = None
Expand All @@ -31,24 +32,32 @@ def prepare_for_train(self, index=None):
shuffled=True)
self.criterion = get_vertical_loss(self.cfg.criterion.type)
batch_index, self.batch_x, self.batch_y = self._fetch_train_data(index)
feature_order = self._get_feature_order(self.batch_x)
feature_order_info = self._get_feature_order_info(self.batch_x)
self.client_feature_order = feature_order_info['feature_order']
if index is None:
self.batch_y_hat = np.random.uniform(low=0.0,
high=1.0,
size=len(self.batch_y))
self.batch_z = 0
return batch_index, feature_order
return batch_index, feature_order_info

def train(self, feature_order=None, tree_num=0, node_num=None):
def train(self, feature_order_info=None, tree_num=0, node_num=None):
# Start to build a tree
if node_num is None:
if tree_num == 0 and feature_order is not None:
self.feature_order = feature_order
if tree_num == 0 and feature_order_info is not None:
self.merged_feature_order, self.extra_info = \
self._parse_feature_order(feature_order_info)
return self._compute_for_root(tree_num=tree_num)
# Continue training
else:
return self._compute_for_node(tree_num, node_num)

def get_feature_value(self, feature_idx, value_idx):
assert self.batch_x is not None

instance_idx = self.client_feature_order[feature_idx][value_idx]
return self.batch_x[instance_idx, feature_idx]

def _predict(self, tree_num):
self._compute_weight(tree_num, node_num=0)

Expand All @@ -58,15 +67,35 @@ def _fetch_train_data(self, index=None):
else:
return index, self.data['train']['x'][index], None

def _get_feature_order(self, data):
def _parse_feature_order(self, feature_order_info):
client_ids = list(feature_order_info.keys())
client_ids = sorted(client_ids)
merged_feature_order = np.concatenate(
[feature_order_info[idx]['feature_order'] for idx in client_ids])

# TODO: different extra_info for different clients
extra_info = feature_order_info[client_ids[0]].get('extra_info', None)
if extra_info is not None:
merged_extra_info = dict()
for each_key in extra_info.keys():
merged_extra_info[each_key] = np.concatenate([
feature_order_info[idx]['extra_info'][each_key]
for idx in client_ids
])
else:
merged_extra_info = None

return merged_feature_order, merged_extra_info

def _get_feature_order_info(self, data):
num_of_feature = data.shape[1]
feature_order = [0] * num_of_feature
for i in range(num_of_feature):
feature_order[i] = data[:, i].argsort()
return feature_order
return {'feature_order': feature_order}

def _get_ordered_gh(self, tree_num, node_num, feature_idx):
order = self.feature_order[feature_idx]
order = self.merged_feature_order[feature_idx]
ordered_g = self.model[tree_num][node_num].grad[order]
ordered_h = self.model[tree_num][node_num].hess[order]
return ordered_g, ordered_h
Expand All @@ -76,11 +105,20 @@ def _get_best_gain(self, tree_num, node_num):
split_ref = {'feature_idx': None, 'value_idx': None}

instance_num = self.batch_x.shape[0]
feature_num = len(self.feature_order)
feature_num = len(self.merged_feature_order)
if self.extra_info is not None:
split_position = self.extra_info.get(
'split_position',
[range(instance_num) for _ in range(feature_num)])
else:
# The left/right sub-tree cannot be empty
split_position = [
range(1, instance_num) for _ in range(feature_num)
]
for feature_idx in range(feature_num):
ordered_g, ordered_h = self._get_ordered_gh(
tree_num, node_num, feature_idx)
for value_idx in range(instance_num):
for value_idx in split_position[feature_idx]:
gain = self.model[tree_num].cal_gain(ordered_g, ordered_h,
value_idx)

Expand Down Expand Up @@ -116,7 +154,8 @@ def _compute_for_node(self, tree_num, node_num):
else:
best_gain, split_ref = self._get_best_gain(tree_num, node_num)
if best_gain > 0:
split_feature = self.feature_order[split_ref['feature_idx']]
split_feature = self.merged_feature_order[
split_ref['feature_idx']]
left_child = np.zeros(self.batch_x.shape[0])
for x in range(split_ref['value_idx']):
left_child[split_feature[x]] = 1
Expand Down
21 changes: 21 additions & 0 deletions federatedscope/vertical_fl/trainer/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
from federatedscope.vertical_fl.trainer import VerticalTrainer, \
FeatureOrderProtectedTrainer


def get_vertical_trainer(config, model, data, device, monitor):

protect_object = config.vertical.protect_object
if not protect_object or protect_object == '':
return VerticalTrainer(model=model,
data=data,
device=device,
config=config,
monitor=monitor)
elif protect_object == 'feature_order':
return FeatureOrderProtectedTrainer(model=model,
data=data,
device=device,
config=config,
monitor=monitor)
else:
raise ValueError
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ vertical:
use: True
dims: [4, 8]
algo: 'xgb'
xgb_use_bin: True
eval:
freq: 5
best_res_update_round_wise_key: test_loss
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ vertical:
use: True
dims: [7, 14]
algo: 'xgb'
xgb_use_bin: True
eval:
freq: 3
best_res_update_round_wise_key: test_loss
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ vertical:
use: True
dims: [10, 20]
algo: 'xgb'
xgb_use_bin: True
eval:
freq: 3
best_res_update_round_wise_key: test_loss
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ vertical:
use: True
dims: [5, 10]
algo: 'xgb'
xgb_use_bin: True
eval:
freq: 3
best_res_update_round_wise_key: test_loss
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
use_gpu: False
device: 0
backend: torch
federate:
mode: standalone
client_num: 2
model:
type: xgb_tree
lambda_: 0.1
gamma: 0
num_of_trees: 10
max_tree_depth: 3
data:
root: data/
type: adult
splits: [1.0, 0.0]
dataloader:
type: raw
batch_size: 2000
criterion:
type: CrossEntropyLoss
trainer:
type: verticaltrainer
train:
optimizer:
bin_num: 100
# learning rate for xgb model
eta: 0.5
vertical:
use: True
dims: [7, 14]
algo: 'xgb'
protect_object: 'feature_order'
protect_method: 'use_bins'
protect_args: [{'bin_num': 100, 'share_bin': True}]
eval:
freq: 3
best_res_update_round_wise_key: test_loss
Loading