Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add protect method for feature order in fl-xgb #497

Merged
merged 24 commits into from
Jan 31, 2023
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 0 additions & 1 deletion federatedscope/autotune/baseline/fedhpo_vfl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ model:
train:
optimizer:
lr: 0.5
bin_num: 100
# learning rate for xgb model
eta: 0.5
data:
Expand Down
5 changes: 3 additions & 2 deletions federatedscope/core/configs/cfg_fl_setting.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,10 @@ def extend_fl_setting_cfg(cfg):
cfg.vertical.encryption = 'paillier'
cfg.vertical.key_size = 3072
cfg.vertical.algo = 'lr' # ['lr', 'xgb']
cfg.vertical.protect_object = ''
cfg.vertical.protect_method = ''
cfg.vertical.protect_object = '' # feature_order, TODO: add more
cfg.vertical.protect_method = '' # dp
cfg.vertical.protect_args = []
# Default values for 'dp': {'bucket_num':100, 'epsilon':None}

# --------------- register corresponding check function ----------
cfg.register_cfg_check_fun(assert_fl_setting_cfg)
Expand Down
120 changes: 82 additions & 38 deletions federatedscope/vertical_fl/trainer/feature_order_protected_trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,10 @@ def __init__(self, model, data, device, config, monitor):
args = config.vertical.protect_args[0] if len(
config.vertical.protect_args) > 0 else {}

if config.vertical.protect_method == 'use_bins':
self.bin_num = args.get('bin_num', 100)
self.share_bin = args.get('share_bin', False)
self.protect_funcs = self._protect_via_bins
if config.vertical.protect_method == 'dp':
self.bucket_num = args.get('bucket_num', 100)
self.epsilon = args.get('epsilon', None)
self.protect_funcs = self._protect_via_dp
self.split_value = None
else:
raise ValueError(f"The method {args['method']} is not provided")
Expand All @@ -25,44 +25,88 @@ def get_feature_value(self, feature_idx, value_idx):

return self.split_value[feature_idx][value_idx]

def _protect_via_bins(self, raw_feature_order, data):
def _bucketize(self, feature_order, bucket_size, bucket_num):
bucketized_feature_order = list()
for bucket_idx in range(bucket_num):
start = bucket_idx * bucket_size
end = min((bucket_idx + 1) * bucket_size, len(feature_order))
bucketized_feature_order.append(feature_order[start:end])
return bucketized_feature_order

def _protect_via_dp(self, raw_feature_order, data):
protected_feature_order = list()
bin_size = int(np.ceil(self.cfg.dataloader.batch_size / self.bin_num))
split_position = [[] for _ in range(len(raw_feature_order))
] if self.share_bin else None
self.split_value = [dict() for _ in range(len(raw_feature_order))]
for i in range(len(raw_feature_order)):
_protected_feature_order = list()
for j in range(self.bin_num):
idx_start = j * bin_size
idx_end = min((j + 1) * bin_size, len(raw_feature_order[i]))
feature_order_frame = raw_feature_order[i][idx_start:idx_end]
np.random.shuffle(feature_order_frame)
_protected_feature_order.append(feature_order_frame)
if self.share_bin:
if j != self.bin_num - 1:
split_position[i].append(idx_end)
min_value = min(data[feature_order_frame][:, i])
max_value = max(data[feature_order_frame][:, i])
if j == 0:
self.split_value[i][idx_end] = max_value
elif j == self.bin_num - 1:
self.split_value[i][idx_start] += min_value / 2.0
bucket_size = int(
np.ceil(self.cfg.dataloader.batch_size / self.bucket_num))
if self.epsilon is None:
prob_for_preserving = 1.0
else:
_tmp = np.power(np.e, self.epsilon)
prob_for_preserving = _tmp / (_tmp + self.bucket_num - 1)
prob_for_moving = (1.0 - prob_for_preserving) / (self.bucket_num - 1)
split_position = []
self.split_value = []

for feature_idx in range(len(raw_feature_order)):
bucketized_feature_order = self._bucketize(
raw_feature_order[feature_idx], bucket_size, self.bucket_num)
noisy_bucketizd_feature_order = [[]
for _ in range(self.bucket_num)]

# Add noise to bucketized feature order
for bucket_idx in range(self.bucket_num):
probs = np.ones(self.bucket_num) * prob_for_moving
probs[bucket_idx] = prob_for_preserving
for each in bucketized_feature_order[bucket_idx]:
selected_bucket_idx = np.random.choice(list(
range(self.bucket_num)),
p=probs)
noisy_bucketizd_feature_order[selected_bucket_idx].append(
each)

# Save split positions (instance number within buckets)
# We exclude the endpoints to avoid empty sub-trees
_split_position = list()
_split_value = dict()
accumu_num = 0
for bucket_idx, each_bucket in enumerate(
noisy_bucketizd_feature_order):
instance_num = len(each_bucket)
# Skip the empty bucket
if instance_num != 0:
# Skip the endpoints
if bucket_idx != self.bucket_num - 1:
_split_position.append(accumu_num + instance_num)

# Save split values: average of min value of (j-1)-th
# bucket and max value of j-th bucket
xieyxclack marked this conversation as resolved.
Show resolved Hide resolved
max_value = data[bucketized_feature_order[bucket_idx]
[0]][feature_idx]
min_value = data[bucketized_feature_order[bucket_idx]
[-1]][feature_idx]
if accumu_num == 0:
_split_value[accumu_num +
instance_num] = min_value / 2.0
elif bucket_idx == self.bucket_num - 1:
_split_value[accumu_num] += max_value / 2.0
else:
self.split_value[i][idx_start] += min_value / 2.0
self.split_value[i][idx_end] = max_value / 2.0
else:
mean_value = np.mean(data[feature_order_frame][:, i])
for x in range(idx_start, idx_end):
self.split_value[i][x] = mean_value
protected_feature_order.append(
np.concatenate(_protected_feature_order))

extra_info = None
if split_position is not None:
extra_info = {'split_position': split_position}
_split_value[accumu_num] += max_value / 2.0
_split_value[accumu_num +
instance_num] = min_value / 2.0

accumu_num += instance_num

split_position.append(_split_position)
self.split_value.append(_split_value)

[np.random.shuffle(x) for x in noisy_bucketizd_feature_order]
noisy_bucketizd_feature_order = np.concatenate(
noisy_bucketizd_feature_order)
protected_feature_order.append(noisy_bucketizd_feature_order)

extra_info = {'split_position': split_position}

return {
'raw_feature_order': raw_feature_order,
'feature_order': protected_feature_order,
'extra_info': extra_info
}
Expand Down
8 changes: 7 additions & 1 deletion federatedscope/vertical_fl/trainer/trainer.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,13 @@ def prepare_for_train(self, index=None):
self.criterion = get_vertical_loss(self.cfg.criterion.type)
batch_index, self.batch_x, self.batch_y = self._fetch_train_data(index)
feature_order_info = self._get_feature_order_info(self.batch_x)
self.client_feature_order = feature_order_info['feature_order']
if 'raw_feature_order' in feature_order_info:
# When applying protect method, the raw (real) feature order might
# be different from the shared feature order
self.client_feature_order = feature_order_info['raw_feature_order']
feature_order_info.pop('raw_feature_order')
else:
self.client_feature_order = feature_order_info['feature_order']
if index is None:
self.batch_y_hat = np.random.uniform(low=0.0,
high=1.0,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ trainer:
type: verticaltrainer
train:
optimizer:
bin_num: 1000
xieyxclack marked this conversation as resolved.
Show resolved Hide resolved
# learning rate for xgb model
eta: 0.5
vertical:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ trainer:
type: verticaltrainer
train:
optimizer:
bin_num: 100
# learning rate for xgb model
eta: 0.5
vertical:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ trainer:
type: verticaltrainer
train:
optimizer:
bin_num: 1000
xieyxclack marked this conversation as resolved.
Show resolved Hide resolved
# learning rate for xgb model
eta: 1
vertical:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ trainer:
type: verticaltrainer
train:
optimizer:
bin_num: 100
# learning rate for xgb model
eta: 0.5
vertical:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,16 +23,15 @@ trainer:
type: verticaltrainer
train:
optimizer:
bin_num: 100
# learning rate for xgb model
eta: 0.5
vertical:
use: True
dims: [7, 14]
algo: 'xgb'
protect_object: 'feature_order'
protect_method: 'use_bins'
protect_args: [{'bin_num': 100, 'share_bin': True}]
protect_method: 'dp'
protect_args: [{'bucket_num': 100, 'epsilon':10}]
eval:
freq: 3
best_res_update_round_wise_key: test_loss
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ model:
type: lr
train:
optimizer:
bin_num: 100
lambda_: 0.1
gamma: 0
num_of_trees: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@ model:
type: lr
train:
optimizer:
bin_num: 100
lambda_: 0.1
gamma: 0
num_of_trees: 10
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ model:
type: lr
train:
optimizer:
bin_num: 100
lambda_: 0.1
gamma: 0
num_of_trees: 10
Expand Down
79 changes: 69 additions & 10 deletions tests/test_xgb.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ def set_config_for_xgb_base(self, cfg):

return backup_cfg

def set_config_for_xgb_use_bin_share_bin(self, cfg):
def set_config_for_xgb_dp(self, cfg):
backup_cfg = cfg.clone()

import torch
Expand Down Expand Up @@ -79,16 +79,16 @@ def set_config_for_xgb_use_bin_share_bin(self, cfg):
cfg.vertical.dims = [7, 14]
cfg.vertical.algo = 'xgb'
cfg.vertical.protect_object = 'feature_order'
cfg.vertical.protect_method = 'use_bins'
cfg.vertical.protect_args = [{'bin_num': 100, 'share_bin': True}]
cfg.vertical.protect_method = 'dp'
cfg.vertical.protect_args = [{'bucket_num': 100, 'epsilon': 10}]

cfg.trainer.type = 'verticaltrainer'
cfg.eval.freq = 5
cfg.eval.best_res_update_round_wise_key = "test_loss"

return backup_cfg

def set_config_for_xgb_use_bin_no_share_bin(self, cfg):
def set_config_for_xgb_dp_too_large_noise(self, cfg):
backup_cfg = cfg.clone()

import torch
Expand Down Expand Up @@ -118,8 +118,47 @@ def set_config_for_xgb_use_bin_no_share_bin(self, cfg):
cfg.vertical.dims = [7, 14]
cfg.vertical.algo = 'xgb'
cfg.vertical.protect_object = 'feature_order'
cfg.vertical.protect_method = 'use_bins'
cfg.vertical.protect_args = [{'bin_num': 100, 'share_bin': False}]
cfg.vertical.protect_method = 'dp'
cfg.vertical.protect_args = [{'bucket_num': 100, 'epsilon': 1}]

cfg.trainer.type = 'verticaltrainer'
cfg.eval.freq = 5
cfg.eval.best_res_update_round_wise_key = "test_loss"

return backup_cfg

def set_config_for_xgb_bucket(self, cfg):
backup_cfg = cfg.clone()

import torch
cfg.use_gpu = torch.cuda.is_available()

cfg.federate.mode = 'standalone'
cfg.federate.client_num = 2

cfg.model.type = 'xgb_tree'
cfg.model.lambda_ = 0.1
cfg.model.gamma = 0
cfg.model.num_of_trees = 5
cfg.model.max_tree_depth = 3

cfg.train.optimizer.eta = 0.5

cfg.data.root = 'test_data/'
cfg.data.type = 'adult'
cfg.data.size = 2000

cfg.dataloader.type = 'raw'
cfg.dataloader.batch_size = 2000

cfg.criterion.type = 'CrossEntropyLoss'

cfg.vertical.use = True
cfg.vertical.dims = [7, 14]
cfg.vertical.algo = 'xgb'
cfg.vertical.protect_object = 'feature_order'
cfg.vertical.protect_method = 'dp'
cfg.vertical.protect_args = [{'bucket_num': 100}]

cfg.trainer.type = 'verticaltrainer'
cfg.eval.freq = 5
Expand Down Expand Up @@ -148,9 +187,9 @@ def test_XGB_Base(self):
self.assertGreater(test_results['server_global_eval']['test_acc'],
0.79)

def test_XGB_use_bin_share_bin(self):
def test_XGB_use_dp(self):
init_cfg = global_cfg.clone()
backup_cfg = self.set_config_for_xgb_use_bin_share_bin(init_cfg)
backup_cfg = self.set_config_for_xgb_dp(init_cfg)
setup_seed(init_cfg.seed)
update_logger(init_cfg, True)

Expand All @@ -169,9 +208,29 @@ def test_XGB_use_bin_share_bin(self):
self.assertGreater(test_results['server_global_eval']['test_acc'],
0.79)

def test_XGB_use_bin_no_share_bin(self):
def test_XGB_use_dp_too_large_noise(self):
init_cfg = global_cfg.clone()
backup_cfg = self.set_config_for_xgb_dp_too_large_noise(init_cfg)
setup_seed(init_cfg.seed)
update_logger(init_cfg, True)

data, modified_config = get_data(init_cfg.clone())
init_cfg.merge_from_other_cfg(modified_config)
self.assertIsNotNone(data)

Fed_runner = get_runner(data=data,
server_class=get_server_cls(init_cfg),
client_class=get_client_cls(init_cfg),
config=init_cfg.clone())
self.assertIsNotNone(Fed_runner)
test_results = Fed_runner.run()
init_cfg.merge_from_other_cfg(backup_cfg)
print(test_results)
self.assertLess(test_results['server_global_eval']['test_acc'], 0.6)

def test_XGB_use_bucket(self):
init_cfg = global_cfg.clone()
backup_cfg = self.set_config_for_xgb_use_bin_no_share_bin(init_cfg)
backup_cfg = self.set_config_for_xgb_bucket(init_cfg)
setup_seed(init_cfg.seed)
update_logger(init_cfg, True)

Expand Down