Skip to content

Commit

Permalink
Add protect method for feature order in fl-xgb (#497)
Browse files Browse the repository at this point in the history
- Bucketize feature orders and shuffle
- DP algorithms
- Opboost algos
  • Loading branch information
xieyxclack authored Jan 31, 2023
1 parent 8457b06 commit 8d042b2
Show file tree
Hide file tree
Showing 22 changed files with 729 additions and 59 deletions.
2 changes: 0 additions & 2 deletions federatedscope/autotune/baseline/fedhpo_vfl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,6 @@ model:
train:
optimizer:
lr: 0.5
bin_num: 100
# learning rate for xgb model
eta: 0.5
data:
Expand All @@ -36,7 +35,6 @@ vertical:
key_size: 256
dims: [7, 14]
algo: 'xgb'
xgb_use_bin: False
eval:
freq: 5
best_res_update_round_wise_key: test_loss
Expand Down
11 changes: 8 additions & 3 deletions federatedscope/core/auxiliaries/trainer_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,6 @@
"mftrainer": "MFTrainer",
"cltrainer": "CLTrainer",
"lptrainer": "LPTrainer",
"verticaltrainer": "VerticalTrainer",
"atc_trainer": "ATCTrainer",
}

Expand Down Expand Up @@ -135,8 +134,6 @@ def get_trainer(model=None,
dict_path = "federatedscope.cv.trainer.trainer"
elif config.trainer.type.lower() in ['nlptrainer']:
dict_path = "federatedscope.nlp.trainer.trainer"
elif config.trainer.type.lower() in ['verticaltrainer']:
dict_path = "federatedscope.vertical_fl.trainer.trainer"
elif config.trainer.type.lower() in ['cltrainer', 'lptrainer']:
dict_path = "federatedscope.cl.trainer.trainer"
elif config.trainer.type.lower() in [
Expand Down Expand Up @@ -171,6 +168,14 @@ def get_trainer(model=None,
config=config,
only_for_eval=only_for_eval,
monitor=monitor)
elif config.trainer.type.lower() in ['verticaltrainer']:
from federatedscope.vertical_fl.trainer.utils import \
get_vertical_trainer
trainer = get_vertical_trainer(config=config,
model=model,
data=data,
device=device,
monitor=monitor)
else:
# try to find user registered trainer
trainer = None
Expand Down
5 changes: 4 additions & 1 deletion federatedscope/core/configs/cfg_fl_setting.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,10 @@ def extend_fl_setting_cfg(cfg):
cfg.vertical.encryption = 'paillier'
cfg.vertical.key_size = 3072
cfg.vertical.algo = 'lr' # ['lr', 'xgb']
cfg.vertical.xgb_use_bin = False
cfg.vertical.protect_object = '' # feature_order, TODO: add more
cfg.vertical.protect_method = '' # dp
cfg.vertical.protect_args = []
# Default values for 'dp': {'bucket_num':100, 'epsilon':None}

# --------------- register corresponding check function ----------
cfg.register_cfg_check_fun(assert_fl_setting_cfg)
Expand Down
1 change: 1 addition & 0 deletions federatedscope/vertical_fl/dataset/blog.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,7 @@ def _get_data(self):

def _read_raw(self, file_path):
data = pd.read_csv(file_path, header=None, usecols=list(range(281)))
data = data.fillna(method='ffill')
data = data.values
return data

Expand Down
1 change: 1 addition & 0 deletions federatedscope/vertical_fl/dataset/credit.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,6 +97,7 @@ def balance_sample(sample_size, y):

def _read_raw(self, file_path):
data = pd.read_csv(file_path)
data = data.fillna(method='ffill')
data = data.values
return data

Expand Down
3 changes: 3 additions & 0 deletions federatedscope/vertical_fl/trainer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
from federatedscope.vertical_fl.trainer.trainer import VerticalTrainer
from federatedscope.vertical_fl.trainer.feature_order_protected_trainer \
import FeatureOrderProtectedTrainer
236 changes: 236 additions & 0 deletions federatedscope/vertical_fl/trainer/feature_order_protected_trainer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,236 @@
import numpy as np
from federatedscope.vertical_fl.trainer.trainer import VerticalTrainer


class FeatureOrderProtectedTrainer(VerticalTrainer):
def __init__(self, model, data, device, config, monitor):
super(FeatureOrderProtectedTrainer,
self).__init__(model, data, device, config, monitor)

assert config.vertical.protect_method != '', \
"Please specify the adopted method for protecting feature order"
args = config.vertical.protect_args[0] if len(
config.vertical.protect_args) > 0 else {}

if config.vertical.protect_method == 'dp':
self.bucket_num = args.get('bucket_num', 100)
self.epsilon = args.get('epsilon', None)
self.protect_funcs = self._protect_via_dp
self.split_value = None
elif config.vertical.protect_method == 'op_boost':
self.algo = args.get('algo', 'global')
self.protect_funcs = self._protect_via_op_boost
self.lower_bound = args.get('lower_bound', 1)
self.upper_bound = args.get('upper_bound', 100)
if self.algo == 'global':
self.epsilon = args.get('epsilon', 2)
elif self.algo == 'adjusting':
self.epsilon_prt = args.get('epsilon_prt', 2)
self.epsilon_ner = args.get('epsilon_ner', 2)
self.partition_num = args.get('partition_num', 10)
else:
raise ValueError
else:
raise ValueError(f"The method {args['method']} is not provided")

def get_feature_value(self, feature_idx, value_idx):
if not hasattr(self, 'split_value') or self.split_value is None:
return super().get_feature_value(feature_idx=feature_idx,
value_idx=value_idx)

return self.split_value[feature_idx][value_idx]

def _bucketize(self, feature_order, bucket_size, bucket_num):
bucketized_feature_order = list()
for bucket_idx in range(bucket_num):
start = bucket_idx * bucket_size
end = min((bucket_idx + 1) * bucket_size, len(feature_order))
bucketized_feature_order.append(feature_order[start:end])
return bucketized_feature_order

def _processed_data(self, data):
min_value = np.min(data, axis=0)
max_value = np.max(data, axis=0)
# To avoid data_max[i] == data_min[i],
for i in range(data.shape[1]):
if max_value[i] == min_value[i]:
max_value[i] += 1
return np.round(self.lower_bound + (data - min_value) /
(max_value - min_value) *
(self.upper_bound - self.lower_bound))

def _global_mapping_fun(self, x, epsilon, lower_bound, upper_bound):
probs = list()
denominator = np.sum(
np.exp(-np.abs(x - np.array(range(lower_bound, upper_bound + 1))) *
epsilon / 2))
for k in range(lower_bound, upper_bound + 1):
probs.append(np.exp(-np.abs(x - k) * epsilon / 2) / denominator)
res = np.random.choice(list(range(lower_bound, upper_bound + 1)),
p=probs)

return res

def _adjusting_mapping_fun(self, x, partition_edges):
for part_idx in range(self.partition_num):
if partition_edges[part_idx] < x and partition_edges[part_idx +
1] >= x:
selected_part = self._global_mapping_fun(
part_idx,
epsilon=self.epsilon_prt,
lower_bound=0,
upper_bound=self.partition_num - 1)
res = self._global_mapping_fun(
x,
epsilon=self.epsilon_ner,
lower_bound=partition_edges[selected_part] + 1,
upper_bound=partition_edges[selected_part + 1])

return res

def _op_boost_global(self, data):

processed_data = self._processed_data(data=data)
mapped_data = np.vectorize(self._global_mapping_fun)(
processed_data,
epsilon=self.epsilon,
lower_bound=self.lower_bound,
upper_bound=self.upper_bound)

return mapped_data

def _op_boost_adjusting(self, data):

processed_data = self._processed_data(data=data)
quantiles = np.linspace(0, 100, self.partition_num + 1)
partition_edges = np.round(
np.asarray(
np.percentile(
list(range(self.lower_bound - 1, self.upper_bound + 1)),
quantiles)))
partition_edges = [int(x) for x in partition_edges]
mapped_data = np.vectorize(self._adjusting_mapping_fun,
signature='(),(n)->()')(
processed_data,
partition_edges=partition_edges)

return mapped_data

def _protect_via_op_boost(self, raw_feature_order, data):
"""
Add random noises to feature order for privacy protection.
For more details, please see
OpBoost: A Vertical Federated Tree Boosting Framework Based on
Order-Preserving Desensitization.pdf
(https://arxiv.org/pdf/2210.01318.pdf)
"""
if self.algo == 'global':
mapped_data = self._op_boost_global(data)
elif self.algo == 'adjusting':
mapped_data = self._op_boost_adjusting(data)
else:
mapped_data = None
assert mapped_data is not None

# Get feature order based on mapped data
num_of_feature = mapped_data.shape[1]
protected_feature_order = [0] * num_of_feature
for i in range(num_of_feature):
protected_feature_order[i] = mapped_data[:, i].argsort()

return {
'raw_feature_order': raw_feature_order,
'feature_order': protected_feature_order,
}

def _protect_via_dp(self, raw_feature_order, data):
"""
Bucketize and add dp noise to feature order for privacy protection.
For more details, please refer to
FederBoost: Private Federated Learning for GBDT
(https://arxiv.org/pdf/2011.02796.pdf)
"""
protected_feature_order = list()
bucket_size = int(
np.ceil(self.cfg.dataloader.batch_size / self.bucket_num))
if self.epsilon is None:
prob_for_preserving = 1.0
else:
_tmp = np.power(np.e, self.epsilon)
prob_for_preserving = _tmp / (_tmp + self.bucket_num - 1)
prob_for_moving = (1.0 - prob_for_preserving) / (self.bucket_num - 1)
split_position = []
self.split_value = []

for feature_idx in range(len(raw_feature_order)):
bucketized_feature_order = self._bucketize(
raw_feature_order[feature_idx], bucket_size, self.bucket_num)
noisy_bucketizd_feature_order = [[]
for _ in range(self.bucket_num)]

# Add noise to bucketized feature order
for bucket_idx in range(self.bucket_num):
probs = np.ones(self.bucket_num) * prob_for_moving
probs[bucket_idx] = prob_for_preserving
for each in bucketized_feature_order[bucket_idx]:
selected_bucket_idx = np.random.choice(list(
range(self.bucket_num)),
p=probs)
noisy_bucketizd_feature_order[selected_bucket_idx].append(
each)

# Save split positions (instance number within buckets)
# We exclude the endpoints to avoid empty sub-trees
_split_position = list()
_split_value = dict()
accumu_num = 0
for bucket_idx, each_bucket in enumerate(
noisy_bucketizd_feature_order):
instance_num = len(each_bucket)
# Skip the empty bucket
if instance_num != 0:
# Skip the endpoints
if bucket_idx != self.bucket_num - 1:
_split_position.append(accumu_num + instance_num)

# Save split values: average of min value of (j-1)-th
# bucket and max value of j-th bucket
max_value = data[bucketized_feature_order[bucket_idx]
[-1]][feature_idx]
min_value = data[bucketized_feature_order[bucket_idx]
[0]][feature_idx]
if accumu_num == 0:
_split_value[accumu_num +
instance_num] = max_value / 2.0
elif bucket_idx == self.bucket_num - 1:
_split_value[accumu_num] += min_value / 2.0
else:
_split_value[accumu_num] += min_value / 2.0
_split_value[accumu_num +
instance_num] = max_value / 2.0

accumu_num += instance_num

split_position.append(_split_position)
self.split_value.append(_split_value)

[np.random.shuffle(x) for x in noisy_bucketizd_feature_order]
noisy_bucketizd_feature_order = np.concatenate(
noisy_bucketizd_feature_order)
protected_feature_order.append(noisy_bucketizd_feature_order)

extra_info = {'split_position': split_position}

return {
'raw_feature_order': raw_feature_order,
'feature_order': protected_feature_order,
'extra_info': extra_info
}

# TODO: more flexible for client to choose whether to protect or not
def _get_feature_order_info(self, data):
num_of_feature = data.shape[1]
feature_order = [0] * num_of_feature
for i in range(num_of_feature):
feature_order[i] = data[:, i].argsort()
return self.protect_funcs(feature_order, data)
Loading

0 comments on commit 8d042b2

Please sign in to comment.