Skip to content

Commit

Permalink
Add random forest in VFL (#523)
Browse files Browse the repository at this point in the history
- dev for random forest
- enable feature protection in rf
- modify unitest
  • Loading branch information
xieyxclack authored Feb 16, 2023
1 parent b5870f7 commit 320a225
Show file tree
Hide file tree
Showing 34 changed files with 915 additions and 377 deletions.
2 changes: 2 additions & 0 deletions federatedscope/autotune/baseline/fedhpo_vfl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ vertical:
key_size: 256
dims: [7, 14]
algo: 'xgb'
data_size_for_debug: 1500
feature_subsample_ratio: 1.0
eval:
freq: 5
best_res_update_round_wise_key: test_loss
Expand Down
4 changes: 3 additions & 1 deletion federatedscope/core/auxiliaries/model_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,9 @@ def get_model(model_config, local_data=None, backend='torch'):
elif model_config.type.lower() in ['vmfnet', 'hmfnet']:
from federatedscope.mf.model.model_builder import get_mfnet
model = get_mfnet(model_config, input_shape)
elif model_config.type.lower() in ['xgb_tree', 'gbdt_tree']:
elif model_config.type.lower() in [
'xgb_tree', 'gbdt_tree', 'random_forest'
]:
from federatedscope.vertical_fl.model.model_builder import \
get_tree_model
model = get_tree_model(model_config)
Expand Down
4 changes: 2 additions & 2 deletions federatedscope/core/auxiliaries/worker_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def get_client_cls(cfg):
if cfg.vertical.algo == 'lr':
from federatedscope.vertical_fl.worker import vFLClient
return vFLClient
elif cfg.vertical.algo in ['xgb', 'gbdt']:
elif cfg.vertical.algo in ['xgb', 'gbdt', 'rf']:
from federatedscope.vertical_fl.xgb_base.worker import XGBClient
return XGBClient
else:
Expand Down Expand Up @@ -173,7 +173,7 @@ def get_server_cls(cfg):
if cfg.vertical.algo == 'lr':
from federatedscope.vertical_fl.worker import vFLServer
return vFLServer
elif cfg.vertical.algo in ['xgb', 'gbdt']:
elif cfg.vertical.algo in ['xgb', 'gbdt', 'rf']:
from federatedscope.vertical_fl.xgb_base.worker import XGBServer
return XGBServer
else:
Expand Down
15 changes: 13 additions & 2 deletions federatedscope/core/configs/cfg_fl_setting.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,11 +78,16 @@ def extend_fl_setting_cfg(cfg):
cfg.vertical.dims = [5, 10] # TODO: we need to explain dims
cfg.vertical.encryption = 'paillier'
cfg.vertical.key_size = 3072
cfg.vertical.algo = 'lr' # ['lr', 'xgb']
cfg.vertical.algo = 'lr' # ['lr', 'xgb', 'gbdt', 'rf']
cfg.vertical.feature_subsample_ratio = 1.0
cfg.vertical.protect_object = '' # feature_order, TODO: add more
cfg.vertical.protect_method = '' # dp
cfg.vertical.protect_method = '' # dp, op_boost
cfg.vertical.protect_args = []
# Default values for 'dp': {'bucket_num':100, 'epsilon':None}
# Default values for 'op_boost': {'algo':'global', 'lower_bound':1,
# 'upper_bound':100, 'epsilon':2}
cfg.vertical.data_size_for_debug = 0 # use a subset for debug in vfl,
# 0 indicates using the entire dataset (disable debug mode)

# --------------- register corresponding check function ----------
cfg.register_cfg_check_fun(assert_fl_setting_cfg)
Expand Down Expand Up @@ -230,5 +235,11 @@ def assert_fl_setting_cfg(cfg):
f"cfg.model.type is changed to 'gbdt_tree' here")
cfg.model.type = 'gbdt_tree'

if not (cfg.vertical.feature_subsample_ratio > 0
and cfg.vertical.feature_subsample_ratio <= 1.0):
raise ValueError(f'The value of vertical.feature_subsample_ratio '
f'must be in (0, 1.0], but got '
f'{cfg.vertical.feature_subsample_ratio}')


register_config("fl_setting", extend_fl_setting_cfg)
1 change: 1 addition & 0 deletions federatedscope/vertical_fl/dataloader/dataloader.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def load_vertical_data(config=None, generate=False):
feature_partition=config.vertical.dims,
tr_frac=config.data.splits[0],
algo=config.vertical.algo,
debug_size=config.vertical.data_size_for_debug,
download=True,
seed=1234,
args=args)
Expand Down
57 changes: 56 additions & 1 deletion federatedscope/vertical_fl/dataloader/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def batch_iter(data, batch_size, shuffled=True):
batch_size (int): the batch size
shuffled (bool): whether to shuffle the data at the start of each epoch
:returns: sample index, batch of x, batch_of y
:rtype: int, ndarray, ndarry
:rtype: int, ndarray, ndarray
"""

assert 'x' in data and 'y' in data
Expand All @@ -28,3 +28,58 @@ def batch_iter(data, batch_size, shuffled=True):
end_index = min(data_size, (batch + 1) * batch_size)
sample_index = shuffled_index[start_index:end_index]
yield sample_index, data_x[sample_index], data_y[sample_index]


class VerticalDataSampler(object):
"""
VerticalDataSampler is used to sample a subset from data
Arguments:
data(dict): data
replace (bool): Whether the sample is with or without replacement
"""
def __init__(self,
data,
replace=False,
use_full_trainset=True,
feature_frac=1.0):
assert 'x' in data
self.data_x = data['x']
self.data_y = data['y'] if 'y' in data else None
self.data_size = self.data_x.shape[0]
self.feature_size = self.data_x.shape[1]
self.replace = replace
self.use_full_trainset = use_full_trainset
self.selected_feature_num = max(1,
int(self.feature_size * feature_frac))
self.selected_feature_index = None

def sample_data(self, sample_size, index=None):

# use the entire dataset
if self.use_full_trainset:
return range(len(self.data_x)), self.data_x, self.data_y

if index is not None:
sampled_x = self.data_x[index]
sampled_y = self.data_y[index] if self.data_y is not None else None
else:
sample_size = min(sample_size, self.data_size)
index = np.random.choice(a=self.data_size,
size=sample_size,
replace=self.replace)
sampled_x = self.data_x[index]
sampled_y = self.data_y[index] if self.data_y is not None else None

return index, sampled_x, sampled_y

def sample_feature(self, x):
if self.selected_feature_num == self.feature_size:
return range(x.shape[-1]), x
else:
feature_index = np.random.choice(a=self.feature_size,
size=self.selected_feature_num,
replace=False)
self.selected_feature_index = feature_index

return feature_index, x[:, feature_index]
11 changes: 10 additions & 1 deletion federatedscope/vertical_fl/dataset/abalone.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import os
import os.path as osp

import numpy as np
import pandas as pd
from torchvision.datasets.utils import download_and_extract_archive

Expand Down Expand Up @@ -43,7 +44,9 @@ class Abalone(object):
args (dict): set Ture or False to decide whether
to normalize or standardize the data or not,
e.g., {'normalization': False, 'standardization': False}
algo(str): the running model, 'lr' or 'xgb'
algo(str): the running model, 'lr'/'xgb'/'gbdt'/'rf'
debug_size(int): use a subset for debug,
0 for using entire dataset
download (bool): indicator to download dataset
seed: a random seed
"""
Expand All @@ -58,6 +61,7 @@ def __init__(self,
args,
algo=None,
tr_frac=0.8,
debug_size=0,
download=True,
seed=123):
self.root = root
Expand All @@ -67,6 +71,7 @@ def __init__(self,
self.seed = seed
self.args = args
self.algo = algo
self.data_size_for_debug = debug_size
self.data_dict = {}
self.data = {}

Expand All @@ -84,6 +89,10 @@ def _get_data(self):
file = osp.join(fpath, self.raw_file)
data = self._read_raw(file)
data = self._process(data)
if self.data_size_for_debug != 0:
subset_size = min(len(data), self.data_size_for_debug)
np.random.shuffle(data)
data = data[:subset_size]
train_num = int(self.tr_frac * len(data))
self.data_dict['train'] = data[:train_num]
self.data_dict['test'] = data[train_num:]
Expand Down
14 changes: 11 additions & 3 deletions federatedscope/vertical_fl/dataset/adult.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,9 @@ class Adult(object):
args (dict): set Ture or False to decide whether
to normalize or standardize the data or not,
e.g., {'normalization': False, 'standardization': False}
algo(str): the running model, 'lr' or 'xgb'
algo(str): the running model, 'lr'/'xgb'/'gbdt'/'rf'
debug_size(int): use a subset for debug,
0 for using entire dataset
download (bool): indicator to download dataset
seed: a random seed
"""
Expand All @@ -46,6 +48,7 @@ def __init__(self,
args,
algo=None,
tr_frac=0.8,
debug_size=0,
download=True,
seed=123):
super(Adult, self).__init__()
Expand All @@ -56,6 +59,7 @@ def __init__(self,
self.seed = seed
self.args = args
self.algo = algo
self.data_size_for_debug = debug_size
self.data_dict = {}
self.data = {}

Expand All @@ -70,6 +74,10 @@ def _get_data(self):
train_data = self._read_raw(train_file)
test_data = self._read_raw(test_file)
train_data, test_data = self._process(train_data, test_data)
if self.data_size_for_debug != 0:
subset_size = min(len(train_data), self.data_size_for_debug)
np.random.shuffle(train_data)
train_data = train_data[:subset_size]
self._partition_data(train_data, test_data)

def _read_raw(self, file_path):
Expand Down Expand Up @@ -102,6 +110,8 @@ def _process(self, train_set, test_set):

train_set = combined_set[:train_set.shape[0]]
test_set = combined_set[train_set.shape[0]:]
train_set = train_set.values
test_set = test_set.values
return train_set, test_set

# normalization
Expand All @@ -116,8 +126,6 @@ def standardization(self, data):
return (data - mu) / sigma

def _partition_data(self, train_set, test_set):
train_set = train_set.values
test_set = test_set.values
x, y = train_set[:, :-1], train_set[:, -1]
test_x, test_y = test_set[:, :-1], test_set[:, -1]

Expand Down
11 changes: 10 additions & 1 deletion federatedscope/vertical_fl/dataset/blog.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,9 @@ class Blog(object):
args (dict): set Ture or False to decide whether
to normalize or standardize the data or not,
e.g., {'normalization': False, 'standardization': False}
algo(str): the running model, 'lr' or 'xgb'
algo(str): the running model, 'lr'/'xgb'/'gbdt'/'rf'
debug_size(int): use a subset for debug,
0 for using entire dataset
download (bool): indicator to download dataset
seed: a random seed
"""
Expand All @@ -60,6 +62,7 @@ def __init__(self,
args,
algo=None,
tr_frac=0.8,
debug_size=0,
download=True,
seed=123):
super(Blog, self).__init__()
Expand All @@ -70,6 +73,7 @@ def __init__(self,
self.seed = seed
self.args = args
self.algo = algo
self.data_size_for_debug = debug_size
self.data_dict = {}
self.data = {}

Expand Down Expand Up @@ -98,6 +102,11 @@ def _get_data(self):
else:
test_data = np.concatenate((test_data, f_data), axis=0)

if self.data_size_for_debug != 0:
subset_size = min(len(train_data), self.data_size_for_debug)
np.random.shuffle(train_data)
train_data = train_data[:subset_size]

self.data_dict['train'] = train_data
self.data_dict['test'] = test_data

Expand Down
11 changes: 10 additions & 1 deletion federatedscope/vertical_fl/dataset/credit.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,9 @@ class Credit(object):
args (dict): set Ture or False to decide whether
to normalize or standardize the data or not,
e.g., {'normalization': False, 'standardization': False}
algo(str): the running model, 'lr' or 'xgb'
algo(str): the running model, 'lr'/'xgb'/'gbdt'/'rf'
debug_size(int): use a subset for debug,
0 for using entire dataset
download (bool): indicator to download dataset
seed: a random seed
"""
Expand All @@ -41,6 +43,7 @@ def __init__(self,
args,
algo=None,
tr_frac=0.8,
debug_size=0,
download=True,
seed=123):
super(Credit, self).__init__()
Expand All @@ -51,6 +54,7 @@ def __init__(self,
self.seed = seed
self.args = args
self.algo = algo
self.data_size_for_debug = debug_size
self.data_dict = {}
self.data = {}

Expand Down Expand Up @@ -90,6 +94,11 @@ def balance_sample(sample_size, y):
data = data[sample_idx]
# '''

if self.data_size_for_debug != 0:
subset_size = min(len(data), self.data_size_for_debug)
np.random.shuffle(data)
data = data[:subset_size]

train_num = int(self.tr_frac * len(data))

self.data_dict['train'] = data[:train_num]
Expand Down
40 changes: 27 additions & 13 deletions federatedscope/vertical_fl/loss/binary_cls.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,25 +6,39 @@ class BinaryClsLoss(object):
y = {1, 0}
L = -yln(p)-(1-y)ln(1-p)
"""
def __init__(self, cal_hess=True):
self.cal_hess = cal_hess
def __init__(self, model_type):
self.cal_hess = model_type in ['xgb_tree']
self.cal_sigmoid = model_type in ['xgb_tree', 'gbdt_tree']
self.merged_mode = 'mean' if model_type in ['random_forest'] else 'sum'

def _sigmoid(self, y_pred):
return 1.0 / (1.0 + np.exp(-y_pred))

def _process_y_pred(self, y_pred):
if self.merged_mode == 'mean':
y_pred = np.mean(y_pred, axis=0)
else:
y_pred = np.sum(y_pred, axis=0)

if self.cal_sigmoid:
y_pred = self._sigmoid(y_pred)

return y_pred

def get_metric(self, y, y_pred):
pred_prob = 1.0 / (1.0 + np.exp(-y_pred))
pred_prob[pred_prob >= 0.5] = 1.
pred_prob[pred_prob < 0.5] = 0
acc = np.sum(pred_prob == y) / len(y)
y_pred = self._process_y_pred(y_pred)
y_pred = (y_pred >= 0.5).astype(np.float32)
acc = np.sum(y_pred == y) / len(y)
return {'acc': acc}

def get_loss(self, y, y_pred):
y_pred = 1.0 / (1.0 + np.exp(-y_pred))
res = np.mean(-y * np.log(y_pred))
y_pred = self._process_y_pred(y_pred)
res = np.mean(-y * np.log(y_pred + 1e-7))
return res

def get_grad_and_hess(self, y, pred):
pred = np.asarray(pred)
def get_grad_and_hess(self, y, y_pred):
y_pred = self._process_y_pred(y_pred)
y = np.array(y)
prob = 1.0 / (1.0 + np.exp(-pred))
grad = prob - y
hess = prob * (1.0 - prob) if self.cal_hess else None
grad = y_pred - y
hess = y_pred * (1.0 - y_pred) if self.cal_hess else None
return grad, hess
Loading

0 comments on commit 320a225

Please sign in to comment.