Skip to content

Commit

Permalink
[Feature] Update optimizer, gbm suitable for rank (#15)
Browse files Browse the repository at this point in the history
* add group in `Rektdataset`
* change `set_aditional_params`
* update rank
  • Loading branch information
RektPunk authored Aug 10, 2024
1 parent 3c861e0 commit 88f6398
Show file tree
Hide file tree
Showing 12 changed files with 170 additions and 49 deletions.
2 changes: 2 additions & 0 deletions examples/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,8 @@ examples
├── classification
│   ├── binary_classfication.py
│   └── multiclass_classification.py
├── rank
│   ├── basic_rank.py
├── regression
│   ├── basic_regression.py
│   ├── gamma_regression.py
Expand Down
83 changes: 83 additions & 0 deletions examples/rank/basic_rank.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Import necessary libraries
import numpy as np
import pandas as pd

from rektgbm import RektDataset, RektGBM, RektOptimizer

# Generate a synthetic dataset
# 'query_id' simulates groups of queries, and 'relevance' indicates the relevance of the item to the query.
df = pd.DataFrame(
{
"query_id": [
i for i in range(1_000) for j in range(10)
], # 1000 unique queries, each with 10 items
"var1": np.random.random(size=(10_000,)), # Random feature 1
"var2": np.random.random(size=(10_000,)), # Random feature 2
"var3": np.random.random(size=(10_000,)), # Random feature 3
"relevance": list(np.random.permutation([0, 0, 0, 0, 0, 0, 0, 0, 1, 1]))
* 1_000, # Random relevance scores
}
)

# Generate a test dataset for later evaluation
X_test = pd.DataFrame(
{
"var1": np.random.random(size=(1_000,)), # Random feature 1
"var2": np.random.random(size=(1_000,)), # Random feature 2
"var3": np.random.random(size=(1_000,)), # Random feature 3
}
)

# Split the dataset into training (80%) and validation (20%) sets
train_df = df[:8000] # First 80% of the data
validation_df = df[8000:] # Remaining 20% of the data

# Grouping for the ranking task (required for rank objective)
query_ids_train = train_df.groupby("query_id")["query_id"].count().to_numpy()
X_train = train_df.drop(["query_id", "relevance"], axis=1) # Training features
y_train = train_df["relevance"] # Training labels (relevance scores)

query_ids_validation = validation_df.groupby("query_id")["query_id"].count().to_numpy()
X_validation = validation_df.drop(
["query_id", "relevance"], axis=1
) # Validation features
y_validation = validation_df["relevance"] # Validation labels (relevance scores)

# Create RektDataset objects for training and validation
dtrain = RektDataset(data=X_train, label=y_train, group=query_ids_train)
dvalid = RektDataset(data=X_validation, label=y_validation, group=query_ids_validation)
dtest = RektDataset(data=X_test) # Test dataset does not require group information


# Initialize RektOptimizer for automatic task type, objective, and metric detection
rekt_optimizer = RektOptimizer()

# Alternatively, manually select optimizer settings (commented out)
# rekt_optimizer = RektOptimizer(
# method="both", # Method: options are both (default), lightgbm, xgboost
# task_type="rank", # Type of task: rank
# objective="ndcg", # Objective function: options are lambdarank, ndcg
# metric="map", # Metric: options are ndcg, map
# additional_params={
# "eval_at": 3 # Evaluate model performance at the top 3 ranks, default 5
# }
# )

# Optimize model hyperparameters using the training and validation datasets
rekt_optimizer.optimize_params(
dataset=dtrain,
valid_set=dvalid, # Validation set is necessary for ranking tasks
n_trials=10, # Number of optimization trials (for demonstration; usually, more trials are preferred)
)

# Print the best hyperparameters found during optimization
print(rekt_optimizer.best_params)

# Initialize RektGBM model with the best hyperparameters
rekt_gbm = RektGBM(**rekt_optimizer.best_params)

# Train the model on the training dataset and validate using the validation set
rekt_gbm.fit(dataset=dtrain, valid_set=dvalid)

# Predict on the test dataset using the trained model
preds = rekt_gbm.predict(dataset=dtest)
2 changes: 1 addition & 1 deletion examples/regression/basic_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
# method="both", # Method: options are both (default), lightgbm, xgboost
# task_type="regression", # Type of task: regression
# objective="rmse", # Objective function: options are rmse, mae
# metric="rmse" # rmse, mae, mape
# metric="rmse" # Metric: options are rmse, mae, mape
# )

# Optimize hyperparameters using the training dataset over a specified number of trials
Expand Down
2 changes: 1 addition & 1 deletion rektgbm/__init__.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# flake8: noqa
from rektgbm.dataset import RektDataset
from rektgbm.gbm import RektGBM
from rektgbm.optimizer import RektOptimizer
from rektgbm.rektgbm import RektGBM

__version__ = "0.0.0"
3 changes: 2 additions & 1 deletion rektgbm/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@ def _train_valid_split(
class RektDataset:
data: XdataLike
label: Optional[YdataLike] = None
group: Optional[YdataLike] = None
reference: Optional["RektDataset"] = None
skip_post_init: bool = False

Expand Down Expand Up @@ -89,7 +90,7 @@ def dtrain(self, method: MethodName) -> DataLike:
method=method,
dtype=_TypeName.train_dtype,
)
return train_dtype(data=self.data, label=self.label)
return train_dtype(data=self.data, label=self.label, group=self.group)

def dpredict(self, method: MethodName) -> Union[DataLike, XdataLike]:
predict_dtype = _get_dtype(
Expand Down
4 changes: 3 additions & 1 deletion rektgbm/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@

from rektgbm.base import BaseGBM, MethodName, StateException
from rektgbm.dataset import RektDataset
from rektgbm.metric import METRIC_DICT_KEY_MAPPER
from rektgbm.metric import METRIC_DICT_KEY_MAPPER, LgbMetricName

_VALID_STR: str = "valid"

Expand Down Expand Up @@ -64,6 +64,8 @@ def eval_loss(self) -> float:
metric_str = METRIC_DICT_KEY_MAPPER.get(self.method)
if self.__is_lgb:
_metric_name = self.params.get(metric_str)
if _metric_name in {LgbMetricName.ndcg.value, LgbMetricName.map.value}:
_metric_name = f"{_metric_name}@{self.params['eval_at']}"
return float(self.model.best_score[_VALID_STR][_metric_name])
elif self.__is_xgb:
_metric_name = self.params.get(metric_str)
Expand Down
22 changes: 16 additions & 6 deletions rektgbm/rektgbm.py → rektgbm/gbm.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,8 +32,14 @@ def fit(
):
self._task_type = check_task_type(
target=dataset.label,
group=dataset.group,
task_type=self.task_type,
)
if self._task_type == TaskType.rank and valid_set is None:
raise ValueError(
"A validation set must be provided when using the 'rank' task."
)

self.rekt_objective = RektObjective(
task_type=self._task_type,
objective=self.objective,
Expand All @@ -50,8 +56,8 @@ def fit(
if valid_set is not None and self.__is_label_encoder_used:
valid_set.transform_label(label_encoder=self.label_encoder)

_objective = self.rekt_objective.get_objective(method=self.method)
_metric = self.rekt_metric.get_metric(method=self.method)
_objective = self.rekt_objective.get_objective_dict(method=self.method)
_metric = self.rekt_metric.get_metric_dict(method=self.method)
self.params.update({**_objective, **_metric})
self.engine = RektEngine(
method=self.method,
Expand All @@ -62,14 +68,18 @@ def fit(
def predict(self, dataset: RektDataset):
preds = self.engine.predict(dataset=dataset)

if self._task_type == TaskType.multiclass:
if self.method == MethodName.lightgbm:
preds = np.argmax(preds, axis=1).astype(int)
preds = np.around(preds).astype(int)
if self._task_type in {TaskType.regression, TaskType.rank}:
return preds

if self._task_type == TaskType.binary:
preds = np.around(preds).astype(int)

if self._task_type == TaskType.multiclass:
if self.method == MethodName.lightgbm:
preds = np.argmax(preds, axis=1).astype(int)
else:
preds = np.around(preds).astype(int)

if self.__is_label_encoder_used:
preds = self.label_encoder.inverse_transform(series=preds)

Expand Down
2 changes: 1 addition & 1 deletion rektgbm/metric.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,7 +185,7 @@ def __post_init__(self) -> None:
def get_metric_str(self, method: MethodName) -> str:
return self._metric_engine_mapper.get(method)

def get_metric(self, method: MethodName) -> Dict[str, str]:
def get_metric_dict(self, method: MethodName) -> Dict[str, str]:
return {METRIC_DICT_KEY_MAPPER.get(method): self.get_metric_str(method=method)}

def __validate_metric(self) -> None:
Expand Down
2 changes: 1 addition & 1 deletion rektgbm/objective.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def __post_init__(self) -> None:
def get_objective_str(self, method: MethodName) -> str:
return self._objective_engine_mapper.get(method)

def get_objective(self, method: MethodName) -> Dict[str, str]:
def get_objective_dict(self, method: MethodName) -> Dict[str, str]:
return {OBJECTIVE_DICT_KEY: self.get_objective_str(method=method)}

def __validate_objective(self) -> None:
Expand Down
39 changes: 24 additions & 15 deletions rektgbm/optimizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,8 +52,9 @@ def optimize_params(
n_trials: int,
valid_set: Optional[RektDataset] = None,
) -> Dict[str, Any]:
self._task_type = check_task_type(
self._task_type: TaskType = check_task_type(
target=dataset.label,
group=dataset.group,
task_type=self.task_type,
)
self.rekt_objective = RektObjective(
Expand All @@ -74,35 +75,42 @@ def optimize_params(
if self.rekt_objective.objective == ObjectiveName.multiclass
else None
)
if valid_set is not None and self.__is_label_encoder_used:
valid_set.transform_label(label_encoder=_label_encoder)
elif valid_set is None:
if valid_set is None:
if self._task_type == TaskType.rank:
raise ValueError(
"A validation set must be provided when using the 'rank' task."
)
dataset, valid_set = dataset.split()
else:
if self.__is_label_encoder_used:
valid_set.transform_label(label_encoder=_label_encoder)

self.studies: Dict[MethodName, optuna.Study] = {}
for method, param in zip(self.method, self.params):
_addtional_params = set_additional_params(
objective=self.rekt_objective.objective,
metric=self.rekt_metric.get_metric_str(method=method),
method=method,
params=self.additional_params,
num_class=self.num_class,
)
_objective = self.rekt_objective.get_objective_dict(method=method)
_metric = self.rekt_metric.get_metric_dict(method=method)

def _study_func(trial: optuna.Trial) -> float:
_param = param(trial=trial)
_objective = self.rekt_objective.get_objective(method=method)
_metric = self.rekt_metric.get_metric(method=method)
_addtional_params = set_additional_params(
objective=self.rekt_objective.objective,
method=method,
params=self.additional_params,
num_class=self.num_class,
)
_param.update({**_objective, **_metric, **_addtional_params})

_engine = RektEngine(
params=_param,
method=method,
)
_engine.fit(dataset=dataset, valid_set=valid_set)
return _engine.eval_loss

_direction = "maximize" if self._task_type == TaskType.rank else "minimize"
study = optuna.create_study(
study_name=f"Rekt_{method.value}",
direction="minimize",
direction=_direction,
load_if_exists=True,
)
study.optimize(_study_func, n_trials=n_trials)
Expand All @@ -116,9 +124,10 @@ def best_params(self) -> Dict[str, Any]:
best_study = self.studies.get(best_method)
_best_params = best_study.best_params
_addtional_params = set_additional_params(
params=self.additional_params,
objective=self.rekt_objective.objective,
method=best_method,
params=self.additional_params,
metric=self.rekt_metric.get_metric_str(method=best_method),
num_class=self.num_class,
)
_best_params.update({**_addtional_params})
Expand Down
33 changes: 22 additions & 11 deletions rektgbm/param.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
from optuna import Trial

from rektgbm.base import MethodName
from rektgbm.metric import MetricName
from rektgbm.objective import ObjectiveName


Expand Down Expand Up @@ -43,21 +44,31 @@ def get_xgb_params(trial: Trial) -> Dict[str, Union[float, int]]:


def set_additional_params(
params: Dict[str, Any],
objective: ObjectiveName,
metric: str,
method: MethodName,
params: Dict[str, Any],
num_class: Optional[int],
) -> Dict[str, Any]:
_params = params.copy()
if objective == ObjectiveName.quantile:
if method == MethodName.lightgbm and "quantile_alpha" in params.keys():
params["alpha"] = params.pop("quantile_alpha")
elif method == MethodName.xgboost and "alpha" in params.keys():
params["quantile_alpha"] = params.pop("alpha")
if method == MethodName.lightgbm and "quantile_alpha" in _params.keys():
_params["alpha"] = _params.pop("quantile_alpha")
elif method == MethodName.xgboost and "alpha" in _params.keys():
_params["quantile_alpha"] = _params.pop("alpha")
elif objective == ObjectiveName.huber:
if method == MethodName.lightgbm and "huber_slope" in params.keys():
params["alpha"] = params.pop("quantile_alpha")
elif method == MethodName.xgboost and "alpha" in params.keys():
params["huber_slope"] = params.pop("alpha")
if method == MethodName.lightgbm and "huber_slope" in _params.keys():
_params["alpha"] = _params.pop("quantile_alpha")
elif method == MethodName.xgboost and "alpha" in _params.keys():
_params["huber_slope"] = _params.pop("alpha")
elif objective == ObjectiveName.multiclass:
params["num_class"] = num_class
return params
_params["num_class"] = num_class

if metric in {MetricName.ndcg.value, MetricName.map.value}:
_eval_at_defalut: int = 5
_eval_at = _params.pop("eval_at", _eval_at_defalut)
if method == MethodName.xgboost:
_params["eval_metric"] = f"{metric}@{_eval_at}"
elif method == MethodName.lightgbm:
_params["eval_at"] = _eval_at
return _params
25 changes: 14 additions & 11 deletions rektgbm/task.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,19 +27,22 @@ class SklearnTaskType(BaseEnum):

def check_task_type(
target: YdataLike,
group: Optional[YdataLike],
task_type: Optional[str],
) -> TaskType:
if group is not None:
return TaskType.rank

_type_inferred: str = type_of_target(y=target)
_sklearn_task_type = SklearnTaskType.get(_type_inferred)
_task_types = SKLEARN_TASK_TYPE_MAPPER.get(_sklearn_task_type)
if task_type is not None:
_user_defined_task_type = TaskType.get(task_type)
if _user_defined_task_type not in _task_types:
raise ValueError(
"The inferred 'task_type' does not match the provided one.'task_type'. "
f"Expected one of '{[_.value for _ in _task_types]}'."
)
_task_type = _user_defined_task_type
else:
_task_type = _task_types[0]
return _task_type
if task_type is None:
return _task_types[0]

_user_defined_task_type = TaskType.get(task_type)
if _user_defined_task_type not in _task_types:
raise ValueError(
"The inferred 'task_type' does not match the provided one.'task_type'. "
f"Expected one of '{[_.value for _ in _task_types]}'."
)
return _user_defined_task_type

0 comments on commit 88f6398

Please sign in to comment.