Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Feature/metadata routing default #164

Merged
merged 4 commits into from
Mar 27, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 4 additions & 5 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -153,16 +153,16 @@ full_pipe = make_meta_pipeline(preproc_pipe, crossval, pred_rud, ens, neut)
# Train
X, y = df.get_feature_target_pair(multi_target=False)
y_int = (y * 4).astype(int)
eras = df.get_era_data
era_series = df.get_era_data
features = df.get_feature_data
full_pipe.fit(X, y_int, numeraiensemble__eras=eras)
full_pipe.fit(X, y_int, era_series=era_series)

# Evaluate
val_df = create_numerframe("data/train_val/validation_int8.parquet")
val_X, _ = val_df.get_feature_target_pair(multi_target=False)
val_eras = val_df.get_era_data
val_features = val_df.get_feature_data
val_df['prediction'] = full_pipe.predict(val_X, eras=val_eras, features=val_features)
val_df['prediction'] = full_pipe.predict(val_X, era_series=val_eras, features=val_features)
val_df['example_preds'] = ExamplePredictions("v4.3/validation_example_preds.parquet").fit_transform(None)['prediction'].values
evaluator = NumeraiClassicEvaluator()
metrics = evaluator.full_evaluation(val_df,
Expand All @@ -176,7 +176,7 @@ live_df = create_numerframe(file_path="data/current_round/live_int8.parquet")
live_X, live_y = live_df.get_feature_target_pair(multi_target=False)
live_eras = live_df.get_era_data
live_features = live_df.get_feature_data
preds = full_pipe.predict(live_X, eras=live_eras, features=live_features)
preds = full_pipe.predict(live_X, era_series=live_eras, features=live_features)

# Submit
NUMERAI_PUBLIC_ID = "YOUR_PUBLIC_ID"
Expand All @@ -196,7 +196,6 @@ downloader.remove_base_directory()
submitter.remove_base_directory()
```


## 4. Contributing

Be sure to read the [How To Contribute section](https://crowdcent.github.io/numerblox/contributing/) section in the documentation for detailed instructions on
Expand Down
6 changes: 0 additions & 6 deletions docs/end_to_end.md
Original file line number Diff line number Diff line change
Expand Up @@ -56,9 +56,7 @@ preproc_pipe = make_union(gpp, fncv3_selector)
xgb = XGBRegressor()
cve = CrossValEstimator(estimator=xgb, cv=TimeSeriesSplit(n_splits=5))
ens = NumeraiEnsemble()
ens.set_transform_request(era_series=True)
fn = FeatureNeutralizer(proportion=0.5)
fn.set_predict_request(era_series=True, features=True)
full_pipe = make_meta_pipeline(preproc_pipe, cve, ens, fn)

# Train full model
Expand Down Expand Up @@ -87,9 +85,7 @@ model = DecisionTreeClassifier()
crossval1 = CrossValEstimator(estimator=model, cv=TimeSeriesSplit(n_splits=3), predict_func='predict_proba')
pred_rud = PredictionReducer(n_models=3, n_classes=5)
ens2 = NumeraiEnsemble(donate_weighted=True)
ens2.set_transform_request(era_series=True)
neut2 = FeatureNeutralizer(proportion=0.5)
neut2.set_predict_request(era_series=True, features=True)
full_pipe = make_meta_pipeline(preproc_pipe, crossval1, pred_rud, ens2, neut2)

full_pipe.fit(X, y, era_series=era_series)
Expand Down Expand Up @@ -121,9 +117,7 @@ for i in range(3):

models = make_column_transformer(*[(pipe, features.columns.tolist()) for pipe in pipes])
ens_end = NumeraiEnsemble()
ens_end.set_transform_request(era_series=True)
neut = FeatureNeutralizer(proportion=0.5)
neut.set_predict_request(era_series=True, features=True)
full_pipe = make_meta_pipeline(models, ens_end, neut)

full_pipe.fit(X, y, era_series=era_series)
Expand Down
8 changes: 4 additions & 4 deletions docs/index.md
Original file line number Diff line number Diff line change
Expand Up @@ -145,10 +145,10 @@ df = create_numerframe(file_path="data/train_val/train_int8.parquet")
val_df = create_numerframe(file_path="data/train_val/validation_int8.parquet")

X, y = df.get_feature_target_pair()
eras = df.get_era_data()
train_eras = df.get_era_data

val_X, val_y = val_df.get_feature_target_pair()
val_eras = val_df.get_era_data()
val_eras = val_df.get_era_data

fncv3_cols = nf.get_fncv3_features.columns.tolist()

Expand All @@ -164,9 +164,9 @@ ensembler = NumeraiEnsemble(donate_weighted=True)

full_pipe = make_pipeline(preproc_pipe, model, ensembler)

full_pipe.fit(X, y, numeraiensemble__eras=eras)
full_pipe.fit(X, y, era_series=train_eras)

val_preds = full_pipe.predict(val_X, eras=val_eras)
val_preds = full_pipe.predict(val_X, era_series=val_eras)
```

### 3.4. Evaluation
Expand Down
9 changes: 3 additions & 6 deletions docs/postprocessing.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,9 +26,8 @@ feature_data = pd.DataFrame([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]])
era_data = pd.Series([1, 1, 2])

neutralizer = FeatureNeutralizer(pred_name="prediction", proportion=0.5)
neutralizer.set_predict_request(era_series=True, features=True)
neutralizer.fit()
neutralized_predictions = neutralizer.predict(X=predictions, features=feature_data, eras=era_data)
neutralized_predictions = neutralizer.predict(X=predictions, features=feature_data, era_series=era_data)
```

Multiple column neutralization:
Expand All @@ -41,9 +40,8 @@ feature_data = pd.DataFrame([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]])
era_data = pd.Series([1, 1, 2])

neutralizer = FeatureNeutralizer(pred_name=["prediction1", "prediction2"], proportion=[0.5, 0.7])
neutralizer.set_predict_request(era_series=True, features=True)
neutralizer.fit()
neutralized_predictions = neutralizer.predict(X=predictions, features=feature_data, eras=era_data)
neutralized_predictions = neutralizer.predict(X=predictions, features=feature_data, era_series=era_data)
```

## FeaturePenalizer
Expand All @@ -66,7 +64,6 @@ feature_data = pd.DataFrame([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]])
era_data = pd.Series([1, 1, 2])

penalizer = FeaturePenalizer(max_exposure=0.1, pred_name="prediction")
penalizer.set_predict_request(era_series=True, features=True)
penalizer.fit(X=predictions)
penalized_predictions = penalizer.predict(X=predictions, features=feature_data, eras=era_data)
penalized_predictions = penalizer.predict(X=predictions, features=feature_data, era_series=era_data)
```
3 changes: 0 additions & 3 deletions docs/preprocessing.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,6 @@ Using `.transform` requires passing `era_series`. This is because the quantiles
from numerblox.preprocessing import EraQuantileProcessor

eq_processor = EraQuantileProcessor(num_quantiles=50, random_state=42)
eq_processor.set_transform_request(era_series=True)
transformed_data = eq_processor.fit_transform(X, era_series=eras_series)
```

Expand Down Expand Up @@ -86,7 +85,6 @@ Note that `LagPreProcessor` needs a `ticker_series` in the `.transform` step.
from numerblox.preprocessing import LagPreProcessor

lag_processor = LagPreProcessor(windows=[5, 10, 20])
lag_processor.set_transform_request(ticker_series=True)
lag_processor.fit(X)
lagged_data = lag_processor.transform(X, ticker_series=tickers_series)

Expand All @@ -105,7 +103,6 @@ from sklearn.pipeline import make_pipeline
from numerblox.preprocessing import DifferencePreProcessor

lag = LagPreProcessor(windows=[5, 10])
lag.set_transform_request(ticker_series=True)
diff = DifferencePreProcessor(windows=[5, 10], pct_diff=True)
pipe = make_pipeline(lag, diff)
pipe.set_output(transform="pandas")
Expand Down
36 changes: 14 additions & 22 deletions examples/end_to_end.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@
"from xgboost import XGBRegressor\n",
"from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor\n",
"from sklearn.model_selection import TimeSeriesSplit\n",
"from sklearn.pipeline import make_pipeline, make_union\n",
"from sklearn.pipeline import make_pipeline\n",
"from sklearn.compose import make_column_transformer, ColumnTransformer\n",
"from numerblox.preprocessing import GroupStatsPreProcessor\n",
"from numerblox.meta import CrossValEstimator, make_meta_pipeline\n",
Expand Down Expand Up @@ -635,10 +635,7 @@
"xgb = DecisionTreeRegressor()\n",
"cve = CrossValEstimator(estimator=xgb, cv=TimeSeriesSplit(n_splits=5))\n",
"ens = NumeraiEnsemble(donate_weighted=True)\n",
"ens.set_transform_request(era_series=True)\n",
"ens.set_predict_request(era_series=True)\n",
"fn = FeatureNeutralizer(proportion=0.5)\n",
"fn.set_predict_request(era_series=True, features=True)\n",
"full_pipe = make_meta_pipeline(preproc_pipe, cve, ens, fn)\n",
"full_pipe"
]
Expand All @@ -662,17 +659,17 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Processing feature neutralizations: 100%|██████████| 1/1 [00:00<00:00, 25575.02it/s]\n"
"Processing feature neutralizations: 100%|██████████| 1/1 [00:00<00:00, 26886.56it/s]\n"
]
},
{
"data": {
"text/plain": [
"array([[0.30403528],\n",
" [0.64598246],\n",
" [0.29753909],\n",
" [0.57058209],\n",
" [0.48907478]])"
"array([[0.28655201],\n",
" [0.63724474],\n",
" [0.27848242],\n",
" [0.55815509],\n",
" [0.47477194]])"
]
},
"execution_count": 8,
Expand Down Expand Up @@ -714,9 +711,7 @@
"crossval1 = CrossValEstimator(estimator=model, cv=TimeSeriesSplit(n_splits=3), predict_func='predict_proba')\n",
"pred_rud = PredictionReducer(n_models=3, n_classes=5)\n",
"ens2 = NumeraiEnsemble(donate_weighted=True)\n",
"ens2.set_transform_request(era_series=True)\n",
"neut2 = FeatureNeutralizer(proportion=0.5)\n",
"neut2.set_predict_request(era_series=True, features=True)\n",
"full_pipe = make_meta_pipeline(preproc_pipe, crossval1, pred_rud, ens2, neut2)"
]
},
Expand Down Expand Up @@ -1766,17 +1761,17 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Processing feature neutralizations: 100%|██████████| 1/1 [00:00<00:00, 9892.23it/s]\n"
"Processing feature neutralizations: 100%|██████████| 1/1 [00:00<00:00, 1893.59it/s]\n"
]
},
{
"data": {
"text/plain": [
"array([[0.29059154],\n",
" [0.64786053],\n",
" [0.28163789],\n",
" [0.56881549],\n",
" [0.48695533]])"
"array([[0.27212312],\n",
" [0.61574058],\n",
" [0.2635116 ],\n",
" [0.53971591],\n",
" [0.46098369]])"
]
},
"execution_count": 12,
Expand Down Expand Up @@ -1819,10 +1814,7 @@
"\n",
"models = make_column_transformer(*[(pipe, features.columns.tolist()) for pipe in pipes])\n",
"ens_end = NumeraiEnsemble()\n",
"ens_end.set_transform_request(era_series=True)\n",
"ens_end.set_predict_request(era_series=True)\n",
"neut = FeatureNeutralizer(proportion=0.5)\n",
"neut.set_predict_request(era_series=True, features=True)\n",
"full_pipe = make_meta_pipeline(models, ens_end, neut)"
]
},
Expand Down Expand Up @@ -2468,7 +2460,7 @@
"name": "stderr",
"output_type": "stream",
"text": [
"Processing feature neutralizations: 100%|██████████| 1/1 [00:00<00:00, 8701.88it/s]\n"
"Processing feature neutralizations: 100%|██████████| 1/1 [00:00<00:00, 11214.72it/s]\n"
]
},
{
Expand Down
14 changes: 6 additions & 8 deletions examples/numerai_pipeline.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -32,18 +32,16 @@
}
],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"from sklearn.pipeline import Pipeline, make_pipeline, make_union,FeatureUnion\n",
"import pandas as pd\n",
"import plotly.express as px\n",
"from sklearn.pipeline import make_pipeline, make_union\n",
"from sklearn.preprocessing import StandardScaler, MinMaxScaler, OneHotEncoder\n",
"from sklearn.linear_model import ElasticNet\n",
"from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier\n",
"from sklearn.compose import ColumnTransformer, make_column_transformer\n",
"from sklearn.ensemble import GradientBoostingRegressor\n",
"from sklearn.compose import make_column_transformer\n",
"from sklego.preprocessing import ColumnSelector\n",
"\n",
"import plotly.express as px\n",
"from numerblox.preprocessing import ReduceMemoryProcessor, GroupStatsPreProcessor, V4_2_FEATURE_GROUP_MAPPING\n",
"from numerblox.preprocessing import GroupStatsPreProcessor, V4_2_FEATURE_GROUP_MAPPING\n",
"from numerblox.meta import MetaEstimator\n",
"from numerblox.neutralizers import FeatureNeutralizer\n",
"\n",
Expand Down
2 changes: 2 additions & 0 deletions numerblox/ensemble.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ class NumeraiEnsemble(BaseEstimator, TransformerMixin):
"""
def __init__(self, weights=None, donate_weighted=False):
sklearn.set_config(enable_metadata_routing=True)
self.set_transform_request(era_series=True)
self.set_predict_request(era_series=True)
super().__init__()
self.weights = weights
if self.weights and sum(self.weights) != 1:
Expand Down
3 changes: 3 additions & 0 deletions numerblox/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import pandas as pd
from xgboost import XGBRegressor
import sklearn
from sklearn.utils.validation import check_is_fitted

from .evaluation import NumeraiClassicEvaluator
Expand All @@ -22,6 +23,8 @@ class EraBoostedXGBRegressor(XGBRegressor):
:param num_iters: Number of total era boosting iterations.
"""
def __init__(self, proportion=0.5, trees_per_step=10, num_iters=200, **xgb_params):
sklearn.set_config(enable_metadata_routing=True)
self.set_fit_request(era_series=True)
super().__init__(**xgb_params)
if not self.n_estimators:
self.n_estimators = 100
Expand Down
2 changes: 2 additions & 0 deletions numerblox/neutralizers.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,8 @@ class BaseNeutralizer(BaseEstimator, TransformerMixin):
def __init__(self, new_col_names: list):
self.new_col_names = new_col_names
sklearn.set_config(enable_metadata_routing=True)
self.set_transform_request(features=True, era_series=True)
self.set_predict_request(features=True, era_series=True)
super().__init__()

def fit(self, X=None, y=None):
Expand Down
1 change: 0 additions & 1 deletion numerblox/numerframe.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import pandas as pd
from pathlib import Path
from datetime import date, timedelta
from typing import Union, Tuple, Any, List
from numerai_era_data.date_utils import (ERA_ONE_START, get_current_era,
get_current_date, get_era_for_date,
Expand Down
10 changes: 7 additions & 3 deletions numerblox/penalizers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,10 @@
import scipy
from abc import abstractmethod
from typing import Union
import numpy as np
import pandas as pd
from typing import Union
from tqdm.auto import tqdm
from abc import abstractmethod
import sklearn
from sklearn.base import BaseEstimator, TransformerMixin

try:
Expand All @@ -19,6 +20,9 @@ class BasePenalizer(BaseEstimator, TransformerMixin):
:param new_col_name: Name of new neutralized column.
"""
def __init__(self, new_col_name: str):
sklearn.set_config(enable_metadata_routing=True)
self.set_transform_request(features=True, era_series=True)
self.set_predict_request(features=True, era_series=True)
self.new_col_name = new_col_name
super().__init__()

Expand All @@ -28,7 +32,7 @@ def fit(self, X=None, y=None):
@abstractmethod
def transform(
self, X: Union[np.array, pd.DataFrame],
features: pd.DataFrame, eras: pd.Series
features: pd.DataFrame, era_series: pd.Series
) -> np.array:
...

Expand Down
10 changes: 7 additions & 3 deletions numerblox/preprocessing/signals.py
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,8 @@ def __init__(
self.quantiler = QuantileTransformer(
n_quantiles=self.num_quantiles, random_state=self.random_state
)
# Metadata routing
self.set_transform_request(era_series=True)

def _quantile_transform(self, group_data: pd.Series) -> pd.Series:
"""
Expand Down Expand Up @@ -305,9 +307,9 @@ def process_feature(feature):
output_df = pd.concat(output_series_list, axis=1)
return output_df.to_numpy()

def fit_transform(self, X: Union[np.array, pd.DataFrame], eras: pd.Series):
self.fit(X=X, eras=eras)
return self.transform(X=X, eras=eras)
def fit_transform(self, X: Union[np.array, pd.DataFrame], era_series: pd.Series):
self.fit(X=X, era_series=era_series)
return self.transform(X=X, era_series=era_series)

def get_feature_names_out(self, input_features=None) -> List[str]:
"""Return feature names."""
Expand Down Expand Up @@ -376,6 +378,8 @@ class LagPreProcessor(BasePreProcessor):
def __init__(self, windows: list = None,):
super().__init__()
self.windows = windows if windows else [5, 10, 15, 20]
# Metadata routing
self.set_transform_request(ticker_series=True)

def transform(self, X: Union[np.array, pd.DataFrame], ticker_series: pd.Series) -> np.array:
X = pd.DataFrame(X)
Expand Down
5 changes: 4 additions & 1 deletion numerblox/targets.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from typing import List, Union
from abc import abstractmethod
from scipy.stats import rankdata
import sklearn
from sklearn.linear_model import Ridge
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.mixture import BayesianGaussianMixture
Expand All @@ -16,7 +17,8 @@ class BaseTargetProcessor(BaseEstimator, TransformerMixin):
"""Common functionality for preprocessors and postprocessors."""

def __init__(self):
...
sklearn.set_config(enable_metadata_routing=True)
self.set_transform_request(era_series=True)

def fit(self, X, y=None):
self.is_fitted_ = True
Expand Down Expand Up @@ -45,6 +47,7 @@ def __init__(
self,
n_components: int = 3,
):
self.set_fit_request(era_series=True)
super().__init__()
self.n_components = n_components
self.ridge = Ridge(fit_intercept=False)
Expand Down
Loading
Loading