diff --git a/docs/end_to_end.md b/docs/end_to_end.md index 5d44aa2..0928865 100644 --- a/docs/end_to_end.md +++ b/docs/end_to_end.md @@ -56,14 +56,16 @@ preproc_pipe = make_union(gpp, fncv3_selector) xgb = XGBRegressor() cve = CrossValEstimator(estimator=xgb, cv=TimeSeriesSplit(n_splits=5)) ens = NumeraiEnsemble() +ens.set_transform_request(era_series=True) fn = FeatureNeutralizer(proportion=0.5) +fn.set_predict_request(era_series=True, features=True) full_pipe = make_meta_pipeline(preproc_pipe, cve, ens, fn) # Train full model -full_pipe.fit(X, y, numeraiensemble__eras=eras); +full_pipe.fit(X, y, era_series=era_series); # Inference on validation data -val_preds = full_pipe.predict(val_X, eras=val_eras, features=val_features) +val_preds = full_pipe.predict(val_X, era_series=val_eras, features=val_features) ``` ## 2. Multi Classification Ensemble @@ -85,12 +87,14 @@ model = DecisionTreeClassifier() crossval1 = CrossValEstimator(estimator=model, cv=TimeSeriesSplit(n_splits=3), predict_func='predict_proba') pred_rud = PredictionReducer(n_models=3, n_classes=5) ens2 = NumeraiEnsemble(donate_weighted=True) +ens2.set_transform_request(era_series=True) neut2 = FeatureNeutralizer(proportion=0.5) +neut2.set_predict_request(era_series=True, features=True) full_pipe = make_meta_pipeline(preproc_pipe, crossval1, pred_rud, ens2, neut2) -full_pipe.fit(X, y, numeraiensemble__eras=eras) +full_pipe.fit(X, y, era_series=era_series) -preds = full_pipe.predict(val_X, eras=val_eras, features=val_features) +preds = full_pipe.predict(val_X, era_series=val_eras, features=val_features) ``` ## 3. Ensemble of ensemble of regressors @@ -107,6 +111,7 @@ from numerblox.meta import CrossValEstimator, make_meta_pipeline from numerblox.ensemble import NumeraiEnsemble, from numerblox.neutralizers import FeatureNeutralizer + pipes = [] for i in range(3): model = DecisionTreeRegressor() @@ -116,12 +121,12 @@ for i in range(3): models = make_column_transformer(*[(pipe, features.columns.tolist()) for pipe in pipes]) ens_end = NumeraiEnsemble() +ens_end.set_transform_request(era_series=True) neut = FeatureNeutralizer(proportion=0.5) +neut.set_predict_request(era_series=True, features=True) full_pipe = make_meta_pipeline(models, ens_end, neut) -full_pipe.fit(X, y, - columntransformer__eras=eras, - numeraiensemble__eras=eras) +full_pipe.fit(X, y, era_series=era_series) -preds = full_pipe.predict(val_X, eras=val_eras, features=val_features) +preds = full_pipe.predict(val_X, era_series=val_eras, features=val_features) ``` diff --git a/docs/models.md b/docs/models.md index b5cbe60..c6abe5b 100644 --- a/docs/models.md +++ b/docs/models.md @@ -16,7 +16,7 @@ Make sure to include the era column as a `pd.Series` in the `fit` method. from numerblox.models import EraBoostedXGBRegressor model = EraBoostedXGBRegressor(proportion=0.5, trees_per_step=10, num_iters=20) -model.fit(X=X_train, y=y_train, eras=eras_train) +model.fit(X=X_train, y=y_train, era_series=eras_train) predictions = model.predict(X_live) ``` \ No newline at end of file diff --git a/docs/postprocessing.md b/docs/postprocessing.md index 0af110e..aa9cc31 100644 --- a/docs/postprocessing.md +++ b/docs/postprocessing.md @@ -26,6 +26,7 @@ feature_data = pd.DataFrame([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]) era_data = pd.Series([1, 1, 2]) neutralizer = FeatureNeutralizer(pred_name="prediction", proportion=0.5) +neutralizer.set_predict_request(era_series=True, features=True) neutralizer.fit() neutralized_predictions = neutralizer.predict(X=predictions, features=feature_data, eras=era_data) ``` @@ -40,6 +41,7 @@ feature_data = pd.DataFrame([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]) era_data = pd.Series([1, 1, 2]) neutralizer = FeatureNeutralizer(pred_name=["prediction1", "prediction2"], proportion=[0.5, 0.7]) +neutralizer.set_predict_request(era_series=True, features=True) neutralizer.fit() neutralized_predictions = neutralizer.predict(X=predictions, features=feature_data, eras=era_data) ``` @@ -64,6 +66,7 @@ feature_data = pd.DataFrame([[0.1, 0.2], [0.3, 0.4], [0.5, 0.6]]) era_data = pd.Series([1, 1, 2]) penalizer = FeaturePenalizer(max_exposure=0.1, pred_name="prediction") +penalizer.set_predict_request(era_series=True, features=True) penalizer.fit(X=predictions) penalized_predictions = penalizer.predict(X=predictions, features=feature_data, eras=era_data) ``` diff --git a/docs/preprocessing.md b/docs/preprocessing.md index 626e3be..c0bed06 100644 --- a/docs/preprocessing.md +++ b/docs/preprocessing.md @@ -52,13 +52,14 @@ enhanced_data = feature_gen.fit_transform(dataf) `EraQuantileProcessor` transforms features into quantiles by era. This can help normalize data and make patterns more distinguishable. Quantiling operation are parallelized across features for faster processing. -Using `.transform` requires passing the era column as a `pd.Series`. This is because the quantiles are calculated per era so it needs that information along with the raw input features. +Using `.transform` requires passing `era_series`. This is because the quantiles are calculated per era so it needs that information along with the raw input features. ```py from numerblox.preprocessing import EraQuantileProcessor eq_processor = EraQuantileProcessor(num_quantiles=50, random_state=42) -transformed_data = eq_processor.fit_transform(X, eras=eras_series) +eq_processor.set_transform_request(era_series=True) +transformed_data = eq_processor.fit_transform(X, era_series=eras_series) ``` ### TickerMapper @@ -79,14 +80,15 @@ mapped_data = ticker_mapper.transform(dataf["ticker"]) `LagPreProcessor` generates lag features based on specified windows. Lag features can capture temporal patterns in time-series data. -Note that `LagPreProcessor` needs a series of `tickers` in the `.transform` step. +Note that `LagPreProcessor` needs a `ticker_series` in the `.transform` step. ```py from numerblox.preprocessing import LagPreProcessor lag_processor = LagPreProcessor(windows=[5, 10, 20]) +lag_processor.set_transform_request(ticker_series=True) lag_processor.fit(X) -lagged_data = lag_processor.transform(X, tickers=tickers_series) +lagged_data = lag_processor.transform(X, ticker_series=tickers_series) ``` @@ -96,18 +98,19 @@ lagged_data = lag_processor.transform(X, tickers=tickers_series) WARNING: `DifferencePreProcessor` works only on `pd.DataFrame` and with columns that are generated in `LagPreProcessor`. If you are using these in a Pipeline make sure `LagPreProcessor` is defined before `DifferencePreProcessor` and that output API is set to Pandas (`pipeline.set_output(transform="pandas")`). -Note that `LagPreProcessor` needs a series of `tickers` in the `.transform` step so a pipeline with both preprocessors will need a `tickers` argument in `.transform`. +Note that `LagPreProcessor` needs a `ticker_series` in the `.transform` step so a pipeline with both preprocessors will need a `tickers` argument in `.transform`. ```py from sklearn.pipeline import make_pipeline from numerblox.preprocessing import DifferencePreProcessor lag = LagPreProcessor(windows=[5, 10]) +lag.set_transform_request(ticker_series=True) diff = DifferencePreProcessor(windows=[5, 10], pct_diff=True) pipe = make_pipeline(lag, diff) pipe.set_output(transform="pandas") pipe.fit(X) -diff_data = pipe.transform(X, tickers=tickers_series) +diff_data = pipe.transform(X, ticker_series=tickers_series) ``` ### PandasTaFeatureGenerator diff --git a/examples/end_to_end.ipynb b/examples/end_to_end.ipynb index 7ea6002..3a3c814 100644 --- a/examples/end_to_end.ipynb +++ b/examples/end_to_end.ipynb @@ -18,13 +18,22 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/Users/clepelaars/miniconda3/envs/numerai-classic-numerbloxv1/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], "source": [ "from xgboost import XGBRegressor\n", "from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor\n", "from sklearn.model_selection import TimeSeriesSplit\n", "from sklearn.pipeline import make_pipeline, make_union\n", - "from sklearn.compose import make_column_transformer\n", + "from sklearn.compose import make_column_transformer, ColumnTransformer\n", "from numerblox.preprocessing import GroupStatsPreProcessor\n", "from numerblox.meta import CrossValEstimator, make_meta_pipeline\n", "from numerblox.ensemble import NumeraiEnsemble, PredictionReducer\n", @@ -57,7 +66,7 @@ "source": [ "X, y = df.get_feature_target_pair(multi_target=False)\n", "fncv3_cols = df.get_fncv3_feature_data.columns.tolist()\n", - "eras = df.get_era_data\n", + "era_series = df.get_era_data\n", "features = df.get_feature_data" ] }, @@ -107,59 +116,465 @@ { "data": { "text/html": [ - "
MetaPipeline(steps=[('featureunion',\n",
-       "                     FeatureUnion(transformer_list=[('groupstatspreprocessor',\n",
-       "                                                     GroupStatsPreProcessor(groups=['sunshine',\n",
-       "                                                                                    'rain'])),\n",
-       "                                                    ('columnselector',\n",
-       "                                                     ColumnSelector(columns=['feature_honoured_observational_balaamite',\n",
-       "                                                                             'feature_polaroid_vadose_quinze',\n",
-       "                                                                             'feature_untidy_withdrawn_bargeman',\n",
-       "                                                                             'feature_genuine_kyphotic_trehala',\n",
-       "                                                                             'feature_unenthr...\n",
-       "                                                                             'feature_ungenuine_sporophytic_evangelist',\n",
-       "                                                                             'feature_supercelestial_telic_dyfed',\n",
-       "                                                                             'feature_inconsiderate_unbooted_ricer', ...]))])),\n",
+       "
MetaPipeline(steps=[('columntransformer',\n",
+       "                     ColumnTransformer(transformers=[('gpp',\n",
+       "                                                      GroupStatsPreProcessor(groups=['sunshine',\n",
+       "                                                                                     'rain']),\n",
+       "                                                      ['feature_honoured_observational_balaamite',\n",
+       "                                                       'feature_polaroid_vadose_quinze',\n",
+       "                                                       'feature_untidy_withdrawn_bargeman',\n",
+       "                                                       'feature_genuine_kyphotic_trehala',\n",
+       "                                                       'feature_unenthralled_sportful_schoolhouse',\n",
+       "                                                       'feature_divulsive_explanat...\n",
+       "                                                       'feature_supercelestial_telic_dyfed',\n",
+       "                                                       'feature_inconsiderate_unbooted_ricer', ...])])),\n",
        "                    ('crossvalestimator',\n",
        "                     CrossValEstimator(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),\n",
        "                                       estimator=DecisionTreeRegressor())),\n",
        "                    ('numeraiensemble', NumeraiEnsemble(donate_weighted=True)),\n",
-       "                    ('featureneutralizer', FeatureNeutralizer())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
CrossValEstimator(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),\n",
+       "                  estimator=DecisionTreeRegressor())
DecisionTreeRegressor()
DecisionTreeRegressor()
NumeraiEnsemble(donate_weighted=True)
FeatureNeutralizer(pred_name=['prediction'], proportion=[0.5])
" ], "text/plain": [ - "MetaPipeline(steps=[('featureunion',\n", - " FeatureUnion(transformer_list=[('groupstatspreprocessor',\n", - " GroupStatsPreProcessor(groups=['sunshine',\n", - " 'rain'])),\n", - " ('columnselector',\n", - " ColumnSelector(columns=['feature_honoured_observational_balaamite',\n", - " 'feature_polaroid_vadose_quinze',\n", - " 'feature_untidy_withdrawn_bargeman',\n", - " 'feature_genuine_kyphotic_trehala',\n", - " 'feature_unenthr...\n", - " 'feature_ungenuine_sporophytic_evangelist',\n", - " 'feature_supercelestial_telic_dyfed',\n", - " 'feature_inconsiderate_unbooted_ricer', ...]))])),\n", + "MetaPipeline(steps=[('columntransformer',\n", + " ColumnTransformer(transformers=[('gpp',\n", + " GroupStatsPreProcessor(groups=['sunshine',\n", + " 'rain']),\n", + " ['feature_honoured_observational_balaamite',\n", + " 'feature_polaroid_vadose_quinze',\n", + " 'feature_untidy_withdrawn_bargeman',\n", + " 'feature_genuine_kyphotic_trehala',\n", + " 'feature_unenthralled_sportful_schoolhouse',\n", + " 'feature_divulsive_explanat...\n", + " 'feature_supercelestial_telic_dyfed',\n", + " 'feature_inconsiderate_unbooted_ricer', ...])])),\n", " ('crossvalestimator',\n", " CrossValEstimator(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),\n", " estimator=DecisionTreeRegressor())),\n", " ('numeraiensemble', NumeraiEnsemble(donate_weighted=True)),\n", - " ('featureneutralizer', FeatureNeutralizer())])" + " ('featureneutralizer',\n", + " FeatureNeutralizer(pred_name=['prediction'],\n", + " proportion=[0.5]))])" ] }, "execution_count": 6, @@ -210,13 +626,19 @@ "gpp = GroupStatsPreProcessor(groups=['sunshine', 'rain'])\n", "fncv3_selector = ColumnSelector(fncv3_cols)\n", "\n", - "preproc_pipe = make_union(gpp, fncv3_selector)\n", + "preproc_pipe = ColumnTransformer([\n", + " ('gpp', gpp, X.columns.tolist()),\n", + " ('fncv3_selector', fncv3_selector, fncv3_cols)\n", + " ])\n", "\n", "# Model\n", "xgb = DecisionTreeRegressor()\n", "cve = CrossValEstimator(estimator=xgb, cv=TimeSeriesSplit(n_splits=5))\n", "ens = NumeraiEnsemble(donate_weighted=True)\n", + "ens.set_transform_request(era_series=True)\n", + "ens.set_predict_request(era_series=True)\n", "fn = FeatureNeutralizer(proportion=0.5)\n", + "fn.set_predict_request(era_series=True, features=True)\n", "full_pipe = make_meta_pipeline(preproc_pipe, cve, ens, fn)\n", "full_pipe" ] @@ -228,7 +650,7 @@ "outputs": [], "source": [ "# Train full model\n", - "full_pipe.fit(X, y, numeraiensemble__eras=eras, featureneutralizer__eras=eras, featureneutralizer__features=features);" + "full_pipe.fit(X, y, era_series=era_series);" ] }, { @@ -237,48 +659,30 @@ "metadata": {}, "outputs": [ { - "data": { - "text/plain": [ - "array([[0.05046453],\n", - " [0.75096525],\n", - " [0.05094715],\n", - " [0.75144788],\n", - " [0.25009049]])" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# Unneutralized predictions\n", - "full_pipe[:-1].predict(X, eras=eras)[:5]" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing feature neutralizations: 100%|██████████| 1/1 [00:00<00:00, 25575.02it/s]\n" + ] + }, { "data": { "text/plain": [ - "array([[0.29074905],\n", - " [0.64394998],\n", - " [0.28693911],\n", - " [0.55793662],\n", - " [0.47588494]])" + "array([[0.30403528],\n", + " [0.64598246],\n", + " [0.29753909],\n", + " [0.57058209],\n", + " [0.48907478]])" ] }, - "execution_count": 9, + "execution_count": 8, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# End to end predictions\n", - "preds = full_pipe.predict(X=X, features=features, eras=eras)\n", + "preds = full_pipe.predict(X=X, features=features, era_series=era_series)\n", "preds[:5]" ] }, @@ -302,7 +706,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 9, "metadata": {}, "outputs": [], "source": [ @@ -310,73 +714,479 @@ "crossval1 = CrossValEstimator(estimator=model, cv=TimeSeriesSplit(n_splits=3), predict_func='predict_proba')\n", "pred_rud = PredictionReducer(n_models=3, n_classes=5)\n", "ens2 = NumeraiEnsemble(donate_weighted=True)\n", + "ens2.set_transform_request(era_series=True)\n", "neut2 = FeatureNeutralizer(proportion=0.5)\n", + "neut2.set_predict_request(era_series=True, features=True)\n", "full_pipe = make_meta_pipeline(preproc_pipe, crossval1, pred_rud, ens2, neut2)" ] }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
MetaPipeline(steps=[('featureunion',\n",
-       "                     FeatureUnion(transformer_list=[('groupstatspreprocessor',\n",
-       "                                                     GroupStatsPreProcessor(groups=['sunshine',\n",
-       "                                                                                    'rain'])),\n",
-       "                                                    ('columnselector',\n",
-       "                                                     ColumnSelector(columns=['feature_honoured_observational_balaamite',\n",
-       "                                                                             'feature_polaroid_vadose_quinze',\n",
-       "                                                                             'feature_untidy_withdrawn_bargeman',\n",
-       "                                                                             'feature_genuine_kyphotic_trehala',\n",
-       "                                                                             'feature_unenthr...\n",
-       "                                                                             'feature_inconsiderate_unbooted_ricer', ...]))])),\n",
-       "                    ('crossvalestimator',\n",
+       "
MetaPipeline(steps=[('columntransformer',\n",
+       "                     ColumnTransformer(transformers=[('gpp',\n",
+       "                                                      GroupStatsPreProcessor(groups=['sunshine',\n",
+       "                                                                                     'rain']),\n",
+       "                                                      ['feature_honoured_observational_balaamite',\n",
+       "                                                       'feature_polaroid_vadose_quinze',\n",
+       "                                                       'feature_untidy_withdrawn_bargeman',\n",
+       "                                                       'feature_genuine_kyphotic_trehala',\n",
+       "                                                       'feature_unenthralled_sportful_schoolhouse',\n",
+       "                                                       'feature_divulsive_explanat...\n",
        "                     CrossValEstimator(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),\n",
        "                                       estimator=DecisionTreeClassifier(),\n",
        "                                       predict_func='predict_proba')),\n",
        "                    ('predictionreducer',\n",
        "                     PredictionReducer(n_classes=5, n_models=3)),\n",
        "                    ('numeraiensemble', NumeraiEnsemble(donate_weighted=True)),\n",
-       "                    ('featureneutralizer', FeatureNeutralizer())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
DecisionTreeClassifier()
PredictionReducer(n_classes=5, n_models=3)
NumeraiEnsemble(donate_weighted=True)
FeatureNeutralizer(pred_name=['prediction'], proportion=[0.5])
" ], "text/plain": [ - "MetaPipeline(steps=[('featureunion',\n", - " FeatureUnion(transformer_list=[('groupstatspreprocessor',\n", - " GroupStatsPreProcessor(groups=['sunshine',\n", - " 'rain'])),\n", - " ('columnselector',\n", - " ColumnSelector(columns=['feature_honoured_observational_balaamite',\n", - " 'feature_polaroid_vadose_quinze',\n", - " 'feature_untidy_withdrawn_bargeman',\n", - " 'feature_genuine_kyphotic_trehala',\n", - " 'feature_unenthr...\n", - " 'feature_inconsiderate_unbooted_ricer', ...]))])),\n", - " ('crossvalestimator',\n", + "MetaPipeline(steps=[('columntransformer',\n", + " ColumnTransformer(transformers=[('gpp',\n", + " GroupStatsPreProcessor(groups=['sunshine',\n", + " 'rain']),\n", + " ['feature_honoured_observational_balaamite',\n", + " 'feature_polaroid_vadose_quinze',\n", + " 'feature_untidy_withdrawn_bargeman',\n", + " 'feature_genuine_kyphotic_trehala',\n", + " 'feature_unenthralled_sportful_schoolhouse',\n", + " 'feature_divulsive_explanat...\n", " CrossValEstimator(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),\n", " estimator=DecisionTreeClassifier(),\n", " predict_func='predict_proba')),\n", " ('predictionreducer',\n", " PredictionReducer(n_classes=5, n_models=3)),\n", " ('numeraiensemble', NumeraiEnsemble(donate_weighted=True)),\n", - " ('featureneutralizer', FeatureNeutralizer())])" + " ('featureneutralizer',\n", + " FeatureNeutralizer(pred_name=['prediction'],\n", + " proportion=[0.5]))])" ] }, - "execution_count": 11, + "execution_count": 10, "metadata": {}, "output_type": "execute_result" } @@ -430,67 +1240,471 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
MetaPipeline(steps=[('featureunion',\n",
-       "                     FeatureUnion(transformer_list=[('groupstatspreprocessor',\n",
-       "                                                     GroupStatsPreProcessor(groups=['sunshine',\n",
-       "                                                                                    'rain'])),\n",
-       "                                                    ('columnselector',\n",
-       "                                                     ColumnSelector(columns=['feature_honoured_observational_balaamite',\n",
-       "                                                                             'feature_polaroid_vadose_quinze',\n",
-       "                                                                             'feature_untidy_withdrawn_bargeman',\n",
-       "                                                                             'feature_genuine_kyphotic_trehala',\n",
-       "                                                                             'feature_unenthr...\n",
-       "                                                                             'feature_inconsiderate_unbooted_ricer', ...]))])),\n",
-       "                    ('crossvalestimator',\n",
+       "
MetaPipeline(steps=[('columntransformer',\n",
+       "                     ColumnTransformer(transformers=[('gpp',\n",
+       "                                                      GroupStatsPreProcessor(groups=['sunshine',\n",
+       "                                                                                     'rain']),\n",
+       "                                                      ['feature_honoured_observational_balaamite',\n",
+       "                                                       'feature_polaroid_vadose_quinze',\n",
+       "                                                       'feature_untidy_withdrawn_bargeman',\n",
+       "                                                       'feature_genuine_kyphotic_trehala',\n",
+       "                                                       'feature_unenthralled_sportful_schoolhouse',\n",
+       "                                                       'feature_divulsive_explanat...\n",
        "                     CrossValEstimator(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),\n",
        "                                       estimator=DecisionTreeClassifier(),\n",
        "                                       predict_func='predict_proba')),\n",
        "                    ('predictionreducer',\n",
        "                     PredictionReducer(n_classes=5, n_models=3)),\n",
        "                    ('numeraiensemble', NumeraiEnsemble(donate_weighted=True)),\n",
-       "                    ('featureneutralizer', FeatureNeutralizer())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
DecisionTreeClassifier()
PredictionReducer(n_classes=5, n_models=3)
NumeraiEnsemble(donate_weighted=True)
FeatureNeutralizer(pred_name=['prediction'], proportion=[0.5])
" ], "text/plain": [ - "MetaPipeline(steps=[('featureunion',\n", - " FeatureUnion(transformer_list=[('groupstatspreprocessor',\n", - " GroupStatsPreProcessor(groups=['sunshine',\n", - " 'rain'])),\n", - " ('columnselector',\n", - " ColumnSelector(columns=['feature_honoured_observational_balaamite',\n", - " 'feature_polaroid_vadose_quinze',\n", - " 'feature_untidy_withdrawn_bargeman',\n", - " 'feature_genuine_kyphotic_trehala',\n", - " 'feature_unenthr...\n", - " 'feature_inconsiderate_unbooted_ricer', ...]))])),\n", - " ('crossvalestimator',\n", + "MetaPipeline(steps=[('columntransformer',\n", + " ColumnTransformer(transformers=[('gpp',\n", + " GroupStatsPreProcessor(groups=['sunshine',\n", + " 'rain']),\n", + " ['feature_honoured_observational_balaamite',\n", + " 'feature_polaroid_vadose_quinze',\n", + " 'feature_untidy_withdrawn_bargeman',\n", + " 'feature_genuine_kyphotic_trehala',\n", + " 'feature_unenthralled_sportful_schoolhouse',\n", + " 'feature_divulsive_explanat...\n", " CrossValEstimator(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=3, test_size=None),\n", " estimator=DecisionTreeClassifier(),\n", " predict_func='predict_proba')),\n", " ('predictionreducer',\n", " PredictionReducer(n_classes=5, n_models=3)),\n", " ('numeraiensemble', NumeraiEnsemble(donate_weighted=True)),\n", - " ('featureneutralizer', FeatureNeutralizer())])" + " ('featureneutralizer',\n", + " FeatureNeutralizer(pred_name=['prediction'],\n", + " proportion=[0.5]))])" ] }, - "execution_count": 12, + "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "y_int = (y * 4).astype(int)\n", - "full_pipe.fit(X, y_int, numeraiensemble__eras=eras)" + "full_pipe.fit(X, y_int, era_series=era_series)" ] }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 12, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing feature neutralizations: 100%|██████████| 1/1 [00:00<00:00, 9892.23it/s]\n" + ] + }, { "data": { "text/plain": [ - "array([[0.29116765],\n", - " [0.64820649],\n", - " [0.28812946],\n", - " [0.5631892 ],\n", - " [0.48462551]])" + "array([[0.29059154],\n", + " [0.64786053],\n", + " [0.28163789],\n", + " [0.56881549],\n", + " [0.48695533]])" ] }, - "execution_count": 13, + "execution_count": 12, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "preds = full_pipe.predict(X, eras=eras, features=features)\n", + "preds = full_pipe.predict(X, era_series=era_series, features=features)\n", "preds[:5]" ] }, @@ -585,7 +1806,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -598,19 +1819,426 @@ "\n", "models = make_column_transformer(*[(pipe, features.columns.tolist()) for pipe in pipes])\n", "ens_end = NumeraiEnsemble()\n", + "ens_end.set_transform_request(era_series=True)\n", + "ens_end.set_predict_request(era_series=True)\n", "neut = FeatureNeutralizer(proportion=0.5)\n", + "neut.set_predict_request(era_series=True, features=True)\n", "full_pipe = make_meta_pipeline(models, ens_end, neut)" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 14, "metadata": {}, "outputs": [ { "data": { "text/html": [ - "
MetaPipeline(steps=[('columntransformer',\n",
+       "
MetaPipeline(steps=[('columntransformer',\n",
        "                     ColumnTransformer(transformers=[('pipeline-1',\n",
        "                                                      Pipeline(steps=[('crossvalestimator',\n",
        "                                                                       CrossValEstimator(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),\n",
@@ -621,7 +2249,6 @@
        "                                                                                                                colsample_bynode=None,\n",
        "                                                                                                                colsample_bytree=None,\n",
        "                                                                                                                de...\n",
-       "                                                       'feature_midmost_perspiratory_hubert',\n",
        "                                                       'feature_laminable_unspecified_gynoecium',\n",
        "                                                       'feature_bally_bathymetrical_isadora',\n",
        "                                                       'feature_skim_unmeant_bandsman',\n",
@@ -629,7 +2256,9 @@
        "                                                       'feature_supercelestial_telic_dyfed',\n",
        "                                                       'feature_inconsiderate_unbooted_ricer', ...])])),\n",
        "                    ('numeraiensemble', NumeraiEnsemble()),\n",
-       "                    ('featureneutralizer', FeatureNeutralizer())])
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
NumeraiEnsemble()
FeatureNeutralizer(pred_name=['prediction'], proportion=[0.5])
" ], "text/plain": [ "MetaPipeline(steps=[('columntransformer',\n", @@ -799,7 +2429,6 @@ " colsample_bynode=None,\n", " colsample_bytree=None,\n", " de...\n", - " 'feature_midmost_perspiratory_hubert',\n", " 'feature_laminable_unspecified_gynoecium',\n", " 'feature_bally_bathymetrical_isadora',\n", " 'feature_skim_unmeant_bandsman',\n", @@ -807,10 +2436,12 @@ " 'feature_supercelestial_telic_dyfed',\n", " 'feature_inconsiderate_unbooted_ricer', ...])])),\n", " ('numeraiensemble', NumeraiEnsemble()),\n", - " ('featureneutralizer', FeatureNeutralizer())])" + " ('featureneutralizer',\n", + " FeatureNeutralizer(pred_name=['prediction'],\n", + " proportion=[0.5]))])" ] }, - "execution_count": 15, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -821,35 +2452,42 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ - "full_pipe.fit(X, y, columntransformer__eras=eras, numeraiensemble__eras=eras);" + "full_pipe.fit(X, y, era_series=era_series);" ] }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 16, "metadata": {}, "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "Processing feature neutralizations: 100%|██████████| 1/1 [00:00<00:00, 8701.88it/s]\n" + ] + }, { "data": { "text/plain": [ - "array([[0.38389541],\n", - " [0.66075521],\n", - " [0.40429224],\n", - " [0.61309102],\n", - " [0.64847805]])" + "array([[0.38385137],\n", + " [0.65767811],\n", + " [0.39945052],\n", + " [0.61573322],\n", + " [0.64903178]])" ] }, - "execution_count": 17, + "execution_count": 16, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "preds = full_pipe.predict(X, eras=eras, features=features)\n", + "preds = full_pipe.predict(X, era_series=era_series, features=features)\n", "preds[:5]" ] } @@ -870,7 +2508,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.9.12" + "version": "3.11.5" }, "orig_nbformat": 4 }, diff --git a/examples/numerai_pipeline.ipynb b/examples/numerai_pipeline.ipynb index b732922..4fc5200 100644 --- a/examples/numerai_pipeline.ipynb +++ b/examples/numerai_pipeline.ipynb @@ -1871,7 +1871,7 @@ "outputs": [], "source": [ "X = sample[features]\n", - "eras = sample[\"era\"]" + "era_series = sample[\"era\"]" ] }, { @@ -1903,7 +1903,7 @@ } ], "source": [ - "eras" + "era_series" ] }, { @@ -3969,7 +3969,7 @@ ], "source": [ "preds = test.predict(X, features=X,\n", - " eras=eras)" + " era_series=era_series)" ] }, { diff --git a/numerblox/ensemble.py b/numerblox/ensemble.py index ef25b42..32b7bba 100644 --- a/numerblox/ensemble.py +++ b/numerblox/ensemble.py @@ -2,9 +2,8 @@ import warnings import numpy as np import pandas as pd - from typing import Union, List -from sklearn.utils.validation import check_is_fitted +import sklearn from sklearn.base import BaseEstimator, TransformerMixin @@ -22,25 +21,26 @@ class NumeraiEnsemble(BaseEstimator, TransformerMixin): Example donate weighting for 5 folds: [0.0625, 0.0625, 0.125, 0.25, 0.5] """ def __init__(self, weights=None, donate_weighted=False): + sklearn.set_config(enable_metadata_routing=True) super().__init__() self.weights = weights if self.weights and sum(self.weights) != 1: warnings.warn(f"Warning: Weights do not sum to 1. Got {sum(self.weights)}.") self.donate_weighted = donate_weighted - def fit(self, X=None, y=None, **kwargs): + def fit(self, X: Union[np.array, pd.DataFrame], y=None): self.is_fitted_ = True return self - def transform(self, X: Union[np.array, pd.DataFrame], eras: pd.Series) -> np.array: + def transform(self, X: Union[np.array, pd.DataFrame], era_series: pd.Series) -> np.array: """ Standardize by era and ensemble. :param X: Input data where each column contains predictions from an estimator. - :param eras: Era labels (strings) for each row in X. + :param era_series: Era labels (strings) for each row in X. :return: Ensembled predictions. """ - check_is_fitted(self) - assert len(X) == len(eras), f"input X and eras must have the same length. Got {len(X)} != {len(eras)}." + assert not era_series is None, "Era series must be provided for NumeraiEnsemble." + assert len(X) == len(era_series), f"input X and era_series must have the same length. Got {len(X)} != {len(era_series)}." if len(X.shape) == 1: raise ValueError("NumeraiEnsemble requires at least 2 prediction columns. Got 1.") @@ -67,7 +67,7 @@ def transform(self, X: Union[np.array, pd.DataFrame], eras: pd.Series) -> np.arr if np.all(pred == pred[0]): warnings.warn(f"Warning: Predictions in column '{i}' are all constant. Consider checking your estimators. Skipping these estimator predictions in ensembling.") else: - standardized_pred = self._standardize_by_era(pred, eras) + standardized_pred = self._standardize_by_era(pred, era_series) standardized_pred_list.append(standardized_pred) standardized_pred_arr = np.asarray(standardized_pred_list).T @@ -78,11 +78,15 @@ def transform(self, X: Union[np.array, pd.DataFrame], eras: pd.Series) -> np.arr ensembled_predictions = np.average(standardized_pred_arr, axis=1, weights=weights) return ensembled_predictions.reshape(-1, 1) - def predict(self, X: Union[np.array, pd.DataFrame], eras: pd.Series) -> np.array: + def fit_transform(self, X: Union[np.array, pd.DataFrame], y=None, era_series: pd.Series = None) -> np.array: + self.fit(X, y) + return self.transform(X, era_series) + + def predict(self, X: Union[np.array, pd.DataFrame], era_series: pd.Series) -> np.array: """ For if a NumeraiEnsemble happens to be the last step in the pipeline. Has same behavior as transform. """ - return self.transform(X, eras) + return self.transform(X, era_series=era_series) def _standardize(self, X: np.array) -> np.array: """ @@ -93,16 +97,16 @@ def _standardize(self, X: np.array) -> np.array: percentile_X = (scipy.stats.rankdata(X, method="ordinal") - 0.5) / len(X) return percentile_X - def _standardize_by_era(self, X: np.array, eras: Union[np.array, pd.Series, pd.DataFrame]) -> np.array: + def _standardize_by_era(self, X: np.array, era_series: Union[np.array, pd.Series, pd.DataFrame]) -> np.array: """ Standardize predictions of a single estimator by era. :param X: All predictions of a single estimator. - :param eras: Era labels (strings) for each row in X. + :param era_series: Era labels (strings) for each row in X. :return: Standardized predictions. """ - if isinstance(eras, (pd.Series, pd.DataFrame)): - eras = eras.to_numpy().flatten() - df = pd.DataFrame({'prediction': X, 'era': eras}) + if isinstance(era_series, (pd.Series, pd.DataFrame)): + era_series = era_series.to_numpy().flatten() + df = pd.DataFrame({'prediction': X, 'era': era_series}) df['standardized_prediction'] = df.groupby('era')['prediction'].transform(self._standardize) return df['standardized_prediction'].values.flatten() @@ -144,8 +148,7 @@ def __init__(self, n_models: int, n_classes: int): self.n_classes = n_classes self.dot_array = [i for i in range(self.n_classes)] - def fit(self, X, y=None): - self.is_fitted_ = True + def fit(self, X: np.array, y=None): return self def transform(self, X: np.array): @@ -153,7 +156,6 @@ def transform(self, X: np.array): :param X: Input predictions. :return: Reduced predictions of shape (X.shape[0], self.n_models). """ - check_is_fitted(self) reduced = [] expected_n_cols = self.n_models * self.n_classes if len(X.shape) != 2: diff --git a/numerblox/evaluation.py b/numerblox/evaluation.py index 2045a24..b9da564 100644 --- a/numerblox/evaluation.py +++ b/numerblox/evaluation.py @@ -1,4 +1,5 @@ import time +import sklearn import numpy as np import pandas as pd from scipy import stats @@ -73,6 +74,7 @@ def __init__( if self.custom_functions is not None: self.check_custom_functions() self.show_detailed_progress_bar = show_detailed_progress_bar + sklearn.set_config(enable_metadata_routing=True) def full_evaluation( self, @@ -513,8 +515,9 @@ def feature_neutral_mean_std_sharpe( More info: https://docs.numer.ai/tournament/feature-neutral-correlation """ fn = FeatureNeutralizer(pred_name=pred_col, proportion=1.0) + fn.set_predict_request(features=True, era_series=True) neutralized_preds = fn.predict( - dataf[pred_col], features=dataf[feature_names], eras=dataf[self.era_col] + dataf[pred_col], features=dataf[feature_names], era_series=dataf[self.era_col] ) # Construct new DataFrame with era col, target col and preds neutralized_dataf = pd.DataFrame(columns=[self.era_col, target_col, pred_col]) diff --git a/numerblox/meta.py b/numerblox/meta.py index 9dac04d..f17712f 100644 --- a/numerblox/meta.py +++ b/numerblox/meta.py @@ -1,23 +1,19 @@ -import inspect import numpy as np import pandas as pd from tqdm import tqdm from typing import Union, List +import sklearn from sklearn import clone from sklearn.compose import ColumnTransformer -from sklearn.pipeline import Pipeline, FeatureUnion, _name_estimators, _final_estimator_has +from sklearn.pipeline import Pipeline, FeatureUnion, _name_estimators from sklearn.utils.validation import check_is_fitted from sklearn.base import BaseEstimator, TransformerMixin, MetaEstimatorMixin from sklearn.model_selection import BaseCrossValidator from sklearn.utils.validation import ( check_is_fitted, check_X_y, - check_memory, FLOAT_DTYPES, ) -from sklearn.utils.metaestimators import available_if - -from sklearn.utils import _print_elapsed_time class MetaEstimator(BaseEstimator, TransformerMixin, MetaEstimatorMixin): @@ -32,6 +28,7 @@ class MetaEstimator(BaseEstimator, TransformerMixin, MetaEstimatorMixin): """ def __init__(self, estimator, predict_func="predict", model_type="regressor"): + sklearn.set_config(enable_metadata_routing=True) self.estimator = estimator if predict_func not in ["predict", "predict_proba", "predict_log_proba", "transform"]: raise ValueError("predict_func must be 'predict', 'predict_proba', 'predict_log_proba' or 'transform'.") @@ -91,6 +88,7 @@ class CrossValEstimator(BaseEstimator, TransformerMixin): :param verbose: Whether to print progress. """ def __init__(self, estimator: BaseEstimator, cv: BaseCrossValidator, evaluation_func=None, predict_func="predict", verbose=False): + sklearn.set_config(enable_metadata_routing=True) super().__init__() self.cv = cv if not hasattr(self.cv, "split") or isinstance(self.cv, str): @@ -215,6 +213,7 @@ class MetaPipeline(Pipeline): :param predict_func: Name of the function that will be used for prediction. """ def __init__(self, steps, memory=None, verbose=False, predict_func="predict"): + sklearn.set_config(enable_metadata_routing=True) self.predict_func = predict_func self.modified_steps = self.wrap_estimators_as_transformers(steps) self.steps = self.modified_steps @@ -237,7 +236,6 @@ def wrap_estimators_as_transformers(self, steps): else: name, step = step_tuple transformed_steps.append(self._wrap_step(name, step, is_last_step=is_last_step)) - return transformed_steps def _wrap_step(self, name, step, columns=None, is_last_step=False): @@ -262,78 +260,6 @@ def _wrap_step(self, name, step, columns=None, is_last_step=False): return (name, MetaEstimator(step, predict_func=self.predict_func)) return (name, step, columns) if columns else (name, step) - - def _fit(self, X, y=None, **fit_params_steps): - # shallow copy of steps - this should really be steps_ - self.steps = list(self.steps) - self._validate_steps() - # Setup the memory - memory = check_memory(self.memory) - - for step_idx, name, transformer in self._iter( - with_final=False, filter_passthrough=False - ): - if transformer is None or transformer == "passthrough": - with _print_elapsed_time("Pipeline", self._log_message(step_idx)): - continue - - if hasattr(memory, "location") and memory.location is None: - # we do not clone when caching is disabled to - # preserve backward compatibility - cloned_transformer = transformer - else: - cloned_transformer = clone(transformer) - # Fit or load from cache the current transformer - X, fitted_transformer = _fit_transform_one( - cloned_transformer, - X, - y, - None, - message_clsname="Pipeline", - message=self._log_message(step_idx), - **fit_params_steps[name], - ) - # Replace the transformer of the step with the fitted - # transformer. This is necessary when loading the transformer - # from the cache. - self.steps[step_idx] = (name, fitted_transformer) - return X - - @available_if(_final_estimator_has("predict")) - def predict(self, X, **predict_params): - """ Overrides predict method in Pipeline so it also parses arguments to transforms.""" - Xt = X - for _, name, transform in self._iter(with_final=False): - # Check if predict params is needed in transform. If so parse along - sig = inspect.signature(transform.transform) - params_needed = {k: v for k, v in predict_params.items() if k in sig.parameters} - # Use the needed parameters when calling transform - Xt = transform.transform(Xt, **params_needed) - return self.steps[-1][1].predict(Xt, **predict_params) - - -def _fit_transform_one( - transformer, X, y, weight, message_clsname="", message=None, **fit_params - ): - """ - Fits ``transformer`` to ``X`` and ``y``. The transformed result is returned - with the fitted transformer. If ``weight`` is not ``None``, the result will - be multiplied by ``weight``. - """ - with _print_elapsed_time(message_clsname, message): - sig = inspect.signature(transformer.transform) - fit_params_needed = {k: v for k, v in fit_params.items() if k in sig.parameters} - # Use the needed parameters when calling fit - transformer.fit(X, y, **fit_params_needed) - - sig = inspect.signature(transformer.transform) - transform_params_needed = {k: v for k, v in fit_params.items() if k in sig.parameters} - # Use the needed parameters when calling transform - res = transformer.transform(X, **transform_params_needed) - - if weight is None: - return res, transformer - return res * weight, transformer def make_meta_pipeline(*steps, memory=None, verbose=False) -> MetaPipeline: diff --git a/numerblox/models.py b/numerblox/models.py index f94aeee..04fb4d3 100644 --- a/numerblox/models.py +++ b/numerblox/models.py @@ -34,13 +34,13 @@ def __init__(self, proportion=0.5, trees_per_step=10, num_iters=200, **xgb_param assert num_iters >= 2, "num_iters must be at least 2." self.num_iters = num_iters - def fit(self, X, y, eras: pd.Series, **fit_params): + def fit(self, X, y, era_series: pd.Series, **fit_params): super().fit(X, y, **fit_params) evaluator = NumeraiClassicEvaluator(era_col="era") self.feature_names = self.get_booster().feature_names iter_df = pd.DataFrame(X, columns=self.feature_names) iter_df["target"] = y - iter_df["era"] = eras + iter_df["era"] = era_series for _ in range(self.num_iters - 1): preds = self.predict(X) diff --git a/numerblox/neutralizers.py b/numerblox/neutralizers.py index 8a00e6c..db28972 100644 --- a/numerblox/neutralizers.py +++ b/numerblox/neutralizers.py @@ -5,6 +5,7 @@ import scipy.stats as sp from abc import abstractmethod from joblib import Parallel, delayed +import sklearn from sklearn.preprocessing import MinMaxScaler from sklearn.base import BaseEstimator, TransformerMixin @@ -16,34 +17,29 @@ class BaseNeutralizer(BaseEstimator, TransformerMixin): """ def __init__(self, new_col_names: list): self.new_col_names = new_col_names + sklearn.set_config(enable_metadata_routing=True) super().__init__() - def fit(self, X=None, y=None, **kwargs): + def fit(self, X=None, y=None): return self @abstractmethod def transform( self, X: Union[np.array, pd.DataFrame], - features: pd.DataFrame, eras: pd.Series, **kwargs + features: pd.DataFrame, era_series: pd.Series ) -> np.array: ... - def predict(self, X: np.array, features: pd.DataFrame, eras: Union[np.array, pd.Series]) -> np.array: + def predict(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series]) -> np.array: """ Convenience function for scikit-learn compatibility. """ - return self.transform(X=X, features=features, eras=eras) + return self.transform(X=X, features=features, era_series=era_series) - def fit_transform(self, X: np.array, features: pd.DataFrame, eras: Union[np.array, pd.Series]) -> np.array: + def fit_transform(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series]) -> np.array: """ Convenience function for scikit-learn compatibility. Needed because fit and transform except different arguments here. """ - return self.fit().transform(X=X, features=features, eras=eras) - - def __call__( - self, X: Union[np.array, pd.DataFrame], - features: pd.DataFrame, eras: pd.Series, **kwargs - ) -> np.array: - return self.predict(X=X, features=features, eras=eras, **kwargs) + return self.fit().transform(X=X, features=features, era_series=era_series) def get_feature_names_out(self, input_features: list = None) -> list: """ @@ -92,17 +88,19 @@ def __init__( self.num_cores = num_cores def transform(self, X: Union[np.array, pd.Series, pd.DataFrame], - features: pd.DataFrame, eras: Union[np.array, pd.Series]) -> np.array: + features: pd.DataFrame, era_series: Union[np.array, pd.Series]) -> np.array: """ Main transform function. :param X: Input predictions to neutralize. \n :param features: DataFrame with features for neutralization. \n - :param eras: Series with era labels for each row in features. \n - Features, eras and the prediction column must all have the same length. + :param era_series: Series with era labels for each row in features. \n + Features, era_series and the prediction column must all have the same length. :return: Neutralized predictions NumPy array. """ + if features is None or era_series is None: + raise ValueError("Features and era_series must be provided.") assert len(X) == len(features), "Input predictions must have same length as features." - assert len(X) == len(eras), "Input predictions must have same length as eras." + assert len(X) == len(era_series), "Input predictions must have same length as eras." df = features.copy() if not isinstance(X, np.ndarray): X = np.array(X) @@ -114,7 +112,7 @@ def transform(self, X: Union[np.array, pd.Series, pd.DataFrame], assert len(self.pred_name) == X.shape[1], "Number of prediction columns given in X does not match 'pred_name'." for i, pred_name in enumerate(self.pred_name): df[pred_name] = X[:, i] - df["era"] = eras + df["era"] = era_series feature_cols = list(features.columns) tasks = [ @@ -192,5 +190,5 @@ def _get_raw_exposures(exposures: np.array, scores: pd.DataFrame) -> np.array: :param scores: DataFrame with predictions. :return: Raw exposures for each era. """ - return exposures.dot(np.linalg.pinv(exposures).dot(scores)) + return exposures.dot(np.linalg.pinv(exposures).dot(scores)) \ No newline at end of file diff --git a/numerblox/penalizers.py b/numerblox/penalizers.py index 5b40c52..fcaac7c 100644 --- a/numerblox/penalizers.py +++ b/numerblox/penalizers.py @@ -22,32 +22,26 @@ def __init__(self, new_col_name: str): self.new_col_name = new_col_name super().__init__() - def fit(self, X=None, y=None, **kwargs): + def fit(self, X=None, y=None): return self @abstractmethod def transform( self, X: Union[np.array, pd.DataFrame], - features: pd.DataFrame, eras: pd.Series, **kwargs + features: pd.DataFrame, eras: pd.Series ) -> np.array: ... - def predict(self, X: np.array, features: pd.DataFrame, eras: Union[np.array, pd.Series]) -> np.array: + def predict(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series]) -> np.array: """ Convenience function for scikit-learn compatibility. """ - return self.transform(X=X, features=features, eras=eras) + return self.transform(X=X, features=features, era_series=era_series) - def fit_transform(self, X: np.array, features: pd.DataFrame, eras: Union[np.array, pd.Series]) -> np.array: + def fit_transform(self, X: np.array, features: pd.DataFrame, era_series: Union[np.array, pd.Series]) -> np.array: """ Convenience function for scikit-learn compatibility. Needed because fit and transform except different arguments here. """ - return self.fit().transform(X=X, features=features, eras=eras) - - def __call__( - self, X: Union[np.array, pd.DataFrame], - features: pd.DataFrame, eras: pd.Series, **kwargs - ) -> np.array: - return self.predict(X=X, features=features, eras=eras, **kwargs) + return self.fit().transform(X=X, features=features, era_series=era_series) def get_feature_names_out(self, input_features: list = None) -> list: """ @@ -89,20 +83,20 @@ def __init__( super().__init__(new_col_name=new_col_name) self.suffix = suffix - def transform(self, X: pd.DataFrame, features: pd.DataFrame, eras: pd.Series) -> np.array: + def transform(self, X: pd.DataFrame, features: pd.DataFrame, era_series: pd.Series) -> np.array: """ Main transform method. :param X: Input predictions to neutralize. :param features: DataFrame with features for neutralization. - :param eras: Series with era labels for each row in features. + :param era_series: Series with era labels for each row in features. Features, eras and the prediction column must all have the same length. :return: Penalized predictions. """ assert len(X) == len(features), "Input predictions must have same length as features." - assert len(X) == len(eras), "Input predictions must have same length as eras." + assert len(X) == len(era_series), "Input predictions must have same length as eras." df = features.copy() df["prediction"] = X - df["era"] = eras + df["era"] = era_series penalized_data = self._reduce_all_exposures( dataf=df, column=self.pred_name, neutralizers=list(features.columns) ) @@ -173,7 +167,6 @@ def _train_loop(self, model, optimizer, feats, pred, target_exps): if loss < 1e-7: break - @tf.function(experimental_relax_shapes=True) def __train_loop_body(self, model, feats, pred, target_exps): with tf.GradientTape() as tape: exps = self.__exposures(feats, pred[:, None] - model(feats, training=True)) @@ -184,7 +177,6 @@ def __train_loop_body(self, model, feats, pred, target_exps): return loss, tape.gradient(loss, model.trainable_variables) @staticmethod - @tf.function(experimental_relax_shapes=True, experimental_compile=True) def __exposures(x, y): x = x - tf.math.reduce_mean(x, axis=0) x = x / tf.norm(x, axis=0) diff --git a/numerblox/preprocessing/base.py b/numerblox/preprocessing/base.py index c157d75..6eebe24 100644 --- a/numerblox/preprocessing/base.py +++ b/numerblox/preprocessing/base.py @@ -3,6 +3,7 @@ from typing import Union, List from abc import abstractmethod +import sklearn from sklearn.base import BaseEstimator, TransformerMixin @@ -10,9 +11,10 @@ class BasePreProcessor(BaseEstimator, TransformerMixin): """Common functionality for preprocessors and postprocessors.""" def __init__(self): - ... + sklearn.set_config(enable_metadata_routing=True) - def fit(self, X, y=None, **kwargs): + def fit(self, X, y=None): + self.is_fitted_ = True return self @abstractmethod @@ -20,11 +22,6 @@ def transform( self, X: Union[np.array, pd.DataFrame], y=None, **kwargs ) -> pd.DataFrame: ... - - def __call__( - self, X: Union[np.array, pd.DataFrame], y=None, **kwargs - ) -> pd.DataFrame: - return self.transform(X=X, y=y, **kwargs) @abstractmethod def get_feature_names_out(self, input_features=None) -> List[str]: diff --git a/numerblox/preprocessing/signals.py b/numerblox/preprocessing/signals.py index fd2eb12..a387422 100644 --- a/numerblox/preprocessing/signals.py +++ b/numerblox/preprocessing/signals.py @@ -278,13 +278,10 @@ def _quantile_transform(self, group_data: pd.Series) -> pd.Series: """ transformed_data = self.quantiler.fit_transform(group_data.to_frame()).ravel() return pd.Series(transformed_data, index=group_data.index) - - def fit(self, X: Union[np.array, pd.DataFrame], y=None, eras: pd.Series = None): - return self def transform( self, X: Union[np.array, pd.DataFrame], - eras: pd.Series, + era_series: pd.Series, ) -> np.array: """ Quantile all features by era. @@ -293,9 +290,9 @@ def transform( :return: Quantiled features. """ X = pd.DataFrame(X) - assert X.shape[0] == eras.shape[0], "Input X and eras must have the same number of rows for quantiling." + assert X.shape[0] == era_series.shape[0], "Input X and eras must have the same number of rows for quantiling." self.features = [col for col in X.columns] - X.loc[:, "era"] = eras + X.loc[:, "era"] = era_series date_groups = X.groupby('era', group_keys=False) def process_feature(feature): @@ -380,13 +377,10 @@ def __init__(self, windows: list = None,): super().__init__() self.windows = windows if windows else [5, 10, 15, 20] - def fit(self, X: Union[np.array, pd.DataFrame], y=None, tickers: pd.Series = None): - return self - - def transform(self, X: Union[np.array, pd.DataFrame], tickers: pd.Series) -> np.array: + def transform(self, X: Union[np.array, pd.DataFrame], ticker_series: pd.Series) -> np.array: X = pd.DataFrame(X) feature_cols = X.columns.tolist() - X["ticker"] = tickers + X["ticker"] = ticker_series ticker_groups = X.groupby("ticker") output_features = [] for feature in tqdm(feature_cols, desc="Lag feature generation"): @@ -398,9 +392,9 @@ def transform(self, X: Union[np.array, pd.DataFrame], tickers: pd.Series) -> np. self.output_features = output_features return X[output_features].to_numpy() - def fit_transform(self, X: Union[np.array, pd.DataFrame], tickers: pd.Series): - self.fit(X=X, tickers=tickers) - return self.transform(X=X, tickers=tickers) + def fit_transform(self, X: Union[np.array, pd.DataFrame], ticker_series: pd.Series): + self.fit(X=X) + return self.transform(X=X, ticker_series=ticker_series) def get_feature_names_out(self, input_features=None) -> List[str]: """Return feature names.""" @@ -561,6 +555,7 @@ def __init__(self, open_col="open", high_col="high", low_col="low", def fit(self, X: pd.DataFrame, y=None): self.ratio_ = X[self.close_col] / X[self.adj_close_col] + self.is_fitted_ = True return self def transform(self, X: pd.DataFrame) -> np.array: @@ -601,6 +596,7 @@ def __init__(self, min_samples_date: int = 200, min_samples_ticker: int = 1200, def fit(self, X: pd.DataFrame, y=None): self.feature_names_out_ = X.columns.tolist() + self.is_fitted_ = True return self def transform(self, X: pd.DataFrame) -> np.array: diff --git a/numerblox/targets.py b/numerblox/targets.py index 821c34c..fffdb1c 100644 --- a/numerblox/targets.py +++ b/numerblox/targets.py @@ -1,7 +1,6 @@ import numpy as np import pandas as pd from tqdm import tqdm -from copy import deepcopy from typing import List, Union from abc import abstractmethod from scipy.stats import rankdata @@ -20,18 +19,14 @@ def __init__(self): ... def fit(self, X, y=None): + self.is_fitted_ = True return self @abstractmethod def transform( - self, X: Union[np.array, pd.DataFrame], y=None, **kwargs + self, X: Union[np.array, pd.DataFrame], y=None ) -> pd.DataFrame: ... - - def __call__( - self, X: Union[np.array, pd.DataFrame], y=None, **kwargs - ) -> pd.DataFrame: - return self.transform(X=X, y=y, **kwargs) @abstractmethod def get_feature_names_out(self, input_features=None) -> List[str]: @@ -55,47 +50,48 @@ def __init__( self.ridge = Ridge(fit_intercept=False) self.bins = [0, 0.05, 0.25, 0.75, 0.95, 1] - def fit(self, X: pd.DataFrame, y: pd.Series, eras: pd.Series): + def fit(self, X: pd.DataFrame, y: pd.Series, era_series: pd.Series): """ Fit Bayesian Gaussian Mixture model on coefficients and normalize. :param X: DataFrame containing features. :param y: Series containing real target. - :param eras: Series containing era information. + :param era_series: Series containing era information. """ bgmm = BayesianGaussianMixture(n_components=self.n_components) - coefs = self._get_coefs(dataf=X, y=y, eras=eras) + coefs = self._get_coefs(dataf=X, y=y, era_series=era_series) bgmm.fit(coefs) # make probability of sampling each component equal to better balance rare regimes bgmm.weights_[:] = 1 / self.n_components self.bgmm_ = bgmm + self.is_fitted_ = True return self - def transform(self, X: pd.DataFrame, eras: pd.Series) -> np.array: + def transform(self, X: pd.DataFrame, era_series: pd.Series) -> np.array: """ Main method for generating fake target. :param X: DataFrame containing features. - :param eras: Series containing era information. + :param era_series: Series containing era information. """ check_is_fitted(self, "bgmm_") - assert len(X) == len(eras), "X and eras must be same length." - all_eras = eras.unique().tolist() + assert len(X) == len(era_series), "X and eras must be same length." + all_eras = era_series.unique().tolist() # Scale data between 0 and 1 X = X.astype(float) X /= X.max() X -= 0.5 - X.loc[:, 'era'] = eras + X.loc[:, 'era'] = era_series fake_target = self._generate_target(dataf=X, all_eras=all_eras) return fake_target - def _get_coefs(self, dataf: pd.DataFrame, y: pd.Series, eras: pd.Series) -> np.ndarray: + def _get_coefs(self, dataf: pd.DataFrame, y: pd.Series, era_series: pd.Series) -> np.ndarray: """ Generate coefficients for BGMM. :param dataf: DataFrame containing features. :param y: Series containing real target. """ coefs = [] - dataf.loc[:, 'era'] = eras + dataf.loc[:, 'era'] = era_series dataf.loc[:, 'target'] = y all_eras = dataf['era'].unique().tolist() for era in all_eras: @@ -155,12 +151,12 @@ def __init__( self.bins = bins if bins else [0, 0.05, 0.25, 0.75, 0.95, 1] self.labels = labels if labels else [0, 0.25, 0.50, 0.75, 1] - def transform(self, dataf: pd.DataFrame, eras: pd.Series) -> np.array: + def transform(self, dataf: pd.DataFrame, era_series: pd.Series) -> np.array: for window in tqdm(self.windows, desc="Signals target engineering windows"): dataf.loc[:, f"target_{window}d_raw"] = ( dataf[self.price_col].pct_change(periods=window).shift(-window) ) - era_groups = dataf.groupby(eras) + era_groups = dataf.groupby(era_series) dataf.loc[:, f"target_{window}d_rank"] = era_groups[ f"target_{window}d_raw" diff --git a/poetry.lock b/poetry.lock index b7c6005..cb8523d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1759,19 +1759,19 @@ dev = ["black", "isort", "mock", "pytest", "ruff"] [[package]] name = "numerai-tools" -version = "0.0.19" +version = "0.0.20" description = "A collection of open-source tools to help interact with Numerai, model data, and automate submissions." optional = false python-versions = "*" files = [ - {file = "numerai_tools-0.0.19-py3-none-any.whl", hash = "sha256:504fdd61cca8c2924298254e41083d808a997713bdcc287d8bbbaa26ad9b3dd5"}, - {file = "numerai_tools-0.0.19.tar.gz", hash = "sha256:dc56020e72ffe2b5eb6f1d9f1640f24fb0d8389f618f6c96d51ec0fb3d504085"}, + {file = "numerai_tools-0.0.20-py3-none-any.whl", hash = "sha256:fc962fac420c4994869bad7bc05e5539f1f631beb26650985e7276eb46047188"}, + {file = "numerai_tools-0.0.20.tar.gz", hash = "sha256:e55120ea9935ded595a41340cccc020a52f6e57052b6e2d62be9d59bbdc91d60"}, ] [package.dependencies] numpy = ">=1.26.2,<1.27.0" pandas = ">=1.3.1,<=2.1.3" -scikit-learn = ">=1.3.0,<=1.3.2" +scikit-learn = ">=1.3.0" scipy = ">=1.11.4,<1.12.0" [[package]] @@ -2175,6 +2175,44 @@ files = [ dev = ["pre-commit", "tox"] testing = ["pytest", "pytest-benchmark"] +[[package]] +name = "polars" +version = "0.20.16" +description = "Blazingly fast DataFrame library" +optional = false +python-versions = ">=3.8" +files = [ + {file = "polars-0.20.16-cp38-abi3-macosx_10_12_x86_64.whl", hash = "sha256:7530148621b541221b8a36cfad27cc576d77c0739e82d8383ee3a699935a5f63"}, + {file = "polars-0.20.16-cp38-abi3-macosx_11_0_arm64.whl", hash = "sha256:92241d9aeb29de7c503d3b4f9191181f0831dd546ee6770e5c6fae3cead98edb"}, + {file = "polars-0.20.16-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2b48593c44a20f09f1eb1449ea43e2e5a4adf3c82faf1fba797fb7364cfa2ca4"}, + {file = "polars-0.20.16-cp38-abi3-manylinux_2_24_aarch64.whl", hash = "sha256:0c504cfdc88f3e4262d5ecf637c10ae154565b94250ee4263a9f5f346397cf98"}, + {file = "polars-0.20.16-cp38-abi3-win_amd64.whl", hash = "sha256:a1fa24ea374ff05766e43fe2f079d62edd682ade19e3e2d68526e9c7a3e23ddf"}, + {file = "polars-0.20.16.tar.gz", hash = "sha256:7a9ebb85bfc9dd964490612b6fee2afbde91eee6bfaa590b731c7868d225210b"}, +] + +[package.extras] +adbc = ["adbc-driver-manager", "adbc-driver-sqlite"] +all = ["polars[adbc,cloudpickle,connectorx,deltalake,fastexcel,fsspec,gevent,numpy,pandas,plot,pyarrow,pydantic,pyiceberg,sqlalchemy,timezone,xlsx2csv,xlsxwriter]"] +cloudpickle = ["cloudpickle"] +connectorx = ["connectorx (>=0.3.2)"] +deltalake = ["deltalake (>=0.14.0)"] +fastexcel = ["fastexcel (>=0.9)"] +fsspec = ["fsspec"] +gevent = ["gevent"] +matplotlib = ["matplotlib"] +numpy = ["numpy (>=1.16.0)"] +openpyxl = ["openpyxl (>=3.0.0)"] +pandas = ["pandas", "pyarrow (>=7.0.0)"] +plot = ["hvplot (>=0.9.1)"] +pyarrow = ["pyarrow (>=7.0.0)"] +pydantic = ["pydantic"] +pyiceberg = ["pyiceberg (>=0.5.0)"] +pyxlsb = ["pyxlsb (>=1.0)"] +sqlalchemy = ["pandas", "sqlalchemy"] +timezone = ["backports-zoneinfo", "tzdata"] +xlsx2csv = ["xlsx2csv (>=0.8.0)"] +xlsxwriter = ["xlsxwriter"] + [[package]] name = "proto-plus" version = "1.23.0" @@ -2679,50 +2717,45 @@ pyasn1 = ">=0.1.3" [[package]] name = "scikit-learn" -version = "1.3.2" +version = "1.4.1.post1" description = "A set of python modules for machine learning and data mining" optional = false -python-versions = ">=3.8" +python-versions = ">=3.9" files = [ - {file = "scikit-learn-1.3.2.tar.gz", hash = "sha256:a2f54c76accc15a34bfb9066e6c7a56c1e7235dda5762b990792330b52ccfb05"}, - {file = "scikit_learn-1.3.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:e326c0eb5cf4d6ba40f93776a20e9a7a69524c4db0757e7ce24ba222471ee8a1"}, - {file = "scikit_learn-1.3.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:535805c2a01ccb40ca4ab7d081d771aea67e535153e35a1fd99418fcedd1648a"}, - {file = "scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1215e5e58e9880b554b01187b8c9390bf4dc4692eedeaf542d3273f4785e342c"}, - {file = "scikit_learn-1.3.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:0ee107923a623b9f517754ea2f69ea3b62fc898a3641766cb7deb2f2ce450161"}, - {file = "scikit_learn-1.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:35a22e8015048c628ad099da9df5ab3004cdbf81edc75b396fd0cff8699ac58c"}, - {file = "scikit_learn-1.3.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6fb6bc98f234fda43163ddbe36df8bcde1d13ee176c6dc9b92bb7d3fc842eb66"}, - {file = "scikit_learn-1.3.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:18424efee518a1cde7b0b53a422cde2f6625197de6af36da0b57ec502f126157"}, - {file = "scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:3271552a5eb16f208a6f7f617b8cc6d1f137b52c8a1ef8edf547db0259b2c9fb"}, - {file = "scikit_learn-1.3.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:fc4144a5004a676d5022b798d9e573b05139e77f271253a4703eed295bde0433"}, - {file = "scikit_learn-1.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:67f37d708f042a9b8d59551cf94d30431e01374e00dc2645fa186059c6c5d78b"}, - {file = "scikit_learn-1.3.2-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:8db94cd8a2e038b37a80a04df8783e09caac77cbe052146432e67800e430c028"}, - {file = "scikit_learn-1.3.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:61a6efd384258789aa89415a410dcdb39a50e19d3d8410bd29be365bcdd512d5"}, - {file = "scikit_learn-1.3.2-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:cb06f8dce3f5ddc5dee1715a9b9f19f20d295bed8e3cd4fa51e1d050347de525"}, - {file = "scikit_learn-1.3.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:5b2de18d86f630d68fe1f87af690d451388bb186480afc719e5f770590c2ef6c"}, - {file = "scikit_learn-1.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:0402638c9a7c219ee52c94cbebc8fcb5eb9fe9c773717965c1f4185588ad3107"}, - {file = "scikit_learn-1.3.2-cp38-cp38-macosx_10_9_x86_64.whl", hash = "sha256:a19f90f95ba93c1a7f7924906d0576a84da7f3b2282ac3bfb7a08a32801add93"}, - {file = "scikit_learn-1.3.2-cp38-cp38-macosx_12_0_arm64.whl", hash = "sha256:b8692e395a03a60cd927125eef3a8e3424d86dde9b2370d544f0ea35f78a8073"}, - {file = "scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:15e1e94cc23d04d39da797ee34236ce2375ddea158b10bee3c343647d615581d"}, - {file = "scikit_learn-1.3.2-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:785a2213086b7b1abf037aeadbbd6d67159feb3e30263434139c98425e3dcfcf"}, - {file = "scikit_learn-1.3.2-cp38-cp38-win_amd64.whl", hash = "sha256:64381066f8aa63c2710e6b56edc9f0894cc7bf59bd71b8ce5613a4559b6145e0"}, - {file = "scikit_learn-1.3.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:6c43290337f7a4b969d207e620658372ba3c1ffb611f8bc2b6f031dc5c6d1d03"}, - {file = "scikit_learn-1.3.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:dc9002fc200bed597d5d34e90c752b74df516d592db162f756cc52836b38fe0e"}, - {file = "scikit_learn-1.3.2-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:1d08ada33e955c54355d909b9c06a4789a729977f165b8bae6f225ff0a60ec4a"}, - {file = "scikit_learn-1.3.2-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:763f0ae4b79b0ff9cca0bf3716bcc9915bdacff3cebea15ec79652d1cc4fa5c9"}, - {file = "scikit_learn-1.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:ed932ea780517b00dae7431e031faae6b49b20eb6950918eb83bd043237950e0"}, -] - -[package.dependencies] -joblib = ">=1.1.1" -numpy = ">=1.17.3,<2.0" -scipy = ">=1.5.0" + {file = "scikit-learn-1.4.1.post1.tar.gz", hash = "sha256:93d3d496ff1965470f9977d05e5ec3376fb1e63b10e4fda5e39d23c2d8969a30"}, + {file = "scikit_learn-1.4.1.post1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:c540aaf44729ab5cd4bd5e394f2b375e65ceaea9cdd8c195788e70433d91bbc5"}, + {file = "scikit_learn-1.4.1.post1-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:4310bff71aa98b45b46cd26fa641309deb73a5d1c0461d181587ad4f30ea3c36"}, + {file = "scikit_learn-1.4.1.post1-cp310-cp310-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:9f43dd527dabff5521af2786a2f8de5ba381e182ec7292663508901cf6ceaf6e"}, + {file = "scikit_learn-1.4.1.post1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c02e27d65b0c7dc32f2c5eb601aaf5530b7a02bfbe92438188624524878336f2"}, + {file = "scikit_learn-1.4.1.post1-cp310-cp310-win_amd64.whl", hash = "sha256:629e09f772ad42f657ca60a1a52342eef786218dd20cf1369a3b8d085e55ef8f"}, + {file = "scikit_learn-1.4.1.post1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6145dfd9605b0b50ae72cdf72b61a2acd87501369a763b0d73d004710ebb76b5"}, + {file = "scikit_learn-1.4.1.post1-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:1afed6951bc9d2053c6ee9a518a466cbc9b07c6a3f9d43bfe734192b6125d508"}, + {file = "scikit_learn-1.4.1.post1-cp311-cp311-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ce03506ccf5f96b7e9030fea7eb148999b254c44c10182ac55857bc9b5d4815f"}, + {file = "scikit_learn-1.4.1.post1-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:4ba516fcdc73d60e7f48cbb0bccb9acbdb21807de3651531208aac73c758e3ab"}, + {file = "scikit_learn-1.4.1.post1-cp311-cp311-win_amd64.whl", hash = "sha256:78cd27b4669513b50db4f683ef41ea35b5dddc797bd2bbd990d49897fd1c8a46"}, + {file = "scikit_learn-1.4.1.post1-cp312-cp312-macosx_10_9_x86_64.whl", hash = "sha256:a1e289f33f613cefe6707dead50db31930530dc386b6ccff176c786335a7b01c"}, + {file = "scikit_learn-1.4.1.post1-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:0df87de9ce1c0140f2818beef310fb2e2afdc1e66fc9ad587965577f17733649"}, + {file = "scikit_learn-1.4.1.post1-cp312-cp312-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:712c1c69c45b58ef21635360b3d0a680ff7d83ac95b6f9b82cf9294070cda710"}, + {file = "scikit_learn-1.4.1.post1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:1754b0c2409d6ed5a3380512d0adcf182a01363c669033a2b55cca429ed86a81"}, + {file = "scikit_learn-1.4.1.post1-cp312-cp312-win_amd64.whl", hash = "sha256:1d491ef66e37f4e812db7e6c8286520c2c3fc61b34bf5e59b67b4ce528de93af"}, + {file = "scikit_learn-1.4.1.post1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:aa0029b78ef59af22cfbd833e8ace8526e4df90212db7ceccbea582ebb5d6794"}, + {file = "scikit_learn-1.4.1.post1-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:14e4c88436ac96bf69eb6d746ac76a574c314a23c6961b7d344b38877f20fee1"}, + {file = "scikit_learn-1.4.1.post1-cp39-cp39-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:d7cd3a77c32879311f2aa93466d3c288c955ef71d191503cf0677c3340ae8ae0"}, + {file = "scikit_learn-1.4.1.post1-cp39-cp39-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:2a3ee19211ded1a52ee37b0a7b373a8bfc66f95353af058a210b692bd4cda0dd"}, + {file = "scikit_learn-1.4.1.post1-cp39-cp39-win_amd64.whl", hash = "sha256:234b6bda70fdcae9e4abbbe028582ce99c280458665a155eed0b820599377d25"}, +] + +[package.dependencies] +joblib = ">=1.2.0" +numpy = ">=1.19.5,<2.0" +scipy = ">=1.6.0" threadpoolctl = ">=2.0.0" [package.extras] -benchmark = ["matplotlib (>=3.1.3)", "memory-profiler (>=0.57.0)", "pandas (>=1.0.5)"] -docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.1.3)", "memory-profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)", "sphinx (>=6.0.0)", "sphinx-copybutton (>=0.5.2)", "sphinx-gallery (>=0.10.1)", "sphinx-prompt (>=1.3.0)", "sphinxext-opengraph (>=0.4.2)"] -examples = ["matplotlib (>=3.1.3)", "pandas (>=1.0.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.16.2)", "seaborn (>=0.9.0)"] -tests = ["black (>=23.3.0)", "matplotlib (>=3.1.3)", "mypy (>=1.3)", "numpydoc (>=1.2.0)", "pandas (>=1.0.5)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.0.272)", "scikit-image (>=0.16.2)"] +benchmark = ["matplotlib (>=3.3.4)", "memory-profiler (>=0.57.0)", "pandas (>=1.1.5)"] +docs = ["Pillow (>=7.1.2)", "matplotlib (>=3.3.4)", "memory-profiler (>=0.57.0)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)", "sphinx (>=6.0.0)", "sphinx-copybutton (>=0.5.2)", "sphinx-gallery (>=0.15.0)", "sphinx-prompt (>=1.3.0)", "sphinxext-opengraph (>=0.4.2)"] +examples = ["matplotlib (>=3.3.4)", "pandas (>=1.1.5)", "plotly (>=5.14.0)", "pooch (>=1.6.0)", "scikit-image (>=0.17.2)", "seaborn (>=0.9.0)"] +tests = ["black (>=23.3.0)", "matplotlib (>=3.3.4)", "mypy (>=1.3)", "numpydoc (>=1.2.0)", "pandas (>=1.1.5)", "polars (>=0.19.12)", "pooch (>=1.6.0)", "pyamg (>=4.0.0)", "pyarrow (>=12.0.0)", "pytest (>=7.1.2)", "pytest-cov (>=2.9.0)", "ruff (>=0.0.272)", "scikit-image (>=0.17.2)"] [[package]] name = "scikit-lego" @@ -3323,4 +3356,4 @@ optional = [] [metadata] lock-version = "2.0" python-versions = ">=3.9,<3.12" -content-hash = "f8a897fe5aa55ef566be82680244872e80228c0b7a43bb9b7f040c69105f3258" +content-hash = "8fad8c43684a0426e4b86376c5cffb7970a6f345ac9bb3128177da591a2bce3e" diff --git a/pyproject.toml b/pyproject.toml index 6f35147..dcdc09b 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "numerblox" -version = "1.2.2" +version = "1.3.0" description = "Solid Numerai Pipelines" authors = ["CrowdCent "] license = "MIT License" @@ -18,11 +18,12 @@ joblib = "^1.3.2" pyarrow = "^14.0.1" numerapi = "^2.17.0" matplotlib = "^3.4.0" -scikit-learn = "^1.3.2" +scikit-learn = "^1.4.1" python-dateutil = "^2.8.2" google-cloud-storage = "^2.11.0" numerai-era-data = "^0.1.1" -numerai-tools = "^0.0.19" +numerai-tools = "^0.0.20" +polars = "^0.20.16" [tool.poetry.group.dev.dependencies] pytest = "^7.4.2" diff --git a/tests/test_end_to_end.py b/tests/test_end_to_end.py index bfd7a63..ad9a992 100644 --- a/tests/test_end_to_end.py +++ b/tests/test_end_to_end.py @@ -3,7 +3,7 @@ from sklego.preprocessing import ColumnSelector from sklearn.tree import DecisionTreeClassifier from sklearn.model_selection import TimeSeriesSplit -from sklearn.pipeline import make_union, make_pipeline, FeatureUnion +from sklearn.pipeline import make_union, make_pipeline from sklearn.compose import ColumnTransformer from numerblox.numerframe import create_numerframe @@ -22,26 +22,31 @@ def test_neutralized_xgboost_pipeline(setup_data): X, y = df.get_feature_target_pair(multi_target=False) fncv3_cols = df.get_fncv3_feature_data.columns.tolist() - eras = df.get_era_data + era_series = df.get_era_data features = df.get_feature_data # Preprocessing gpp = GroupStatsPreProcessor(groups=['sunshine', 'rain']) fncv3_selector = ColumnSelector(fncv3_cols) - preproc_pipe = make_union(gpp, fncv3_selector) + # TODO Test with preproc FeatureUnion + preproc_pipe = ColumnTransformer([ + ("gpp", gpp, features.columns.tolist()), + ("selector", fncv3_selector, fncv3_cols) + ]) # Model xgb = XGBRegressor() cve = CrossValEstimator(estimator=xgb, cv=TimeSeriesSplit(n_splits=5)) - fn = FeatureNeutralizer(proportion=0.5) ens = NumeraiEnsemble() + ens.set_transform_request(era_series=True) + fn = FeatureNeutralizer(proportion=0.5) + fn.set_predict_request(era_series=True, features=True) full_pipe = make_meta_pipeline(preproc_pipe, cve, ens, fn) # Train full model - full_pipe.fit(X, y, numeraiensemble__eras=eras) - + full_pipe.fit(X, y, era_series=era_series) # Inference - preds = full_pipe.predict(X, eras=eras, features=features) + preds = full_pipe.predict(X, era_series=era_series, features=features) assert preds.min() >= 0 assert abs(preds.max() - 1) <= 1e-9 assert preds.shape[0] == X.shape[0] @@ -50,37 +55,44 @@ def test_neutralized_xgboost_pipeline(setup_data): def test_multi_classification_ensemble(setup_data): df = setup_data X, y = df.get_feature_target_pair(multi_target=False) - eras = df.get_era_data + era_series = df.get_era_data features = df.get_feature_data fncv3_cols = df.get_fncv3_feature_data.columns.tolist() - preproc_pipe = make_union(GroupStatsPreProcessor(groups=['sunshine', 'rain']), ColumnSelector(fncv3_cols)) + # TODO Test with preproc FeatureUnion in sklearn 1.5+ + preproc_pipe = ColumnTransformer([ + ("gpp", GroupStatsPreProcessor(groups=['sunshine', 'rain']), features.columns.tolist()), + ("selector", ColumnSelector(fncv3_cols), fncv3_cols) + ]) model = DecisionTreeClassifier() crossval = CrossValEstimator(estimator=model, cv=TimeSeriesSplit(n_splits=3), predict_func='predict_proba') pred_rud = PredictionReducer(n_models=3, n_classes=5) ens = NumeraiEnsemble(donate_weighted=True) - neut = FeatureNeutralizer(proportion=0.5) - full_pipe = make_meta_pipeline(preproc_pipe, crossval, pred_rud, ens, neut) + ens.set_transform_request(era_series=True) + fn = FeatureNeutralizer(proportion=0.5) + fn.set_predict_request(era_series=True, features=True) + full_pipe = make_meta_pipeline(preproc_pipe, crossval, pred_rud, ens, fn) y_int = (y * 4).astype(int) - full_pipe.fit(X, y_int, numeraiensemble__eras=eras) + full_pipe.fit(X, y_int, era_series=era_series) - preds = full_pipe.predict(X, eras=eras, features=features) + preds = full_pipe.predict(X, era_series=era_series, features=features) assert preds.min() >= 0 assert abs(preds.max() - 1) <= 1e-9 assert preds.shape[0] == X.shape[0] assert len(preds.shape) == 2 +@pytest.mark.xfail(reason="Can only be tested with sklearn 1.5+") def test_feature_union_pipeline(setup_data): df = setup_data X, y = df.get_feature_target_pair(multi_target=False) - eras = df.get_era_data + era_series = df.get_era_data features = df.get_feature_data fncv3_cols = df.get_fncv3_feature_data.columns.tolist() gpp = GroupStatsPreProcessor(groups=['sunshine', 'rain']) fncv3_selector = ColumnSelector(fncv3_cols) - preproc_pipe = FeatureUnion([("gpp", gpp), ("selector", fncv3_selector)]) + preproc_pipe = make_union(gpp, fncv3_selector) xgb = MetaEstimator(XGBRegressor()) fn = FeatureNeutralizer(proportion=0.5) @@ -88,7 +100,7 @@ def test_feature_union_pipeline(setup_data): model_pipe.fit(X, y) - preds = model_pipe.predict(X, eras=eras, features=features) + preds = model_pipe.predict(X, era_series=era_series, features=features) assert preds.min() >= 0 assert abs(preds.max() - 1) <= 1e-9 assert preds.shape[0] == X.shape[0] @@ -97,7 +109,7 @@ def test_column_transformer_pipeline(setup_data): df = setup_data X, y = df.get_feature_target_pair(multi_target=False) - eras = df.get_era_data + era_series = df.get_era_data features = df.get_feature_data fncv3_cols = df.get_fncv3_feature_data.columns.tolist() @@ -106,11 +118,12 @@ def test_column_transformer_pipeline(setup_data): ("selector", "passthrough", fncv3_cols[2:])]) xgb = MetaEstimator(XGBRegressor()) fn = FeatureNeutralizer(proportion=0.5) + fn.set_predict_request(era_series=True, features=True) model_pipe = make_pipeline(preproc_pipe, xgb, fn) model_pipe.fit(X, y) - preds = model_pipe.predict(X, eras=eras, features=features) + preds = model_pipe.predict(X, era_series=era_series, features=features) assert preds.min() >= 0 assert abs(preds.max() - 1) <= 1e-9 assert preds.shape[0] == X.shape[0] diff --git a/tests/test_ensemble.py b/tests/test_ensemble.py index 5aaa10b..e893489 100644 --- a/tests/test_ensemble.py +++ b/tests/test_ensemble.py @@ -15,7 +15,7 @@ def sample_data(): @pytest.fixture def ensemble(): - return NumeraiEnsemble() + return NumeraiEnsemble().set_transform_request(era_series=True) ##### NumeraiEnsemble ##### @@ -32,7 +32,7 @@ def test_numeraiensemble_predict(ensemble, sample_data): eras = np.array([1]*50 + [2]*50) input_preds = np.random.uniform(size=(100, 5)) - ensemble_preds = ensemble.predict(input_preds, eras) + ensemble_preds = ensemble.predict(input_preds, era_series=eras) # The length of output should have the same shape as input preds assert len(ensemble_preds) == len(input_preds) # Output should be a numpy array with values between 0 and 1 @@ -82,7 +82,7 @@ def test_numeraiensemble_standardize_by_era(ensemble): def test_numeraiensemble_predict_with_constant_values(ensemble): # Create an instance of your ensemble with mock estimators - eras = np.random.randint(1, 5, size=100) + era_series = np.random.randint(1, 5, size=100) X_fit = np.random.rand(100, 3) y_fit = np.random.rand(100) @@ -92,11 +92,11 @@ def test_numeraiensemble_predict_with_constant_values(ensemble): with pytest.raises(ValueError, match="Predictions for all columns are constant. No valid predictions to ensemble."): with pytest.warns(UserWarning, match="Some estimator predictions are constant. Consider checking your estimators. Skipping these estimator predictions in ensembling."): - ensemble.predict(constant_preds, eras) + ensemble.predict(constant_preds, era_series) def test_numeraiensemble_predict_with_nans(ensemble): # Create an instance of your ensemble with mock estimators - eras = np.random.randint(1, 5, size=100) + era_series = np.random.randint(1, 5, size=100) X_fit = np.random.rand(100, 3) y_fit = np.random.rand(100) @@ -107,7 +107,7 @@ def test_numeraiensemble_predict_with_nans(ensemble): nan_preds[:5, 1] = np.nan with pytest.warns(UserWarning, match="Predictions in column"): - ensemble_preds = ensemble.predict(nan_preds, eras) + ensemble_preds = ensemble.predict(nan_preds, era_series) assert len(ensemble_preds) == len(nan_preds) # Output should be a numpy array with values between 0 and 1 assert isinstance(ensemble_preds, np.ndarray) @@ -141,15 +141,15 @@ def test_numeraiensemble_get_feature_names_out(ensemble): def test_numeraiensemble_set_output(ensemble, sample_data): X, y = sample_data - eras = np.array([1]*50 + [2]*50) + era_series = np.array([1]*50 + [2]*50) ens_ins = ensemble ens_ins.fit(X, y) ens_ins.set_output(transform="pandas") - preds = ens_ins.predict(X, eras=eras) + preds = ens_ins.predict(X, era_series=era_series) assert isinstance(preds, pd.DataFrame) ens_ins.set_output(transform="default") - preds = ens_ins.predict(X, eras=eras) + preds = ens_ins.predict(X, era_series=era_series) assert isinstance(preds, np.ndarray) ##### PredictionReducer ##### diff --git a/tests/test_meta.py b/tests/test_meta.py index d8622cd..4e0bb1d 100644 --- a/tests/test_meta.py +++ b/tests/test_meta.py @@ -259,19 +259,21 @@ def test_binary_class_postprocess(): ##### MetaPipeline ##### class MockTransform(BaseEstimator, TransformerMixin): - """A mock transformer that requires 'eras' as an argument in its transform method.""" + """A mock transformer that requires 'era_series' as an argument in its transform method.""" def fit(self, X, y=None): return self - def predict(self, X, eras): - return self.transform(X, eras) + def predict(self, X, era_series): + assert not era_series is None, "era_series should be provided." + return self.transform(X, era_series) class MockFinalStep(BaseEstimator, RegressorMixin): - """A mock final step for the pipeline that requires 'features' and 'eras' in its predict method.""" + """A mock final step for the pipeline that requires 'features' and 'era_series' in its predict method.""" def fit(self, X, y=None): return self - def predict(self, X, features, eras): + def predict(self, X, features, era_series): + assert not features is None and not era_series is None, "features and era_series should be provided." return X class MockEstimator: @@ -286,49 +288,54 @@ def predict(self, X): def test_feature_neutralizer_pipeline(setup_data): lr1 = Ridge() fn = FeatureNeutralizer(proportion=0.5) + fn.set_predict_request(features=True, era_series=True) pipeline = make_meta_pipeline(lr1, fn) X, y = setup_data[["feature1", "feature2"]], setup_data["target"] pipeline.fit(X, y) - eras = setup_data["era"] + era_series = setup_data["era"] - result = pipeline.predict(X, features=X, eras=eras) + result = pipeline.predict(X, features=X, era_series=era_series) assert isinstance(result, np.ndarray) assert len(result) == len(setup_data) assert result.min() >= 0 assert result.max() <= 1 def test_meta_pipeline_missing_eras(setup_data): - # Create a pipeline where a step requires the 'eras' argument. + # Create a pipeline where a step requires the 'era_series' argument. steps = [("mock_transform", MockTransform()), ("final_step", MockFinalStep())] pipeline = MetaPipeline(steps) X = setup_data[["feature1", "feature2"]] y = setup_data["target"] - # Predict without providing 'eras' should raise a TypeError from MetaEstimator. + # Predict without providing 'era_series' should raise a TypeError from MetaEstimator. with pytest.raises(TypeError): pipeline.fit(X, y).predict(X, features=[]) def test_meta_pipeline_missing_features(setup_data): - # Create a pipeline with a final step that requires 'features' and 'eras' arguments. - steps = [("ridge", Ridge()), ("final_step", MockFinalStep())] + # Create a pipeline with a final step that requires 'features' and 'era_series' arguments. + final_step = MockFinalStep() + final_step.set_predict_request(features=True, era_series=True) + steps = [("ridge", Ridge()), ("final_step", final_step)] pipeline = MetaPipeline(steps) X = setup_data[["feature1", "feature2"]] y = setup_data["target"] # Predict without providing 'features' should raise an error. with pytest.raises(TypeError, match=re.escape("predict() missing 1 required positional argument: 'features'")): - pipeline.fit(X, y).predict(X, eras=[]) + pipeline.fit(X, y).predict(X, era_series=[]) def test_meta_pipeline_missing_eras_for_final_step(setup_data): - # Create a pipeline with a final step that requires 'features' and 'eras' arguments. - steps = [("ridge", Ridge()), ("final_step", MockFinalStep())] + # Create a pipeline with a final step that requires 'features' and 'era_series' arguments. + final_step = MockFinalStep() + final_step.set_predict_request(features=True, era_series=True) + steps = [("ridge", Ridge()), ("final_step", final_step)] pipeline = MetaPipeline(steps) X = setup_data[["feature1", "feature2"]] y = setup_data["target"] - # Predict without providing 'eras' for the final step should raise an error. - with pytest.raises(TypeError, match=re.escape("predict() missing 1 required positional argument: 'eras'")): + # Predict without providing 'era_series' for the final step should raise an error. + with pytest.raises(TypeError, match=re.escape("predict() missing 1 required positional argument: 'era_series'")): pipeline.fit(X, y).predict(X, features=[]) def test_do_not_wrap_transformer(): diff --git a/tests/test_models.py b/tests/test_models.py index c2866ab..06c213d 100644 --- a/tests/test_models.py +++ b/tests/test_models.py @@ -23,10 +23,10 @@ def test_initialization(): def test_fit_method(setup_data): model = EraBoostedXGBRegressor(proportion=0.5, num_iters=5, n_estimators=100, max_depth=3, learning_rate=0.1) - X, y, eras = setup_data[["feature1", "feature2"]], setup_data['target'], setup_data['era'] + X, y, era_series = setup_data[["feature1", "feature2"]], setup_data['target'], setup_data['era'] initial_tree_count = model.n_estimators - model.fit(X, y, eras=eras, verbose=500) + model.fit(X, y, era_series=era_series, verbose=500) assert model.n_estimators > initial_tree_count # Check if the final number of trees is as expected @@ -36,8 +36,8 @@ def test_fit_method(setup_data): def test_predictions(setup_data): model = EraBoostedXGBRegressor(num_iters=5, proportion=0.5, n_estimators=100, learning_rate=0.1, max_depth=3) - X, y, eras = setup_data[["feature1", "feature2"]], setup_data['target'], setup_data['era'] - model.fit(X, y, eras=eras) + X, y, era_series = setup_data[["feature1", "feature2"]], setup_data['target'], setup_data['era'] + model.fit(X, y, era_series=era_series) predictions = model.predict(X) assert len(predictions) == len(X) @@ -53,8 +53,8 @@ def test_get_feature_names_out(setup_data): with pytest.raises(NotFittedError): model.get_feature_names_out() - X, y, eras = setup_data[["feature1", "feature2"]], setup_data['target'], setup_data['era'] - model.fit(X, y, eras=eras) + X, y, era_series = setup_data[["feature1", "feature2"]], setup_data['target'], setup_data['era'] + model.fit(X, y, era_series=era_series) # Test after fitting feature_names = model.get_feature_names_out() diff --git a/tests/test_neutralizers.py b/tests/test_neutralizers.py index dc14d6f..6991878 100644 --- a/tests/test_neutralizers.py +++ b/tests/test_neutralizers.py @@ -1,23 +1,30 @@ import pytest +import sklearn import numpy as np import pandas as pd +from sklearn.utils._metadata_requests import MetadataRequest from numerblox.neutralizers import BaseNeutralizer, FeatureNeutralizer from utils import create_classic_sample_data +sklearn.set_config(enable_metadata_routing=True) + setup_data = create_classic_sample_data def test_base_neutralizer_initialization(): bn = BaseNeutralizer(new_col_names=["test"]) assert bn.new_col_names == ["test"] + def test_base_neutralizer_fit(setup_data): obj = BaseNeutralizer(new_col_names=["test"]).fit(setup_data) assert isinstance(obj, BaseNeutralizer) def test_feature_neutralizer_initialization(): fn = FeatureNeutralizer() + fn.set_transform_request(features=True, era_series=True) + fn.set_predict_request(features=True, era_series=True) assert fn.new_col_names[0].startswith("prediction_neutralized_") # Proportion must be between 0 and 1 @@ -26,49 +33,62 @@ def test_feature_neutralizer_initialization(): with pytest.raises(AssertionError): FeatureNeutralizer(proportion=[-0.1]) + # Test routing + routing = fn.get_metadata_routing() + assert isinstance(routing, MetadataRequest) + assert routing.consumes("transform", ["features", "era_series"]) == set({"features", "era_series"}) + assert routing.consumes("predict", ["features", "era_series"]) == set({"features", "era_series"}) + + def test_feature_neutralizer_length_mismatch_X_features(setup_data): fn = FeatureNeutralizer() + fn.set_transform_request(features=True, era_series=True) + fn.set_predict_request(features=True, era_series=True) features = setup_data[["feature1", "feature2"]] - eras = setup_data["era"] + era_series = setup_data["era"] X = setup_data["prediction"][:-1] # Remove one element to cause mismatch with pytest.raises(AssertionError): - fn.transform(X, features, eras) + fn.transform(X, features=features, era_series=era_series) def test_feature_neutralizer_length_mismatch_X_eras(setup_data): fn = FeatureNeutralizer() + fn.set_transform_request(features=True, era_series=True) features = setup_data[["feature1", "feature2"]] - eras = setup_data["era"][:-1] # Remove one element to cause mismatch + era_series = setup_data["era"][:-1] # Remove one element to cause mismatch X = setup_data["prediction"] with pytest.raises(AssertionError): - fn.transform(X, features, eras) + fn.transform(X, features=features, era_series=era_series) def test_feature_neutralizer_incorrect_dim_X_single_pred(setup_data): fn = FeatureNeutralizer(pred_name=["prediction1", "prediction2"]) + fn.set_transform_request(features=True, era_series=True) features = setup_data[["feature1", "feature2"]] - eras = setup_data["era"] + era_series = setup_data["era"] X = setup_data["prediction"] # X is 1D, but two prediction names are provided with pytest.raises(AssertionError): - fn.transform(X, features, eras) + fn.transform(X, features=features, era_series=era_series) def test_feature_neutralizer_incorrect_dim_X_multi_pred(setup_data): fn = FeatureNeutralizer(pred_name=["prediction1", "prediction2"]) + fn.set_transform_request(features=True, era_series=True) features = setup_data[["feature1", "feature2"]] - eras = setup_data["era"] + era_series = setup_data["era"] setup_data["prediction2"] = np.random.uniform(size=len(setup_data)) X = setup_data[["prediction"]] # Only one column provided, but two expected with pytest.raises(AssertionError): - fn.transform(X, features, eras) + fn.transform(X, features=features, era_series=era_series) def test_feature_neutralizer_predict(setup_data): fn = FeatureNeutralizer(pred_name="prediction", proportion=0.5) + fn.set_transform_request(features=True, era_series=True) features = setup_data[["feature1", "feature2"]] - eras = setup_data["era"] + era_series = setup_data["era"] X = setup_data["prediction"] - result = fn.transform(X, features=features, eras=eras) + result = fn.transform(X, features=features, era_series=era_series) assert len(result) == len(setup_data) assert result.shape[1] == 1 assert np.all(np.isclose(result, 0, atol=1e-8) | (result >= 0)) @@ -76,11 +96,12 @@ def test_feature_neutralizer_predict(setup_data): def test_feature_neutralizer_predict_multi_pred(setup_data): fn = FeatureNeutralizer(pred_name=["prediction", "prediction2"], proportion=[0.5]) + fn.set_transform_request(features=True, era_series=True) features = setup_data[["feature1", "feature2"]] - eras = setup_data["era"] + era_series = setup_data["era"] setup_data["prediction2"] = np.random.uniform(size=len(setup_data)) X = setup_data[["prediction", "prediction2"]] - result = fn.transform(X, features=features, eras=eras) + result = fn.transform(X, features=features, era_series=era_series) assert len(result) == len(setup_data) assert result.shape[1] == 2 assert np.all(np.isclose(result, 0, atol=1e-8) | (result >= 0)) @@ -88,10 +109,11 @@ def test_feature_neutralizer_predict_multi_pred(setup_data): def test_feature_neutralizer_predict_multi_prop(setup_data): fn = FeatureNeutralizer(pred_name="prediction", proportion=[0.5, 0.7]) + fn.set_transform_request(features=True, era_series=True) features = setup_data[["feature1", "feature2"]] - eras = setup_data["era"] + era_series = setup_data["era"] X = setup_data["prediction"] - result = fn.transform(X, features=features, eras=eras) + result = fn.transform(X, features=features, era_series=era_series) assert len(result) == len(setup_data) assert result.shape[1] == 2 assert np.all(np.isclose(result, 0, atol=1e-8) | (result >= 0)) @@ -99,18 +121,19 @@ def test_feature_neutralizer_predict_multi_prop(setup_data): def test_feature_neutralizer_multi_pred_multi_prop(setup_data): fn = FeatureNeutralizer(pred_name=["prediction", "prediction2"], proportion=[0.5, 0.7, 0.9]) + fn.set_transform_request(features=True, era_series=True) features = setup_data[["feature1", "feature2"]] - eras = setup_data["era"] + era_series = setup_data["era"] setup_data["prediction2"] = np.random.uniform(size=len(setup_data)) X = setup_data[["prediction", "prediction2"]] - result = fn.transform(X, features=features, eras=eras) + result = fn.transform(X, features=features, era_series=era_series) assert len(result) == len(setup_data) assert result.shape[1] == 6 assert np.all(np.isclose(result, 0, atol=1e-8) | (result >= 0)) assert np.all(np.isclose(result, 1, atol=1e-8) | (result <= 1)) # Test with numpy X - result = fn.transform(X.to_numpy(), features=features, eras=eras) + result = fn.transform(X.to_numpy(), features=features, era_series=era_series) assert len(result) == len(setup_data) assert result.shape[1] == 6 assert np.all(np.isclose(result, 0, atol=1e-8) | (result >= 0)) diff --git a/tests/test_penalizers.py b/tests/test_penalizers.py index 8065f9d..ffbc8fe 100644 --- a/tests/test_penalizers.py +++ b/tests/test_penalizers.py @@ -32,10 +32,11 @@ def test_feature_penalizer_get_feature_names_out_with_input_features(): # TODO Fast FeaturePenalizer tests # def test_feature_penalizer_predict(setup_data): # fp = FeaturePenalizer(max_exposure=0.5) +# fp.set_transform_request(features=True, era_series=True) # features = setup_data[["feature1", "feature2"]] -# eras = setup_data["era"] +# era_series = setup_data["era"] # X = setup_data["prediction"] -# result = fp.predict(X, features=features, eras=eras) +# result = fp.predict(X, features=features, era_series=era_series) # assert len(result) == len(setup_data) # assert result['prediction'].min() >= 0 # assert result['prediction'].max() <= 1 @@ -46,10 +47,10 @@ def test_feature_penalizer_get_feature_names_out_with_input_features(): # pipeline = make_numerai_pipeline(lr1, fp) # pipeline.fit(setup_data[["feature1", "feature2"]], setup_data["target"]) # features = setup_data[["feature1", "feature2"]] -# eras = setup_data["era"] +# era_series = setup_data["era"] # result = pipeline.predict(setup_data[["feature1", "feature2"]], -# features=features, eras=eras) +# features=features, era_series=era_series) # assert isinstance(result, np.ndarray) # assert len(result) == len(setup_data) # assert result.min() >= 0 diff --git a/tests/test_preprocessing.py b/tests/test_preprocessing.py index 3672916..1a834b8 100644 --- a/tests/test_preprocessing.py +++ b/tests/test_preprocessing.py @@ -1,10 +1,11 @@ import warnings import numpy as np +import polars as pl import pandas as pd from tqdm import tqdm from sklearn.pipeline import Pipeline, FeatureUnion from sklearn.decomposition import PCA -from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.base import BaseEstimator, TransformerMixin, check_is_fitted from numerblox.preprocessing.base import BasePreProcessor from numerblox.preprocessing import (ReduceMemoryProcessor, GroupStatsPreProcessor, @@ -58,6 +59,7 @@ def test_processors_sklearn(dummy_signals_data): # Test fit returns self assert processor.fit(X=X, y=y) == processor + check_is_fitted(processor) # Inherits from BasePreProcessor assert issubclass(processor_cls, BasePreProcessor) @@ -96,6 +98,12 @@ def test_reduce_memory_preprocessor(dummy_signals_data): reduced_data = rmp.transform(dummy_signals_data.to_numpy()) assert isinstance(reduced_data, np.ndarray) + # Test polars Output + rmp.set_output(transform="polars") + reduced_data = rmp.transform(dummy_signals_data) + assert isinstance(reduced_data, pl.DataFrame) + + def test_group_stats_preprocessor(): # Test with part groups selects test_group_processor = GroupStatsPreProcessor(groups=["sunshine", "rain"]) @@ -151,6 +159,10 @@ def test_group_stats_preprocessor(): result = processor.transform(dataset) assert isinstance(result, np.ndarray) + processor.set_output(transform="polars") + result = processor.transform(dataset) + assert isinstance(result, pl.DataFrame) + # Test get_feature_names_out assert test_group_processor.get_feature_names_out() == expected_cols assert test_group_processor.get_feature_names_out(["fancy"]) == ["fancy"] @@ -183,7 +195,7 @@ def test_era_quantile_processor(dummy_signals_data): eqp.set_output(transform="pandas") X = dummy_signals_data[["close", "volume"]] eqp.fit(X) - result = eqp.transform(X, eras=dummy_signals_data["date"]) + result = eqp.transform(X, era_series=dummy_signals_data["date"]) quantile_cols = [col for col in result.columns if "quantile" in col] assert len(result.columns) == 2 for col in quantile_cols: @@ -192,15 +204,19 @@ def test_era_quantile_processor(dummy_signals_data): assert eqp.get_feature_names_out() == quantile_cols # Numpy input - result = eqp.transform(X.to_numpy(), eras=dummy_signals_data["date"]) + result = eqp.transform(X.to_numpy(), era_series=dummy_signals_data["date"]) assert len(result.shape) == 2 assert isinstance(result, pd.DataFrame) # Test set_output API eqp.set_output(transform="default") - result = eqp.transform(X, eras=dummy_signals_data["date"]) + result = eqp.transform(X, era_series=dummy_signals_data["date"]) assert isinstance(result, np.ndarray) + eqp.set_output(transform="polars") + result = eqp.transform(X, era_series=dummy_signals_data["date"]) + assert isinstance(result, pl.DataFrame) + def test_ticker_mapper(): # Basic test_dataf = pd.Series(["AAPL", "MSFT"]) @@ -219,13 +235,18 @@ def test_ticker_mapper(): mapper.set_output(transform="default") result = mapper.transform(test_dataf) assert isinstance(result, np.ndarray) + + mapper.set_output(transform="polars") + result = mapper.transform(test_dataf) + assert isinstance(result, pl.DataFrame) + def test_lag_preprocessor(dummy_signals_data): lpp = LagPreProcessor(windows=[20, 40]) lpp.set_output(transform="pandas") lpp.fit(dummy_signals_data[['close', 'volume']]) # DataFrame input - result = lpp.transform(dummy_signals_data[['close', 'volume']], tickers=dummy_signals_data["ticker"]) + result = lpp.transform(dummy_signals_data[['close', 'volume']], ticker_series=dummy_signals_data["ticker"]) expected_cols = [ "close_lag20", "close_lag40", @@ -236,7 +257,7 @@ def test_lag_preprocessor(dummy_signals_data): assert lpp.get_feature_names_out() == expected_cols # Numpy input - result = lpp.transform(dummy_signals_data[['close', 'volume']].to_numpy(), tickers=dummy_signals_data["ticker"]) + result = lpp.transform(dummy_signals_data[['close', 'volume']].to_numpy(), ticker_series=dummy_signals_data["ticker"]) expected_cols = [ "0_lag20", "0lag40", @@ -246,15 +267,20 @@ def test_lag_preprocessor(dummy_signals_data): # Test set_output API lpp.set_output(transform="default") - result = lpp.transform(dummy_signals_data[['close', 'volume']], tickers=dummy_signals_data["ticker"]) + result = lpp.transform(dummy_signals_data[['close', 'volume']], ticker_series=dummy_signals_data["ticker"]) assert isinstance(result, np.ndarray) + lpp.set_output(transform="polars") + result = lpp.transform(dummy_signals_data[['close', 'volume']], ticker_series=dummy_signals_data["ticker"]) + assert isinstance(result, pl.DataFrame) + + def test_difference_preprocessor(dummy_signals_data): lpp = LagPreProcessor(windows=[20, 40]) lpp.set_output(transform="pandas") lpp.fit(dummy_signals_data[['close', 'volume']]) lags = lpp.transform(dummy_signals_data[['close', 'volume']], - tickers=dummy_signals_data["ticker"]) + ticker_series=dummy_signals_data["ticker"]) dpp = DifferencePreProcessor(windows=[20, 40], abs_diff=True) dpp.set_output(transform="pandas") result = dpp.fit_transform(lags) @@ -268,6 +294,10 @@ def test_difference_preprocessor(dummy_signals_data): result = dpp.transform(lags) assert isinstance(result, np.ndarray) + dpp.set_output(transform="polars") + result = dpp.transform(lags) + assert isinstance(result, pl.DataFrame) + def test_pandasta_feature_generator(dummy_signals_data): ptfg = PandasTaFeatureGenerator() result = ptfg.fit_transform(dummy_signals_data) @@ -295,6 +325,15 @@ def test_hlocv_adjuster_basic(dummy_signals_data): assert np.isclose(original_row["open"] / ratio, adjusted_row["adjusted_open"]) assert np.isclose(original_row["volume"] * ratio, adjusted_row["adjusted_volume"]) + # Test set_output API + adjuster.set_output(transform="default") + result = adjuster.transform(dummy_signals_data) + assert isinstance(result, np.ndarray) + + adjuster.set_output(transform="polars") + result = adjuster.transform(dummy_signals_data) + assert isinstance(result, pl.DataFrame) + def test_minimum_data_filter(dummy_signals_data): before_tickers = dummy_signals_data["ticker"].unique().tolist() for tick in ["XYZ.US", "RST.US", "UVW.US"]: @@ -317,3 +356,7 @@ def test_minimum_data_filter(dummy_signals_data): result = filter.transform(dummy_signals_data) assert isinstance(result, np.ndarray) + filter.set_output(transform="polars") + result = filter.transform(dummy_signals_data) + assert isinstance(result, pl.DataFrame) + diff --git a/tests/test_targets.py b/tests/test_targets.py index 0b49ded..d558a72 100644 --- a/tests/test_targets.py +++ b/tests/test_targets.py @@ -1,6 +1,7 @@ import numpy as np import pandas as pd from tqdm import tqdm +import sklearn from sklearn.pipeline import Pipeline from sklearn.base import BaseEstimator, TransformerMixin @@ -13,11 +14,12 @@ ALL_PROCESSORS = [BayesianGMMTargetProcessor, SignalsTargetProcessor] +sklearn.set_config(enable_metadata_routing=True) + def test_processors_sklearn(): data = dataset.sample(50) data = data.drop(columns=["data_type"]) y = data["target_jerome_v4_20"].fillna(0.5) - eras = data["era"] feature_names = ["feature_tallish_grimier_tumbrel", "feature_partitive_labyrinthine_sard"] X = data[feature_names].fillna(0.5) @@ -25,65 +27,53 @@ def test_processors_sklearn(): for processor_cls in tqdm(ALL_PROCESSORS, desc="Testing target processors for scikit-learn compatibility"): # Initialization processor = processor_cls() - - # Test fit returns self - try: - assert processor.fit(X=X, y=y, eras=eras) == processor - except TypeError: - assert processor.fit(X=X, y=y) == processor + processor.set_transform_request(era_series=True) # Inherits from Sklearn classes assert issubclass(processor_cls, (BaseEstimator, TransformerMixin)) - # Pipeline - pipeline = Pipeline([ - ('processor', processor), - ]) - try: - _ = pipeline.fit(X, y=y, processor__eras=eras) - except TypeError: - _ = pipeline.fit(X, y=y) - # Test every processor has get_feature_names_out assert hasattr(processor, 'get_feature_names_out'), "Processor {processor.__name__} does not have get_feature_names_out. Every implemented preprocessors should have this method." def test_bayesian_gmm_target_preprocessor(): bgmm = BayesianGMMTargetProcessor(n_components=2) + bgmm.set_transform_request(era_series=True) y = dataset["target_jerome_v4_20"].fillna(0.5) - eras = dataset["era"] + era_series = dataset["era"] feature_names = ["feature_tallish_grimier_tumbrel", "feature_partitive_labyrinthine_sard"] X = dataset[feature_names] - bgmm.fit(X, y, eras=eras) + bgmm.fit(X, y, era_series=era_series) - result = bgmm.transform(X, eras=eras) + result = bgmm.transform(X, era_series=era_series) assert bgmm.get_feature_names_out() == ["fake_target"] assert len(result) == len(dataset) assert result.min() >= 0.0 assert result.max() <= 1.0 # _get_coefs - coefs = bgmm._get_coefs(X, y, eras=eras) + coefs = bgmm._get_coefs(X, y, era_series=era_series) assert coefs.shape == (5, 2) assert coefs.min() >= 0.0 assert coefs.max() <= 1.0 # Test set_output API bgmm.set_output(transform="pandas") - result = bgmm.transform(X, eras=eras) + result = bgmm.transform(X, era_series=era_series) assert isinstance(result, pd.DataFrame) bgmm.set_output(transform="default") - result = bgmm.transform(X, eras=eras) + result = bgmm.transform(X, era_series=era_series) assert isinstance(result, np.ndarray) def test_signals_target_processor(dummy_signals_data): stp = SignalsTargetProcessor() + stp.set_transform_request(era_series=True) stp.set_output(transform="pandas") - eras = dummy_signals_data["date"] + era_series = dummy_signals_data["date"] stp.fit(dummy_signals_data) - result = stp.transform(dummy_signals_data, eras=eras) + result = stp.transform(dummy_signals_data, era_series=era_series) expected_target_cols = ["target_10d_raw", "target_10d_rank", "target_10d_group", "target_20d_raw", "target_20d_rank", "target_20d_group"] for col in expected_target_cols: assert col in result.columns @@ -91,5 +81,5 @@ def test_signals_target_processor(dummy_signals_data): # Test set_output API stp.set_output(transform="default") - result = stp.transform(dummy_signals_data, eras=eras) + result = stp.transform(dummy_signals_data, era_series=era_series) assert isinstance(result, np.ndarray) \ No newline at end of file