Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: Fix pipeline_test that partially stopped working due to introduction of shap and confusion matrix. #30

Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
e9953e1
build: Add a shap package.
Oct 26, 2023
3c4e0da
Merge branch 'main' into main
AkiraUra Oct 26, 2023
ca36674
Merge branch 'sapientml:main' into main
arima-tsukasa Oct 27, 2023
34ea903
Merge branch 'sapientml:main' into main
arima-tsukasa Oct 30, 2023
1c99e81
fix: Fix input error passing to confusion matrix.
Oct 31, 2023
0a9e498
style: Source formatting performed.
Oct 31, 2023
d862dd1
fix: Change to not graph shaps for multiclass classification.
Nov 2, 2023
be9e062
Merge branch 'main' into main
kimusaku Nov 8, 2023
51b9a34
fix: Fixed a flaw in returning converted values when using LabelEncoder.
Nov 8, 2023
f24bbc8
fix: Fix shap plotting to work for LightGBM classification.
Nov 10, 2023
5ceaff2
style: Formatted the code.
Nov 10, 2023
a9faa53
Merge branch 'sapientml:main' into main
arima-tsukasa Nov 17, 2023
5f2b2d8
Merge branch 'main' of https://github.com/arima-tsukasa/core into main
Nov 17, 2023
6e63c2e
Merge branch 'main' into bugs_pipeline_test_adding_shap_and_confusion…
Nov 17, 2023
e603a4e
Merge branch 'sapientml:main' into main
arima-tsukasa Nov 22, 2023
9146c78
feat: Added None to the prediction method specification.
Nov 22, 2023
412dd22
Merge branch 'main' of https://github.com/arima-tsukasa/core into main
Nov 22, 2023
e87410d
Merge branch 'main' into bugs_pipeline_test_adding_shap_and_confusion…
Nov 22, 2023
ffacb05
Merge branch 'sapientml:main' into main
arima-tsukasa Nov 24, 2023
f466da9
Merge branch 'main' of https://github.com/arima-tsukasa/core into main
Nov 24, 2023
2a351eb
Merge branch 'main' into bugs_pipeline_test_adding_shap_and_confusion…
Nov 24, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 6 additions & 9 deletions sapientml_core/adaptation/generation/pipeline_template.py
Original file line number Diff line number Diff line change
Expand Up @@ -189,11 +189,12 @@ def generate(self):

# Adding Shap Visualization data
tpl = env.get_template("other_templates/shap.py.jinja")
pipeline.pipeline_json["shap"]["code"] = self._render(tpl, pipeline=pipeline)
pipeline.pipeline_json["shap"]["code"] = self._render(tpl, pipeline=pipeline, model_name=model_name)

tpl = env.get_template("other_templates/prediction_result.py.jinja")
pipeline.pipeline_json["output_prediction"]["code"] = self._render(tpl, pipeline=pipeline, macros=macros)
pipeline.pipeline_json["output_prediction"]["code_test"] = self._render(tpl, pipeline=pipeline, macros=macros)
pipeline.pipeline_json["output_prediction"]["code"] = self._render(
tpl, pipeline=pipeline, model_name=model_name, macros=macros
)

if flag_hyperparameter_tuning:
tpl = env.get_template("model_templates/hyperparameters.py.jinja")
Expand All @@ -220,14 +221,10 @@ def generate(self):
if pipeline.adaptation_metric and (
pipeline.adaptation_metric in macros.metric_needing_predict_proba
or pipeline.adaptation_metric.startswith(macros.Metric.MAP_K.value)
or pipeline.config.predict_option == macros.PRED_PROBABILITY
):
pipeline.pipeline_json["evaluation"]["code_test"] = pipeline.pipeline_json["evaluation"][
"code_test"
].replace("y_pred", "y_prob")
pipeline.pipeline_json["output_prediction"]["code_test"] = pipeline.pipeline_json["output_prediction"][
"code_test"
].replace("y_pred", "y_prob")

if pipeline.config.permutation_importance:
tpl = env.get_template("other_templates/permutation_importance.py.jinja")
Expand Down Expand Up @@ -517,6 +514,7 @@ def populate_model(self):
is_multioutput_regression=_is_multioutput_regression,
is_multioutput_classification=_is_multioutput_classification,
metric_needing_predict_proba=macros.metric_needing_predict_proba,
macros=macros,
)
snippet_test = self._render(
tpl_test,
Expand All @@ -541,9 +539,8 @@ def populate_model(self):
tpl = env.get_template("model_templates/classification_post_process.jinja")
snippet += "\n" + self._render(tpl, pipeline=pipeline)

snippet_predict = snippet_predict.replace("predict", "predict_proba")
tpl_predict = env.get_template("model_templates/classification_post_process.jinja")
snippet_predict += "\n" + self._render(tpl_predict, pipeline=pipeline)
snippet_predict += "\n" + self._render(tpl_predict, pipeline=pipeline).replace("y_pred", "y_prob")

tpl_test = env.get_template("model_templates/classification_post_process.jinja")
snippet_test += "\n" + self._render(tpl_test, pipeline=pipeline).replace("y_pred", "y_prob")
Expand Down
6 changes: 3 additions & 3 deletions sapientml_core/params.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,8 +67,8 @@ class SapientMLConfig(Config):
Ignored when hyperparameter_tuning is False.
hyperparameter_tuning_random_state: int, default 1023
Random seed for hyperparameter tuning.
predict_option: Literal["default", "probability"], default "default"
Specify predict method (default: predict(), probability: predict_proba().)
predict_option: Literal["default", "probability", None], default None
Specify predict method (default: predict(), probability: predict_proba(), None: Comply with metric requirements.)
permutation_importance: bool, default True
On/Off of outputting permutation importance calculation code.
add_explanation: bool, default False
Expand All @@ -84,7 +84,7 @@ class SapientMLConfig(Config):
hyperparameter_tuning_n_trials: int = 10
hyperparameter_tuning_timeout: int = 0
hyperparameter_tuning_random_state: int = 1023
predict_option: Literal["default", "probability"] = "default"
predict_option: Optional[Literal["default", "probability"]] = None
permutation_importance: bool = True
add_explanation: bool = False

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,16 @@ import numpy as np
with open('model.pkl', 'rb') as f:
model = pickle.load(f)

{% if (pipeline.adaptation_metric not in macros.metric_needing_predict_proba) or (pipeline.config.predict_option == macros.PRED_DEFAULT) %}
y_pred = model.predict(feature_test)
{% if flag_predict_proba and (not pipeline.adaptation_metric.startswith("MAP_")) and (not pipeline.adaptation_metric == "LogLoss") and (pipeline.adaptation_metric not in metric_needing_predict_proba) %}
y_pred = model.classes_[np.argmax(y_pred, axis=1)].reshape(-1, 1)
{% endif %}
{% if model_name == xgbclassifier and (not pipeline.adaptation_metric.startswith("MAP_")) and (not pipeline.adaptation_metric == "LogLoss") and (pipeline.adaptation_metric not in metric_needing_predict_proba) %}
{% if pipeline.adaptation_metric and flag_predict_proba %}
y_prob = model.predict_proba(feature_test)
{% endif %}
{% if model_name == xgbclassifier %}
with open('target_LabelEncoder.pkl', 'rb') as f:
label_encoder = pickle.load(f)

{% endif %}
{% if model_name == xgbclassifier and ((pipeline.adaptation_metric not in macros.metric_needing_predict_proba) or (pipeline.config.predict_option == macros.PRED_DEFAULT)) %}
y_pred = label_encoder.inverse_transform(y_pred).reshape(-1, 1)
{% endif %}
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,6 @@ y_pred = model.predict(feature_test)
{% if pipeline.adaptation_metric and flag_predict_proba %}
y_prob = model.predict_proba(feature_test)
{% endif %}
{% if flag_predict_proba and (not pipeline.adaptation_metric.startswith("MAP_")) and (not pipeline.adaptation_metric == "LogLoss") and (pipeline.adaptation_metric not in metric_needing_predict_proba) %}
y_prob = model.classes_[np.argmax(y_prob, axis=1)].reshape(-1, 1)
{% endif %}
{% if flag_predict_proba and model_name == xgbclassifier and (not pipeline.adaptation_metric.startswith("MAP_")) and (not pipeline.adaptation_metric == "LogLoss") and (pipeline.adaptation_metric not in metric_needing_predict_proba) %}
y_pred = label_encoder.inverse_transform(y_pred).reshape(-1, 1)
y_prob = label_encoder.inverse_transform(y_prob).reshape(-1, 1)
{% elif model_name == xgbclassifier and (not pipeline.adaptation_metric.startswith("MAP_")) and (not pipeline.adaptation_metric == "LogLoss") and (pipeline.adaptation_metric not in metric_needing_predict_proba) %}
{% if model_name == xgbclassifier %}
y_pred = label_encoder.inverse_transform(y_pred).reshape(-1, 1)
{% endif %}
Original file line number Diff line number Diff line change
@@ -1,11 +1,20 @@
# OUTPUT PREDICTION
{% if pipeline.task.is_multiclass == True and pipeline.adaptation_metric == "LogLoss"%}
prediction = pd.DataFrame(y_pred, columns=model.classes_, index=feature_test.index)
{% set xgbclassifier = "XGBClassifier" %}
{% if pipeline.config.predict_option == macros.PRED_PROBABILITY and model_name == xgbclassifier and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metrics_for_classification) and (not pipeline.adaptation_metric.startswith("MAP_"))%}
prediction = pd.DataFrame(y_prob, columns=label_encoder.inverse_transform(model.classes_), index=feature_test.index)
{% elif pipeline.config.predict_option == macros.PRED_PROBABILITY and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metrics_for_classification) and (not pipeline.adaptation_metric.startswith("MAP_"))%}
prediction = pd.DataFrame(y_prob, columns=model.classes_, index=feature_test.index)
{% elif pipeline.config.predict_option == macros.PRED_PROBABILITY and (pipeline.adaptation_metric in macros.metrics_for_classification) and (not pipeline.adaptation_metric.startswith("MAP_"))%}
prediction = pd.DataFrame(y_prob, columns=TARGET_COLUMNS, index=feature_test.index)
{% elif pipeline.config.predict_option is none and model_name == xgbclassifier and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metric_needing_predict_proba) and (not pipeline.adaptation_metric.startswith("MAP_"))%}
prediction = pd.DataFrame(y_prob, columns=label_encoder.inverse_transform(model.classes_), index=feature_test.index)
{% elif pipeline.config.predict_option is none and pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in macros.metric_needing_predict_proba) and (not pipeline.adaptation_metric.startswith("MAP_"))%}
prediction = pd.DataFrame(y_prob, columns=model.classes_, index=feature_test.index)
{% elif pipeline.config.predict_option is none and (pipeline.adaptation_metric in macros.metric_needing_predict_proba) and (not pipeline.adaptation_metric.startswith("MAP_"))%}
prediction = pd.DataFrame(y_prob, columns=TARGET_COLUMNS, index=feature_test.index)
{% elif pipeline.adaptation_metric.startswith("MAP_") %}
{% set k = pipeline.adaptation_metric.split("_")[1] %}
prediction = pd.DataFrame(y_pred, columns=[TARGET_COLUMNS[0] + "_" +str(i) for i in range(1, y_pred.shape[1] + 1)], index=feature_test.index)
{% elif pipeline.task.is_multiclass == True and (pipeline.adaptation_metric in ["auc", "ROC_AUC", "Gini"]) %}
prediction = pd.DataFrame(model.classes_[np.argmax(y_pred, axis=1)], columns=TARGET_COLUMNS, index=feature_test.index)
prediction = pd.DataFrame(y_prob, columns=[TARGET_COLUMNS[0] + "_" +str(i) for i in range(1, y_prob.shape[1] + 1)], index=feature_test.index)
{% else %}
prediction = pd.DataFrame(y_pred, columns=TARGET_COLUMNS, index=feature_test.index)
{% endif %}
Expand Down
7 changes: 6 additions & 1 deletion sapientml_core/templates/other_templates/shap.py.jinja
Original file line number Diff line number Diff line change
@@ -1,9 +1,14 @@
# Models are restricted because of execution time.
{% set lgbmclassifier = "LGBMClassifier" %}
models_for_shap = ['XGBClassifier', 'XGBRegressor', 'LGBMClassifier', 'LGBMRegressor', 'GradientBoostingClassifier', 'GradientBoostingRegressor']
if model.__class__.__name__ in models_for_shap:
import shap
feature_shap = feature_train.sample(1000) if feature_train.shape[0] > 1000 else feature_train
explainer = shap.Explainer(model, feature_shap)
{% if model_name == lgbmclassifier %}
explainer = shap.Explainer(model,feature_shap)
{% else %}
explainer = shap.Explainer(model)
{% endif %}
shap_values = explainer(feature_shap)

# summarize the effects of all the features
Expand Down
4 changes: 2 additions & 2 deletions sapientml_core/templates/pipeline_test.py.jinja
Original file line number Diff line number Diff line change
Expand Up @@ -66,14 +66,14 @@ if set(TARGET_COLUMNS).issubset(test_dataset.columns.tolist()):
{% endif %}
{% if 'output_prediction' in pipeline_json %}

{{ pipeline_json['output_prediction']['code_test'] }}
{{ pipeline_json['output_prediction']['code'] }}
{% endif %}
{% if 'permutation_importance' in pipeline_json %}

{{ pipeline_json['permutation_importance']['code'] }}
{% endif %}

{% if 'shap' in pipeline_json %}
{% if 'shap' in pipeline_json and not pipeline.task.is_multiclass %}

{{ pipeline_json['shap']['code'] }}
{% endif %}