From eaa3aadafcf40dbca1ad25bb8b3d8a9e853a1920 Mon Sep 17 00:00:00 2001 From: ShreeshaM07 Date: Tue, 11 Jun 2024 13:43:51 +0530 Subject: [PATCH 01/19] [ENH] test_methods_p handling shuffle --- skpro/distributions/tests/test_all_distrs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skpro/distributions/tests/test_all_distrs.py b/skpro/distributions/tests/test_all_distrs.py index b9c2fe23b..1fed1c5c9 100644 --- a/skpro/distributions/tests/test_all_distrs.py +++ b/skpro/distributions/tests/test_all_distrs.py @@ -167,7 +167,7 @@ def test_methods_p(self, object_instance, method, shuffled): else: p = np_unif - res = getattr(object_instance, method)(p) + res = getattr(d, method)(p) _check_output_format(res, d, method) From 2906e1143799690fcee38dd90c2e1ea6f0d0ab8a Mon Sep 17 00:00:00 2001 From: ShreeshaM07 Date: Wed, 12 Jun 2024 14:44:37 +0530 Subject: [PATCH 02/19] [ENH] GLM with multiple family and link support --- skpro/regression/linear/_glm.py | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 4f6637c6e..43f112d6f 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -167,8 +167,20 @@ class GLMRegressor(BaseProbaRegressor): "y_inner_mtype": "pd_DataFrame_Table", } + def _str_to_sm_family(self, family): + from statsmodels.genmod.families.family import Gamma, Gaussian, Poisson + + sm_fmly = { + "Gaussian": Gaussian, + "Poisson": Poisson, + "Gamma": Gamma, + } + + return sm_fmly[family] + def __init__( self, + family=None, missing="none", start_params=None, maxiter=100, @@ -184,9 +196,10 @@ def __init__( add_constant=False, ): super().__init__() - from statsmodels.genmod.families.family import Gaussian - self._family = Gaussian() + if family is None: + family = "Gaussian" + self.family = family self.missing = missing self.start_params = start_params self.maxiter = maxiter @@ -231,10 +244,13 @@ def _fit(self, X, y): y_col = y.columns + family = self.family + sm_family = self._str_to_sm_family(family) + glm_estimator = GLM( endog=y, exog=X_, - family=self._family, + family=sm_family, missing=self.missing, ) From 29756283de89f2cdf4e60fa2601f3bebb40e0657 Mon Sep 17 00:00:00 2001 From: ShreeshaM07 Date: Wed, 12 Jun 2024 18:44:03 +0530 Subject: [PATCH 03/19] [ENH] GLMs with multiple link and distribution support --- skpro/regression/linear/_glm.py | 45 ++++++++++++++++++++++++++++----- 1 file changed, 39 insertions(+), 6 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 43f112d6f..4e1b3f204 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -1,6 +1,8 @@ -"""Interface adapter for the Generalized Linear Model Regressor with Gaussian Link.""" +"""Interface adapter for the Generalized Linear Model Regressor.""" # copyright: skpro developers, BSD-3-Clause License (see LICENSE file) +__author__ = ["ShreeshaM07", "julian-fong"] + import pandas as pd from skpro.regression.base import BaseProbaRegressor @@ -18,6 +20,11 @@ class GLMRegressor(BaseProbaRegressor): Parameters ---------- + family : str + Available options are + 1.Normal + 2.Poisson + 3.Gamma missing : str Available options are 'none', 'drop' and 'raise'. If 'none', no nan checking is done. If 'drop', any observations with nans are dropped. @@ -157,8 +164,8 @@ class GLMRegressor(BaseProbaRegressor): """ _tags = { - "authors": ["julian-fong"], - "maintainers": ["julian-fong"], + "authors": ["ShreeshaM07", "julian-fong"], + "maintainers": ["ShreeshaM07", "julian-fong"], "python_version": None, "python_dependencies": "statsmodels", "capability:multioutput": False, @@ -168,15 +175,20 @@ class GLMRegressor(BaseProbaRegressor): } def _str_to_sm_family(self, family): + """Convert the string to a statsmodel object. + + If the link function is also explcitly mentioned then include then + that must be passed to the family/distribution object. + """ from statsmodels.genmod.families.family import Gamma, Gaussian, Poisson sm_fmly = { - "Gaussian": Gaussian, + "Normal": Gaussian, "Poisson": Poisson, "Gamma": Gamma, } - return sm_fmly[family] + return sm_fmly[family]() def __init__( self, @@ -198,7 +210,7 @@ def __init__( super().__init__() if family is None: - family = "Gaussian" + family = "Normal" self.family = family self.missing = missing self.start_params = start_params @@ -329,6 +341,19 @@ def _predict(self, X): return y_pred + def _params_sm_to_skpro(self, y_predictions_df, index, columns, family): + """Convert the statsmodels output to equivalent skpro distribution.""" + # from skpro.distributions.gamma import Gamma + # from skpro.distributions.normal import Normal + # from skpro.distributions.poisson import Poisson + + # skpro_distr = { + # "Normal": Normal, + # "Poisson": Poisson, + # "Gamma": Gamma, + # } + # params = {} + def _predict_proba(self, X): """Predict distribution over labels for data from features. @@ -357,6 +382,14 @@ def _predict_proba(self, X): # the prediction and prediction variance i.e mu and sigma y_column = self.y_col y_predictions_df = self.glm_fit_.get_prediction(X_).summary_frame() + + # convert the returned values to skpro equivalent distribution + family = self.family + index = X_.index + columns = y_column + + y_pred = self._params_sm_to_skpro(y_predictions_df, family, index, columns) + y_mu = y_predictions_df["mean"].rename("mu").to_frame() y_sigma = y_predictions_df["mean_se"].rename("sigma").to_frame() params = { From c5802a3a6e432d11bbc44d36847231ecf50c8f0b Mon Sep 17 00:00:00 2001 From: ShreeshaM07 Date: Thu, 13 Jun 2024 01:03:11 +0530 Subject: [PATCH 04/19] [ENH] GLMs with multiple distributions and links --- skpro/regression/linear/_glm.py | 57 ++++++++++++++++++++------------- 1 file changed, 35 insertions(+), 22 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 4e1b3f204..1b703422d 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -343,16 +343,41 @@ def _predict(self, X): def _params_sm_to_skpro(self, y_predictions_df, index, columns, family): """Convert the statsmodels output to equivalent skpro distribution.""" - # from skpro.distributions.gamma import Gamma - # from skpro.distributions.normal import Normal - # from skpro.distributions.poisson import Poisson + from skpro.distributions.gamma import Gamma + from skpro.distributions.normal import Normal + from skpro.distributions.poisson import Poisson + + skpro_distr = { + "Normal": Normal, + "Poisson": Poisson, + "Gamma": Gamma, + } - # skpro_distr = { - # "Normal": Normal, - # "Poisson": Poisson, - # "Gamma": Gamma, - # } - # params = {} + params = {} + skp_dist = Normal + + if family in skpro_distr: + skp_dist = skpro_distr[family] + + if skp_dist == Normal: + y_mu = y_predictions_df["mean"].rename("mu").to_frame() + y_sigma = y_predictions_df["mean_se"].rename("sigma").to_frame() + params["mu"] = y_mu + params["sigma"] = y_sigma + elif skp_dist == Poisson: + y_mu = y_predictions_df["mean"].rename("mu").to_frame() + params["mu"] = y_mu + elif skp_dist == Gamma: + y_alpha = y_predictions_df["mean"].rename("alpha").to_frame() + y_beta = y_predictions_df["mean_se"].rename("beta").to_frame() + params["alpha"] = y_alpha + params["beta"] = y_beta + + params["index"] = index + params["columns"] = columns + + y_pred = skp_dist(**params) + return y_pred def _predict_proba(self, X): """Predict distribution over labels for data from features. @@ -373,8 +398,6 @@ def _predict_proba(self, X): y_pred : skpro BaseDistribution, same length as `X` labels predicted for `X` """ - from skpro.distributions.normal import Normal - X_ = self._prep_x(X) # instead of using the conventional predict() method, we use statsmodels @@ -388,17 +411,7 @@ def _predict_proba(self, X): index = X_.index columns = y_column - y_pred = self._params_sm_to_skpro(y_predictions_df, family, index, columns) - - y_mu = y_predictions_df["mean"].rename("mu").to_frame() - y_sigma = y_predictions_df["mean_se"].rename("sigma").to_frame() - params = { - "mu": y_mu, - "sigma": y_sigma, - "index": X_.index, - "columns": y_column, - } - y_pred = Normal(**params) + y_pred = self._params_sm_to_skpro(y_predictions_df, index, columns, family) return y_pred def _prep_x(self, X): From 38385d831702582f961f148c15fc814c4ac6dd5a Mon Sep 17 00:00:00 2001 From: ShreeshaM07 Date: Thu, 13 Jun 2024 01:27:26 +0530 Subject: [PATCH 05/19] default "Normal" --- skpro/regression/linear/_glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 1b703422d..b5b210cab 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -192,7 +192,7 @@ def _str_to_sm_family(self, family): def __init__( self, - family=None, + family="Normal", missing="none", start_params=None, maxiter=100, From 888d67abaed59eafe51c27c2018c6526cee81712 Mon Sep 17 00:00:00 2001 From: ShreeshaM07 Date: Thu, 13 Jun 2024 17:03:34 +0530 Subject: [PATCH 06/19] modified gamma params --- skpro/regression/linear/_glm.py | 14 +++++++++++--- 1 file changed, 11 insertions(+), 3 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index b5b210cab..ee401a2f3 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -368,8 +368,11 @@ def _params_sm_to_skpro(self, y_predictions_df, index, columns, family): y_mu = y_predictions_df["mean"].rename("mu").to_frame() params["mu"] = y_mu elif skp_dist == Gamma: - y_alpha = y_predictions_df["mean"].rename("alpha").to_frame() - y_beta = y_predictions_df["mean_se"].rename("beta").to_frame() + y_mean = y_predictions_df["mean"] + y_sd = y_predictions_df["mean_se"] + y_alpha = (y_mean / y_sd) ** 2 + y_beta = (y_alpha / y_mean).rename("beta").to_frame() + y_alpha = y_alpha.rename("alpha").to_frame() params["alpha"] = y_alpha params["beta"] = y_beta @@ -457,5 +460,10 @@ def get_test_params(cls, parameter_set="default"): """ params1 = {} params2 = {"add_constant": True} + params3 = { + "family": "Poisson", + "add_constant": True, + } + params4 = {"family": "Gamma"} - return [params1, params2] + return [params1, params2, params3, params4] From f6ff39e0fe3897a460ad266358639d8e1aebb612 Mon Sep 17 00:00:00 2001 From: ShreeshaM07 Date: Thu, 13 Jun 2024 18:31:34 +0530 Subject: [PATCH 07/19] link function support --- skpro/regression/linear/_glm.py | 41 ++++++++++++++++++++++++++++++--- 1 file changed, 38 insertions(+), 3 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index ee401a2f3..0d46d80cb 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -25,6 +25,11 @@ class GLMRegressor(BaseProbaRegressor): 1.Normal 2.Poisson 3.Gamma + link : str + Available safe options are + Normal : Log, Identity, InversePower + Poisson : Log, Identity, Sqrt + Gamma : Log, Identity, InversePower missing : str Available options are 'none', 'drop' and 'raise'. If 'none', no nan checking is done. If 'drop', any observations with nans are dropped. @@ -174,13 +179,16 @@ class GLMRegressor(BaseProbaRegressor): "y_inner_mtype": "pd_DataFrame_Table", } - def _str_to_sm_family(self, family): + def _str_to_sm_family(self, family, link): """Convert the string to a statsmodel object. If the link function is also explcitly mentioned then include then that must be passed to the family/distribution object. """ + from warnings import warn + from statsmodels.genmod.families.family import Gamma, Gaussian, Poisson + from statsmodels.genmod.families.links import Identity, InversePower, Log, Sqrt sm_fmly = { "Normal": Gaussian, @@ -188,11 +196,27 @@ def _str_to_sm_family(self, family): "Gamma": Gamma, } + links = { + "Log": Log, + "Identity": Identity, + "InversePower": InversePower, + "Sqrt": Sqrt, + } + + if link in links: + link_function = links[link]() + try: + return sm_fmly[family](link_function) + except Exception: + msg = "Invalid link for family, default link will be used" + warn(msg) + return sm_fmly[family]() def __init__( self, family="Normal", + link=None, missing="none", start_params=None, maxiter=100, @@ -212,6 +236,7 @@ def __init__( if family is None: family = "Normal" self.family = family + self.link = link self.missing = missing self.start_params = start_params self.maxiter = maxiter @@ -257,7 +282,8 @@ def _fit(self, X, y): y_col = y.columns family = self.family - sm_family = self._str_to_sm_family(family) + link = self.link + sm_family = self._str_to_sm_family(family=family, link=link) glm_estimator = GLM( endog=y, @@ -465,5 +491,14 @@ def get_test_params(cls, parameter_set="default"): "add_constant": True, } params4 = {"family": "Gamma"} + params5 = { + "family": "Normal", + "link": "InversePower", + } + params6 = { + "family": "Poisson", + "link": "Log", + "add_constant": True, + } - return [params1, params2, params3, params4] + return [params1, params2, params3, params4, params5, params6] From 4bdd71d9634ee926149aca03e11d27be072750c2 Mon Sep 17 00:00:00 2001 From: ShreeshaM07 Date: Mon, 17 Jun 2024 00:31:52 +0530 Subject: [PATCH 08/19] offset and exposure added as bool and part of X --- skpro/regression/linear/_glm.py | 48 +++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 0d46d80cb..f7ed455db 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -3,6 +3,7 @@ __author__ = ["ShreeshaM07", "julian-fong"] +import numpy as np import pandas as pd from skpro.regression.base import BaseProbaRegressor @@ -34,7 +35,16 @@ class GLMRegressor(BaseProbaRegressor): Available options are 'none', 'drop' and 'raise'. If 'none', no nan checking is done. If 'drop', any observations with nans are dropped. If 'raise', an error is raised. Default = 'none' - + offset : bool, default = False + If True, then the exog or ``X`` passed while ``fit``ting must have an additional + column with column name ``offset`` with any values against each row. + When ``predict``ing have an additional column with name ``offset`` + in X with all the ``offset`` values stored in the column for each row. + exposure : bool, default = False + If True, then the exog or ``X`` passed while ``fit``ting must have an additional + column with column name ``exposure`` with any values against each row. + When ``predict``ing have an additional column with name ``exposure`` + in X with all the ``exposure`` values stored in the column for each row. start_params : array_like (optional) Initial guess of the solution for the loglikelihood maximization. The default is family-specific and is given by the @@ -218,6 +228,8 @@ def __init__( family="Normal", link=None, missing="none", + offset=False, + exposure=False, start_params=None, maxiter=100, method="IRLS", @@ -238,6 +250,8 @@ def __init__( self.family = family self.link = link self.missing = missing + self.offset = offset + self.exposure = exposure self.start_params = start_params self.maxiter = maxiter self.method = method @@ -277,6 +291,15 @@ def _fit(self, X, y): """ from statsmodels.genmod.generalized_linear_model import GLM + # remove the offset and exposure columns which + # was inserted to maintain the shape + offset = self.offset + exposure = self.exposure + if offset is True: + X = X.drop(["offset"], axis=1) + if exposure is True: + X = X.drop(["exposure"], axis=1) + X_ = self._prep_x(X) y_col = y.columns @@ -358,11 +381,23 @@ def _predict(self, X): ------- y : pandas DataFrame, same length as `X`, with same columns as y in fit """ + offset = self.offset + exposure = self.exposure + offset_arr = None + exposure_arr = None + if offset is True: + offset_arr = np.array(X["offset"]) + X = X.drop(["offset"], axis=1) + if exposure is True: + exposure_arr = np.array(X["exposure"]) + X = X.drop(["exposure"], axis=1) X_ = self._prep_x(X) index = X_.index y_column = self.y_col - y_pred_series = self.glm_fit_.predict(X_) + y_pred_series = self.glm_fit_.predict( + X_, offset=offset_arr, exposure=exposure_arr + ) y_pred = pd.DataFrame(y_pred_series, index=index, columns=y_column) return y_pred @@ -427,6 +462,15 @@ def _predict_proba(self, X): y_pred : skpro BaseDistribution, same length as `X` labels predicted for `X` """ + # remove the offset and exposure columns + # which was inserted to maintain the shape + offset = self.offset + exposure = self.exposure + if offset is True: + X = X.drop(["offset"], axis=1) + if exposure is True: + X = X.drop(["exposure"], axis=1) + X_ = self._prep_x(X) # instead of using the conventional predict() method, we use statsmodels From e6b20d182c17dc4e84f2cdf85c2a3580e3f95b28 Mon Sep 17 00:00:00 2001 From: ShreeshaM07 Date: Mon, 17 Jun 2024 00:42:44 +0530 Subject: [PATCH 09/19] offset and exposure initialized in constructor itself with size --- skpro/regression/linear/_glm.py | 53 +++++++-------------------------- 1 file changed, 10 insertions(+), 43 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index f7ed455db..ce001614c 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -3,7 +3,6 @@ __author__ = ["ShreeshaM07", "julian-fong"] -import numpy as np import pandas as pd from skpro.regression.base import BaseProbaRegressor @@ -35,16 +34,11 @@ class GLMRegressor(BaseProbaRegressor): Available options are 'none', 'drop' and 'raise'. If 'none', no nan checking is done. If 'drop', any observations with nans are dropped. If 'raise', an error is raised. Default = 'none' - offset : bool, default = False - If True, then the exog or ``X`` passed while ``fit``ting must have an additional - column with column name ``offset`` with any values against each row. - When ``predict``ing have an additional column with name ``offset`` - in X with all the ``offset`` values stored in the column for each row. - exposure : bool, default = False - If True, then the exog or ``X`` passed while ``fit``ting must have an additional - column with column name ``exposure`` with any values against each row. - When ``predict``ing have an additional column with name ``exposure`` - in X with all the ``exposure`` values stored in the column for each row. + offset : 1D float array or None (optional) + 1D array same size as X or exog while ``predict``ing. + exposure : 1D float array or None (optional) + 1D array same size as X or exog while ``predict``ing. + used only when link is 'Log'. start_params : array_like (optional) Initial guess of the solution for the loglikelihood maximization. The default is family-specific and is given by the @@ -228,8 +222,8 @@ def __init__( family="Normal", link=None, missing="none", - offset=False, - exposure=False, + offset=None, + exposure=None, start_params=None, maxiter=100, method="IRLS", @@ -291,15 +285,6 @@ def _fit(self, X, y): """ from statsmodels.genmod.generalized_linear_model import GLM - # remove the offset and exposure columns which - # was inserted to maintain the shape - offset = self.offset - exposure = self.exposure - if offset is True: - X = X.drop(["offset"], axis=1) - if exposure is True: - X = X.drop(["exposure"], axis=1) - X_ = self._prep_x(X) y_col = y.columns @@ -381,23 +366,14 @@ def _predict(self, X): ------- y : pandas DataFrame, same length as `X`, with same columns as y in fit """ + X_ = self._prep_x(X) + offset = self.offset exposure = self.exposure - offset_arr = None - exposure_arr = None - if offset is True: - offset_arr = np.array(X["offset"]) - X = X.drop(["offset"], axis=1) - if exposure is True: - exposure_arr = np.array(X["exposure"]) - X = X.drop(["exposure"], axis=1) - X_ = self._prep_x(X) index = X_.index y_column = self.y_col - y_pred_series = self.glm_fit_.predict( - X_, offset=offset_arr, exposure=exposure_arr - ) + y_pred_series = self.glm_fit_.predict(X_, offset=offset, exposure=exposure) y_pred = pd.DataFrame(y_pred_series, index=index, columns=y_column) return y_pred @@ -462,15 +438,6 @@ def _predict_proba(self, X): y_pred : skpro BaseDistribution, same length as `X` labels predicted for `X` """ - # remove the offset and exposure columns - # which was inserted to maintain the shape - offset = self.offset - exposure = self.exposure - if offset is True: - X = X.drop(["offset"], axis=1) - if exposure is True: - X = X.drop(["exposure"], axis=1) - X_ = self._prep_x(X) # instead of using the conventional predict() method, we use statsmodels From 8945ebda9ec2e00a9a0136a919a7557f326da624 Mon Sep 17 00:00:00 2001 From: ShreeshaM07 Date: Mon, 17 Jun 2024 01:16:25 +0530 Subject: [PATCH 10/19] Back to no `offset`/`exposure` --- skpro/regression/linear/_glm.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index ce001614c..0d46d80cb 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -34,11 +34,7 @@ class GLMRegressor(BaseProbaRegressor): Available options are 'none', 'drop' and 'raise'. If 'none', no nan checking is done. If 'drop', any observations with nans are dropped. If 'raise', an error is raised. Default = 'none' - offset : 1D float array or None (optional) - 1D array same size as X or exog while ``predict``ing. - exposure : 1D float array or None (optional) - 1D array same size as X or exog while ``predict``ing. - used only when link is 'Log'. + start_params : array_like (optional) Initial guess of the solution for the loglikelihood maximization. The default is family-specific and is given by the @@ -222,8 +218,6 @@ def __init__( family="Normal", link=None, missing="none", - offset=None, - exposure=None, start_params=None, maxiter=100, method="IRLS", @@ -244,8 +238,6 @@ def __init__( self.family = family self.link = link self.missing = missing - self.offset = offset - self.exposure = exposure self.start_params = start_params self.maxiter = maxiter self.method = method @@ -368,12 +360,9 @@ def _predict(self, X): """ X_ = self._prep_x(X) - offset = self.offset - exposure = self.exposure - index = X_.index y_column = self.y_col - y_pred_series = self.glm_fit_.predict(X_, offset=offset, exposure=exposure) + y_pred_series = self.glm_fit_.predict(X_) y_pred = pd.DataFrame(y_pred_series, index=index, columns=y_column) return y_pred From 064517c3707728833f7ad6de05c348851b23b965 Mon Sep 17 00:00:00 2001 From: ShreeshaM07 Date: Mon, 17 Jun 2024 23:15:57 +0530 Subject: [PATCH 11/19] Revert "offset and exposure initialized in constructor itself with size" This reverts commit e6b20d182c17dc4e84f2cdf85c2a3580e3f95b28. --- skpro/regression/linear/_glm.py | 46 +++++++++++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 0d46d80cb..b7f6eb6f9 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -3,6 +3,7 @@ __author__ = ["ShreeshaM07", "julian-fong"] +import numpy as np import pandas as pd from skpro.regression.base import BaseProbaRegressor @@ -34,7 +35,16 @@ class GLMRegressor(BaseProbaRegressor): Available options are 'none', 'drop' and 'raise'. If 'none', no nan checking is done. If 'drop', any observations with nans are dropped. If 'raise', an error is raised. Default = 'none' - + offset : bool, default = False + If True, then the exog or ``X`` passed while ``fit``ting must have an additional + column with column name ``offset`` with any values against each row. + When ``predict``ing have an additional column with name ``offset`` + in X with all the ``offset`` values stored in the column for each row. + exposure : bool, default = False + If True, then the exog or ``X`` passed while ``fit``ting must have an additional + column with column name ``exposure`` with any values against each row. + When ``predict``ing have an additional column with name ``exposure`` + in X with all the ``exposure`` values stored in the column for each row. start_params : array_like (optional) Initial guess of the solution for the loglikelihood maximization. The default is family-specific and is given by the @@ -218,6 +228,8 @@ def __init__( family="Normal", link=None, missing="none", + offset=False, + exposure=False, start_params=None, maxiter=100, method="IRLS", @@ -277,6 +289,15 @@ def _fit(self, X, y): """ from statsmodels.genmod.generalized_linear_model import GLM + # remove the offset and exposure columns which + # was inserted to maintain the shape + offset = self.offset + exposure = self.exposure + if offset is True: + X = X.drop(["offset"], axis=1) + if exposure is True: + X = X.drop(["exposure"], axis=1) + X_ = self._prep_x(X) y_col = y.columns @@ -358,11 +379,23 @@ def _predict(self, X): ------- y : pandas DataFrame, same length as `X`, with same columns as y in fit """ + offset = self.offset + exposure = self.exposure + offset_arr = None + exposure_arr = None + if offset is True: + offset_arr = np.array(X["offset"]) + X = X.drop(["offset"], axis=1) + if exposure is True: + exposure_arr = np.array(X["exposure"]) + X = X.drop(["exposure"], axis=1) X_ = self._prep_x(X) index = X_.index y_column = self.y_col - y_pred_series = self.glm_fit_.predict(X_) + y_pred_series = self.glm_fit_.predict( + X_, offset=offset_arr, exposure=exposure_arr + ) y_pred = pd.DataFrame(y_pred_series, index=index, columns=y_column) return y_pred @@ -427,6 +460,15 @@ def _predict_proba(self, X): y_pred : skpro BaseDistribution, same length as `X` labels predicted for `X` """ + # remove the offset and exposure columns + # which was inserted to maintain the shape + offset = self.offset + exposure = self.exposure + if offset is True: + X = X.drop(["offset"], axis=1) + if exposure is True: + X = X.drop(["exposure"], axis=1) + X_ = self._prep_x(X) # instead of using the conventional predict() method, we use statsmodels From e79cdd4d87b218147541c78724eae5de7974b3ae Mon Sep 17 00:00:00 2001 From: ShreeshaM07 Date: Tue, 18 Jun 2024 00:37:02 +0530 Subject: [PATCH 12/19] offset_var and exposure_var implemented --- skpro/regression/linear/_glm.py | 98 +++++++++++++++++++++++---------- 1 file changed, 68 insertions(+), 30 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index b7f6eb6f9..8e0aa8313 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -35,16 +35,20 @@ class GLMRegressor(BaseProbaRegressor): Available options are 'none', 'drop' and 'raise'. If 'none', no nan checking is done. If 'drop', any observations with nans are dropped. If 'raise', an error is raised. Default = 'none' - offset : bool, default = False - If True, then the exog or ``X`` passed while ``fit``ting must have an additional - column with column name ``offset`` with any values against each row. - When ``predict``ing have an additional column with name ``offset`` + offset : pd.Index([string]) or int, default = None + If ``pd.Index([string])``, then the exog or ``X`` passed while ``fit``ting + must have an additional column with column name passed through + ``offset`` with any values against each row. When ``predict``ing + have an additional column with name same as string passed through ``offset`` in X with all the ``offset`` values stored in the column for each row. - exposure : bool, default = False - If True, then the exog or ``X`` passed while ``fit``ting must have an additional - column with column name ``exposure`` with any values against each row. - When ``predict``ing have an additional column with name ``exposure`` + If ``int`` it corresponding column number will be considered. + exposure : pd.Index([string]) or int, default = None + If ```pd.Index([string])``, then the exog or ``X`` passed while ``fit``ting + must have an additional column with column name passed through + ``exposure`` with any values against each row. When ``predict``ing + have an additional column with name same as string passed through ``exposure`` in X with all the ``exposure`` values stored in the column for each row. + If ``int`` it corresponding column number will be considered. start_params : array_like (optional) Initial guess of the solution for the loglikelihood maximization. The default is family-specific and is given by the @@ -228,8 +232,8 @@ def __init__( family="Normal", link=None, missing="none", - offset=False, - exposure=False, + offset_var=None, + exposure_var=None, start_params=None, maxiter=100, method="IRLS", @@ -249,6 +253,8 @@ def __init__( family = "Normal" self.family = family self.link = link + self.offset_var = offset_var + self.exposure_var = exposure_var self.missing = missing self.start_params = start_params self.maxiter = maxiter @@ -291,12 +297,21 @@ def _fit(self, X, y): # remove the offset and exposure columns which # was inserted to maintain the shape - offset = self.offset - exposure = self.exposure - if offset is True: - X = X.drop(["offset"], axis=1) - if exposure is True: - X = X.drop(["exposure"], axis=1) + offset_var = self.offset_var + exposure_var = self.exposure_var + + if offset_var is not None: + if isinstance(offset_var, int): + offset_var = pd.Index([X.iloc[:, offset_var].name]) + if exposure_var is not None: + if isinstance(exposure_var, int): + exposure_var = pd.Index([X.iloc[:, exposure_var].name]) + if offset_var is not None and exposure_var is not None: + X = X.drop([offset_var[0], exposure_var[0]], axis=1) + elif offset_var is not None: + X = X.drop(offset_var, axis=1) + elif exposure_var is not None: + X = X.drop(exposure_var, axis=1) X_ = self._prep_x(X) @@ -379,16 +394,30 @@ def _predict(self, X): ------- y : pandas DataFrame, same length as `X`, with same columns as y in fit """ - offset = self.offset - exposure = self.exposure + offset_var = self.offset_var + exposure_var = self.exposure_var offset_arr = None exposure_arr = None - if offset is True: - offset_arr = np.array(X["offset"]) - X = X.drop(["offset"], axis=1) - if exposure is True: - exposure_arr = np.array(X["exposure"]) - X = X.drop(["exposure"], axis=1) + + if offset_var is not None: + if isinstance(offset_var, pd.Index): + offset_arr = np.array(X[offset_var]).flatten() + elif isinstance(offset_var, int): + offset_arr = np.array(X.iloc[:, offset_var]).flatten() + offset_var = pd.Index([X.iloc[:, offset_var].name]) + if exposure_var is not None: + if isinstance(exposure_var, pd.Index): + offset_arr = np.array(X[exposure_var]).flatten() + elif isinstance(exposure_var, int): + exposure_arr = np.array(X.iloc[:, exposure_var]).flatten() + exposure_var = pd.Index([X.iloc[:, exposure_var].name]) + if offset_var is not None and exposure_var is not None: + X = X.drop([offset_var[0], exposure_var[0]], axis=1) + elif offset_var is not None: + X = X.drop(offset_var, axis=1) + elif exposure_var is not None: + X = X.drop(exposure_var, axis=1) + X_ = self._prep_x(X) index = X_.index @@ -462,12 +491,21 @@ def _predict_proba(self, X): """ # remove the offset and exposure columns # which was inserted to maintain the shape - offset = self.offset - exposure = self.exposure - if offset is True: - X = X.drop(["offset"], axis=1) - if exposure is True: - X = X.drop(["exposure"], axis=1) + offset_var = self.offset_var + exposure_var = self.exposure_var + + if offset_var is not None: + if isinstance(offset_var, int): + offset_var = pd.Index([X.iloc[:, offset_var].name]) + if exposure_var is not None: + if isinstance(exposure_var, int): + exposure_var = pd.Index([X.iloc[:, exposure_var].name]) + if offset_var is not None and exposure_var is not None: + X = X.drop([offset_var[0], exposure_var[0]], axis=1) + elif offset_var is not None: + X = X.drop(offset_var, axis=1) + elif exposure_var is not None: + X = X.drop(exposure_var, axis=1) X_ = self._prep_x(X) From 53ec0d11b004b354ac89aef38717153856679482 Mon Sep 17 00:00:00 2001 From: ShreeshaM07 Date: Tue, 18 Jun 2024 00:39:17 +0530 Subject: [PATCH 13/19] params order chaged for deprecation handling --- skpro/regression/linear/_glm.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 8e0aa8313..54cfe6a8b 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -229,11 +229,7 @@ def _str_to_sm_family(self, family, link): def __init__( self, - family="Normal", - link=None, missing="none", - offset_var=None, - exposure_var=None, start_params=None, maxiter=100, method="IRLS", @@ -246,6 +242,10 @@ def __init__( disp=False, max_start_irls=3, add_constant=False, + family="Normal", + link=None, + offset_var=None, + exposure_var=None, ): super().__init__() From f0e9bccab56396aa1a8c6f9aeb7410e10deb6e1e Mon Sep 17 00:00:00 2001 From: ShreeshaM07 Date: Wed, 19 Jun 2024 00:25:16 +0530 Subject: [PATCH 14/19] added test_glm for offset and exposure --- skpro/regression/linear/_glm.py | 20 +++++----- skpro/regression/tests/test_glm.py | 60 ++++++++++++++++++++++++++++++ 2 files changed, 70 insertions(+), 10 deletions(-) create mode 100644 skpro/regression/tests/test_glm.py diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 54cfe6a8b..fc4015151 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -35,19 +35,19 @@ class GLMRegressor(BaseProbaRegressor): Available options are 'none', 'drop' and 'raise'. If 'none', no nan checking is done. If 'drop', any observations with nans are dropped. If 'raise', an error is raised. Default = 'none' - offset : pd.Index([string]) or int, default = None - If ``pd.Index([string])``, then the exog or ``X`` passed while ``fit``ting + offset_var : pd.Index([str]) or int, default = None + If ``pd.Index([str])``, then the exog or ``X`` passed while ``fit``-ting must have an additional column with column name passed through - ``offset`` with any values against each row. When ``predict``ing - have an additional column with name same as string passed through ``offset`` - in X with all the ``offset`` values stored in the column for each row. + ``offset_var`` with any values against each row. When ``predict``ing + have an additional column with name same as string passed through ``offset_var`` + in X with all the ``offset_var`` values stored in the column for each row. If ``int`` it corresponding column number will be considered. - exposure : pd.Index([string]) or int, default = None - If ```pd.Index([string])``, then the exog or ``X`` passed while ``fit``ting + exposure_var : pd.Index([str]) or int, default = None + If ```pd.Index([str])``, then the exog or ``X`` passed while ``fit``-ting must have an additional column with column name passed through - ``exposure`` with any values against each row. When ``predict``ing - have an additional column with name same as string passed through ``exposure`` - in X with all the ``exposure`` values stored in the column for each row. + ``exposure_var`` with any values against each row. When ``predict``ing + have additional column with name same as string passed through ``exposure_var`` + in X with all the ``exposure_var`` values stored in the column for each row. If ``int`` it corresponding column number will be considered. start_params : array_like (optional) Initial guess of the solution for the loglikelihood maximization. diff --git a/skpro/regression/tests/test_glm.py b/skpro/regression/tests/test_glm.py new file mode 100644 index 000000000..8ca263ac3 --- /dev/null +++ b/skpro/regression/tests/test_glm.py @@ -0,0 +1,60 @@ +"""Tests Generalized Linear Model regressor.""" + +import pandas as pd +import pytest + +from skpro.regression.linear import GLMRegressor +from skpro.tests.test_switch import run_test_for_class + + +@pytest.mark.skipif( + not run_test_for_class(GLMRegressor), + reason="run test only if softdeps are present and incrementally (if requested)", +) +def test_glm_simple_use(): + """Test simple use of GLM regressor.""" + from sklearn.datasets import load_diabetes + from sklearn.model_selection import train_test_split + + X, y = load_diabetes(return_X_y=True, as_frame=True) + y = pd.DataFrame(y) + X = X.iloc[:200] + y = y.iloc[:200] + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + glm_reg = GLMRegressor() + glm_reg.fit(X_train, y_train) + y_pred = glm_reg.predict(X_test) + y_pred_proba = glm_reg.predict_proba(X_test) + + assert y_pred.shape == y_test.shape + assert y_pred_proba.shape == y_test.shape + + +@pytest.mark.skipif( + not run_test_for_class(GLMRegressor), + reason="run test only if softdeps are present and incrementally (if requested)", +) +def test_glm_with_offset_exposure(): + """Test GLM with offset_var and exposure_var parameters.""" + import numpy as np + from sklearn.datasets import load_diabetes + from sklearn.model_selection import train_test_split + + X, y = load_diabetes(return_X_y=True, as_frame=True) + y = pd.DataFrame(y) + X = X.iloc[:200] + y = y.iloc[:200] + X["off"] = np.ones(X.shape[0]) * 2.1 + X["exp"] = np.arange(1, X.shape[0] + 1) + X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) + + glm_reg = GLMRegressor( + family="Normal", link="Log", offset_var=pd.Index(["off"]), exposure_var=-1 + ) + glm_reg.fit(X_train, y_train) + y_pred = glm_reg.predict(X_test) + y_pred_proba = glm_reg.predict_proba(X_test) + + assert y_pred.shape == y_test.shape + assert y_pred_proba.shape == y_test.shape From a1523ea021f465e073a287827aeb6fe51fa384e4 Mon Sep 17 00:00:00 2001 From: ShreeshaM07 Date: Wed, 19 Jun 2024 21:02:11 +0530 Subject: [PATCH 15/19] deprecation for changing sequence of params --- skpro/regression/linear/_glm.py | 237 +++++++++++++++++++++++++------- 1 file changed, 191 insertions(+), 46 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index fc4015151..2deae912f 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -21,7 +21,7 @@ class GLMRegressor(BaseProbaRegressor): Parameters ---------- - family : str + family : str, default : "Normal" Available options are 1.Normal 2.Poisson @@ -31,10 +31,6 @@ class GLMRegressor(BaseProbaRegressor): Normal : Log, Identity, InversePower Poisson : Log, Identity, Sqrt Gamma : Log, Identity, InversePower - missing : str - Available options are 'none', 'drop' and 'raise'. If 'none', no nan - checking is done. If 'drop', any observations with nans are dropped. - If 'raise', an error is raised. Default = 'none' offset_var : pd.Index([str]) or int, default = None If ``pd.Index([str])``, then the exog or ``X`` passed while ``fit``-ting must have an additional column with column name passed through @@ -49,6 +45,10 @@ class GLMRegressor(BaseProbaRegressor): have additional column with name same as string passed through ``exposure_var`` in X with all the ``exposure_var`` values stored in the column for each row. If ``int`` it corresponding column number will be considered. + missing : str + Available options are 'none', 'drop' and 'raise'. If 'none', no nan + checking is done. If 'drop', any observations with nans are dropped. + If 'raise', an error is raised. Default = 'none' start_params : array_like (optional) Initial guess of the solution for the loglikelihood maximization. The default is family-specific and is given by the @@ -227,30 +227,93 @@ def _str_to_sm_family(self, family, link): return sm_fmly[family]() + # TODO (release 2.4.0) + # replace the existing definition of `__init__` with + # the below definition for `__init__`. + # def __init__( + # self, + # family="Normal", + # link=None, + # offset_var=None, + # exposure_var=None, + # missing="none", + # start_params=None, + # maxiter=100, + # method="IRLS", + # tol=1e-8, + # scale=None, + # cov_type="nonrobust", + # cov_kwds=None, + # use_t=None, + # full_output=True, + # disp=False, + # max_start_irls=3, + # add_constant=False, + # ): + # super().__init__() + + # self.family = family + # self.link = link + # self.offset_var = offset_var + # self.exposure_var = exposure_var + # self.missing = missing + # self.start_params = start_params + # self.maxiter = maxiter + # self.method = method + # self.tol = tol + # self.scale = scale + # self.cov_type = cov_type + # self.cov_kwds = cov_kwds + # self.use_t = use_t + # self.full_output = full_output + # self.disp = disp + # self.max_start_irls = max_start_irls + # self.add_constant = add_constant + + # self._family = self.family + # self._link = self.link + # self._offset_var = self.offset_var + # self._exposure_var = self.exposure_var + # self._missing = self.missing + # self._start_params = self.start_params + # self._maxiter = self.maxiter + # self._method = self.method + # self._tol = self.tol + # self._scale = self.scale + # self._cov_type = self.cov_type + # self._cov_kwds = self.cov_kwds + # self._use_t = self.use_t + # self._full_output = self.full_output + # self._disp = self.disp + # self._max_start_irls = self.max_start_irls + # self._add_constant = self.add_constant + def __init__( self, - missing="none", - start_params=None, - maxiter=100, - method="IRLS", - tol=1e-8, - scale=None, - cov_type="nonrobust", - cov_kwds=None, - use_t=None, - full_output=True, - disp=False, - max_start_irls=3, - add_constant=False, - family="Normal", - link=None, - offset_var=None, - exposure_var=None, + missing="4", + start_params="5", + maxiter="6", + method="7", + tol="8", + scale="9", + cov_type="10", + cov_kwds="11", + use_t="12", + full_output="13", + disp="14", + max_start_irls="15", + add_constant="16", + family="0", + link="1", + offset_var="2", + exposure_var="3", ): + # The default values of the parameters + # are replaced with the changed sequence + # of parameters ranking for each of them + # from 0 to 16(total 17 parameters). super().__init__() - if family is None: - family = "Normal" self.family = family self.link = link self.offset_var = offset_var @@ -269,6 +332,88 @@ def __init__( self.max_start_irls = max_start_irls self.add_constant = add_constant + if family == "0": + self._family = "Normal" + else: + self._family = family + if link == "1": + self._link = None + else: + self._link = link + if offset_var == "2": + self._offset_var = None + else: + self._offset_var = offset_var + if exposure_var == "3": + self._exposure_var = None + else: + self._exposure_var = exposure_var + if missing == "4": + self._missing = "none" + else: + self._missing = missing + if start_params == "5": + self._start_params = None + else: + self._start_params = start_params + if maxiter == "6": + self._maxiter = 100 + else: + self._maxiter = maxiter + if method == "7": + self._method = "IRLS" + else: + self._method = method + if tol == "8": + self._tol = 1e-8 + else: + self._tol = self.tol + if scale == "9": + self._scale = None + else: + self._scale = scale + if cov_type == "10": + self._cov_type = "nonrobust" + else: + self._cov_type = cov_type + if cov_kwds == "11": + self._cov_kwds = None + else: + self._cov_kwds = cov_kwds + if use_t == "12": + self._use_t = None + else: + self._use_t = use_t + if full_output == "13": + self._full_output = True + else: + self._full_output = full_output + if disp == "14": + self._disp = False + else: + self._disp = disp + if max_start_irls == "15": + self._max_start_irls = 3 + else: + self._max_start_irls = max_start_irls + if add_constant == "16": + self._add_constant = False + else: + self._add_constant = add_constant + + from sktime.utils.warnings import warn + + l1 = "Note: in `GLMRegressor`, the sequence of the parameters will change " + l2 = "in skpro version 2.4.0. It will be as per the order present in the" + l3 = "current docstring with the top one being the first parameter.\n" + l4 = "The defaults for the parameters will remain same and " + l5 = "there will be no changes.\n" + l6 = "Please use the `kwargs` calls instead of positional calls for the" + l7 = "parameters until the release of skpro 2.4.0 " + l8 = "as this will avoid any discrepancies." + warn_msg = l1 + l2 + l3 + l4 + l5 + l6 + l7 + l8 + warn(warn_msg) + def _fit(self, X, y): """Fit regressor to training data. @@ -297,8 +442,8 @@ def _fit(self, X, y): # remove the offset and exposure columns which # was inserted to maintain the shape - offset_var = self.offset_var - exposure_var = self.exposure_var + offset_var = self._offset_var + exposure_var = self._exposure_var if offset_var is not None: if isinstance(offset_var, int): @@ -317,31 +462,31 @@ def _fit(self, X, y): y_col = y.columns - family = self.family - link = self.link + family = self._family + link = self._link sm_family = self._str_to_sm_family(family=family, link=link) glm_estimator = GLM( endog=y, exog=X_, family=sm_family, - missing=self.missing, + missing=self._missing, ) self._estimator = glm_estimator fitted_glm_model = glm_estimator.fit( - self.start_params, - self.maxiter, - self.method, - self.tol, - self.scale, - self.cov_type, - self.cov_kwds, - self.use_t, - self.full_output, - self.disp, - self.max_start_irls, + self._start_params, + self._maxiter, + self._method, + self._tol, + self._scale, + self._cov_type, + self._cov_kwds, + self._use_t, + self._full_output, + self._disp, + self._max_start_irls, ) PARAMS_TO_FORWARD = { @@ -394,8 +539,8 @@ def _predict(self, X): ------- y : pandas DataFrame, same length as `X`, with same columns as y in fit """ - offset_var = self.offset_var - exposure_var = self.exposure_var + offset_var = self._offset_var + exposure_var = self._exposure_var offset_arr = None exposure_arr = None @@ -491,8 +636,8 @@ def _predict_proba(self, X): """ # remove the offset and exposure columns # which was inserted to maintain the shape - offset_var = self.offset_var - exposure_var = self.exposure_var + offset_var = self._offset_var + exposure_var = self._exposure_var if offset_var is not None: if isinstance(offset_var, int): @@ -516,7 +661,7 @@ def _predict_proba(self, X): y_predictions_df = self.glm_fit_.get_prediction(X_).summary_frame() # convert the returned values to skpro equivalent distribution - family = self.family + family = self._family index = X_.index columns = y_column @@ -540,7 +685,7 @@ def _prep_x(self, X): """ from statsmodels.tools import add_constant - if self.add_constant: + if self._add_constant: X_ = add_constant(X) return X_ else: From a75b0972df4b85df02062da8b8e70e63d6f12f8b Mon Sep 17 00:00:00 2001 From: ShreeshaM07 Date: Wed, 19 Jun 2024 21:29:13 +0530 Subject: [PATCH 16/19] removed sktime dependecy in warning --- skpro/regression/linear/_glm.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 2deae912f..b2b630290 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -401,7 +401,7 @@ def __init__( else: self._add_constant = add_constant - from sktime.utils.warnings import warn + from warnings import warn l1 = "Note: in `GLMRegressor`, the sequence of the parameters will change " l2 = "in skpro version 2.4.0. It will be as per the order present in the" From 9ee6e94b3f56ec946b8598b82d8fbf992a4055c1 Mon Sep 17 00:00:00 2001 From: ShreeshaM07 Date: Thu, 20 Jun 2024 13:39:34 +0530 Subject: [PATCH 17/19] modified requested changes --- skpro/regression/linear/_glm.py | 182 +++++++++++++++-------------- skpro/regression/tests/test_glm.py | 2 +- 2 files changed, 94 insertions(+), 90 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index b2b630290..134140842 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -21,29 +21,40 @@ class GLMRegressor(BaseProbaRegressor): Parameters ---------- - family : str, default : "Normal" - Available options are - 1.Normal - 2.Poisson - 3.Gamma - link : str - Available safe options are - Normal : Log, Identity, InversePower - Poisson : Log, Identity, Sqrt - Gamma : Log, Identity, InversePower - offset_var : pd.Index([str]) or int, default = None - If ``pd.Index([str])``, then the exog or ``X`` passed while ``fit``-ting + family : string, default : "Normal" + The family parameter denotes the type of distribution + that will be used. + Available family/distributions are + 1."Normal" + 2."Poisson" + 3."Gamma" + link : string, default : None + This parameter is used to represent the link function to be + used with the distribution. + If default is None it will internally replace with default of the + respective family. The default is the first string + against each family below. + Available safe options for the respective family are: + ``Normal`` : "Identity", "Log", "InversePower"; + ``Poisson`` : "Log", "Identity", "Sqrt"; + ``Gamma`` : "InversePower", "Log", "Identity"; + offset_var : string or int, default = None + Pass the column name as a string or column number as an int in X. + If string, then the exog or ``X`` passed while ``fit``-ting must have an additional column with column name passed through ``offset_var`` with any values against each row. When ``predict``ing have an additional column with name same as string passed through ``offset_var`` - in X with all the ``offset_var`` values stored in the column for each row. + in X with all the ``offset_var`` values for predicting + stored in the column for each row. If ``int`` it corresponding column number will be considered. - exposure_var : pd.Index([str]) or int, default = None - If ```pd.Index([str])``, then the exog or ``X`` passed while ``fit``-ting + exposure_var : string or int, default = None + Pass the column name as a string or column number as an int in X. + If string, then the exog or ``X`` passed while ``fit``-ting must have an additional column with column name passed through ``exposure_var`` with any values against each row. When ``predict``ing have additional column with name same as string passed through ``exposure_var`` - in X with all the ``exposure_var`` values stored in the column for each row. + in X with all the ``exposure_var`` values for predicting + stored in the column for each row. If ``int`` it corresponding column number will be considered. missing : str Available options are 'none', 'drop' and 'raise'. If 'none', no nan @@ -403,16 +414,16 @@ def __init__( from warnings import warn - l1 = "Note: in `GLMRegressor`, the sequence of the parameters will change " - l2 = "in skpro version 2.4.0. It will be as per the order present in the" - l3 = "current docstring with the top one being the first parameter.\n" - l4 = "The defaults for the parameters will remain same and " - l5 = "there will be no changes.\n" - l6 = "Please use the `kwargs` calls instead of positional calls for the" - l7 = "parameters until the release of skpro 2.4.0 " - l8 = "as this will avoid any discrepancies." - warn_msg = l1 + l2 + l3 + l4 + l5 + l6 + l7 + l8 - warn(warn_msg) + warn( + "Note: in `GLMRegressor`, the sequence of the parameters will change " + "in skpro version 2.4.0. It will be as per the order present in the" + "current docstring with the top one being the first parameter.\n" + "The defaults for the parameters will remain same and " + "there will be no changes.\n" + "Please use the `kwargs` calls instead of positional calls for the" + "parameters until the release of skpro 2.4.0 " + "as this will avoid any discrepancies." + ) def _fit(self, X, y): """Fit regressor to training data. @@ -445,20 +456,7 @@ def _fit(self, X, y): offset_var = self._offset_var exposure_var = self._exposure_var - if offset_var is not None: - if isinstance(offset_var, int): - offset_var = pd.Index([X.iloc[:, offset_var].name]) - if exposure_var is not None: - if isinstance(exposure_var, int): - exposure_var = pd.Index([X.iloc[:, exposure_var].name]) - if offset_var is not None and exposure_var is not None: - X = X.drop([offset_var[0], exposure_var[0]], axis=1) - elif offset_var is not None: - X = X.drop(offset_var, axis=1) - elif exposure_var is not None: - X = X.drop(exposure_var, axis=1) - - X_ = self._prep_x(X) + X_ = self._prep_x(X, offset_var, exposure_var, False) y_col = y.columns @@ -475,19 +473,21 @@ def _fit(self, X, y): self._estimator = glm_estimator - fitted_glm_model = glm_estimator.fit( - self._start_params, - self._maxiter, - self._method, - self._tol, - self._scale, - self._cov_type, - self._cov_kwds, - self._use_t, - self._full_output, - self._disp, - self._max_start_irls, - ) + glm_fit_params = { + "start_params": self._start_params, + "maxiter": self._maxiter, + "method": self._method, + "tol": self._tol, + "scale": self._scale, + "cov_type": self._cov_type, + "cov_kwds": self._cov_kwds, + "use_t": self._use_t, + "full_output": self._full_output, + "disp": self._disp, + "max_start_irls": self._max_start_irls, + } + + fitted_glm_model = glm_estimator.fit(**glm_fit_params) PARAMS_TO_FORWARD = { "df_model_": glm_estimator.df_model, @@ -544,26 +544,7 @@ def _predict(self, X): offset_arr = None exposure_arr = None - if offset_var is not None: - if isinstance(offset_var, pd.Index): - offset_arr = np.array(X[offset_var]).flatten() - elif isinstance(offset_var, int): - offset_arr = np.array(X.iloc[:, offset_var]).flatten() - offset_var = pd.Index([X.iloc[:, offset_var].name]) - if exposure_var is not None: - if isinstance(exposure_var, pd.Index): - offset_arr = np.array(X[exposure_var]).flatten() - elif isinstance(exposure_var, int): - exposure_arr = np.array(X.iloc[:, exposure_var]).flatten() - exposure_var = pd.Index([X.iloc[:, exposure_var].name]) - if offset_var is not None and exposure_var is not None: - X = X.drop([offset_var[0], exposure_var[0]], axis=1) - elif offset_var is not None: - X = X.drop(offset_var, axis=1) - elif exposure_var is not None: - X = X.drop(exposure_var, axis=1) - - X_ = self._prep_x(X) + X_, offset_arr, exposure_arr = self._prep_x(X, offset_var, exposure_var, True) index = X_.index y_column = self.y_col @@ -604,7 +585,7 @@ def _params_sm_to_skpro(self, y_predictions_df, index, columns, family): y_mean = y_predictions_df["mean"] y_sd = y_predictions_df["mean_se"] y_alpha = (y_mean / y_sd) ** 2 - y_beta = (y_alpha / y_mean).rename("beta").to_frame() + y_beta = (y_mean / (y_sd**2)).rename("beta").to_frame() y_alpha = y_alpha.rename("alpha").to_frame() params["alpha"] = y_alpha params["beta"] = y_beta @@ -639,20 +620,7 @@ def _predict_proba(self, X): offset_var = self._offset_var exposure_var = self._exposure_var - if offset_var is not None: - if isinstance(offset_var, int): - offset_var = pd.Index([X.iloc[:, offset_var].name]) - if exposure_var is not None: - if isinstance(exposure_var, int): - exposure_var = pd.Index([X.iloc[:, exposure_var].name]) - if offset_var is not None and exposure_var is not None: - X = X.drop([offset_var[0], exposure_var[0]], axis=1) - elif offset_var is not None: - X = X.drop(offset_var, axis=1) - elif exposure_var is not None: - X = X.drop(exposure_var, axis=1) - - X_ = self._prep_x(X) + X_ = self._prep_x(X, offset_var, exposure_var, False) # instead of using the conventional predict() method, we use statsmodels # get_prediction method, which returns a pandas df that contains @@ -668,10 +636,13 @@ def _predict_proba(self, X): y_pred = self._params_sm_to_skpro(y_predictions_df, index, columns, family) return y_pred - def _prep_x(self, X): + def _prep_x(self, X, offset_var, exposure_var, rtn_off_exp_arr): """ Return a copy of X with an added constant of self.add_constant = True. + If rtn_off_exp_arr is True it will also return offset and exposure + arrays along with updated X. + Parameters ---------- X : pandas DataFrame @@ -682,13 +653,46 @@ def _prep_x(self, X): X_ : pandas DataFrame A copy of the input X with an added column 'const' with is an array of len(X) of 1s + offset_arr : numpy.array + The copy of column which is meant for offsetting present in X. + exposure_arr : numpy.array + The copy of column which is meant for exposure present in X. """ from statsmodels.tools import add_constant + offset_arr = None + exposure_arr = None + if offset_var is not None: + if isinstance(offset_var, str): + offset_var = pd.Index([offset_var]) + offset_arr = np.array(X[offset_var]).flatten() + elif isinstance(offset_var, int): + offset_arr = np.array(X.iloc[:, offset_var]).flatten() + offset_var = pd.Index([X.iloc[:, offset_var].name]) + if exposure_var is not None: + if isinstance(exposure_var, str): + exposure_var = pd.Index([exposure_var]) + exposure_arr = np.array(X[exposure_var]).flatten() + elif isinstance(exposure_var, int): + exposure_arr = np.array(X.iloc[:, exposure_var]).flatten() + exposure_var = pd.Index([X.iloc[:, exposure_var].name]) + # drop the offset and exposure columns from X + columns_to_drop = [] + if offset_var is not None: + columns_to_drop.append(offset_var[0]) + if exposure_var is not None: + columns_to_drop.append(exposure_var[0]) + if columns_to_drop: + X = X.drop(columns_to_drop, axis=1) + if self._add_constant: X_ = add_constant(X) + if rtn_off_exp_arr: + return X_, offset_arr, exposure_arr return X_ else: + if rtn_off_exp_arr: + return X, offset_arr, exposure_arr return X @classmethod diff --git a/skpro/regression/tests/test_glm.py b/skpro/regression/tests/test_glm.py index 8ca263ac3..b1435dc4f 100644 --- a/skpro/regression/tests/test_glm.py +++ b/skpro/regression/tests/test_glm.py @@ -50,7 +50,7 @@ def test_glm_with_offset_exposure(): X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) glm_reg = GLMRegressor( - family="Normal", link="Log", offset_var=pd.Index(["off"]), exposure_var=-1 + family="Normal", link="Log", offset_var="off", exposure_var=-1 ) glm_reg.fit(X_train, y_train) y_pred = glm_reg.predict(X_test) From 2a1044d3f7e2882a17bc743e356ba2d572501fa3 Mon Sep 17 00:00:00 2001 From: ShreeshaM07 Date: Thu, 20 Jun 2024 13:40:13 +0530 Subject: [PATCH 18/19] order unchanged parameters back to default. --- skpro/regression/linear/_glm.py | 91 ++++++++++----------------------- 1 file changed, 26 insertions(+), 65 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 134140842..4461ea407 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -301,19 +301,19 @@ def _str_to_sm_family(self, family, link): def __init__( self, - missing="4", - start_params="5", - maxiter="6", - method="7", - tol="8", - scale="9", - cov_type="10", - cov_kwds="11", - use_t="12", - full_output="13", - disp="14", - max_start_irls="15", - add_constant="16", + missing="none", + start_params=None, + maxiter=100, + method="IRLS", + tol=1e-8, + scale=None, + cov_type="nonrobust", + cov_kwds=None, + use_t=None, + full_output=True, + disp=False, + max_start_irls=3, + add_constant=False, family="0", link="1", offset_var="2", @@ -359,58 +359,19 @@ def __init__( self._exposure_var = None else: self._exposure_var = exposure_var - if missing == "4": - self._missing = "none" - else: - self._missing = missing - if start_params == "5": - self._start_params = None - else: - self._start_params = start_params - if maxiter == "6": - self._maxiter = 100 - else: - self._maxiter = maxiter - if method == "7": - self._method = "IRLS" - else: - self._method = method - if tol == "8": - self._tol = 1e-8 - else: - self._tol = self.tol - if scale == "9": - self._scale = None - else: - self._scale = scale - if cov_type == "10": - self._cov_type = "nonrobust" - else: - self._cov_type = cov_type - if cov_kwds == "11": - self._cov_kwds = None - else: - self._cov_kwds = cov_kwds - if use_t == "12": - self._use_t = None - else: - self._use_t = use_t - if full_output == "13": - self._full_output = True - else: - self._full_output = full_output - if disp == "14": - self._disp = False - else: - self._disp = disp - if max_start_irls == "15": - self._max_start_irls = 3 - else: - self._max_start_irls = max_start_irls - if add_constant == "16": - self._add_constant = False - else: - self._add_constant = add_constant + self._missing = self.missing + self._start_params = self.start_params + self._maxiter = self.maxiter + self._method = self.method + self._tol = self.tol + self._scale = self.scale + self._cov_type = self.cov_type + self._cov_kwds = self.cov_kwds + self._use_t = self.use_t + self._full_output = self.full_output + self._disp = self.disp + self._max_start_irls = self.max_start_irls + self._add_constant = self.add_constant from warnings import warn From 3a0a621477b3fc57cffd83851adce3a9b7a29137 Mon Sep 17 00:00:00 2001 From: ShreeshaM07 Date: Thu, 20 Jun 2024 20:58:40 +0530 Subject: [PATCH 19/19] init modified --- skpro/regression/linear/_glm.py | 32 ++++++++++---------------------- 1 file changed, 10 insertions(+), 22 deletions(-) diff --git a/skpro/regression/linear/_glm.py b/skpro/regression/linear/_glm.py index 4461ea407..7004c9851 100644 --- a/skpro/regression/linear/_glm.py +++ b/skpro/regression/linear/_glm.py @@ -314,10 +314,10 @@ def __init__( disp=False, max_start_irls=3, add_constant=False, - family="0", - link="1", - offset_var="2", - exposure_var="3", + family="Normal", + link=None, + offset_var=None, + exposure_var=None, ): # The default values of the parameters # are replaced with the changed sequence @@ -343,22 +343,10 @@ def __init__( self.max_start_irls = max_start_irls self.add_constant = add_constant - if family == "0": - self._family = "Normal" - else: - self._family = family - if link == "1": - self._link = None - else: - self._link = link - if offset_var == "2": - self._offset_var = None - else: - self._offset_var = offset_var - if exposure_var == "3": - self._exposure_var = None - else: - self._exposure_var = exposure_var + self._family = self.family + self._link = self.link + self._offset_var = self.offset_var + self._exposure_var = self.exposure_var self._missing = self.missing self._start_params = self.start_params self._maxiter = self.maxiter @@ -377,12 +365,12 @@ def __init__( warn( "Note: in `GLMRegressor`, the sequence of the parameters will change " - "in skpro version 2.4.0. It will be as per the order present in the" + "in skpro version 2.5.0. It will be as per the order present in the" "current docstring with the top one being the first parameter.\n" "The defaults for the parameters will remain same and " "there will be no changes.\n" "Please use the `kwargs` calls instead of positional calls for the" - "parameters until the release of skpro 2.4.0 " + "parameters until the release of skpro 2.5.0 " "as this will avoid any discrepancies." )