Update shared_sparsity_selection.py

Updates to improve robustness and quality-of-life: -Addition of a max_norm constraint to avoid overflow/nonsparse outcomes given the nonconvex regularizer (involves a minor change to the optimization algorithm). This was a parameter that the theory had included, but we had omitted this from the original version since it only seems to matter in extreme circumstances where the hyperparameters are poorly chosen. There is also an auto option here for easy use, as well as none to ignore. -Addition of an auto option for the mcp_alpha parameter, since this may be somewhat mysterious to some. -Addition of an include_bias option defaulted to True. This subtracts off the mean of each group of covariates and outcomes (something we typically did outside the module), which is equivalent to adding an unregularized bias term to the regressions. -The replacement of the auto option for mcp_lambda with an option proportional_lambda, which, if set to True, interprets the value of mcp_lambda as a constant of proportionality to the previous auto option. This is because I found in practice, the coefficient of the auto option pretty much always needs to be tuned, but that the analytic expression it uses was extremely helpful otherwise.
kgreenewald · May 31, 2023 · d52d8b2 · d52d8b2
1 parent 9aea4dc
commit d52d8b2
Showing 1 changed file with 62 additions and 13 deletions.
diff --git a/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py b/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py
@@ -11,11 +11,10 @@
 
 class MCPSelector:
     # TODO: transpose `theta` and rename it `coef_` to align with sklearn models
-    def __init__(self, lmda="auto", alpha=1, step=0.1, max_iter=1000, tol=1e-3):
+    def __init__(self, lmda=0.01, alpha="auto", max_norm = "auto", step=0.1, max_iter=1000, tol=1e-3, proportional_lambda=True, include_bias=True):
         """Constructor for MCPSelector. This class computes shared
         sparsity matrix using proximal gradient descent applied with
         MCP regularizer.
-
         Args:
             lmda (str|float): Parameter (>= 0) to control shape of MCP regularizer.
                 The bigger the value the stronger the regularization.
@@ -27,46 +26,80 @@ def __init__(self, lmda="auto", alpha=1, step=0.1, max_iter=1000, tol=1e-3):
             tol (float): Stopping criterion for MCP. If the normalized value of
                 proximal gradient is less than tol then the algorithm is assumed
                 to have converged.
+            proportional_lambda (Bool): Interpret mcp_lambda parameter as a constant of proportionality for the theory-driven optimal lambda rate. Makes tuning simpler across changing 
+                numbers of samples, covariates, actions, etc.
+            include_bias (Bool): If true, include bias terms in the regression that are not regularized.
         """
         super().__init__()
         self.alpha = alpha
+        self.R = max_norm
         self.lmda = lmda
+        self.lmda_proportional = proportional_lambda
         self.step = step
         self.max_iter = max_iter
         self.tol = tol
         self.epsilon_safe_division = 1e-6
+        self.include_bias = True
 
     def _initialize_internal_params(self, X, a, y):
         treatments = list(a.unique())
         n_treatments = len(treatments)
         n_confounders = X.shape[1]
-        if self.lmda == "auto":
-            avg_num_samples = len(y) / n_treatments
-            lmda = 0.2 * np.sqrt(n_treatments * np.log(n_confounders) / avg_num_samples)
+        avg_num_samples = len(y) / n_treatments
+        if self.lmda_proportional:
+            lmda = self.lmda * np.sqrt(n_treatments * np.log(n_confounders) / avg_num_samples)
         else:
             lmda = self.lmda
 
-        assert self.alpha >= 0
+        if self.alpha == "auto":
+            min_var = np.Inf
+            for i,t in enumerate(treatments):
+                u = X.loc[a == t].values.T
+
+                tmp = np.min(np.std(u,axis=1))
+                if tmp > 0:
+                    min_var = min(min_var, tmp)
+            alpha = 1 / (.25 * min_var)
+        else:
+            alpha = self.alpha
+
+        if self.alpha == 0 or self.R == "none":
+            R = np.inf
+        else:
+            if self.R == "auto":
+                R = 1 * np.sqrt(n_treatments) / (8 * lmda)
+            else:
+                R = self.R
+
+        assert alpha >= 0
+        assert R > 0
         assert lmda >= 0
 
         corr_xx = np.zeros((n_confounders, n_confounders, n_treatments))
         corr_xa = np.zeros((n_confounders, n_treatments))
         for i, t in enumerate(treatments):
             u, v = X.loc[a == t].values.T, y.loc[a == t].values
+            if self.include_bias:
+                u = u - np.mean(u,axis=1,keepdims=True) @ np.ones((1,u.shape[1]))
+                v = v - np.mean(v)
             corr_xx[:, :, i] = (u @ u.T) / len(v)
             corr_xa[:, i] = (u @ v) / len(v)
         self.theta_ = np.zeros((n_confounders, n_treatments))
         self.corr_mats_ = {"corr_xx": corr_xx, "corr_xa": corr_xa}  # Correlation matrices
         self.n_confounders_ = n_confounders
         self.n_treatments_ = n_treatments
         self.lmda_ = lmda
+        self.R_ = R
+        self.alpha_ = alpha
 
-    def _mcp_q_grad(self, t):
+
+    #Minimax Convex Penalty (MCP) regularizer
+    def _mcp_q_grad(self, t): 
         if self.alpha == 0:
             return np.sign(t) * self.lmda_ * (-1)
         else:
             return (np.sign(t) * self.lmda_
-                    * np.maximum(-1, - np.abs(t) / (self.lmda_ * self.alpha)))
+                    * np.maximum(-1, - np.abs(t) / (self.lmda_ * self.alpha_)))
 
     def _update_grad_op(self, theta):
         theta_new = np.zeros_like(theta)
@@ -83,9 +116,19 @@ def _update_grad_op(self, theta):
 
     def _update_proximal_op(self, theta):
         norm_theta = np.linalg.norm(theta, axis=1, keepdims=True)
+        # Do Proximal operator, bearing in mind max_norm constraint
         shrink = np.maximum(0, norm_theta - self.lmda_)
+        if np.sum(shrink) > self.R_:
+            #first take a large safe step
+            shrink = np.maximum(0, shrink - (np.sum(shrink) - self.R_)/len(shrink) )
+        #Then iteratively step until constraint is satisfied
+        while np.sum(shrink) > self.R_:
+            shrink = np.maximum(0, shrink - .05 * self.lmda_)
+
         theta_proximal_op = theta * ((shrink / norm_theta)
                                      @ np.ones((1, self.n_treatments_)))
+
+
         return theta_proximal_op
 
     def compute_shared_sparsity_matrix(self, X, a, y):
@@ -114,16 +157,16 @@ class SharedSparsityConfounderSelection(_BaseConfounderSelection):
     Method by Greenewald, Katz-Rogozhnikov, and Shanmugam: https://arxiv.org/abs/2011.01979
     """
 
-    def __init__(self, mcp_lambda="auto", mcp_alpha=1, step=0.1, max_iter=1000, tol=1e-3, threshold=1e-6,
+    def __init__(self, mcp_lambda=0.01, mcp_alpha="auto", max_norm = "auto", step=0.1, max_iter=1000, tol=1e-3, threshold=1e-6,proportional_lambda=True,include_bias = True,
                  importance_getter=None, covariates=None):
         """Constructor for SharedSparsityConfounderSelection
-
         Args:
-            mcp_lambda (str|float): Parameter (>= 0) to control shape of MCP regularizer.
+            mcp_lambda (str|float): Parameter (>= 0) to control shape of Minimax Convex Penalty (MCP) regularizer.
                 The bigger the value the stronger the regularization.
-                "auto" will auto-select good regularization value.
             mcp_alpha (float): Associated lambda parameter (>= 0) to control shape of MCP regularizer.
                 The smaller the value the stronger the regularization.
+            max_norm (float): Maximum (1,2)-norm allowed for coefficient vector, used if alpha \neq 0. Decrease if the optimization does not converge.
+                "auto" will auto-select a value. "none" means do not use.
             step (float): Step size for proximal gradient, equivalent of learning rate.
             max_iter (int): Maximum number of iterations of MCP proximal gradient.
             tol (float): Stopping criterion for MCP. If the normalized value of
@@ -132,6 +175,9 @@ def __init__(self, mcp_lambda="auto", mcp_alpha=1, step=0.1, max_iter=1000, tol=
             threshold (float): Only if the importance of a confounder exceeds
                 threshold for all values of treatments, then the confounder
                 is retained by transform() call.
+            proportional_lambda (Bool): Interpret mcp_lambda parameter as a constant of proportionality for the theory-driven optimal lambda rate. Makes tuning simpler across changing 
+                numbers of samples, covariates, actions, etc.
+            include_bias (Bool): If True, include bias terms in the regressions that are not regularized.
             importance_getter: IGNORED.
             covariates (list | np.ndarray): Specifying a subset of columns to perform selection on.
                 Columns in `X` but not in `covariates` will be included after `transform`
@@ -140,13 +186,14 @@ def __init__(self, mcp_lambda="auto", mcp_alpha=1, step=0.1, max_iter=1000, tol=
                 or anything compatible with pandas `loc` function for columns.
                 if `None` then all columns are participating in the selection process.
                 This is similar to using sklearn's `ColumnTransformer` or `make_column_selector`.
+            
         """
         super().__init__(importance_getter=importance_getter, covariates=covariates)
         self.step = step
         self.max_iter = max_iter
         self.tol = tol
         self.threshold = threshold
-        self.selector_ = MCPSelector(lmda=mcp_lambda, alpha=mcp_alpha, step=step, max_iter=max_iter, tol=tol)
+        self.selector_ = MCPSelector(lmda=mcp_lambda, alpha=mcp_alpha, max_norm = max_norm, step=step, max_iter=max_iter, tol=tol, proportional_lambda=True,include_bias=include_bias)
 
         self.importance_getter = lambda e: e.selector_.theta_.transpose()  # Shape to behave like sklearn linear_model
         # self.importance_getter = "selector_.theta_"
@@ -174,3 +221,5 @@ def fit(self, X, a, y):
 
     def _get_support_mask(self):
         return self.support_
+
+