Skip to content

Commit

Permalink
Update shared_sparsity_selection.py
Browse files Browse the repository at this point in the history
Updates to improve robustness and quality-of-life:
-Addition of a max_norm constraint to avoid overflow/nonsparse outcomes given the nonconvex regularizer (involves a minor change to the optimization algorithm). This was a parameter that the theory had included, but we had omitted this from the original version since it only seems to matter in extreme circumstances where the hyperparameters are poorly chosen. There is also an auto option here for easy use, as well as none to ignore.
-Addition of an auto option for the mcp_alpha parameter, since this may be somewhat mysterious to some.
-Addition of an include_bias option defaulted to True. This subtracts off the mean of each group of covariates and outcomes (something we typically did outside the module), which is equivalent to adding an unregularized bias term to the regressions.
-The replacement of the auto option for mcp_lambda with an option proportional_lambda, which, if set to True, interprets the value of mcp_lambda as a constant of proportionality to the previous auto option. This is because I found in practice, the coefficient of the auto option pretty much always needs to be tuned, but that the analytic expression it uses was extremely helpful otherwise.
  • Loading branch information
kgreenewald authored May 31, 2023
1 parent 9aea4dc commit d52d8b2
Showing 1 changed file with 62 additions and 13 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,10 @@

class MCPSelector:
# TODO: transpose `theta` and rename it `coef_` to align with sklearn models
def __init__(self, lmda="auto", alpha=1, step=0.1, max_iter=1000, tol=1e-3):
def __init__(self, lmda=0.01, alpha="auto", max_norm = "auto", step=0.1, max_iter=1000, tol=1e-3, proportional_lambda=True, include_bias=True):
"""Constructor for MCPSelector. This class computes shared
sparsity matrix using proximal gradient descent applied with
MCP regularizer.
Args:
lmda (str|float): Parameter (>= 0) to control shape of MCP regularizer.
The bigger the value the stronger the regularization.
Expand All @@ -27,46 +26,80 @@ def __init__(self, lmda="auto", alpha=1, step=0.1, max_iter=1000, tol=1e-3):
tol (float): Stopping criterion for MCP. If the normalized value of
proximal gradient is less than tol then the algorithm is assumed
to have converged.
proportional_lambda (Bool): Interpret mcp_lambda parameter as a constant of proportionality for the theory-driven optimal lambda rate. Makes tuning simpler across changing
numbers of samples, covariates, actions, etc.
include_bias (Bool): If true, include bias terms in the regression that are not regularized.
"""
super().__init__()
self.alpha = alpha
self.R = max_norm
self.lmda = lmda
self.lmda_proportional = proportional_lambda
self.step = step
self.max_iter = max_iter
self.tol = tol
self.epsilon_safe_division = 1e-6
self.include_bias = True

def _initialize_internal_params(self, X, a, y):
treatments = list(a.unique())
n_treatments = len(treatments)
n_confounders = X.shape[1]
if self.lmda == "auto":
avg_num_samples = len(y) / n_treatments
lmda = 0.2 * np.sqrt(n_treatments * np.log(n_confounders) / avg_num_samples)
avg_num_samples = len(y) / n_treatments
if self.lmda_proportional:
lmda = self.lmda * np.sqrt(n_treatments * np.log(n_confounders) / avg_num_samples)
else:
lmda = self.lmda

assert self.alpha >= 0
if self.alpha == "auto":
min_var = np.Inf
for i,t in enumerate(treatments):
u = X.loc[a == t].values.T

tmp = np.min(np.std(u,axis=1))
if tmp > 0:
min_var = min(min_var, tmp)
alpha = 1 / (.25 * min_var)
else:
alpha = self.alpha

if self.alpha == 0 or self.R == "none":
R = np.inf
else:
if self.R == "auto":
R = 1 * np.sqrt(n_treatments) / (8 * lmda)
else:
R = self.R

assert alpha >= 0
assert R > 0
assert lmda >= 0

corr_xx = np.zeros((n_confounders, n_confounders, n_treatments))
corr_xa = np.zeros((n_confounders, n_treatments))
for i, t in enumerate(treatments):
u, v = X.loc[a == t].values.T, y.loc[a == t].values
if self.include_bias:
u = u - np.mean(u,axis=1,keepdims=True) @ np.ones((1,u.shape[1]))
v = v - np.mean(v)
corr_xx[:, :, i] = (u @ u.T) / len(v)
corr_xa[:, i] = (u @ v) / len(v)
self.theta_ = np.zeros((n_confounders, n_treatments))
self.corr_mats_ = {"corr_xx": corr_xx, "corr_xa": corr_xa} # Correlation matrices
self.n_confounders_ = n_confounders
self.n_treatments_ = n_treatments
self.lmda_ = lmda
self.R_ = R
self.alpha_ = alpha

def _mcp_q_grad(self, t):

#Minimax Convex Penalty (MCP) regularizer
def _mcp_q_grad(self, t):
if self.alpha == 0:
return np.sign(t) * self.lmda_ * (-1)
else:
return (np.sign(t) * self.lmda_
* np.maximum(-1, - np.abs(t) / (self.lmda_ * self.alpha)))
* np.maximum(-1, - np.abs(t) / (self.lmda_ * self.alpha_)))

def _update_grad_op(self, theta):
theta_new = np.zeros_like(theta)
Expand All @@ -83,9 +116,19 @@ def _update_grad_op(self, theta):

def _update_proximal_op(self, theta):
norm_theta = np.linalg.norm(theta, axis=1, keepdims=True)
# Do Proximal operator, bearing in mind max_norm constraint
shrink = np.maximum(0, norm_theta - self.lmda_)
if np.sum(shrink) > self.R_:
#first take a large safe step
shrink = np.maximum(0, shrink - (np.sum(shrink) - self.R_)/len(shrink) )
#Then iteratively step until constraint is satisfied
while np.sum(shrink) > self.R_:
shrink = np.maximum(0, shrink - .05 * self.lmda_)

theta_proximal_op = theta * ((shrink / norm_theta)
@ np.ones((1, self.n_treatments_)))


return theta_proximal_op

def compute_shared_sparsity_matrix(self, X, a, y):
Expand Down Expand Up @@ -114,16 +157,16 @@ class SharedSparsityConfounderSelection(_BaseConfounderSelection):
Method by Greenewald, Katz-Rogozhnikov, and Shanmugam: https://arxiv.org/abs/2011.01979
"""

def __init__(self, mcp_lambda="auto", mcp_alpha=1, step=0.1, max_iter=1000, tol=1e-3, threshold=1e-6,
def __init__(self, mcp_lambda=0.01, mcp_alpha="auto", max_norm = "auto", step=0.1, max_iter=1000, tol=1e-3, threshold=1e-6,proportional_lambda=True,include_bias = True,
importance_getter=None, covariates=None):
"""Constructor for SharedSparsityConfounderSelection
Args:
mcp_lambda (str|float): Parameter (>= 0) to control shape of MCP regularizer.
mcp_lambda (str|float): Parameter (>= 0) to control shape of Minimax Convex Penalty (MCP) regularizer.
The bigger the value the stronger the regularization.
"auto" will auto-select good regularization value.
mcp_alpha (float): Associated lambda parameter (>= 0) to control shape of MCP regularizer.
The smaller the value the stronger the regularization.
max_norm (float): Maximum (1,2)-norm allowed for coefficient vector, used if alpha \neq 0. Decrease if the optimization does not converge.
"auto" will auto-select a value. "none" means do not use.
step (float): Step size for proximal gradient, equivalent of learning rate.
max_iter (int): Maximum number of iterations of MCP proximal gradient.
tol (float): Stopping criterion for MCP. If the normalized value of
Expand All @@ -132,6 +175,9 @@ def __init__(self, mcp_lambda="auto", mcp_alpha=1, step=0.1, max_iter=1000, tol=
threshold (float): Only if the importance of a confounder exceeds
threshold for all values of treatments, then the confounder
is retained by transform() call.
proportional_lambda (Bool): Interpret mcp_lambda parameter as a constant of proportionality for the theory-driven optimal lambda rate. Makes tuning simpler across changing
numbers of samples, covariates, actions, etc.
include_bias (Bool): If True, include bias terms in the regressions that are not regularized.
importance_getter: IGNORED.
covariates (list | np.ndarray): Specifying a subset of columns to perform selection on.
Columns in `X` but not in `covariates` will be included after `transform`
Expand All @@ -140,13 +186,14 @@ def __init__(self, mcp_lambda="auto", mcp_alpha=1, step=0.1, max_iter=1000, tol=
or anything compatible with pandas `loc` function for columns.
if `None` then all columns are participating in the selection process.
This is similar to using sklearn's `ColumnTransformer` or `make_column_selector`.
"""
super().__init__(importance_getter=importance_getter, covariates=covariates)
self.step = step
self.max_iter = max_iter
self.tol = tol
self.threshold = threshold
self.selector_ = MCPSelector(lmda=mcp_lambda, alpha=mcp_alpha, step=step, max_iter=max_iter, tol=tol)
self.selector_ = MCPSelector(lmda=mcp_lambda, alpha=mcp_alpha, max_norm = max_norm, step=step, max_iter=max_iter, tol=tol, proportional_lambda=True,include_bias=include_bias)

self.importance_getter = lambda e: e.selector_.theta_.transpose() # Shape to behave like sklearn linear_model
# self.importance_getter = "selector_.theta_"
Expand Down Expand Up @@ -174,3 +221,5 @@ def fit(self, X, a, y):

def _get_support_mask(self):
return self.support_


0 comments on commit d52d8b2

Please sign in to comment.