diff --git a/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py b/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py
index 4ecd7934..9ce6048e 100644
--- a/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py
+++ b/causallib/contrib/shared_sparsity_selection/shared_sparsity_selection.py
@@ -11,15 +11,13 @@
 
 class MCPSelector:
     # TODO: transpose `theta` and rename it `coef_` to align with sklearn models
-    def __init__(self, lmda="auto", alpha=1, step=0.1, max_iter=1000, tol=1e-3):
+    def __init__(self, lmda=0.01, alpha="auto", max_norm = "auto", step=0.1, max_iter=1000, tol=1e-3, proportional_lambda=True, include_bias=True):
         """Constructor for MCPSelector. This class computes shared
         sparsity matrix using proximal gradient descent applied with
         MCP regularizer.
-
         Args:
-            lmda (str|float): Parameter (>= 0) to control shape of MCP regularizer.
+            lmda (float): Parameter (>= 0) to control shape of MCP regularizer.
                 The bigger the value the stronger the regularization.
-                "auto" will auto-select good regularization value.
             alpha (float): Associated lambda parameter (>= 0) to control shape of MCP regularizer.
                 The smaller the value the stronger the regularization.
             step (float): Step size for proximal gradient, equivalent of learning rate.
@@ -27,32 +25,62 @@ def __init__(self, lmda="auto", alpha=1, step=0.1, max_iter=1000, tol=1e-3):
             tol (float): Stopping criterion for MCP. If the normalized value of
                 proximal gradient is less than tol then the algorithm is assumed
                 to have converged.
+            proportional_lambda (Bool): Interpret mcp_lambda parameter as a constant of proportionality for the theory-driven optimal lambda rate. Makes tuning simpler across changing 
+                numbers of samples, covariates, actions, etc.
+            include_bias (Bool): If true, include bias terms in the regression that are not regularized.
         """
         super().__init__()
         self.alpha = alpha
+        self.R = max_norm
         self.lmda = lmda
+        self.lmda_proportional = proportional_lambda
         self.step = step
         self.max_iter = max_iter
         self.tol = tol
         self.epsilon_safe_division = 1e-6
+        self.include_bias = True
 
     def _initialize_internal_params(self, X, a, y):
         treatments = list(a.unique())
         n_treatments = len(treatments)
         n_confounders = X.shape[1]
-        if self.lmda == "auto":
-            avg_num_samples = len(y) / n_treatments
-            lmda = 0.2 * np.sqrt(n_treatments * np.log(n_confounders) / avg_num_samples)
+        avg_num_samples = len(y) / n_treatments
+        if self.lmda_proportional:
+            lmda = self.lmda * np.sqrt(n_treatments * np.log(n_confounders) / avg_num_samples)
         else:
             lmda = self.lmda
 
-        assert self.alpha >= 0
+        if self.alpha == "auto":
+            min_var = np.inf
+            for i,t in enumerate(treatments):
+                u = X.loc[a == t].values.T
+                
+                tmp = np.min(np.std(u,axis=1))
+                if tmp > 0:
+                    min_var = min(min_var, tmp)
+            alpha = 1 / (.25 * min_var)
+        else:
+            alpha = self.alpha
+        
+        if self.alpha == 0 or self.R == "none":
+            R = np.inf
+        else:
+            if self.R == "auto":
+                R = 1 * np.sqrt(n_treatments) / (8 * lmda)
+            else:
+                R = self.R
+
+        assert alpha >= 0
+        assert R > 0
         assert lmda >= 0
 
         corr_xx = np.zeros((n_confounders, n_confounders, n_treatments))
         corr_xa = np.zeros((n_confounders, n_treatments))
         for i, t in enumerate(treatments):
             u, v = X.loc[a == t].values.T, y.loc[a == t].values
+            if self.include_bias:
+                u = u - np.mean(u,axis=1,keepdims=True) @ np.ones((1,u.shape[1]))
+                v = v - np.mean(v)
             corr_xx[:, :, i] = (u @ u.T) / len(v)
             corr_xa[:, i] = (u @ v) / len(v)
         self.theta_ = np.zeros((n_confounders, n_treatments))
@@ -60,13 +88,17 @@ def _initialize_internal_params(self, X, a, y):
         self.n_confounders_ = n_confounders
         self.n_treatments_ = n_treatments
         self.lmda_ = lmda
+        self.R_ = R
+        self.alpha_ = alpha
 
-    def _mcp_q_grad(self, t):
+        
+    #Minimax Convex Penalty (MCP) regularizer
+    def _mcp_q_grad(self, t): 
         if self.alpha == 0:
             return np.sign(t) * self.lmda_ * (-1)
         else:
             return (np.sign(t) * self.lmda_
-                    * np.maximum(-1, - np.abs(t) / (self.lmda_ * self.alpha)))
+                    * np.maximum(-1, - np.abs(t) / (self.lmda_ * self.alpha_)))
 
     def _update_grad_op(self, theta):
         theta_new = np.zeros_like(theta)
@@ -83,9 +115,19 @@ def _update_grad_op(self, theta):
 
     def _update_proximal_op(self, theta):
         norm_theta = np.linalg.norm(theta, axis=1, keepdims=True)
+        # Do Proximal operator, bearing in mind max_norm constraint
         shrink = np.maximum(0, norm_theta - self.lmda_)
+        if np.sum(shrink) > self.R_:
+            #first take a large safe step
+            shrink = np.maximum(0, shrink - (np.sum(shrink) - self.R_)/len(shrink) )
+        #Then iteratively step until constraint is satisfied
+        while np.sum(shrink) > self.R_:
+            shrink = np.maximum(0, shrink - .05 * self.lmda_)
+        
         theta_proximal_op = theta * ((shrink / norm_theta)
                                      @ np.ones((1, self.n_treatments_)))
+        
+    
         return theta_proximal_op
 
     def compute_shared_sparsity_matrix(self, X, a, y):
@@ -114,16 +156,16 @@ class SharedSparsityConfounderSelection(_BaseConfounderSelection):
     Method by Greenewald, Katz-Rogozhnikov, and Shanmugam: https://arxiv.org/abs/2011.01979
     """
 
-    def __init__(self, mcp_lambda="auto", mcp_alpha=1, step=0.1, max_iter=1000, tol=1e-3, threshold=1e-6,
+    def __init__(self, mcp_lambda=0.01, mcp_alpha="auto", max_norm = "auto", step=0.1, max_iter=1000, tol=1e-3, threshold=1e-6,proportional_lambda=True,include_bias = True,
                  importance_getter=None, covariates=None):
         """Constructor for SharedSparsityConfounderSelection
-
         Args:
-            mcp_lambda (str|float): Parameter (>= 0) to control shape of MCP regularizer.
+            mcp_lambda (str|float): Parameter (>= 0) to control shape of Minimax Convex Penalty (MCP) regularizer.
                 The bigger the value the stronger the regularization.
-                "auto" will auto-select good regularization value.
             mcp_alpha (float): Associated lambda parameter (>= 0) to control shape of MCP regularizer.
                 The smaller the value the stronger the regularization.
+            max_norm (float): Maximum (1,2)-norm allowed for coefficient vector, used if alpha \neq 0. Decrease if the optimization does not converge.
+                "auto" will auto-select a value. "none" means do not use.
             step (float): Step size for proximal gradient, equivalent of learning rate.
             max_iter (int): Maximum number of iterations of MCP proximal gradient.
             tol (float): Stopping criterion for MCP. If the normalized value of
@@ -132,6 +174,9 @@ def __init__(self, mcp_lambda="auto", mcp_alpha=1, step=0.1, max_iter=1000, tol=
             threshold (float): Only if the importance of a confounder exceeds
                 threshold for all values of treatments, then the confounder
                 is retained by transform() call.
+            proportional_lambda (Bool): Interpret mcp_lambda parameter as a constant of proportionality for the theory-driven optimal lambda rate. Makes tuning simpler across changing 
+                numbers of samples, covariates, actions, etc.
+            include_bias (Bool): If True, include bias terms in the regressions that are not regularized.
             importance_getter: IGNORED.
             covariates (list | np.ndarray): Specifying a subset of columns to perform selection on.
                 Columns in `X` but not in `covariates` will be included after `transform`
@@ -140,13 +185,14 @@ def __init__(self, mcp_lambda="auto", mcp_alpha=1, step=0.1, max_iter=1000, tol=
                 or anything compatible with pandas `loc` function for columns.
                 if `None` then all columns are participating in the selection process.
                 This is similar to using sklearn's `ColumnTransformer` or `make_column_selector`.
+            
         """
         super().__init__(importance_getter=importance_getter, covariates=covariates)
         self.step = step
         self.max_iter = max_iter
         self.tol = tol
         self.threshold = threshold
-        self.selector_ = MCPSelector(lmda=mcp_lambda, alpha=mcp_alpha, step=step, max_iter=max_iter, tol=tol)
+        self.selector_ = MCPSelector(lmda=mcp_lambda, alpha=mcp_alpha, max_norm = max_norm, step=step, max_iter=max_iter, tol=tol, proportional_lambda=proportional_lambda,include_bias=include_bias)
 
         self.importance_getter = lambda e: e.selector_.theta_.transpose()  # Shape to behave like sklearn linear_model
         # self.importance_getter = "selector_.theta_"
@@ -174,3 +220,5 @@ def fit(self, X, a, y):
 
     def _get_support_mask(self):
         return self.support_
+
+
diff --git a/causallib/contrib/tests/test_shared_sparsity_selection.py b/causallib/contrib/tests/test_shared_sparsity_selection.py
index 1c486310..1378fbf5 100644
--- a/causallib/contrib/tests/test_shared_sparsity_selection.py
+++ b/causallib/contrib/tests/test_shared_sparsity_selection.py
@@ -85,15 +85,16 @@ def test_alphas(self):
     def test_lambdas(self):
         X, a, y = self.make_xay(6, 4, n_samples=100, seed=1)
 
-        with self.subTest("Automatic (default) lambda"):
-            sss = SharedSparsityConfounderSelection(mcp_lambda="auto")
-            sss.fit(X, a, y)
-            expected = 0.2 * np.sqrt(2 * np.log(X.shape[1]) / (X.shape[0] / 2))
-            self.assertAlmostEqual(sss.selector_.lmda_, expected)
+        # with self.subTest("Automatic (default) lambda"):
+        #     sss = SharedSparsityConfounderSelection(mcp_lambda="auto")
+        #     sss.fit(X, a, y)
+        #     expected = 0.2 * np.sqrt(2 * np.log(X.shape[1]) / (X.shape[0] / 2))
+        #     self.assertAlmostEqual(sss.selector_.lmda_, expected)
 
         with self.subTest("Pre-specified lambda"):
             lmda = 2.1
-            sss = SharedSparsityConfounderSelection(mcp_lambda=lmda)
+            # `proportional_lambda=False` should take lambda as is without changing it
+            sss = SharedSparsityConfounderSelection(mcp_lambda=lmda, proportional_lambda=False)
             sss.fit(X, a, y)
             self.assertEqual(sss.selector_.lmda_, lmda)
 
@@ -107,6 +108,13 @@ def test_lambdas(self):
             strong = SharedSparsityConfounderSelection(mcp_lambda=1).fit(X, a, y).selector_.theta_
             self.assertLess(np.linalg.norm(strong), np.linalg.norm(weak))
 
+        with self.subTest("Shrinkage under n>p binary treatment setting with proportional lambda"):
+            lmda = 1
+            # `proportional_lambda=True` should reduce lambda
+            sss = SharedSparsityConfounderSelection(mcp_lambda=lmda, proportional_lambda=True)
+            sss.fit(X, a, y)
+            self.assertLess(sss.selector_.lmda_, lmda)
+
     def test_max_iter(self):
         X, a, y = self.make_xay(6, 4, n_samples=100, seed=1)