cnellington · cnellington · Dec 29, 2023 · Mar 6, 2023 · Mar 6, 2023 · Mar 6, 2023
diff --git a/.pylintrc b/.pylintrc
@@ -82,7 +82,7 @@ persistent=yes
 
 # Minimum Python version to use for version dependent checks. Will default to
 # the version used to run pylint.
-py-version=3.7
+py-version=3.8
 
 # Discover python modules and packages in the file system subtree.
 recursive=no

diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
@@ -9,10 +9,13 @@ If you would like to implement a new feature or a bug, please make sure you (or
 
 ### Creating a Pull Request
 1. [Fork](https://github.com/cnellington/Contextualized/fork) this repository.
-2. Make your code changes locally.
-3. Check the style using pylint and black following [these steps](https://github.com/cnellington/Contextualized/pull/111#issue-1323230194).
-4. (Optional) Include your name in alphabetical order in [ACKNOWLEDGEMENTS.md](https://github.com/cnellington/Contextualized/blob/main/ACKNOWLEDGEMENTS.md).
-5. Issue a PR to merge your changes into the `dev` branch.
+2. Install locally with `pip install -e .`.
+3. Install extra developer dependencies with `pip install -r dev_requirements.txt`.
+4. Make your code changes locally.
+5. Automatically format your code and check for style issues by running `format_style.sh`. We are working on linting the entire repo, but please make sure your code is cleared by pylint.
+6. Automatically update our documentation by running `update_docs.sh`.
+7. (Optional) Include your name in alphabetical order in [ACKNOWLEDGEMENTS.md](https://github.com/cnellington/Contextualized/blob/main/ACKNOWLEDGEMENTS.md).
+8. Issue a PR to merge your changes into the `main` branch.
 
 
 ## Issues

diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-![Preview](contextualized_logo.png)
+![Preview](docs/logo.png)
 #
 
 ![License](https://img.shields.io/github/license/cnellington/contextualized.svg?style=flat-square)
@@ -10,7 +10,7 @@
 <a href="https://github.com/psf/black"><img alt="Code style: black" src="https://img.shields.io/badge/code%20style-black-000000.svg"></a>
 
 
-A statistical machine learning toolbox for estimating models, distributions, and functions with context-specific parameters.
+An easy-to-use machine learning toolbox for estimating models, distributions, and functions with context-specific parameters.
 
 Context-specific parameters:
 - Find hidden heterogeneity in data -- are all samples the same?
@@ -66,13 +66,16 @@ Feel free to add your own page(s) by sending a PR or request an improvement by c
   <img src="https://contributors-img.web.app/image?repo=cnellington/contextualized" />
 </a>
 
-ContextualizedML was originally implemented by [Caleb Ellington](https://calebellington.com/) (CMU) and [Ben Lengerich](http://web.mit.edu/~blengeri/www) (MIT).
+Contextualized ML was originally implemented by [Caleb Ellington](https://calebellington.com/) (CMU) and [Ben Lengerich](http://web.mit.edu/~blengeri/www) (MIT).
 
 Many people have helped. Check out [ACKNOWLEDGEMENTS.md](https://github.com/cnellington/Contextualized/blob/main/ACKNOWLEDGEMENTS.md)!
 
 
 
 ## Related Publications and Pre-prints
+- [Contextualized Machine Learning](https://arxiv.org/abs/2310.11340)
+- [Contextualized Networks Reveal Heterogeneous Transcriptomic Regulation in Tumors at Sample-Specific Resolution](https://www.biorxiv.org/content/10.1101/2023.12.01.569658v1)
+- [Contextualized Policy Recovery: Modeling and Interpreting Medical Decisions with Adaptive Imitation Learning](https://arxiv.org/abs/2310.07918)
 - [Automated Interpretable Discovery of Heterogeneous Treatment Effectiveness: A COVID-19 Case Study](https://www.sciencedirect.com/science/article/pii/S1532046422001022)
 - [NOTMAD: Estimating Bayesian Networks with Sample-Specific Structures and Parameters](http://arxiv.org/abs/2111.01104)
 - [Discriminative Subtyping of Lung Cancers from Histopathology Images via Contextual Deep Learning](https://www.medrxiv.org/content/10.1101/2020.06.25.20140053v1.abstract)

diff --git a/contextualized/analysis/__init__.py b/contextualized/analysis/__init__.py
@@ -12,3 +12,8 @@
     plot_homogeneous_predictor_effects,
     plot_heterogeneous_predictor_effects,
 )
+from contextualized.analysis.pvals import (
+    calc_homogeneous_context_effects_pvals,
+    calc_homogeneous_predictor_effects_pvals,
+    calc_heterogeneous_predictor_effects_pvals,
+)
diff --git a/contextualized/analysis/accuracy_split.py b/contextualized/analysis/accuracy_split.py
@@ -1,8 +1,10 @@
 """
 Utilities for post-hoc analysis of trained Contextualized models.
 """
+from typing import *
 
 import numpy as np
+import pandas as pd
 from sklearn.metrics import roc_auc_score as roc
 
 
@@ -14,11 +16,25 @@ def get_roc(Y_true, Y_pred):
         return np.nan
 
 
-def print_acc_by_covars(Y_true, Y_pred, covar_df, **kwargs):
+def print_acc_by_covars(
+    Y_true: np.ndarray, Y_pred: np.ndarray, covar_df: pd.DataFrame, **kwargs
+) -> None:
     """
     Prints Accuracy for different data splits with covariates.
-    Assume Y_true and Y_pred are np arrays.
-    Allows train_idx and test_idx as Boolean locators.
+
+    Args:
+        Y_true (np.ndarray): True labels.
+        Y_pred (np.ndarray): Predicted labels.
+        covar_df (pd.DataFrame): DataFrame of covariates.
+        max_classes (int, optional): Maximum number of classes to print. Defaults to 20.
+        covar_stds (np.ndarray, optional): Standard deviations of covariates. Defaults to None.
+        covar_means (np.ndarray, optional): Means of covariates. Defaults to None.
+        covar_encoders (List[LabelEncoder], optional): Encoders for covariates. Defaults to None.
+        train_idx (np.ndarray, optional): Boolean array indicating training data. Defaults to None.
+        test_idx (np.ndarray, optional): Boolean array indicating testing data. Defaults to None.
+
+    Returns:
+        None
     """
     Y_true = np.squeeze(Y_true)
     Y_pred = np.squeeze(Y_pred)

diff --git a/contextualized/analysis/bootstraps.py b/contextualized/analysis/bootstraps.py
@@ -1,5 +1,6 @@
 # Utility functions for bootstraps
 
+
 def select_good_bootstraps(sklearn_wrapper, train_errs, tol=2, **kwargs):
     """
     Select bootstraps that are good for a given model.
@@ -19,5 +20,6 @@ def select_good_bootstraps(sklearn_wrapper, train_errs, tol=2, **kwargs):
 
     train_errs_by_bootstrap = np.mean(train_errs, axis=(1, 2))
     sklearn_wrapper.models = sklearn_wrapper.models[
-         train_errs_by_bootstrap < tol*np.min(train_errs_by_bootstrap)]
+        train_errs_by_bootstrap < tol * np.min(train_errs_by_bootstrap)
+    ]
     return sklearn_wrapper
diff --git a/contextualized/analysis/effects.py b/contextualized/analysis/effects.py
@@ -1,21 +1,29 @@
 """
 Utilities for plotting effects learned by Contextualized models.
 """
-
+from typing import *
 
 import numpy as np
 import matplotlib.pyplot as plt
 
+from contextualized.easy.wrappers import SKLearnWrapper
+
 
 def simple_plot(
-    x_vals,
-    y_vals,
+    x_vals: List[Union[float, int]],
+    y_vals: List[Union[float, int]],
     **kwargs,
-):
+) -> None:
     """
-    Simple plotting of xs and ys with kwargs passed to mpl helpers.
-    :param x_vals:
-    :param y_vals:
+    Simple plotting of y vs x with kwargs passed to matplotlib helpers.
+
+    Args:
+        x_vals: x values to plot
+        y_vals: y values to plot
+        **kwargs: kwargs passed to matplotlib helpers (fill_alpha, fill_color, y_lowers, y_uppers, x_label, y_label, x_ticks, x_ticklabels, y_ticks, y_ticklabels)
+
+    Returns:
+        None
     """
     plt.figure(figsize=kwargs.get("figsize", (8, 8)))
     if "y_lowers" in kwargs and "y_uppers" in kwargs:
@@ -84,16 +92,25 @@ def plot_effect(x_vals, y_means, y_lowers=None, y_uppers=None, **kwargs):
         )
 
 
-def get_homogeneous_context_effects(model, C, **kwargs):
+def get_homogeneous_context_effects(
+    model: SKLearnWrapper, C: np.ndarray, **kwargs
+) -> Tuple[np.ndarray, np.ndarray]:
     """
     Get the homogeneous (context-invariant) effects of context.
-    :param model:
-    :param C:
 
-    returns:
-        c_vis: the context values that were used to estimate the effects
-        effects: np array of effects, one for each context. Each homogeneous effect is a matrix of shape:
-            (n_bootstraps, n_context_vals, n_outcomes).
+    Args:
+        model (SKLearnWrapper): a fitted ``contextualized.easy`` model
+        C: the context values to use to estimate the effects
+        verbose (bool, optional): print progess. Default True.
+        individual_preds (bool, optional): whether to use plot each bootstrap. Default True.
+        C_vis (np.ndarray, optional): Context bins used to visualize context (n_vis, n_contexts). Default None to construct anew.
+        n_vis (int, optional): Number of bins to use to visualize context. Default 1000.
+
+    Returns:
+        Tuple[np.ndarray, np.ndarray]:
+            c_vis: the context values that were used to estimate the effects
+            effects: array of effects, one for each context. Each homogeneous effect is a matrix of shape:
+                (n_bootstraps, n_context_vals, n_outcomes).
     """
     if kwargs.get("verbose", True):
         print("Estimating Homogeneous Contextual Effects.")
@@ -233,14 +250,32 @@ def plot_boolean_vars(names, y_mean, y_err, **kwargs):
 
 
 def plot_homogeneous_context_effects(
-    model,
-    C,
+    model: SKLearnWrapper,
+    C: np.ndarray,
     **kwargs,
-):
+) -> None:
     """
-    Plot the homogeneous (context-invariant) effects of context.
-    :param model:
-    :param C:
+    Plot the direct effect of context on outcomes, disregarding other features.
+    This context effect is homogeneous in that it is a static function of context (context-invariant).
+
+    Args:
+        model (SKLearnWrapper): a fitted ``contextualized.easy`` model
+        C: the context values to use to estimate the effects
+        verbose (bool, optional): print progess. Default True.
+        individual_preds (bool, optional): whether to use plot each bootstrap. Default True.
+        C_vis (np.ndarray, optional): Context bins used to visualize context (n_vis, n_contexts). Default None to construct anew.
+        n_vis (int, optional): Number of bins to use to visualize context. Default 1000.
+        lower_pct (int, optional): Lower percentile for bootstraps. Default 2.5.
+        upper_pct (int, optional): Upper percentile for bootstraps. Default 97.5.
+        classification (bool, optional): Whether to exponentiate the effects. Default True.
+        C_encoders (List[sklearn.preprocessing.LabelEncoder], optional): encoders for each context. Default None.
+        C_means (np.ndarray, optional): means for each context. Default None.
+        C_stds (np.ndarray, optional): standard deviations for each context. Default None.
+        xlabel_prefix (str, optional): prefix for x label. Default "".
+        figname (str, optional): name of figure to save. Default None.
+
+    Returns:
+        None
     """
     c_vis, effects = get_homogeneous_context_effects(model, C, **kwargs)
     # effects.shape is (n_context, n_bootstraps, n_context_vals, n_outcomes)
@@ -283,16 +318,34 @@ def plot_homogeneous_context_effects(
 
 
 def plot_homogeneous_predictor_effects(
-    model,
-    C,
-    X,
+    model: SKLearnWrapper,
+    C: np.ndarray,
+    X: np.ndarray,
     **kwargs,
-):
+) -> None:
     """
-    Plot the homogeneous (context-invariant) effects of predictors.
-    :param model:
-    :param C:
-    :param X:
+    Plot the effect of predictors on outcomes that do not change with context (homogeneous).
+
+    Args:
+        model (SKLearnWrapper): a fitted ``contextualized.easy`` model
+        C: the context values to use to estimate the effects
+        X: the predictor values to use to estimate the effects
+        max_classes_for_discrete (int, optional): maximum number of classes to treat as discrete. Default 10.
+        min_effect_size (float, optional): minimum effect size to plot. Default 0.003.
+        ylabel (str, optional): y label for plot. Default "Influence of ".
+        xlabel_prefix (str, optional): prefix for x label. Default "".
+        X_names (List[str], optional): names of predictors. Default None.
+        X_encoders (List[sklearn.preprocessing.LabelEncoder], optional): encoders for each predictor. Default None.
+        X_means (np.ndarray, optional): means for each predictor. Default None.
+        X_stds (np.ndarray, optional): standard deviations for each predictor. Default None.
+        verbose (bool, optional): print progess. Default True.
+        lower_pct (int, optional): Lower percentile for bootstraps. Default 2.5.
+        upper_pct (int, optional): Upper percentile for bootstraps. Default 97.5.
+        classification (bool, optional): Whether to exponentiate the effects. Default True.
+        figname (str, optional): name of figure to save. Default None.
+
+    Returns:
+        None
     """
     c_vis = np.zeros_like(C.values)
     x_vis = make_grid_mat(X.values, 1000)
@@ -355,19 +408,31 @@ def plot_homogeneous_predictor_effects(
 
 def plot_heterogeneous_predictor_effects(model, C, X, **kwargs):
     """
-    Plot the heterogeneous (context-dependent) effects of context.
-    :param model:
-    :param C:
-    :param X:
-    :param encoders:
-    :param C_means:
-    :param C_stds:
-    :param X_names:
-    :param ylabel:  (Default value = "Influence of ")
-    :param min_effect_size:  (Default value = 0.003)
-    :param n_vis:  (Default value = 1000)
-    :param max_classes_for_discrete:  (Default value = 10)
-
+    Plot how the effect of predictors on outcomes changes with context (heterogeneous).
+
+    Args:
+        model (SKLearnWrapper): a fitted ``contextualized.easy`` model
+        C: the context values to use to estimate the effects
+        X: the predictor values to use to estimate the effects
+        max_classes_for_discrete (int, optional): maximum number of classes to treat as discrete. Default 10.
+        min_effect_size (float, optional): minimum effect size to plot. Default 0.003.
+        y_prefix (str, optional): y prefix for plot. Default "Influence of ".
+        X_names (List[str], optional): names of predictors. Default None.
+        verbose (bool, optional): print progess. Default True.
+        individual_preds (bool, optional): whether to use plot each bootstrap. Default True.
+        C_vis (np.ndarray, optional): Context bins used to visualize context (n_vis, n_contexts). Default None to construct anew.
+        n_vis (int, optional): Number of bins to use to visualize context. Default 1000.
+        lower_pct (int, optional): Lower percentile for bootstraps. Default 2.5.
+        upper_pct (int, optional): Upper percentile for bootstraps. Default 97.5.
+        classification (bool, optional): Whether to exponentiate the effects. Default True.
+        C_encoders (List[sklearn.preprocessing.LabelEncoder], optional): encoders for each context. Default None.
+        C_means (np.ndarray, optional): means for each context. Default None.
+        C_stds (np.ndarray, optional): standard deviations for each context. Default None.
+        xlabel_prefix (str, optional): prefix for x label. Default "".
+        figname (str, optional): name of figure to save. Default None.
+
+    Returns:
+        None
     """
     c_vis = maybe_make_c_vis(C, **kwargs)
     n_vis = c_vis.shape[0]

diff --git a/contextualized/analysis/embeddings.py b/contextualized/analysis/embeddings.py
@@ -1,26 +1,37 @@
 """
 Utilities for plotting embeddings of fitted Contextualized models.
 """
-
+from typing import *
 
 import numpy as np
+import pandas as pd
 import matplotlib.pyplot as plt
 import matplotlib as mpl
 
 from contextualized.analysis import utils
 
 
 def plot_embedding_for_all_covars(
-    reps, covars_df, covars_stds=None, covars_means=None, covars_encoders=None, **kwargs
-):
+    reps: np.ndarray,
+    covars_df: pd.DataFrame,
+    covars_stds: np.ndarray = None,
+    covars_means: np.ndarray = None,
+    covars_encoders: List[Callable] = None,
+    **kwargs,
+) -> None:
     """
     Plot embeddings of representations for all covariates in a Pandas dataframe.
-    :param reps:
-    :param covars_df:
-    :param covars_stds: Used to project back to readable values. (Default value = None)
-    :param covars_means: Used to project back to readable values. (Default value = None)
-    :param covars_encoders: Used to project back to readable values. (Default value = None)
-    :param kwargs: Keyword arguments for plotting.
+
+    Args:
+        reps (np.ndarray): Embeddings of shape (n_samples, n_dims).
+        covars_df (pd.DataFrame): DataFrame of covariates.
+        covars_stds (np.ndarray, optional): Standard deviations of covariates. Defaults to None.
+        covars_means (np.ndarray, optional): Means of covariates. Defaults to None.
+        covars_encoders (List[LabelEncoder], optional): Encoders for covariates. Defaults to None.
+        kwargs: Keyword arguments for plotting.
+
+    Returns:
+        None
     """
     for i, covar in enumerate(covars_df.columns):
         my_labels = covars_df.iloc[:, i].values
@@ -49,17 +60,20 @@ def plot_embedding_for_all_covars(
 
 
 def plot_lowdim_rep(
-    low_dim,
-    labels,
+    low_dim: np.ndarray,
+    labels: np.ndarray,
     **kwargs,
 ):
     """
+    Plot a low-dimensional representation of a dataset.
 
-    :param low_dim:
-    :param labels:
-    :param kwargs:
-        Keyword arguments.
+    Args:
+        low_dim (np.ndarray): Low-dimensional representation of shape (n_samples, 2).
+        labels (np.ndarray): Labels of shape (n_samples,).
+        kwargs: Keyword arguments for plotting.
 
+    Returns:
+        None
     """
 
     if len(set(labels)) < kwargs.get("max_classes_for_discrete", 10):  # discrete labels