From e77400c49bdd9f570e9add7441dfcb18fdc922cd Mon Sep 17 00:00:00 2001
From: Burton DeWilde <burtdewilde@gmail.com>
Date: Thu, 16 Jan 2025 21:42:48 -0500
Subject: [PATCH 01/18] Add rough draft template nb

---
 .../04-make-explain-predictions-TEMPLATE.py   | 325 ++++++++++++++++++
 1 file changed, 325 insertions(+)
 create mode 100644 notebooks/pdp/04-make-explain-predictions-TEMPLATE.py

diff --git a/notebooks/pdp/04-make-explain-predictions-TEMPLATE.py b/notebooks/pdp/04-make-explain-predictions-TEMPLATE.py
new file mode 100644
index 00000000..fd0c29cc
--- /dev/null
+++ b/notebooks/pdp/04-make-explain-predictions-TEMPLATE.py
@@ -0,0 +1,325 @@
+# Databricks notebook source
+# MAGIC %md
+# MAGIC # SST Make and Explain Predictions: [SCHOOL]
+# MAGIC
+# MAGIC Fourth step in the process of transforming raw (PDP) data into actionable, data-driven insights for advisors: generate predictions and feature importances for new (unlabeled) data.
+# MAGIC
+# MAGIC ### References
+# MAGIC
+# MAGIC - [Data science product components (Confluence doc)](https://datakind.atlassian.net/wiki/spaces/TT/pages/237862913/Data+science+product+components+the+modeling+process)
+# MAGIC - [Databricks runtimes release notes](https://docs.databricks.com/en/release-notes/runtime/index.html)
+# MAGIC - [SCHOOL WEBSITE](https://example.com)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC # setup
+
+# COMMAND ----------
+
+# MAGIC %sh python --version
+
+# COMMAND ----------
+
+# install dependencies, most of which should come through our 1st-party SST package
+# %pip install "student-success-tool==0.1.0" --no-deps
+# %pip install git+https://github.com/datakind/student-success-tool.git@develop --no-deps
+# %pip install pandera
+
+# COMMAND ----------
+
+# MAGIC %restart_python
+
+# COMMAND ----------
+
+import functools as ft
+import logging
+import typing as t
+
+import mlflow
+import numpy as np
+import pandas as pd
+import shap
+import sklearn.inspection
+import sklearn.metrics
+from databricks.connect import DatabricksSession
+
+# from databricks.sdk.runtime import dbutils
+# from py4j.protocol import Py4JJavaError
+# from pyspark import SparkContext
+# from pyspark.sql import SparkSession
+from pyspark.sql.types import FloatType, StringType, StructField, StructType
+
+from student_success_tool.analysis.pdp import dataio
+from student_success_tool.modeling import inference, utils
+
+# COMMAND ----------
+
+logging.getLogger("root").setLevel(logging.INFO)
+logging.getLogger("py4j").setLevel(logging.WARNING)  # ignore databricks logger
+
+try:
+    spark_session = DatabricksSession.builder.getOrCreate()
+except Exception:
+    logging.warning("unable to create spark session; are you in a Databricks runtime?")
+    pass
+
+# COMMAND ----------
+
+# Databricks logs every instance that uses sklearn or other modelling libraries
+# to MLFlow experiments... which we don't want
+mlflow.autolog(disable=True)
+mlflow.sklearn.autolog(disable=True)
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## configuration
+
+# COMMAND ----------
+
+# TODO TODO TODO: use project config
+
+train_sample_size = 100
+validate_sample_size = 100
+
+institution_id = "INSTITUTION_ID"
+best_model_run_id = "BEST_MODEL_RUN_ID"
+student_id_col = "student_guid"
+target_col = "target"
+split_col = "split"
+sample_weight_col = "sample_weight"
+pos_label = True
+
+model_type = "sklearn"
+labeled_data_path = "CATALOG.SCHEMA.TABLE_NAME"  # "TODO"
+unlabeled_data_path = None
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC # load model and data
+
+# COMMAND ----------
+
+
+# TODO: move this into sst package
+def mlflow_load_model(model_uri: str, model_type: str):
+    load_model_func = (
+        mlflow.sklearn.load_model
+        if model_type == "sklearn"
+        else mlflow.xgboost.load_model
+        if model_type == "xgboost"
+        else mlflow.lightgbm.load_model
+        if model_type == "lightgbm"
+        else mlflow.pyfunc.load_model
+    )
+    model = load_model_func(f"runs:/{best_model_run_id}/model")
+    logging.info("mlflow '%s' model loaded from '%s'", model_type, model_uri)
+    return model
+
+
+def predict_proba(
+    df: pd.DataFrame, *, model, pos_label: bool | str = True
+) -> pd.Series:
+    return pd.Series(
+        model.predict_proba(df)[:, model.classes_.tolist().index(pos_label)]
+    )
+
+
+# COMMAND ----------
+
+model = mlflow_load_model(f"runs:/{best_model_run_id}/model", model_type)
+model_features = model.named_steps["column_selector"].get_params()["cols"]
+logging.info(
+    "model uses %s features: %s", len(model_features), ", ".join(model_features)
+)
+
+# COMMAND ----------
+
+model_features = model.named_steps["column_selector"].get_params()["cols"]
+print(len(model_features))
+
+# COMMAND ----------
+
+df_labeled = dataio.read_data_from_delta_table(
+    labeled_data_path, spark_session=spark_session
+)
+print(df_labeled.shape)
+df_labeled.head()
+
+# COMMAND ----------
+
+if unlabeled_data_path:
+    df_unlabeled = dataio.read_data_from_delta_table(
+        unlabeled_data_path, spark_session=spark_session
+    )
+else:
+    df_unlabeled = df_labeled.loc[df_labeled[split_col].eq("test"), :].drop(
+        columns=target_col
+    )
+print(df_unlabeled.shape)
+df_unlabeled.head()
+
+# COMMAND ----------
+
+pred_probs = predict_proba(df_unlabeled, model=model)
+pred_probs.describe()
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC # initialize SHAP explainer
+
+# COMMAND ----------
+
+df_train = df_labeled.loc[df_labeled[split_col].eq("train"), :]
+# SHAP can't explain models using data with nulls
+# so, impute nulls using the mode (most frequent values)
+mode = df_train.mode().iloc[0]
+# sample background data for SHAP Explainer
+train_sample = (
+    df_train.sample(n=min(train_sample_size, df_train.shape[0]), random_state=1)
+    .fillna(mode)
+    .loc[:, model_features]
+)
+train_sample
+
+# COMMAND ----------
+
+
+def predict_proba_v3(
+    X,
+    *,
+    model,
+    col_names: t.Optional[list[str]] = None,
+    pos_label: t.Optional[bool | str] = None,
+) -> np.ndarray:
+    if col_names is None:
+        col_names = model.named_steps["column_selector"].get_params()["cols"]
+    pred_probs = model.predict_proba(pd.DataFrame(data=X, columns=col_names))
+    if pos_label is not None:
+        return pred_probs[:, model.classes_.tolist().index(pos_label)]
+    else:
+        return pred_probs
+
+
+def predict_proba_v2(X, *, model, pos_label: bool | str = True):
+    model_features = model.named_steps["column_selector"].get_params()["cols"]
+    pred_probs = model.predict_proba(pd.DataFrame(data=X, columns=model_features))
+    return pred_probs[:, model.classes_.tolist().index(pos_label)]
+
+
+# COMMAND ----------
+
+# import shap
+# import sklearn
+
+# X, y = shap.datasets.adult()
+# m = sklearn.linear_model.LogisticRegression().fit(X, y)
+# explainer = shap.explainers.Permutation(m.predict_proba, X)
+# shap_values = explainer(X[:100])
+# shap.plots.bar(shap_values[..., 1])
+
+# COMMAND ----------
+
+# explainer = shap.explainers.Permutation(model.predict_proba, train_sample)
+# explainer = shap.explainers.KernelExplainer(model.predict_proba, train_sample)
+explainer = shap.explainers.KernelExplainer(
+    ft.partial(
+        predict_proba_v3, model=model, col_names=model_features, pos_label=pos_label
+    ),
+    train_sample,
+    link="identity",
+)
+explainer
+
+# COMMAND ----------
+
+shap_schema = StructType(
+    [StructField(student_id_col, StringType(), nullable=False)]
+    + [StructField(col, FloatType(), nullable=False) for col in model_features]
+)
+
+df_shap_values = (
+    spark.createDataFrame(df_unlabeled.drop(columns=[split_col, sample_weight_col]))  # noqa: F821
+    .repartition(sc.defaultParallelism)  # noqa: F821
+    .mapInPandas(
+        ft.partial(
+            inference.calculate_shap_values_spark_udf,
+            student_id_col=student_id_col,
+            model_features=model_features,
+            explainer=explainer,
+            mode=mode,
+        ),
+        schema=shap_schema,
+    )
+    .toPandas()
+    .set_index(student_id_col)
+    .reindex(df_unlabeled[student_id_col])
+    .reset_index(drop=False)
+)
+df_shap_values
+
+# COMMAND ----------
+
+shap.summary_plot(
+    df_shap_values[model_features].to_numpy(),
+    df_unlabeled[model_features],
+    class_names=model.classes_,
+    # show=False, ???
+)
+
+# COMMAND ----------
+
+features_table = utils.load_features_table("assets/pdp/features_table.toml")
+result = inference.select_top_features_for_display(
+    df_unlabeled.loc[:, model_features],
+    df_unlabeled[student_id_col],
+    pred_probs,
+    df_shap_values[model_features].to_numpy(),
+    n_features=5,
+    features_table=features_table,
+    needs_support_threshold_prob=0.5,
+)
+result
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## TODO:
+# MAGIC
+# MAGIC - save plots and results in a nice form in a place that makes sense
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC ## haxx
+
+# COMMAND ----------
+
+
+result = sklearn.inspection.permutation_importance(
+    model,
+    train_sample.drop(columns=target_col),
+    train_sample[target_col],
+    scoring=sklearn.metrics.make_scorer(
+        sklearn.metrics.log_loss, greater_is_better=False
+    ),
+    n_repeats=10,
+)
+
+# COMMAND ----------
+
+sorted_importances_idx = result.importances_mean.argsort()
+importances = pd.DataFrame(
+    result.importances[sorted_importances_idx].T,
+    columns=train_sample.columns[sorted_importances_idx],
+)
+ax = importances.plot.box(vert=False, whis=10, figsize=(10, 10))
+ax.set_title("Permutation Importances (test set)")
+ax.axvline(x=0, color="k", linestyle="--")
+ax.set_xlabel("Decrease in accuracy score")
+ax.figure.tight_layout()
+
+# COMMAND ----------

From 863a075fb225967d13488d989d5f6eedec9a136f Mon Sep 17 00:00:00 2001
From: Burton DeWilde <burtdewilde@gmail.com>
Date: Thu, 16 Jan 2025 22:11:40 -0500
Subject: [PATCH 02/18] Add load mlflow model util func

---
 src/student_success_tool/modeling/utils.py | 30 ++++++++++++++++++++++
 1 file changed, 30 insertions(+)

diff --git a/src/student_success_tool/modeling/utils.py b/src/student_success_tool/modeling/utils.py
index c626bbb6..88fe1aad 100644
--- a/src/student_success_tool/modeling/utils.py
+++ b/src/student_success_tool/modeling/utils.py
@@ -3,6 +3,7 @@
 import typing as t
 from collections.abc import Sequence
 
+import mlflow
 import numpy as np
 import pandas as pd
 import sklearn.utils
@@ -105,3 +106,32 @@ def load_features_table(rel_fpath: str) -> dict[str, dict[str, str]]:
     LOGGER.info("loaded features table from '%s'", file_path)
     assert isinstance(features_table, dict)  # type guard
     return features_table
+
+
+def load_mlflow_model(
+    model_uri: str,
+    model_type: t.Optional[t.Literal["sklearn", "xgboost", "lightgbm"]] = None,
+) -> object:
+    """
+    Load a (registered) MLFlow model of whichever model type from a specified URI.
+
+    Args:
+        model_uri
+        model_type
+
+    References:
+        - https://mlflow.org/docs/latest/python_api/mlflow.sklearn.html#mlflow.sklearn.load_model
+        - https://mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.load_model
+    """
+    load_model_func = (
+        mlflow.sklearn.load_model
+        if model_type == "sklearn"
+        else mlflow.xgboost.load_model
+        if model_type == "xgboost"
+        else mlflow.lightgbm.load_model
+        if model_type == "lightgbm"
+        else mlflow.pyfunc.load_model
+    )
+    model = load_model_func(model_uri)
+    LOGGER.info("mlflow model loaded from '%s'", model_uri)
+    return model

From 26dfc8e144ba4310ce654932bfc1f3aa6b6958a4 Mon Sep 17 00:00:00 2001
From: Burton DeWilde <burtdewilde@gmail.com>
Date: Sat, 18 Jan 2025 14:59:34 -0500
Subject: [PATCH 03/18] Add better (v2) pdp project config

---
 src/student_success_tool/configs/__init__.py  |   1 +
 .../configs/schemas/pdp_v2.py                 | 194 ++++++++++++++++++
 2 files changed, 195 insertions(+)
 create mode 100644 src/student_success_tool/configs/schemas/pdp_v2.py

diff --git a/src/student_success_tool/configs/__init__.py b/src/student_success_tool/configs/__init__.py
index 1acfd3fd..88bc3e61 100644
--- a/src/student_success_tool/configs/__init__.py
+++ b/src/student_success_tool/configs/__init__.py
@@ -1,2 +1,3 @@
 from .load import load_config
 from .schemas.pdp import PDPProjectConfig
+from .schemas.pdp_v2 import PDPProjectConfigV2
diff --git a/src/student_success_tool/configs/schemas/pdp_v2.py b/src/student_success_tool/configs/schemas/pdp_v2.py
new file mode 100644
index 00000000..d2278015
--- /dev/null
+++ b/src/student_success_tool/configs/schemas/pdp_v2.py
@@ -0,0 +1,194 @@
+import typing as t
+
+import pydantic as pyd
+
+from ...analysis.pdp import constants
+
+
+class FeaturesConfig(pyd.BaseModel):
+    min_passing_grade: float = pyd.Field(
+        default=constants.DEFAULT_MIN_PASSING_GRADE,
+        description="Minimum numeric grade considered by institution as 'passing'",
+        gt=0.0,
+        lt=4.0,
+    )
+    min_num_credits_full_time: float = pyd.Field(
+        default=constants.DEFAULT_MIN_NUM_CREDITS_FULL_TIME,
+        description=(
+            "Minimum number of credits *attempted* per term for a student's "
+            "enrollment intensity to be considered 'full-time'."
+        ),
+        gt=0.0,
+        lt=20.0,
+    )
+    course_level_pattern: str = pyd.Field(
+        default=constants.DEFAULT_COURSE_LEVEL_PATTERN,
+        description=(
+            "Regular expression patttern that extracts a course's 'level' "
+            "from a PDP course_number field"
+        ),
+    )
+    peak_covid_terms: set[tuple[str, str]] = pyd.Field(
+        default=constants.DEFAULT_PEAK_COVID_TERMS,
+        description=(
+            "Set of (academic year, academic term) pairs considered by institution "
+            "as 'peak' COVID, for use in control variables to account for pandemic effects"
+        ),
+    )
+    key_course_subject_areas: t.Optional[list[str]] = pyd.Field(
+        default=None,
+        description=(
+            "One or more course subject areas (formatted as 2-digit CIP codes) "
+            "for which custom features should be computed"
+        ),
+    )
+    key_course_ids: t.Optional[list[str]] = pyd.Field(
+        default=None,
+        description=(
+            "One or more course ids (formatted as '[COURSE_PREFIX][COURSE_NUMBER]') "
+            "for which custom features should be computed"
+        ),
+    )
+
+
+class TargetConfig(pyd.BaseModel):
+    student_criteria: dict[str, object] = pyd.Field(
+        default_factory=dict,
+        description=(
+            "Column name in modeling dataset mapped to one or more values that it must equal "
+            "in order for the corresponding student to be considered 'eligible'. "
+            "Multiple criteria are combined with a logical 'AND'."
+        ),
+    )
+    # TODO: refine target functionality and expand on this configuration
+
+
+class PreprocessingConfig(pyd.BaseModel):
+    features: FeaturesConfig
+    target: TargetConfig
+    splits: dict[t.Literal["train", "test", "validate"], float] = pyd.Field(
+        default={"train": 0.6, "test": 0.2, "validate": 0.2},
+        description=(
+            "Mapping of name to fraction of the full datset belonging to a given 'split', "
+            "which is a randomized subset used for different parts of the modeling process"
+        ),
+    )
+    sample_class_weight: t.Optional[t.Literal["balanced"] | dict[object, int]] = (
+        pyd.Field(
+            default=None,
+            description=(
+                "Weights associated with classes in the form ``{class_label: weight}`` "
+                "or 'balanced' to automatically adjust weights inversely proportional "
+                "to class frequencies in the input data. "
+                "If null (default), then sample weights are not computed."
+            ),
+        )
+    )
+
+    @pyd.field_validator("splits", mode="after")
+    @classmethod
+    def check_split_fractions(cls, value: dict) -> dict:
+        if (sum_fracs := sum(value.values())) != 1.0:
+            raise pyd.ValidationError(
+                f"split fractions must sum up to 1.0, but input sums up to {sum_fracs}"
+            )
+        return value
+
+
+class TrainingConfig(pyd.BaseModel):
+    """
+    References:
+        - https://docs.databricks.com/en/machine-learning/automl/automl-api-reference.html#classify
+    """
+
+    student_group_cols: t.Optional[list[str]] = pyd.Field(
+        default=None,
+        description=(
+            "One or more column names in dataset containing student 'groups' "
+            "to use for model bias assessment, but NOT as model features"
+        ),
+    )
+    exclude_cols: t.Optional[list[str]] = pyd.Field(
+        default=None,
+        description="One or more column names in dataset to exclude from training.",
+    )
+    time_col: t.Optional[str] = pyd.Field(
+        default=None,
+        description=(
+            "Column name in dataset used to split train/test/validate sets chronologically, "
+            "as an alternative to the randomized assignment in ``split_col`` ."
+        ),
+    )
+    exclude_frameworks: t.Optional[list[str]] = pyd.Field(
+        default=None,
+        description="List of algorithm frameworks that AutoML excludes from training.",
+    )
+    primary_metric: str = pyd.Field(
+        default="log_loss",
+        description="Metric used to evaluate and rank model performance.",
+    )
+    timeout_minutes: t.Optional[int] = pyd.Field(
+        default=None,
+        description="Maximum time to wait for AutoML trials to complete.",
+    )
+
+
+class InferenceConfig(pyd.BaseModel):
+    num_top_features: int = pyd.Field(default=5)
+
+
+class DatasetConfig(pyd.BaseModel):
+    table_path: t.Optional[str] = pyd.Field(
+        ...,
+        description=(
+            "Path to a table in Unity Catalog where dataset is stored, "
+            "including the full three-level namespace: 'CATALOG.SCHEMA.TABLE'"
+        ),
+    )
+    file_path: t.Optional[str] = pyd.Field(
+        ...,
+        description="Full, absolute path to dataset on disk, e.g. a Databricks Volume",
+    )
+    # TODO: if/when we allow different file formats, add this parameter ...
+    # file_format: t.Optional[t.Literal["csv", "parquet"]] = pyd.Field(default=None)
+
+
+class DatasetsConfig(pyd.BaseModel):
+    raw: DatasetConfig
+    preprocessed: t.Optional[DatasetConfig]
+    predictions: t.Optional[DatasetConfig]
+
+
+class TrainedModelConfig(pyd.BaseModel):
+    experiment_id: str
+    run_id: str
+    model_type: t.Optional[t.Literal["sklearn", "xgboost", "lightgbm"]] = None
+    min_prob_pos_label: t.Optional[float] = 0.5
+
+
+class PDPProjectConfigV2(pyd.BaseModel):
+    """Configuration (v2) schema for PDP SST projects."""
+
+    institution_id: str
+    institution_name: str
+
+    # shared dataset parameters
+    student_id_col: str = "student_guid"
+    target_col: str = "target"
+    split_col: str = "split"
+    sample_weight_col: t.Optional[str] = None
+    pos_label: t.Optional[int | bool | str] = True
+    pred_col: str = "pred"
+    pred_prob_col: str = "pred_prob"
+    # other shared parameters
+    random_state: t.Optional[int] = None
+
+    labeled_dataset: DatasetsConfig
+    trained_model: t.Optional[TrainedModelConfig] = None
+
+    preprocessing: t.Optional[PreprocessingConfig] = None
+    training: t.Optional[TrainingConfig] = None
+    inference: t.Optional[InferenceConfig] = None
+
+    # NOTE: this is for *pydantic* model -- not ML model -- configuration
+    model_config = pyd.ConfigDict(extra="ignore", strict=True)

From a2d007cc8507dd02d4f5d4f0794a42d37ba8bc10 Mon Sep 17 00:00:00 2001
From: Burton DeWilde <burtdewilde@gmail.com>
Date: Sat, 18 Jan 2025 15:21:05 -0500
Subject: [PATCH 04/18] Shuffle modeling config around

---
 .../configs/schemas/pdp_v2.py                 | 34 ++++++++++++++-----
 1 file changed, 26 insertions(+), 8 deletions(-)

diff --git a/src/student_success_tool/configs/schemas/pdp_v2.py b/src/student_success_tool/configs/schemas/pdp_v2.py
index d2278015..9ab05734 100644
--- a/src/student_success_tool/configs/schemas/pdp_v2.py
+++ b/src/student_success_tool/configs/schemas/pdp_v2.py
@@ -95,19 +95,25 @@ def check_split_fractions(cls, value: dict) -> dict:
         return value
 
 
+class FeatureSelectionConfig(pyd.BaseModel):
+    """
+    See Also:
+        - :func:`modeling.feature_selection.select_features()`
+    """
+
+    non_feature_cols: t.Optional[list[str]] = None
+    force_include_cols: t.Optional[list[str]] = None
+    incomplete_threshold: float = 0.5
+    low_variance_threshold: float = 0.0
+    collinear_threshold: t.Optional[float] = 10.0
+
+
 class TrainingConfig(pyd.BaseModel):
     """
     References:
         - https://docs.databricks.com/en/machine-learning/automl/automl-api-reference.html#classify
     """
 
-    student_group_cols: t.Optional[list[str]] = pyd.Field(
-        default=None,
-        description=(
-            "One or more column names in dataset containing student 'groups' "
-            "to use for model bias assessment, but NOT as model features"
-        ),
-    )
     exclude_cols: t.Optional[list[str]] = pyd.Field(
         default=None,
         description="One or more column names in dataset to exclude from training.",
@@ -133,6 +139,11 @@ class TrainingConfig(pyd.BaseModel):
     )
 
 
+class ModelingConfig(pyd.BaseModel):
+    feature_selection: t.Optional[FeatureSelectionConfig] = None
+    training: TrainingConfig
+
+
 class InferenceConfig(pyd.BaseModel):
     num_top_features: int = pyd.Field(default=5)
 
@@ -177,6 +188,13 @@ class PDPProjectConfigV2(pyd.BaseModel):
     target_col: str = "target"
     split_col: str = "split"
     sample_weight_col: t.Optional[str] = None
+    student_group_cols: t.Optional[list[str]] = pyd.Field(
+        default=None,
+        description=(
+            "One or more column names in datasets containing student 'groups' "
+            "to use for model bias assessment, but NOT as model features"
+        ),
+    )
     pos_label: t.Optional[int | bool | str] = True
     pred_col: str = "pred"
     pred_prob_col: str = "pred_prob"
@@ -187,7 +205,7 @@ class PDPProjectConfigV2(pyd.BaseModel):
     trained_model: t.Optional[TrainedModelConfig] = None
 
     preprocessing: t.Optional[PreprocessingConfig] = None
-    training: t.Optional[TrainingConfig] = None
+    modeling: t.Optional[ModelingConfig] = None
     inference: t.Optional[InferenceConfig] = None
 
     # NOTE: this is for *pydantic* model -- not ML model -- configuration

From 4a5391a281a2ad186a5ee3881b08d61448900fb5 Mon Sep 17 00:00:00 2001
From: Burton DeWilde <burtdewilde@gmail.com>
Date: Sat, 18 Jan 2025 16:13:35 -0500
Subject: [PATCH 05/18] Refine v2 project config

---
 .../configs/schemas/pdp_v2.py                 | 26 ++++++++++++++-----
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/src/student_success_tool/configs/schemas/pdp_v2.py b/src/student_success_tool/configs/schemas/pdp_v2.py
index 9ab05734..d3e8ebc2 100644
--- a/src/student_success_tool/configs/schemas/pdp_v2.py
+++ b/src/student_success_tool/configs/schemas/pdp_v2.py
@@ -146,28 +146,35 @@ class ModelingConfig(pyd.BaseModel):
 
 class InferenceConfig(pyd.BaseModel):
     num_top_features: int = pyd.Field(default=5)
+    # TODO: extend this configuration, maybe?
 
 
 class DatasetConfig(pyd.BaseModel):
     table_path: t.Optional[str] = pyd.Field(
-        ...,
+        default=None,
         description=(
             "Path to a table in Unity Catalog where dataset is stored, "
             "including the full three-level namespace: 'CATALOG.SCHEMA.TABLE'"
         ),
     )
     file_path: t.Optional[str] = pyd.Field(
-        ...,
+        default=None,
         description="Full, absolute path to dataset on disk, e.g. a Databricks Volume",
     )
     # TODO: if/when we allow different file formats, add this parameter ...
     # file_format: t.Optional[t.Literal["csv", "parquet"]] = pyd.Field(default=None)
 
+    @pyd.model_validator(mode="after")
+    def check_some_nonnull_inputs(self):
+        if self.table_path is None and self.file_path is None:
+            raise pyd.ValidationError("table_path and/or file_path must be non-null")
+        return self
+
 
 class DatasetsConfig(pyd.BaseModel):
     raw: DatasetConfig
-    preprocessed: t.Optional[DatasetConfig]
-    predictions: t.Optional[DatasetConfig]
+    preprocessed: t.Optional[DatasetConfig] = None
+    predictions: t.Optional[DatasetConfig] = None
 
 
 class TrainedModelConfig(pyd.BaseModel):
@@ -176,6 +183,11 @@ class TrainedModelConfig(pyd.BaseModel):
     model_type: t.Optional[t.Literal["sklearn", "xgboost", "lightgbm"]] = None
     min_prob_pos_label: t.Optional[float] = 0.5
 
+    @pyd.computed_field  # type: ignore[misc]
+    @property
+    def mlflow_model_uri(self) -> str:
+        return f"runs:/{self.run_id}/model"
+
 
 class PDPProjectConfigV2(pyd.BaseModel):
     """Configuration (v2) schema for PDP SST projects."""
@@ -192,16 +204,16 @@ class PDPProjectConfigV2(pyd.BaseModel):
         default=None,
         description=(
             "One or more column names in datasets containing student 'groups' "
-            "to use for model bias assessment, but NOT as model features"
+            "to use for model bias assessment, but *not* as model features"
         ),
     )
-    pos_label: t.Optional[int | bool | str] = True
     pred_col: str = "pred"
     pred_prob_col: str = "pred_prob"
+    pos_label: t.Optional[int | bool | str] = True
     # other shared parameters
     random_state: t.Optional[int] = None
 
-    labeled_dataset: DatasetsConfig
+    labeled_dataset: t.Optional[DatasetsConfig] = None
     trained_model: t.Optional[TrainedModelConfig] = None
 
     preprocessing: t.Optional[PreprocessingConfig] = None

From 00dbc1422e9c3ef8a6e7508b1a80948599cf7619 Mon Sep 17 00:00:00 2001
From: Burton DeWilde <burtdewilde@gmail.com>
Date: Sat, 18 Jan 2025 16:43:54 -0500
Subject: [PATCH 06/18] Add proj config v2 template

---
 notebooks/pdp/config-v2-TEMPLATE.toml | 52 +++++++++++++++++++++++++++
 1 file changed, 52 insertions(+)
 create mode 100644 notebooks/pdp/config-v2-TEMPLATE.toml

diff --git a/notebooks/pdp/config-v2-TEMPLATE.toml b/notebooks/pdp/config-v2-TEMPLATE.toml
new file mode 100644
index 00000000..fa4ddfb5
--- /dev/null
+++ b/notebooks/pdp/config-v2-TEMPLATE.toml
@@ -0,0 +1,52 @@
+institution_id = "INST_ID"
+institution_name = "INST_NAME"
+
+student_id_col = "student_guid"
+target_col = "target"
+split_col = "split"
+sample_weight_col = "sample_weight"
+student_group_cols = ["student_age", "race", "ethnicity", "gender", "first_gen"]
+pred_col = "pred"
+pred_prob_col = "pred_prob"
+pos_label = true
+random_state = 12345
+
+[labeled_dataset.raw]
+file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/FILE_NAME.csv"
+
+[preprocessing]
+splits = { train = 0.6, test = 0.2, validate = 0.2 }
+sample_class_weight = "balanced"
+
+[preprocessing.features]
+min_passing_grade = 1.0
+min_num_credits_full_time = 12
+course_level_pattern = 'asdf'
+key_course_subject_areas = ["24", "51"]
+key_course_ids = ["ENGL101", "MATH101"]
+
+[preprocessing.target.student_criteria]
+enrollment_type = "FIRST-TIME"
+credential_type_sought_year_1 = "Bachelor's Degree"
+
+[labeled_dataset.preprocessed]
+table_path = "CATALOG.SCHEMA.TABLE_NAME"
+
+[modeling.feature_selection]
+incomplete_threshold = 0.5
+low_variance_threshold = 0.0
+collinear_threshold = 10.0
+
+[modeling.training]
+# exclude_frameworks = ["xgboost", "lightgbm"]
+primary_metric = "log_loss"
+timeout_minutes = 10
+
+[trained_model]
+experiment_id = "EXPERIMENT_ID"
+run_id = "RUN_ID"
+# model_type = "sklearn"
+min_prob_pos_label = 0.5
+
+[inference]
+num_top_features = 5

From 7b08eb730c11c011f7758333d6c9640b9eb22d81 Mon Sep 17 00:00:00 2001
From: Burton DeWilde <burtdewilde@gmail.com>
Date: Sun, 19 Jan 2025 14:08:21 -0500
Subject: [PATCH 07/18] Add default student group cols

these will *probably* be correct for all pdp schools
---
 src/student_success_tool/configs/schemas/pdp_v2.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/student_success_tool/configs/schemas/pdp_v2.py b/src/student_success_tool/configs/schemas/pdp_v2.py
index d3e8ebc2..987cc36c 100644
--- a/src/student_success_tool/configs/schemas/pdp_v2.py
+++ b/src/student_success_tool/configs/schemas/pdp_v2.py
@@ -201,7 +201,7 @@ class PDPProjectConfigV2(pyd.BaseModel):
     split_col: str = "split"
     sample_weight_col: t.Optional[str] = None
     student_group_cols: t.Optional[list[str]] = pyd.Field(
-        default=None,
+        default=["student_age", "race", "ethnicity", "gender", "first_gen"],
         description=(
             "One or more column names in datasets containing student 'groups' "
             "to use for model bias assessment, but *not* as model features"

From e46df479d9be94facfc5eb39eaad674fbf402411 Mon Sep 17 00:00:00 2001
From: Burton DeWilde <burtdewilde@gmail.com>
Date: Sun, 19 Jan 2025 14:27:32 -0500
Subject: [PATCH 08/18] Split raw dataset cfg into course+cohort

---
 notebooks/pdp/config-v2-TEMPLATE.toml              | 7 +++++--
 src/student_success_tool/configs/schemas/pdp_v2.py | 3 ++-
 2 files changed, 7 insertions(+), 3 deletions(-)

diff --git a/notebooks/pdp/config-v2-TEMPLATE.toml b/notebooks/pdp/config-v2-TEMPLATE.toml
index fa4ddfb5..0601f781 100644
--- a/notebooks/pdp/config-v2-TEMPLATE.toml
+++ b/notebooks/pdp/config-v2-TEMPLATE.toml
@@ -11,8 +11,11 @@ pred_prob_col = "pred_prob"
 pos_label = true
 random_state = 12345
 
-[labeled_dataset.raw]
-file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/FILE_NAME.csv"
+[labeled_dataset.raw_course]
+file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/FILE_NAME_COURSE.csv"
+
+[labeled_dataset.raw_cohort]
+file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/FILE_NAME_COHORT.csv"
 
 [preprocessing]
 splits = { train = 0.6, test = 0.2, validate = 0.2 }
diff --git a/src/student_success_tool/configs/schemas/pdp_v2.py b/src/student_success_tool/configs/schemas/pdp_v2.py
index 987cc36c..7ab639ad 100644
--- a/src/student_success_tool/configs/schemas/pdp_v2.py
+++ b/src/student_success_tool/configs/schemas/pdp_v2.py
@@ -172,7 +172,8 @@ def check_some_nonnull_inputs(self):
 
 
 class DatasetsConfig(pyd.BaseModel):
-    raw: DatasetConfig
+    raw_course: DatasetConfig
+    raw_cohort: DatasetConfig
     preprocessed: t.Optional[DatasetConfig] = None
     predictions: t.Optional[DatasetConfig] = None
 

From 343083f38dd86bc1c29099c3d5a58b9e02a45097 Mon Sep 17 00:00:00 2001
From: Burton DeWilde <burtdewilde@gmail.com>
Date: Sun, 19 Jan 2025 15:50:10 -0500
Subject: [PATCH 09/18] fix: Allow nullable raw cohort column values

amazing that nobody has caught this issue yet
---
 src/student_success_tool/analysis/pdp/schemas/raw_cohort.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/student_success_tool/analysis/pdp/schemas/raw_cohort.py b/src/student_success_tool/analysis/pdp/schemas/raw_cohort.py
index 518731d5..6b4c3455 100644
--- a/src/student_success_tool/analysis/pdp/schemas/raw_cohort.py
+++ b/src/student_success_tool/analysis/pdp/schemas/raw_cohort.py
@@ -74,7 +74,9 @@ class RawPDPCohortDataSchema(pda.DataFrameModel):
         dtype_kwargs={"categories": ["FIRST-TIME", "RE-ADMIT", "TRANSFER-IN"]},
     )
     # NOTE: categories set in a parser, which forces "UK" / "UNKNOWN" values to null
-    enrollment_intensity_first_term: pt.Series[pd.CategoricalDtype] = pda.Field()
+    enrollment_intensity_first_term: pt.Series[pd.CategoricalDtype] = pda.Field(
+        nullable=True
+    )
     # NOTE: categories set in a parser, which forces "UK" values to null
     math_placement: pt.Series[pd.CategoricalDtype] = pda.Field(nullable=True)
     # NOTE: categories set in a parser, which forces "UK" values to null

From 9f1dd5cc9e498abb25b14e0d712ba34a31e122e6 Mon Sep 17 00:00:00 2001
From: bdewilde <bdewilde>
Date: Sun, 19 Jan 2025 21:14:12 +0000
Subject: [PATCH 10/18] Update 01-template nb w/ configs and fixes

---
 .../pdp/01-data-assessment-eda-TEMPLATE.py    | 119 +++++++++++-------
 1 file changed, 71 insertions(+), 48 deletions(-)

diff --git a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
index fe1ec743..f2c299dc 100644
--- a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
+++ b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
@@ -23,8 +23,13 @@
 
 # COMMAND ----------
 
-# install dependencies, most of which should come through our 1st-party SST package
-# %pip install git+https://github.com/datakind/student-success-tool.git@develop
+# install dependencies, most/all of which should come through our 1st-party SST package
+# NOTE: it's okay to use 'develop' or a feature branch while developing this nb
+# but when it's finished, it's best to pin to a specific version of the package
+# %pip install "student-success-tool == 0.1.0"
+# %pip install "git+https://github.com/datakind/student-success-tool.git@develop"
+
+%pip install "git+https://github.com/datakind/student-success-tool.git@pdp-add-inference-nb-template"
 
 # COMMAND ----------
 
@@ -45,6 +50,7 @@
 from databricks.sdk.runtime import dbutils
 
 from student_success_tool.analysis import pdp
+from student_success_tool import configs
 
 # COMMAND ----------
 
@@ -52,7 +58,7 @@
 logging.getLogger("py4j").setLevel(logging.WARNING)  # ignore databricks logger
 
 try:
-    spark_session = DatabricksSession.builder.getOrCreate()
+    spark = DatabricksSession.builder.getOrCreate()
 except Exception:
     logging.warning("unable to create spark session; are you in a Databricks runtime?")
     pass
@@ -60,7 +66,7 @@
 # COMMAND ----------
 
 # MAGIC %md
-# MAGIC ## `student-success-intervention` hacks
+# MAGIC ## import school-specific code
 
 # COMMAND ----------
 
@@ -69,33 +75,21 @@
 
 # COMMAND ----------
 
-# HACK: insert our 1st-party (school-specific) code into PATH
+# insert our 1st-party (school-specific) code into PATH
 if "../" not in sys.path:
     sys.path.insert(1, "../")
 
-# TODO: specify school's subpackage
+# TODO: specify school's subpackage here
 from analysis import *  # noqa: F403
 
 # COMMAND ----------
 
-# MAGIC %md
-# MAGIC ## unity catalog config
-
-# COMMAND ----------
-
-catalog = "sst_dev"
-
-# configure where data is to be read from / written to
-inst_name = "SCHOOL"  # TODO: fill in school's name in Unity Catalog
-read_schema = f"{inst_name}_bronze"
-write_schema = f"{inst_name}_silver"
-
-path_volume = os.path.join(
-    "/Volumes", catalog, read_schema, f"{inst_name}_bronze_file_volume"
-)
-path_table = f"{catalog}.{read_schema}"
-print(f"{path_table=}")
-print(f"{path_volume=}")
+# project configuration should be stored in a config file in TOML format
+# it'll start out with just basic info: institution_id, institution_name
+# but as each step of the pipeline gets built, more parameters will be moved
+# from hard-coded notebook variables to shareable, persistent config fields
+cfg = configs.load_config("./config-v2-TEMPLATE.toml", configs.PDPProjectConfigV2)
+cfg
 
 # COMMAND ----------
 
@@ -109,14 +103,17 @@
 
 # COMMAND ----------
 
-# TODO: fill in school's name; may not be same as in the schemas above
-fpath_course = os.path.join(path_volume, "SCHOOL_COURSE_AR_DEID_DTTM.csv")
+try:
+    raw_course_file_path = cfg.labeled_dataset.raw_course.file_path
+except AttributeError:
+    # TODO: fill in the actual path to school's raw course file
+    raw_course_file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/SCHOOL_COURSE_AR_DEID_DTTM.csv"
 
 # COMMAND ----------
 
 # read without any schema validation, so we can look at the data "raw"
 df_course_raw = pdp.dataio.read_raw_pdp_course_data_from_file(
-    fpath_course, schema=None, dttm_format="%Y%m%d.0"
+    raw_course_file_path, schema=None, dttm_format="%Y%m%d.0"
 )
 print(f"rows x cols = {df_course_raw.shape}")
 df_course_raw.head()
@@ -127,6 +124,10 @@
 
 # COMMAND ----------
 
+df_course_raw["course_begin_date"].describe()
+
+# COMMAND ----------
+
 # MAGIC %md
 # MAGIC Quick checks:
 # MAGIC - [ ] data exists where it should
@@ -137,7 +138,7 @@
 
 # try to read data while validating with the "base" PDP schema
 df_course = pdp.dataio.read_raw_pdp_course_data_from_file(
-    fpath_course, schema=pdp.schemas.RawPDPCourseDataSchema, dttm_format="%Y%m%d.0"
+    raw_course_file_path, schema=pdp.schemas.RawPDPCourseDataSchema, dttm_format="%Y%m%d.0"
 )
 df_course
 
@@ -199,7 +200,7 @@
 # MAGIC ```
 # MAGIC
 # MAGIC At this point, `df_course` should be a properly validated and parsed data frame, ready for exploratory data analysis.
-
+# MAGIC
 
 # COMMAND ----------
 
@@ -208,14 +209,18 @@
 
 # COMMAND ----------
 
-
-# TODO: fill in school's name; may not be same as in the schemas above
-fpath_cohort = os.path.join(path_volume, "SCHOOL_COHORT_AR_DEID_DTTM.csv")
+try:
+    raw_cohort_file_path = cfg.labeled_dataset.raw_cohort.file_path
+except AttributeError:
+    # TODO: fill in the actual path to school's raw course file
+    raw_cohort_file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/SCHOOL_COHORT_AR_DEID_DTTM.csv"
 
 # COMMAND ----------
 
 # read without any schema validation, so we can look at the data "raw"
-df_cohort_raw = pdp.dataio.read_raw_pdp_cohort_data_from_file(fpath_cohort, schema=None)
+df_cohort_raw = pdp.dataio.read_raw_pdp_cohort_data_from_file(
+    raw_cohort_file_path, schema=None
+)
 print(f"rows x cols = {df_cohort_raw.shape}")
 df_cohort_raw.head()
 
@@ -223,7 +228,7 @@
 
 # try to read data while validating with the "base" PDP schema
 df_cohort = pdp.dataio.read_raw_pdp_cohort_data_from_file(
-    fpath_cohort, schema=pdp.schemas.base.RawPDPCohortDataSchema
+    raw_cohort_file_path, schema=pdp.schemas.RawPDPCohortDataSchema
 )
 df_cohort
 
@@ -242,22 +247,31 @@
 # COMMAND ----------
 
 # MAGIC %md
-# MAGIC ## save validated data
+# MAGIC ## HEY, STOP HERE!
+
+# COMMAND ----------
+
+# MAGIC %md
+# MAGIC Before continuing on to EDA, now's a great time to do a couple things:
+# MAGIC
+# MAGIC - Copy any school-specific raw dataset schemas into a `schemas.py` file in the current working directory
+# MAGIC - Copy any school-specific preprocessing functions needed to coerce the raw data into a standardized form into a `dataio.py` file in the current working directory
+# MAGIC - **Optional:** If you want easy access to outputs from every (sub-)step of the data transformation pipeline, save the validated datasets into this school's "silver" schema in Unity Catalog.
 
 # COMMAND ----------
 
 pdp.dataio.write_data_to_delta_table(
     df_course,
-    f"{catalog}.{write_schema}.course_dataset_validated",
-    spark_session=spark_session,
+    "CATALOG.INST_NAME_silver.course_dataset_validated",
+    spark_session=spark,
 )
 
 # COMMAND ----------
 
 pdp.dataio.write_data_to_delta_table(
     df_cohort,
-    f"{catalog}.{write_schema}.cohort_dataset_validated",
-    spark_session=spark_session,
+    "CATALOG.INST_NAME_silver.cohort_dataset_validated",
+    spark_session=spark,
 )
 
 # COMMAND ----------
@@ -269,7 +283,7 @@
 
 # MAGIC %md
 # MAGIC %md
-# MAGIC ## read validated data
+# MAGIC ## read validated data?
 # MAGIC
 # MAGIC (so you don't have to execute the validation process more than once)
 
@@ -278,8 +292,8 @@
 # use base or school-specific schema, as needed
 df_course = pdp.schemas.RawPDPCourseDataSchema(
     pdp.dataio.read_data_from_delta_table(
-        f"{catalog}.{write_schema}.course_dataset_validated",
-        spark_session=spark_session,
+        "CATALOG.INST_NAME_silver.course_dataset_validated",
+        spark_session=spark,
     )
 )
 df_course.shape
@@ -288,8 +302,8 @@
 
 df_cohort = pdp.schemas.RawCohortDataSchema(
     pdp.dataio.read_data_from_delta_table(
-        f"{catalog}.{write_schema}.cohort_dataset_validated",
-        spark_session=spark_session,
+        "CATALOG.INST_NAME_silver.cohort_dataset_validated",
+        spark_session=spark,
     )
 )
 df_cohort.shape
@@ -307,8 +321,11 @@
 # COMMAND ----------
 
 # specific follow-ups, for example
+# df_course["academic_year"].value_counts(normalize=True, dropna=False)
+# df_course["academic_term"].value_counts(normalize=True, dropna=False)
 # df_course["grade"].value_counts(normalize=True, dropna=False)
 # df_course["delivery_method"].value_counts(normalize=True, dropna=False)
+# df_course["course_name"].value_counts(normalize=True, dropna=False).head(10)
 
 # COMMAND ----------
 
@@ -317,8 +334,8 @@
 # COMMAND ----------
 
 # specific follow-ups, for example
-# df_course["cohort"].value_counts(normalize=True, dropna=False)
-# df_course["enrollment_type"].value_counts(normalize=True, dropna=False)
+# df_cohort["cohort"].value_counts(normalize=True, dropna=False)
+# df_cohort["enrollment_type"].value_counts(normalize=True, dropna=False)
 
 # COMMAND ----------
 
@@ -509,6 +526,10 @@
 
 # COMMAND ----------
 
+df_pre_cohort["enrollment_type"].value_counts()
+
+# COMMAND ----------
+
 # MAGIC %md
 # MAGIC ### filter invalid rows(?)
 
@@ -769,7 +790,9 @@
 # COMMAND ----------
 
 # MAGIC %md
-# MAGIC - [ ] Add school-specific data schemas and/or preprocessing functions into the appropriate directory in the [`student-success-intervention` repository](https://github.com/datakind/student-success-intervention)
-# MAGIC - ...
+# MAGIC - [ ] If you haven't already, add school-specific data schemas and/or preprocessing functions into the appropriate directory in the [`student-success-intervention` repository](https://github.com/datakind/student-success-intervention)
+# MAGIC - [ ] Add file paths for the raw course/cohort datasets to the project config file's `labeled_dataset.raw_course` and `labeled_dataset.raw_cohort` blocks
 
 # COMMAND ----------
+
+

From aefe31946b9a4c74aeefe778a625e92533f513d6 Mon Sep 17 00:00:00 2001
From: bdewilde <bdewilde>
Date: Sun, 19 Jan 2025 21:15:28 +0000
Subject: [PATCH 11/18] style: Hide pip magics from linter

---
 notebooks/pdp/01-data-assessment-eda-TEMPLATE.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
index f2c299dc..4ee19cca 100644
--- a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
+++ b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
@@ -29,8 +29,6 @@
 # %pip install "student-success-tool == 0.1.0"
 # %pip install "git+https://github.com/datakind/student-success-tool.git@develop"
 
-%pip install "git+https://github.com/datakind/student-success-tool.git@pdp-add-inference-nb-template"
-
 # COMMAND ----------
 
 # MAGIC %restart_python

From 17eddc59272d241f2237070788d6b3762b6f5a1c Mon Sep 17 00:00:00 2001
From: bdewilde <bdewilde>
Date: Sun, 19 Jan 2025 21:16:36 +0000
Subject: [PATCH 12/18] style: istg databricks there must be a better way

---
 notebooks/pdp/01-data-assessment-eda-TEMPLATE.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
index 4ee19cca..0fbe8a45 100644
--- a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
+++ b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
@@ -36,7 +36,6 @@
 # COMMAND ----------
 
 import logging
-import os
 import sys
 
 import matplotlib.pyplot as plt

From 6c67960a77bf556dc37b240b98c7252871a4af31 Mon Sep 17 00:00:00 2001
From: Burton DeWilde <burtdewilde@gmail.com>
Date: Mon, 20 Jan 2025 11:13:53 -0500
Subject: [PATCH 13/18] Tweak cfg structure for datasets

---
 notebooks/pdp/config-v2-TEMPLATE.toml           |  4 ++--
 .../configs/schemas/pdp_v2.py                   | 17 +++++++++++------
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/notebooks/pdp/config-v2-TEMPLATE.toml b/notebooks/pdp/config-v2-TEMPLATE.toml
index 0601f781..7fc768d7 100644
--- a/notebooks/pdp/config-v2-TEMPLATE.toml
+++ b/notebooks/pdp/config-v2-TEMPLATE.toml
@@ -11,10 +11,10 @@ pred_prob_col = "pred_prob"
 pos_label = true
 random_state = 12345
 
-[labeled_dataset.raw_course]
+[datasets.labeled.raw_course]
 file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/FILE_NAME_COURSE.csv"
 
-[labeled_dataset.raw_cohort]
+[datasets.labeled.raw_cohort]
 file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/FILE_NAME_COHORT.csv"
 
 [preprocessing]
diff --git a/src/student_success_tool/configs/schemas/pdp_v2.py b/src/student_success_tool/configs/schemas/pdp_v2.py
index 7ab639ad..418a058c 100644
--- a/src/student_success_tool/configs/schemas/pdp_v2.py
+++ b/src/student_success_tool/configs/schemas/pdp_v2.py
@@ -149,7 +149,7 @@ class InferenceConfig(pyd.BaseModel):
     # TODO: extend this configuration, maybe?
 
 
-class DatasetConfig(pyd.BaseModel):
+class DatasetIOConfig(pyd.BaseModel):
     table_path: t.Optional[str] = pyd.Field(
         default=None,
         description=(
@@ -171,11 +171,16 @@ def check_some_nonnull_inputs(self):
         return self
 
 
+class DatasetConfig(pyd.BaseModel):
+    raw_course: DatasetIOConfig
+    raw_cohort: DatasetIOConfig
+    preprocessed: t.Optional[DatasetIOConfig] = None
+    predictions: t.Optional[DatasetIOConfig] = None
+
+
 class DatasetsConfig(pyd.BaseModel):
-    raw_course: DatasetConfig
-    raw_cohort: DatasetConfig
-    preprocessed: t.Optional[DatasetConfig] = None
-    predictions: t.Optional[DatasetConfig] = None
+    labeled: DatasetConfig
+    unlabeled: t.Optional[DatasetConfig] = None
 
 
 class TrainedModelConfig(pyd.BaseModel):
@@ -214,7 +219,7 @@ class PDPProjectConfigV2(pyd.BaseModel):
     # other shared parameters
     random_state: t.Optional[int] = None
 
-    labeled_dataset: t.Optional[DatasetsConfig] = None
+    datasets: t.Optional[DatasetsConfig] = None
     trained_model: t.Optional[TrainedModelConfig] = None
 
     preprocessing: t.Optional[PreprocessingConfig] = None

From e787a2e6a184be218f7365c09c9ce44f0446f1aa Mon Sep 17 00:00:00 2001
From: bdewilde <bdewilde>
Date: Mon, 20 Jan 2025 16:16:46 +0000
Subject: [PATCH 14/18] Update 01-template nb w/ new config structure

---
 notebooks/pdp/01-data-assessment-eda-TEMPLATE.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
index 0fbe8a45..74accd80 100644
--- a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
+++ b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
@@ -101,7 +101,7 @@
 # COMMAND ----------
 
 try:
-    raw_course_file_path = cfg.labeled_dataset.raw_course.file_path
+    raw_course_file_path = cfg.datasets.labeled.raw_course.file_path
 except AttributeError:
     # TODO: fill in the actual path to school's raw course file
     raw_course_file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/SCHOOL_COURSE_AR_DEID_DTTM.csv"
@@ -207,7 +207,7 @@
 # COMMAND ----------
 
 try:
-    raw_cohort_file_path = cfg.labeled_dataset.raw_cohort.file_path
+    raw_cohort_file_path = cfg.datasets.labeled.raw_cohort.file_path
 except AttributeError:
     # TODO: fill in the actual path to school's raw course file
     raw_cohort_file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/SCHOOL_COHORT_AR_DEID_DTTM.csv"
@@ -282,7 +282,7 @@
 # MAGIC %md
 # MAGIC ## read validated data?
 # MAGIC
-# MAGIC (so you don't have to execute the validation process more than once)
+# MAGIC (optional, so you don't have to execute the validation process more than once)
 
 # COMMAND ----------
 
@@ -788,7 +788,8 @@
 
 # MAGIC %md
 # MAGIC - [ ] If you haven't already, add school-specific data schemas and/or preprocessing functions into the appropriate directory in the [`student-success-intervention` repository](https://github.com/datakind/student-success-intervention)
-# MAGIC - [ ] Add file paths for the raw course/cohort datasets to the project config file's `labeled_dataset.raw_course` and `labeled_dataset.raw_cohort` blocks
+# MAGIC - [ ] Add file paths for the raw course/cohort datasets to the project config file's `datasets.labeled.raw_course` and `datasets.labeled.raw_cohort` blocks
+# MAGIC - [ ] Submit a PR including this notebook and any school-specific files added in order to run it
 
 # COMMAND ----------
 

From bbb98714267a3a22a763bc5cf7e4235cae551af3 Mon Sep 17 00:00:00 2001
From: bdewilde <bdewilde>
Date: Mon, 20 Jan 2025 16:42:45 +0000
Subject: [PATCH 15/18] Add examples for df course valid usage

---
 notebooks/pdp/01-data-assessment-eda-TEMPLATE.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
index 74accd80..1a021de0 100644
--- a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
+++ b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
@@ -592,6 +592,7 @@
 
 ax = sb.histplot(
     df_course.sort_values(by="academic_year"),
+    # df_course_valid.sort_values(by="academic_year"),
     y="academic_year",
     hue="academic_term",
     multiple="stack",
@@ -663,6 +664,7 @@
 ax = sb.histplot(
     pd.merge(
         df_course.groupby("student_guid")
+        # df_course_valid.groupby("student_guid")
         .size()
         .rename("num_courses_enrolled")
         .reset_index(drop=False),
@@ -685,6 +687,9 @@
     df_course.groupby("student_guid").agg(
         {"number_of_credits_attempted": "sum", "number_of_credits_earned": "sum"}
     ),
+    # df_course_valid.groupby("student_guid").agg(
+    #     {"number_of_credits_attempted": "sum", "number_of_credits_earned": "sum"}
+    # ),
     x="number_of_credits_attempted",
     y="number_of_credits_earned",
     kind="hex",

From 521a19975100c5e7b7e00b52ff4e4257c9f00fb3 Mon Sep 17 00:00:00 2001
From: Burton DeWilde <burtdewilde@gmail.com>
Date: Tue, 21 Jan 2025 19:54:56 -0500
Subject: [PATCH 16/18] hack: Allow arbitrary target params in cfg

---
 notebooks/pdp/config-v2-TEMPLATE.toml              | 4 ++++
 src/student_success_tool/configs/schemas/pdp_v2.py | 1 +
 2 files changed, 5 insertions(+)

diff --git a/notebooks/pdp/config-v2-TEMPLATE.toml b/notebooks/pdp/config-v2-TEMPLATE.toml
index 7fc768d7..f74c2a25 100644
--- a/notebooks/pdp/config-v2-TEMPLATE.toml
+++ b/notebooks/pdp/config-v2-TEMPLATE.toml
@@ -28,6 +28,10 @@ course_level_pattern = 'asdf'
 key_course_subject_areas = ["24", "51"]
 key_course_ids = ["ENGL101", "MATH101"]
 
+[preprocessing.target.params]
+min_num_credits_checkin = 30.0
+min_num_credits_target = 60.0
+
 [preprocessing.target.student_criteria]
 enrollment_type = "FIRST-TIME"
 credential_type_sought_year_1 = "Bachelor's Degree"
diff --git a/src/student_success_tool/configs/schemas/pdp_v2.py b/src/student_success_tool/configs/schemas/pdp_v2.py
index 418a058c..45b21aad 100644
--- a/src/student_success_tool/configs/schemas/pdp_v2.py
+++ b/src/student_success_tool/configs/schemas/pdp_v2.py
@@ -61,6 +61,7 @@ class TargetConfig(pyd.BaseModel):
         ),
     )
     # TODO: refine target functionality and expand on this configuration
+    params: dict[str, object] = pyd.Field(default_factory=dict)
 
 
 class PreprocessingConfig(pyd.BaseModel):

From 3492f3d3c3540e616e2992e1b81464b916b2cf94 Mon Sep 17 00:00:00 2001
From: bdewilde <bdewilde>
Date: Wed, 22 Jan 2025 00:55:52 +0000
Subject: [PATCH 17/18] fix: Force logging in template nb

---
 notebooks/pdp/01-data-assessment-eda-TEMPLATE.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
index 1a021de0..fdabbd39 100644
--- a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
+++ b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
@@ -51,7 +51,7 @@
 
 # COMMAND ----------
 
-logging.basicConfig(level=logging.INFO)
+logging.basicConfig(level=logging.INFO, force=True)
 logging.getLogger("py4j").setLevel(logging.WARNING)  # ignore databricks logger
 
 try:
@@ -209,7 +209,7 @@
 try:
     raw_cohort_file_path = cfg.datasets.labeled.raw_cohort.file_path
 except AttributeError:
-    # TODO: fill in the actual path to school's raw course file
+    # TODO: fill in the actual path to school's raw cohort file
     raw_cohort_file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/SCHOOL_COHORT_AR_DEID_DTTM.csv"
 
 # COMMAND ----------

From d1006b13deb53794a05f5aaad921e331d82fce99 Mon Sep 17 00:00:00 2001
From: bdewilde <bdewilde>
Date: Sat, 25 Jan 2025 16:08:25 +0000
Subject: [PATCH 18/18] hack: save this work but revisit later

---
 .../02-prepare-modeling-dataset-TEMPLATE.py   | 49 ++++++++++---------
 1 file changed, 27 insertions(+), 22 deletions(-)

diff --git a/notebooks/pdp/02-prepare-modeling-dataset-TEMPLATE.py b/notebooks/pdp/02-prepare-modeling-dataset-TEMPLATE.py
index 95fa5805..ac3a0f0d 100644
--- a/notebooks/pdp/02-prepare-modeling-dataset-TEMPLATE.py
+++ b/notebooks/pdp/02-prepare-modeling-dataset-TEMPLATE.py
@@ -22,8 +22,11 @@
 
 # COMMAND ----------
 
-# install dependencies, most of which should come through our 1st-party SST package
-# %pip install git+https://github.com/datakind/student-success-tool.git@develop
+# install dependencies, most/all of which should come through our 1st-party SST package
+# NOTE: it's okay to use 'develop' or a feature branch while developing this nb
+# but when it's finished, it's best to pin to a specific version of the package
+# %pip install "student-success-tool == 0.1.0"
+# %pip install "git+https://github.com/datakind/student-success-tool.git@develop"
 
 # COMMAND ----------
 
@@ -56,7 +59,7 @@
 # COMMAND ----------
 
 # MAGIC %md
-# MAGIC ## `student-success-intervention` hacks
+# MAGIC ## import school-specific code
 
 # COMMAND ----------
 
@@ -65,7 +68,7 @@
 
 # COMMAND ----------
 
-# HACK: insert our 1st-party (school-specific) code into PATH
+# insert our 1st-party (school-specific) code into PATH
 if "../" not in sys.path:
     sys.path.insert(1, "../")
 
@@ -74,24 +77,12 @@
 
 # COMMAND ----------
 
-# MAGIC %md
-# MAGIC ## project config
-
-# COMMAND ----------
-
-# TODO: create a config file in TOML format to school directory
-config = configs.load_config("./config.toml", schema=configs.PDPProjectConfig)
-config
-
-# COMMAND ----------
-
-catalog = "sst_dev"
-
-# configure where data is to be read from / written to
-inst_name = "SCHOOL"  # TODO: fill in school's name in Unity Catalog
-schema = f"{inst_name}_silver"
-catalog_schema = f"{catalog}.{schema}"
-print(f"{catalog_schema=}")
+# project configuration should be stored in a config file in TOML format
+# it'll start out with just basic info: institution_id, institution_name
+# but as each step of the pipeline gets built, more parameters will be moved
+# from hard-coded notebook variables to shareable, persistent config fields
+cfg = configs.load_config("./config-v2-TEMPLATE.toml", configs.PDPProjectConfigV2)
+cfg
 
 # COMMAND ----------
 
@@ -130,6 +121,20 @@
 
 # COMMAND ----------
 
+try:
+    feature_params = cfg.preprocessing.features.model_dump()
+except AttributeError:
+    feature_params = {
+        "min_passing_grade": pdp.constants.DEFAULT_MIN_PASSING_GRADE,
+        "min_num_credits_full_time": pdp.constants.DEFAULT_MIN_NUM_CREDITS_FULL_TIME,
+        "course_level_pattern": pdp.constants.DEFAULT_COURSE_LEVEL_PATTERN,
+        "peak_covid_terms": pdp.constants.DEFAULT_PEAK_COVID_TERMS,
+        "key_course_subject_areas": None,
+        "key_course_ids": None,
+    }
+
+# COMMAND ----------
+
 dict(config.prepare_modeling_dataset)
 
 # COMMAND ----------