From e77400c49bdd9f570e9add7441dfcb18fdc922cd Mon Sep 17 00:00:00 2001 From: Burton DeWilde Date: Thu, 16 Jan 2025 21:42:48 -0500 Subject: [PATCH 01/18] Add rough draft template nb --- .../04-make-explain-predictions-TEMPLATE.py | 325 ++++++++++++++++++ 1 file changed, 325 insertions(+) create mode 100644 notebooks/pdp/04-make-explain-predictions-TEMPLATE.py diff --git a/notebooks/pdp/04-make-explain-predictions-TEMPLATE.py b/notebooks/pdp/04-make-explain-predictions-TEMPLATE.py new file mode 100644 index 00000000..fd0c29cc --- /dev/null +++ b/notebooks/pdp/04-make-explain-predictions-TEMPLATE.py @@ -0,0 +1,325 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC # SST Make and Explain Predictions: [SCHOOL] +# MAGIC +# MAGIC Fourth step in the process of transforming raw (PDP) data into actionable, data-driven insights for advisors: generate predictions and feature importances for new (unlabeled) data. +# MAGIC +# MAGIC ### References +# MAGIC +# MAGIC - [Data science product components (Confluence doc)](https://datakind.atlassian.net/wiki/spaces/TT/pages/237862913/Data+science+product+components+the+modeling+process) +# MAGIC - [Databricks runtimes release notes](https://docs.databricks.com/en/release-notes/runtime/index.html) +# MAGIC - [SCHOOL WEBSITE](https://example.com) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC # setup + +# COMMAND ---------- + +# MAGIC %sh python --version + +# COMMAND ---------- + +# install dependencies, most of which should come through our 1st-party SST package +# %pip install "student-success-tool==0.1.0" --no-deps +# %pip install git+https://github.com/datakind/student-success-tool.git@develop --no-deps +# %pip install pandera + +# COMMAND ---------- + +# MAGIC %restart_python + +# COMMAND ---------- + +import functools as ft +import logging +import typing as t + +import mlflow +import numpy as np +import pandas as pd +import shap +import sklearn.inspection +import sklearn.metrics +from databricks.connect import DatabricksSession + +# from databricks.sdk.runtime import dbutils +# from py4j.protocol import Py4JJavaError +# from pyspark import SparkContext +# from pyspark.sql import SparkSession +from pyspark.sql.types import FloatType, StringType, StructField, StructType + +from student_success_tool.analysis.pdp import dataio +from student_success_tool.modeling import inference, utils + +# COMMAND ---------- + +logging.getLogger("root").setLevel(logging.INFO) +logging.getLogger("py4j").setLevel(logging.WARNING) # ignore databricks logger + +try: + spark_session = DatabricksSession.builder.getOrCreate() +except Exception: + logging.warning("unable to create spark session; are you in a Databricks runtime?") + pass + +# COMMAND ---------- + +# Databricks logs every instance that uses sklearn or other modelling libraries +# to MLFlow experiments... which we don't want +mlflow.autolog(disable=True) +mlflow.sklearn.autolog(disable=True) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## configuration + +# COMMAND ---------- + +# TODO TODO TODO: use project config + +train_sample_size = 100 +validate_sample_size = 100 + +institution_id = "INSTITUTION_ID" +best_model_run_id = "BEST_MODEL_RUN_ID" +student_id_col = "student_guid" +target_col = "target" +split_col = "split" +sample_weight_col = "sample_weight" +pos_label = True + +model_type = "sklearn" +labeled_data_path = "CATALOG.SCHEMA.TABLE_NAME" # "TODO" +unlabeled_data_path = None + +# COMMAND ---------- + +# MAGIC %md +# MAGIC # load model and data + +# COMMAND ---------- + + +# TODO: move this into sst package +def mlflow_load_model(model_uri: str, model_type: str): + load_model_func = ( + mlflow.sklearn.load_model + if model_type == "sklearn" + else mlflow.xgboost.load_model + if model_type == "xgboost" + else mlflow.lightgbm.load_model + if model_type == "lightgbm" + else mlflow.pyfunc.load_model + ) + model = load_model_func(f"runs:/{best_model_run_id}/model") + logging.info("mlflow '%s' model loaded from '%s'", model_type, model_uri) + return model + + +def predict_proba( + df: pd.DataFrame, *, model, pos_label: bool | str = True +) -> pd.Series: + return pd.Series( + model.predict_proba(df)[:, model.classes_.tolist().index(pos_label)] + ) + + +# COMMAND ---------- + +model = mlflow_load_model(f"runs:/{best_model_run_id}/model", model_type) +model_features = model.named_steps["column_selector"].get_params()["cols"] +logging.info( + "model uses %s features: %s", len(model_features), ", ".join(model_features) +) + +# COMMAND ---------- + +model_features = model.named_steps["column_selector"].get_params()["cols"] +print(len(model_features)) + +# COMMAND ---------- + +df_labeled = dataio.read_data_from_delta_table( + labeled_data_path, spark_session=spark_session +) +print(df_labeled.shape) +df_labeled.head() + +# COMMAND ---------- + +if unlabeled_data_path: + df_unlabeled = dataio.read_data_from_delta_table( + unlabeled_data_path, spark_session=spark_session + ) +else: + df_unlabeled = df_labeled.loc[df_labeled[split_col].eq("test"), :].drop( + columns=target_col + ) +print(df_unlabeled.shape) +df_unlabeled.head() + +# COMMAND ---------- + +pred_probs = predict_proba(df_unlabeled, model=model) +pred_probs.describe() + +# COMMAND ---------- + +# MAGIC %md +# MAGIC # initialize SHAP explainer + +# COMMAND ---------- + +df_train = df_labeled.loc[df_labeled[split_col].eq("train"), :] +# SHAP can't explain models using data with nulls +# so, impute nulls using the mode (most frequent values) +mode = df_train.mode().iloc[0] +# sample background data for SHAP Explainer +train_sample = ( + df_train.sample(n=min(train_sample_size, df_train.shape[0]), random_state=1) + .fillna(mode) + .loc[:, model_features] +) +train_sample + +# COMMAND ---------- + + +def predict_proba_v3( + X, + *, + model, + col_names: t.Optional[list[str]] = None, + pos_label: t.Optional[bool | str] = None, +) -> np.ndarray: + if col_names is None: + col_names = model.named_steps["column_selector"].get_params()["cols"] + pred_probs = model.predict_proba(pd.DataFrame(data=X, columns=col_names)) + if pos_label is not None: + return pred_probs[:, model.classes_.tolist().index(pos_label)] + else: + return pred_probs + + +def predict_proba_v2(X, *, model, pos_label: bool | str = True): + model_features = model.named_steps["column_selector"].get_params()["cols"] + pred_probs = model.predict_proba(pd.DataFrame(data=X, columns=model_features)) + return pred_probs[:, model.classes_.tolist().index(pos_label)] + + +# COMMAND ---------- + +# import shap +# import sklearn + +# X, y = shap.datasets.adult() +# m = sklearn.linear_model.LogisticRegression().fit(X, y) +# explainer = shap.explainers.Permutation(m.predict_proba, X) +# shap_values = explainer(X[:100]) +# shap.plots.bar(shap_values[..., 1]) + +# COMMAND ---------- + +# explainer = shap.explainers.Permutation(model.predict_proba, train_sample) +# explainer = shap.explainers.KernelExplainer(model.predict_proba, train_sample) +explainer = shap.explainers.KernelExplainer( + ft.partial( + predict_proba_v3, model=model, col_names=model_features, pos_label=pos_label + ), + train_sample, + link="identity", +) +explainer + +# COMMAND ---------- + +shap_schema = StructType( + [StructField(student_id_col, StringType(), nullable=False)] + + [StructField(col, FloatType(), nullable=False) for col in model_features] +) + +df_shap_values = ( + spark.createDataFrame(df_unlabeled.drop(columns=[split_col, sample_weight_col])) # noqa: F821 + .repartition(sc.defaultParallelism) # noqa: F821 + .mapInPandas( + ft.partial( + inference.calculate_shap_values_spark_udf, + student_id_col=student_id_col, + model_features=model_features, + explainer=explainer, + mode=mode, + ), + schema=shap_schema, + ) + .toPandas() + .set_index(student_id_col) + .reindex(df_unlabeled[student_id_col]) + .reset_index(drop=False) +) +df_shap_values + +# COMMAND ---------- + +shap.summary_plot( + df_shap_values[model_features].to_numpy(), + df_unlabeled[model_features], + class_names=model.classes_, + # show=False, ??? +) + +# COMMAND ---------- + +features_table = utils.load_features_table("assets/pdp/features_table.toml") +result = inference.select_top_features_for_display( + df_unlabeled.loc[:, model_features], + df_unlabeled[student_id_col], + pred_probs, + df_shap_values[model_features].to_numpy(), + n_features=5, + features_table=features_table, + needs_support_threshold_prob=0.5, +) +result + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## TODO: +# MAGIC +# MAGIC - save plots and results in a nice form in a place that makes sense + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## haxx + +# COMMAND ---------- + + +result = sklearn.inspection.permutation_importance( + model, + train_sample.drop(columns=target_col), + train_sample[target_col], + scoring=sklearn.metrics.make_scorer( + sklearn.metrics.log_loss, greater_is_better=False + ), + n_repeats=10, +) + +# COMMAND ---------- + +sorted_importances_idx = result.importances_mean.argsort() +importances = pd.DataFrame( + result.importances[sorted_importances_idx].T, + columns=train_sample.columns[sorted_importances_idx], +) +ax = importances.plot.box(vert=False, whis=10, figsize=(10, 10)) +ax.set_title("Permutation Importances (test set)") +ax.axvline(x=0, color="k", linestyle="--") +ax.set_xlabel("Decrease in accuracy score") +ax.figure.tight_layout() + +# COMMAND ---------- From 863a075fb225967d13488d989d5f6eedec9a136f Mon Sep 17 00:00:00 2001 From: Burton DeWilde Date: Thu, 16 Jan 2025 22:11:40 -0500 Subject: [PATCH 02/18] Add load mlflow model util func --- src/student_success_tool/modeling/utils.py | 30 ++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/student_success_tool/modeling/utils.py b/src/student_success_tool/modeling/utils.py index c626bbb6..88fe1aad 100644 --- a/src/student_success_tool/modeling/utils.py +++ b/src/student_success_tool/modeling/utils.py @@ -3,6 +3,7 @@ import typing as t from collections.abc import Sequence +import mlflow import numpy as np import pandas as pd import sklearn.utils @@ -105,3 +106,32 @@ def load_features_table(rel_fpath: str) -> dict[str, dict[str, str]]: LOGGER.info("loaded features table from '%s'", file_path) assert isinstance(features_table, dict) # type guard return features_table + + +def load_mlflow_model( + model_uri: str, + model_type: t.Optional[t.Literal["sklearn", "xgboost", "lightgbm"]] = None, +) -> object: + """ + Load a (registered) MLFlow model of whichever model type from a specified URI. + + Args: + model_uri + model_type + + References: + - https://mlflow.org/docs/latest/python_api/mlflow.sklearn.html#mlflow.sklearn.load_model + - https://mlflow.org/docs/latest/python_api/mlflow.pyfunc.html#mlflow.pyfunc.load_model + """ + load_model_func = ( + mlflow.sklearn.load_model + if model_type == "sklearn" + else mlflow.xgboost.load_model + if model_type == "xgboost" + else mlflow.lightgbm.load_model + if model_type == "lightgbm" + else mlflow.pyfunc.load_model + ) + model = load_model_func(model_uri) + LOGGER.info("mlflow model loaded from '%s'", model_uri) + return model From 26dfc8e144ba4310ce654932bfc1f3aa6b6958a4 Mon Sep 17 00:00:00 2001 From: Burton DeWilde Date: Sat, 18 Jan 2025 14:59:34 -0500 Subject: [PATCH 03/18] Add better (v2) pdp project config --- src/student_success_tool/configs/__init__.py | 1 + .../configs/schemas/pdp_v2.py | 194 ++++++++++++++++++ 2 files changed, 195 insertions(+) create mode 100644 src/student_success_tool/configs/schemas/pdp_v2.py diff --git a/src/student_success_tool/configs/__init__.py b/src/student_success_tool/configs/__init__.py index 1acfd3fd..88bc3e61 100644 --- a/src/student_success_tool/configs/__init__.py +++ b/src/student_success_tool/configs/__init__.py @@ -1,2 +1,3 @@ from .load import load_config from .schemas.pdp import PDPProjectConfig +from .schemas.pdp_v2 import PDPProjectConfigV2 diff --git a/src/student_success_tool/configs/schemas/pdp_v2.py b/src/student_success_tool/configs/schemas/pdp_v2.py new file mode 100644 index 00000000..d2278015 --- /dev/null +++ b/src/student_success_tool/configs/schemas/pdp_v2.py @@ -0,0 +1,194 @@ +import typing as t + +import pydantic as pyd + +from ...analysis.pdp import constants + + +class FeaturesConfig(pyd.BaseModel): + min_passing_grade: float = pyd.Field( + default=constants.DEFAULT_MIN_PASSING_GRADE, + description="Minimum numeric grade considered by institution as 'passing'", + gt=0.0, + lt=4.0, + ) + min_num_credits_full_time: float = pyd.Field( + default=constants.DEFAULT_MIN_NUM_CREDITS_FULL_TIME, + description=( + "Minimum number of credits *attempted* per term for a student's " + "enrollment intensity to be considered 'full-time'." + ), + gt=0.0, + lt=20.0, + ) + course_level_pattern: str = pyd.Field( + default=constants.DEFAULT_COURSE_LEVEL_PATTERN, + description=( + "Regular expression patttern that extracts a course's 'level' " + "from a PDP course_number field" + ), + ) + peak_covid_terms: set[tuple[str, str]] = pyd.Field( + default=constants.DEFAULT_PEAK_COVID_TERMS, + description=( + "Set of (academic year, academic term) pairs considered by institution " + "as 'peak' COVID, for use in control variables to account for pandemic effects" + ), + ) + key_course_subject_areas: t.Optional[list[str]] = pyd.Field( + default=None, + description=( + "One or more course subject areas (formatted as 2-digit CIP codes) " + "for which custom features should be computed" + ), + ) + key_course_ids: t.Optional[list[str]] = pyd.Field( + default=None, + description=( + "One or more course ids (formatted as '[COURSE_PREFIX][COURSE_NUMBER]') " + "for which custom features should be computed" + ), + ) + + +class TargetConfig(pyd.BaseModel): + student_criteria: dict[str, object] = pyd.Field( + default_factory=dict, + description=( + "Column name in modeling dataset mapped to one or more values that it must equal " + "in order for the corresponding student to be considered 'eligible'. " + "Multiple criteria are combined with a logical 'AND'." + ), + ) + # TODO: refine target functionality and expand on this configuration + + +class PreprocessingConfig(pyd.BaseModel): + features: FeaturesConfig + target: TargetConfig + splits: dict[t.Literal["train", "test", "validate"], float] = pyd.Field( + default={"train": 0.6, "test": 0.2, "validate": 0.2}, + description=( + "Mapping of name to fraction of the full datset belonging to a given 'split', " + "which is a randomized subset used for different parts of the modeling process" + ), + ) + sample_class_weight: t.Optional[t.Literal["balanced"] | dict[object, int]] = ( + pyd.Field( + default=None, + description=( + "Weights associated with classes in the form ``{class_label: weight}`` " + "or 'balanced' to automatically adjust weights inversely proportional " + "to class frequencies in the input data. " + "If null (default), then sample weights are not computed." + ), + ) + ) + + @pyd.field_validator("splits", mode="after") + @classmethod + def check_split_fractions(cls, value: dict) -> dict: + if (sum_fracs := sum(value.values())) != 1.0: + raise pyd.ValidationError( + f"split fractions must sum up to 1.0, but input sums up to {sum_fracs}" + ) + return value + + +class TrainingConfig(pyd.BaseModel): + """ + References: + - https://docs.databricks.com/en/machine-learning/automl/automl-api-reference.html#classify + """ + + student_group_cols: t.Optional[list[str]] = pyd.Field( + default=None, + description=( + "One or more column names in dataset containing student 'groups' " + "to use for model bias assessment, but NOT as model features" + ), + ) + exclude_cols: t.Optional[list[str]] = pyd.Field( + default=None, + description="One or more column names in dataset to exclude from training.", + ) + time_col: t.Optional[str] = pyd.Field( + default=None, + description=( + "Column name in dataset used to split train/test/validate sets chronologically, " + "as an alternative to the randomized assignment in ``split_col`` ." + ), + ) + exclude_frameworks: t.Optional[list[str]] = pyd.Field( + default=None, + description="List of algorithm frameworks that AutoML excludes from training.", + ) + primary_metric: str = pyd.Field( + default="log_loss", + description="Metric used to evaluate and rank model performance.", + ) + timeout_minutes: t.Optional[int] = pyd.Field( + default=None, + description="Maximum time to wait for AutoML trials to complete.", + ) + + +class InferenceConfig(pyd.BaseModel): + num_top_features: int = pyd.Field(default=5) + + +class DatasetConfig(pyd.BaseModel): + table_path: t.Optional[str] = pyd.Field( + ..., + description=( + "Path to a table in Unity Catalog where dataset is stored, " + "including the full three-level namespace: 'CATALOG.SCHEMA.TABLE'" + ), + ) + file_path: t.Optional[str] = pyd.Field( + ..., + description="Full, absolute path to dataset on disk, e.g. a Databricks Volume", + ) + # TODO: if/when we allow different file formats, add this parameter ... + # file_format: t.Optional[t.Literal["csv", "parquet"]] = pyd.Field(default=None) + + +class DatasetsConfig(pyd.BaseModel): + raw: DatasetConfig + preprocessed: t.Optional[DatasetConfig] + predictions: t.Optional[DatasetConfig] + + +class TrainedModelConfig(pyd.BaseModel): + experiment_id: str + run_id: str + model_type: t.Optional[t.Literal["sklearn", "xgboost", "lightgbm"]] = None + min_prob_pos_label: t.Optional[float] = 0.5 + + +class PDPProjectConfigV2(pyd.BaseModel): + """Configuration (v2) schema for PDP SST projects.""" + + institution_id: str + institution_name: str + + # shared dataset parameters + student_id_col: str = "student_guid" + target_col: str = "target" + split_col: str = "split" + sample_weight_col: t.Optional[str] = None + pos_label: t.Optional[int | bool | str] = True + pred_col: str = "pred" + pred_prob_col: str = "pred_prob" + # other shared parameters + random_state: t.Optional[int] = None + + labeled_dataset: DatasetsConfig + trained_model: t.Optional[TrainedModelConfig] = None + + preprocessing: t.Optional[PreprocessingConfig] = None + training: t.Optional[TrainingConfig] = None + inference: t.Optional[InferenceConfig] = None + + # NOTE: this is for *pydantic* model -- not ML model -- configuration + model_config = pyd.ConfigDict(extra="ignore", strict=True) From a2d007cc8507dd02d4f5d4f0794a42d37ba8bc10 Mon Sep 17 00:00:00 2001 From: Burton DeWilde Date: Sat, 18 Jan 2025 15:21:05 -0500 Subject: [PATCH 04/18] Shuffle modeling config around --- .../configs/schemas/pdp_v2.py | 34 ++++++++++++++----- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/src/student_success_tool/configs/schemas/pdp_v2.py b/src/student_success_tool/configs/schemas/pdp_v2.py index d2278015..9ab05734 100644 --- a/src/student_success_tool/configs/schemas/pdp_v2.py +++ b/src/student_success_tool/configs/schemas/pdp_v2.py @@ -95,19 +95,25 @@ def check_split_fractions(cls, value: dict) -> dict: return value +class FeatureSelectionConfig(pyd.BaseModel): + """ + See Also: + - :func:`modeling.feature_selection.select_features()` + """ + + non_feature_cols: t.Optional[list[str]] = None + force_include_cols: t.Optional[list[str]] = None + incomplete_threshold: float = 0.5 + low_variance_threshold: float = 0.0 + collinear_threshold: t.Optional[float] = 10.0 + + class TrainingConfig(pyd.BaseModel): """ References: - https://docs.databricks.com/en/machine-learning/automl/automl-api-reference.html#classify """ - student_group_cols: t.Optional[list[str]] = pyd.Field( - default=None, - description=( - "One or more column names in dataset containing student 'groups' " - "to use for model bias assessment, but NOT as model features" - ), - ) exclude_cols: t.Optional[list[str]] = pyd.Field( default=None, description="One or more column names in dataset to exclude from training.", @@ -133,6 +139,11 @@ class TrainingConfig(pyd.BaseModel): ) +class ModelingConfig(pyd.BaseModel): + feature_selection: t.Optional[FeatureSelectionConfig] = None + training: TrainingConfig + + class InferenceConfig(pyd.BaseModel): num_top_features: int = pyd.Field(default=5) @@ -177,6 +188,13 @@ class PDPProjectConfigV2(pyd.BaseModel): target_col: str = "target" split_col: str = "split" sample_weight_col: t.Optional[str] = None + student_group_cols: t.Optional[list[str]] = pyd.Field( + default=None, + description=( + "One or more column names in datasets containing student 'groups' " + "to use for model bias assessment, but NOT as model features" + ), + ) pos_label: t.Optional[int | bool | str] = True pred_col: str = "pred" pred_prob_col: str = "pred_prob" @@ -187,7 +205,7 @@ class PDPProjectConfigV2(pyd.BaseModel): trained_model: t.Optional[TrainedModelConfig] = None preprocessing: t.Optional[PreprocessingConfig] = None - training: t.Optional[TrainingConfig] = None + modeling: t.Optional[ModelingConfig] = None inference: t.Optional[InferenceConfig] = None # NOTE: this is for *pydantic* model -- not ML model -- configuration From 4a5391a281a2ad186a5ee3881b08d61448900fb5 Mon Sep 17 00:00:00 2001 From: Burton DeWilde Date: Sat, 18 Jan 2025 16:13:35 -0500 Subject: [PATCH 05/18] Refine v2 project config --- .../configs/schemas/pdp_v2.py | 26 ++++++++++++++----- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/src/student_success_tool/configs/schemas/pdp_v2.py b/src/student_success_tool/configs/schemas/pdp_v2.py index 9ab05734..d3e8ebc2 100644 --- a/src/student_success_tool/configs/schemas/pdp_v2.py +++ b/src/student_success_tool/configs/schemas/pdp_v2.py @@ -146,28 +146,35 @@ class ModelingConfig(pyd.BaseModel): class InferenceConfig(pyd.BaseModel): num_top_features: int = pyd.Field(default=5) + # TODO: extend this configuration, maybe? class DatasetConfig(pyd.BaseModel): table_path: t.Optional[str] = pyd.Field( - ..., + default=None, description=( "Path to a table in Unity Catalog where dataset is stored, " "including the full three-level namespace: 'CATALOG.SCHEMA.TABLE'" ), ) file_path: t.Optional[str] = pyd.Field( - ..., + default=None, description="Full, absolute path to dataset on disk, e.g. a Databricks Volume", ) # TODO: if/when we allow different file formats, add this parameter ... # file_format: t.Optional[t.Literal["csv", "parquet"]] = pyd.Field(default=None) + @pyd.model_validator(mode="after") + def check_some_nonnull_inputs(self): + if self.table_path is None and self.file_path is None: + raise pyd.ValidationError("table_path and/or file_path must be non-null") + return self + class DatasetsConfig(pyd.BaseModel): raw: DatasetConfig - preprocessed: t.Optional[DatasetConfig] - predictions: t.Optional[DatasetConfig] + preprocessed: t.Optional[DatasetConfig] = None + predictions: t.Optional[DatasetConfig] = None class TrainedModelConfig(pyd.BaseModel): @@ -176,6 +183,11 @@ class TrainedModelConfig(pyd.BaseModel): model_type: t.Optional[t.Literal["sklearn", "xgboost", "lightgbm"]] = None min_prob_pos_label: t.Optional[float] = 0.5 + @pyd.computed_field # type: ignore[misc] + @property + def mlflow_model_uri(self) -> str: + return f"runs:/{self.run_id}/model" + class PDPProjectConfigV2(pyd.BaseModel): """Configuration (v2) schema for PDP SST projects.""" @@ -192,16 +204,16 @@ class PDPProjectConfigV2(pyd.BaseModel): default=None, description=( "One or more column names in datasets containing student 'groups' " - "to use for model bias assessment, but NOT as model features" + "to use for model bias assessment, but *not* as model features" ), ) - pos_label: t.Optional[int | bool | str] = True pred_col: str = "pred" pred_prob_col: str = "pred_prob" + pos_label: t.Optional[int | bool | str] = True # other shared parameters random_state: t.Optional[int] = None - labeled_dataset: DatasetsConfig + labeled_dataset: t.Optional[DatasetsConfig] = None trained_model: t.Optional[TrainedModelConfig] = None preprocessing: t.Optional[PreprocessingConfig] = None From 00dbc1422e9c3ef8a6e7508b1a80948599cf7619 Mon Sep 17 00:00:00 2001 From: Burton DeWilde Date: Sat, 18 Jan 2025 16:43:54 -0500 Subject: [PATCH 06/18] Add proj config v2 template --- notebooks/pdp/config-v2-TEMPLATE.toml | 52 +++++++++++++++++++++++++++ 1 file changed, 52 insertions(+) create mode 100644 notebooks/pdp/config-v2-TEMPLATE.toml diff --git a/notebooks/pdp/config-v2-TEMPLATE.toml b/notebooks/pdp/config-v2-TEMPLATE.toml new file mode 100644 index 00000000..fa4ddfb5 --- /dev/null +++ b/notebooks/pdp/config-v2-TEMPLATE.toml @@ -0,0 +1,52 @@ +institution_id = "INST_ID" +institution_name = "INST_NAME" + +student_id_col = "student_guid" +target_col = "target" +split_col = "split" +sample_weight_col = "sample_weight" +student_group_cols = ["student_age", "race", "ethnicity", "gender", "first_gen"] +pred_col = "pred" +pred_prob_col = "pred_prob" +pos_label = true +random_state = 12345 + +[labeled_dataset.raw] +file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/FILE_NAME.csv" + +[preprocessing] +splits = { train = 0.6, test = 0.2, validate = 0.2 } +sample_class_weight = "balanced" + +[preprocessing.features] +min_passing_grade = 1.0 +min_num_credits_full_time = 12 +course_level_pattern = 'asdf' +key_course_subject_areas = ["24", "51"] +key_course_ids = ["ENGL101", "MATH101"] + +[preprocessing.target.student_criteria] +enrollment_type = "FIRST-TIME" +credential_type_sought_year_1 = "Bachelor's Degree" + +[labeled_dataset.preprocessed] +table_path = "CATALOG.SCHEMA.TABLE_NAME" + +[modeling.feature_selection] +incomplete_threshold = 0.5 +low_variance_threshold = 0.0 +collinear_threshold = 10.0 + +[modeling.training] +# exclude_frameworks = ["xgboost", "lightgbm"] +primary_metric = "log_loss" +timeout_minutes = 10 + +[trained_model] +experiment_id = "EXPERIMENT_ID" +run_id = "RUN_ID" +# model_type = "sklearn" +min_prob_pos_label = 0.5 + +[inference] +num_top_features = 5 From 7b08eb730c11c011f7758333d6c9640b9eb22d81 Mon Sep 17 00:00:00 2001 From: Burton DeWilde Date: Sun, 19 Jan 2025 14:08:21 -0500 Subject: [PATCH 07/18] Add default student group cols these will *probably* be correct for all pdp schools --- src/student_success_tool/configs/schemas/pdp_v2.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/student_success_tool/configs/schemas/pdp_v2.py b/src/student_success_tool/configs/schemas/pdp_v2.py index d3e8ebc2..987cc36c 100644 --- a/src/student_success_tool/configs/schemas/pdp_v2.py +++ b/src/student_success_tool/configs/schemas/pdp_v2.py @@ -201,7 +201,7 @@ class PDPProjectConfigV2(pyd.BaseModel): split_col: str = "split" sample_weight_col: t.Optional[str] = None student_group_cols: t.Optional[list[str]] = pyd.Field( - default=None, + default=["student_age", "race", "ethnicity", "gender", "first_gen"], description=( "One or more column names in datasets containing student 'groups' " "to use for model bias assessment, but *not* as model features" From e46df479d9be94facfc5eb39eaad674fbf402411 Mon Sep 17 00:00:00 2001 From: Burton DeWilde Date: Sun, 19 Jan 2025 14:27:32 -0500 Subject: [PATCH 08/18] Split raw dataset cfg into course+cohort --- notebooks/pdp/config-v2-TEMPLATE.toml | 7 +++++-- src/student_success_tool/configs/schemas/pdp_v2.py | 3 ++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/notebooks/pdp/config-v2-TEMPLATE.toml b/notebooks/pdp/config-v2-TEMPLATE.toml index fa4ddfb5..0601f781 100644 --- a/notebooks/pdp/config-v2-TEMPLATE.toml +++ b/notebooks/pdp/config-v2-TEMPLATE.toml @@ -11,8 +11,11 @@ pred_prob_col = "pred_prob" pos_label = true random_state = 12345 -[labeled_dataset.raw] -file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/FILE_NAME.csv" +[labeled_dataset.raw_course] +file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/FILE_NAME_COURSE.csv" + +[labeled_dataset.raw_cohort] +file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/FILE_NAME_COHORT.csv" [preprocessing] splits = { train = 0.6, test = 0.2, validate = 0.2 } diff --git a/src/student_success_tool/configs/schemas/pdp_v2.py b/src/student_success_tool/configs/schemas/pdp_v2.py index 987cc36c..7ab639ad 100644 --- a/src/student_success_tool/configs/schemas/pdp_v2.py +++ b/src/student_success_tool/configs/schemas/pdp_v2.py @@ -172,7 +172,8 @@ def check_some_nonnull_inputs(self): class DatasetsConfig(pyd.BaseModel): - raw: DatasetConfig + raw_course: DatasetConfig + raw_cohort: DatasetConfig preprocessed: t.Optional[DatasetConfig] = None predictions: t.Optional[DatasetConfig] = None From 343083f38dd86bc1c29099c3d5a58b9e02a45097 Mon Sep 17 00:00:00 2001 From: Burton DeWilde Date: Sun, 19 Jan 2025 15:50:10 -0500 Subject: [PATCH 09/18] fix: Allow nullable raw cohort column values amazing that nobody has caught this issue yet --- src/student_success_tool/analysis/pdp/schemas/raw_cohort.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/student_success_tool/analysis/pdp/schemas/raw_cohort.py b/src/student_success_tool/analysis/pdp/schemas/raw_cohort.py index 518731d5..6b4c3455 100644 --- a/src/student_success_tool/analysis/pdp/schemas/raw_cohort.py +++ b/src/student_success_tool/analysis/pdp/schemas/raw_cohort.py @@ -74,7 +74,9 @@ class RawPDPCohortDataSchema(pda.DataFrameModel): dtype_kwargs={"categories": ["FIRST-TIME", "RE-ADMIT", "TRANSFER-IN"]}, ) # NOTE: categories set in a parser, which forces "UK" / "UNKNOWN" values to null - enrollment_intensity_first_term: pt.Series[pd.CategoricalDtype] = pda.Field() + enrollment_intensity_first_term: pt.Series[pd.CategoricalDtype] = pda.Field( + nullable=True + ) # NOTE: categories set in a parser, which forces "UK" values to null math_placement: pt.Series[pd.CategoricalDtype] = pda.Field(nullable=True) # NOTE: categories set in a parser, which forces "UK" values to null From 9f1dd5cc9e498abb25b14e0d712ba34a31e122e6 Mon Sep 17 00:00:00 2001 From: bdewilde Date: Sun, 19 Jan 2025 21:14:12 +0000 Subject: [PATCH 10/18] Update 01-template nb w/ configs and fixes --- .../pdp/01-data-assessment-eda-TEMPLATE.py | 119 +++++++++++------- 1 file changed, 71 insertions(+), 48 deletions(-) diff --git a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py index fe1ec743..f2c299dc 100644 --- a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py +++ b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py @@ -23,8 +23,13 @@ # COMMAND ---------- -# install dependencies, most of which should come through our 1st-party SST package -# %pip install git+https://github.com/datakind/student-success-tool.git@develop +# install dependencies, most/all of which should come through our 1st-party SST package +# NOTE: it's okay to use 'develop' or a feature branch while developing this nb +# but when it's finished, it's best to pin to a specific version of the package +# %pip install "student-success-tool == 0.1.0" +# %pip install "git+https://github.com/datakind/student-success-tool.git@develop" + +%pip install "git+https://github.com/datakind/student-success-tool.git@pdp-add-inference-nb-template" # COMMAND ---------- @@ -45,6 +50,7 @@ from databricks.sdk.runtime import dbutils from student_success_tool.analysis import pdp +from student_success_tool import configs # COMMAND ---------- @@ -52,7 +58,7 @@ logging.getLogger("py4j").setLevel(logging.WARNING) # ignore databricks logger try: - spark_session = DatabricksSession.builder.getOrCreate() + spark = DatabricksSession.builder.getOrCreate() except Exception: logging.warning("unable to create spark session; are you in a Databricks runtime?") pass @@ -60,7 +66,7 @@ # COMMAND ---------- # MAGIC %md -# MAGIC ## `student-success-intervention` hacks +# MAGIC ## import school-specific code # COMMAND ---------- @@ -69,33 +75,21 @@ # COMMAND ---------- -# HACK: insert our 1st-party (school-specific) code into PATH +# insert our 1st-party (school-specific) code into PATH if "../" not in sys.path: sys.path.insert(1, "../") -# TODO: specify school's subpackage +# TODO: specify school's subpackage here from analysis import * # noqa: F403 # COMMAND ---------- -# MAGIC %md -# MAGIC ## unity catalog config - -# COMMAND ---------- - -catalog = "sst_dev" - -# configure where data is to be read from / written to -inst_name = "SCHOOL" # TODO: fill in school's name in Unity Catalog -read_schema = f"{inst_name}_bronze" -write_schema = f"{inst_name}_silver" - -path_volume = os.path.join( - "/Volumes", catalog, read_schema, f"{inst_name}_bronze_file_volume" -) -path_table = f"{catalog}.{read_schema}" -print(f"{path_table=}") -print(f"{path_volume=}") +# project configuration should be stored in a config file in TOML format +# it'll start out with just basic info: institution_id, institution_name +# but as each step of the pipeline gets built, more parameters will be moved +# from hard-coded notebook variables to shareable, persistent config fields +cfg = configs.load_config("./config-v2-TEMPLATE.toml", configs.PDPProjectConfigV2) +cfg # COMMAND ---------- @@ -109,14 +103,17 @@ # COMMAND ---------- -# TODO: fill in school's name; may not be same as in the schemas above -fpath_course = os.path.join(path_volume, "SCHOOL_COURSE_AR_DEID_DTTM.csv") +try: + raw_course_file_path = cfg.labeled_dataset.raw_course.file_path +except AttributeError: + # TODO: fill in the actual path to school's raw course file + raw_course_file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/SCHOOL_COURSE_AR_DEID_DTTM.csv" # COMMAND ---------- # read without any schema validation, so we can look at the data "raw" df_course_raw = pdp.dataio.read_raw_pdp_course_data_from_file( - fpath_course, schema=None, dttm_format="%Y%m%d.0" + raw_course_file_path, schema=None, dttm_format="%Y%m%d.0" ) print(f"rows x cols = {df_course_raw.shape}") df_course_raw.head() @@ -127,6 +124,10 @@ # COMMAND ---------- +df_course_raw["course_begin_date"].describe() + +# COMMAND ---------- + # MAGIC %md # MAGIC Quick checks: # MAGIC - [ ] data exists where it should @@ -137,7 +138,7 @@ # try to read data while validating with the "base" PDP schema df_course = pdp.dataio.read_raw_pdp_course_data_from_file( - fpath_course, schema=pdp.schemas.RawPDPCourseDataSchema, dttm_format="%Y%m%d.0" + raw_course_file_path, schema=pdp.schemas.RawPDPCourseDataSchema, dttm_format="%Y%m%d.0" ) df_course @@ -199,7 +200,7 @@ # MAGIC ``` # MAGIC # MAGIC At this point, `df_course` should be a properly validated and parsed data frame, ready for exploratory data analysis. - +# MAGIC # COMMAND ---------- @@ -208,14 +209,18 @@ # COMMAND ---------- - -# TODO: fill in school's name; may not be same as in the schemas above -fpath_cohort = os.path.join(path_volume, "SCHOOL_COHORT_AR_DEID_DTTM.csv") +try: + raw_cohort_file_path = cfg.labeled_dataset.raw_cohort.file_path +except AttributeError: + # TODO: fill in the actual path to school's raw course file + raw_cohort_file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/SCHOOL_COHORT_AR_DEID_DTTM.csv" # COMMAND ---------- # read without any schema validation, so we can look at the data "raw" -df_cohort_raw = pdp.dataio.read_raw_pdp_cohort_data_from_file(fpath_cohort, schema=None) +df_cohort_raw = pdp.dataio.read_raw_pdp_cohort_data_from_file( + raw_cohort_file_path, schema=None +) print(f"rows x cols = {df_cohort_raw.shape}") df_cohort_raw.head() @@ -223,7 +228,7 @@ # try to read data while validating with the "base" PDP schema df_cohort = pdp.dataio.read_raw_pdp_cohort_data_from_file( - fpath_cohort, schema=pdp.schemas.base.RawPDPCohortDataSchema + raw_cohort_file_path, schema=pdp.schemas.RawPDPCohortDataSchema ) df_cohort @@ -242,22 +247,31 @@ # COMMAND ---------- # MAGIC %md -# MAGIC ## save validated data +# MAGIC ## HEY, STOP HERE! + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Before continuing on to EDA, now's a great time to do a couple things: +# MAGIC +# MAGIC - Copy any school-specific raw dataset schemas into a `schemas.py` file in the current working directory +# MAGIC - Copy any school-specific preprocessing functions needed to coerce the raw data into a standardized form into a `dataio.py` file in the current working directory +# MAGIC - **Optional:** If you want easy access to outputs from every (sub-)step of the data transformation pipeline, save the validated datasets into this school's "silver" schema in Unity Catalog. # COMMAND ---------- pdp.dataio.write_data_to_delta_table( df_course, - f"{catalog}.{write_schema}.course_dataset_validated", - spark_session=spark_session, + "CATALOG.INST_NAME_silver.course_dataset_validated", + spark_session=spark, ) # COMMAND ---------- pdp.dataio.write_data_to_delta_table( df_cohort, - f"{catalog}.{write_schema}.cohort_dataset_validated", - spark_session=spark_session, + "CATALOG.INST_NAME_silver.cohort_dataset_validated", + spark_session=spark, ) # COMMAND ---------- @@ -269,7 +283,7 @@ # MAGIC %md # MAGIC %md -# MAGIC ## read validated data +# MAGIC ## read validated data? # MAGIC # MAGIC (so you don't have to execute the validation process more than once) @@ -278,8 +292,8 @@ # use base or school-specific schema, as needed df_course = pdp.schemas.RawPDPCourseDataSchema( pdp.dataio.read_data_from_delta_table( - f"{catalog}.{write_schema}.course_dataset_validated", - spark_session=spark_session, + "CATALOG.INST_NAME_silver.course_dataset_validated", + spark_session=spark, ) ) df_course.shape @@ -288,8 +302,8 @@ df_cohort = pdp.schemas.RawCohortDataSchema( pdp.dataio.read_data_from_delta_table( - f"{catalog}.{write_schema}.cohort_dataset_validated", - spark_session=spark_session, + "CATALOG.INST_NAME_silver.cohort_dataset_validated", + spark_session=spark, ) ) df_cohort.shape @@ -307,8 +321,11 @@ # COMMAND ---------- # specific follow-ups, for example +# df_course["academic_year"].value_counts(normalize=True, dropna=False) +# df_course["academic_term"].value_counts(normalize=True, dropna=False) # df_course["grade"].value_counts(normalize=True, dropna=False) # df_course["delivery_method"].value_counts(normalize=True, dropna=False) +# df_course["course_name"].value_counts(normalize=True, dropna=False).head(10) # COMMAND ---------- @@ -317,8 +334,8 @@ # COMMAND ---------- # specific follow-ups, for example -# df_course["cohort"].value_counts(normalize=True, dropna=False) -# df_course["enrollment_type"].value_counts(normalize=True, dropna=False) +# df_cohort["cohort"].value_counts(normalize=True, dropna=False) +# df_cohort["enrollment_type"].value_counts(normalize=True, dropna=False) # COMMAND ---------- @@ -509,6 +526,10 @@ # COMMAND ---------- +df_pre_cohort["enrollment_type"].value_counts() + +# COMMAND ---------- + # MAGIC %md # MAGIC ### filter invalid rows(?) @@ -769,7 +790,9 @@ # COMMAND ---------- # MAGIC %md -# MAGIC - [ ] Add school-specific data schemas and/or preprocessing functions into the appropriate directory in the [`student-success-intervention` repository](https://github.com/datakind/student-success-intervention) -# MAGIC - ... +# MAGIC - [ ] If you haven't already, add school-specific data schemas and/or preprocessing functions into the appropriate directory in the [`student-success-intervention` repository](https://github.com/datakind/student-success-intervention) +# MAGIC - [ ] Add file paths for the raw course/cohort datasets to the project config file's `labeled_dataset.raw_course` and `labeled_dataset.raw_cohort` blocks # COMMAND ---------- + + From aefe31946b9a4c74aeefe778a625e92533f513d6 Mon Sep 17 00:00:00 2001 From: bdewilde Date: Sun, 19 Jan 2025 21:15:28 +0000 Subject: [PATCH 11/18] style: Hide pip magics from linter --- notebooks/pdp/01-data-assessment-eda-TEMPLATE.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py index f2c299dc..4ee19cca 100644 --- a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py +++ b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py @@ -29,8 +29,6 @@ # %pip install "student-success-tool == 0.1.0" # %pip install "git+https://github.com/datakind/student-success-tool.git@develop" -%pip install "git+https://github.com/datakind/student-success-tool.git@pdp-add-inference-nb-template" - # COMMAND ---------- # MAGIC %restart_python From 17eddc59272d241f2237070788d6b3762b6f5a1c Mon Sep 17 00:00:00 2001 From: bdewilde Date: Sun, 19 Jan 2025 21:16:36 +0000 Subject: [PATCH 12/18] style: istg databricks there must be a better way --- notebooks/pdp/01-data-assessment-eda-TEMPLATE.py | 1 - 1 file changed, 1 deletion(-) diff --git a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py index 4ee19cca..0fbe8a45 100644 --- a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py +++ b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py @@ -36,7 +36,6 @@ # COMMAND ---------- import logging -import os import sys import matplotlib.pyplot as plt From 6c67960a77bf556dc37b240b98c7252871a4af31 Mon Sep 17 00:00:00 2001 From: Burton DeWilde Date: Mon, 20 Jan 2025 11:13:53 -0500 Subject: [PATCH 13/18] Tweak cfg structure for datasets --- notebooks/pdp/config-v2-TEMPLATE.toml | 4 ++-- .../configs/schemas/pdp_v2.py | 17 +++++++++++------ 2 files changed, 13 insertions(+), 8 deletions(-) diff --git a/notebooks/pdp/config-v2-TEMPLATE.toml b/notebooks/pdp/config-v2-TEMPLATE.toml index 0601f781..7fc768d7 100644 --- a/notebooks/pdp/config-v2-TEMPLATE.toml +++ b/notebooks/pdp/config-v2-TEMPLATE.toml @@ -11,10 +11,10 @@ pred_prob_col = "pred_prob" pos_label = true random_state = 12345 -[labeled_dataset.raw_course] +[datasets.labeled.raw_course] file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/FILE_NAME_COURSE.csv" -[labeled_dataset.raw_cohort] +[datasets.labeled.raw_cohort] file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/FILE_NAME_COHORT.csv" [preprocessing] diff --git a/src/student_success_tool/configs/schemas/pdp_v2.py b/src/student_success_tool/configs/schemas/pdp_v2.py index 7ab639ad..418a058c 100644 --- a/src/student_success_tool/configs/schemas/pdp_v2.py +++ b/src/student_success_tool/configs/schemas/pdp_v2.py @@ -149,7 +149,7 @@ class InferenceConfig(pyd.BaseModel): # TODO: extend this configuration, maybe? -class DatasetConfig(pyd.BaseModel): +class DatasetIOConfig(pyd.BaseModel): table_path: t.Optional[str] = pyd.Field( default=None, description=( @@ -171,11 +171,16 @@ def check_some_nonnull_inputs(self): return self +class DatasetConfig(pyd.BaseModel): + raw_course: DatasetIOConfig + raw_cohort: DatasetIOConfig + preprocessed: t.Optional[DatasetIOConfig] = None + predictions: t.Optional[DatasetIOConfig] = None + + class DatasetsConfig(pyd.BaseModel): - raw_course: DatasetConfig - raw_cohort: DatasetConfig - preprocessed: t.Optional[DatasetConfig] = None - predictions: t.Optional[DatasetConfig] = None + labeled: DatasetConfig + unlabeled: t.Optional[DatasetConfig] = None class TrainedModelConfig(pyd.BaseModel): @@ -214,7 +219,7 @@ class PDPProjectConfigV2(pyd.BaseModel): # other shared parameters random_state: t.Optional[int] = None - labeled_dataset: t.Optional[DatasetsConfig] = None + datasets: t.Optional[DatasetsConfig] = None trained_model: t.Optional[TrainedModelConfig] = None preprocessing: t.Optional[PreprocessingConfig] = None From e787a2e6a184be218f7365c09c9ce44f0446f1aa Mon Sep 17 00:00:00 2001 From: bdewilde Date: Mon, 20 Jan 2025 16:16:46 +0000 Subject: [PATCH 14/18] Update 01-template nb w/ new config structure --- notebooks/pdp/01-data-assessment-eda-TEMPLATE.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py index 0fbe8a45..74accd80 100644 --- a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py +++ b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py @@ -101,7 +101,7 @@ # COMMAND ---------- try: - raw_course_file_path = cfg.labeled_dataset.raw_course.file_path + raw_course_file_path = cfg.datasets.labeled.raw_course.file_path except AttributeError: # TODO: fill in the actual path to school's raw course file raw_course_file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/SCHOOL_COURSE_AR_DEID_DTTM.csv" @@ -207,7 +207,7 @@ # COMMAND ---------- try: - raw_cohort_file_path = cfg.labeled_dataset.raw_cohort.file_path + raw_cohort_file_path = cfg.datasets.labeled.raw_cohort.file_path except AttributeError: # TODO: fill in the actual path to school's raw course file raw_cohort_file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/SCHOOL_COHORT_AR_DEID_DTTM.csv" @@ -282,7 +282,7 @@ # MAGIC %md # MAGIC ## read validated data? # MAGIC -# MAGIC (so you don't have to execute the validation process more than once) +# MAGIC (optional, so you don't have to execute the validation process more than once) # COMMAND ---------- @@ -788,7 +788,8 @@ # MAGIC %md # MAGIC - [ ] If you haven't already, add school-specific data schemas and/or preprocessing functions into the appropriate directory in the [`student-success-intervention` repository](https://github.com/datakind/student-success-intervention) -# MAGIC - [ ] Add file paths for the raw course/cohort datasets to the project config file's `labeled_dataset.raw_course` and `labeled_dataset.raw_cohort` blocks +# MAGIC - [ ] Add file paths for the raw course/cohort datasets to the project config file's `datasets.labeled.raw_course` and `datasets.labeled.raw_cohort` blocks +# MAGIC - [ ] Submit a PR including this notebook and any school-specific files added in order to run it # COMMAND ---------- From bbb98714267a3a22a763bc5cf7e4235cae551af3 Mon Sep 17 00:00:00 2001 From: bdewilde Date: Mon, 20 Jan 2025 16:42:45 +0000 Subject: [PATCH 15/18] Add examples for df course valid usage --- notebooks/pdp/01-data-assessment-eda-TEMPLATE.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py index 74accd80..1a021de0 100644 --- a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py +++ b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py @@ -592,6 +592,7 @@ ax = sb.histplot( df_course.sort_values(by="academic_year"), + # df_course_valid.sort_values(by="academic_year"), y="academic_year", hue="academic_term", multiple="stack", @@ -663,6 +664,7 @@ ax = sb.histplot( pd.merge( df_course.groupby("student_guid") + # df_course_valid.groupby("student_guid") .size() .rename("num_courses_enrolled") .reset_index(drop=False), @@ -685,6 +687,9 @@ df_course.groupby("student_guid").agg( {"number_of_credits_attempted": "sum", "number_of_credits_earned": "sum"} ), + # df_course_valid.groupby("student_guid").agg( + # {"number_of_credits_attempted": "sum", "number_of_credits_earned": "sum"} + # ), x="number_of_credits_attempted", y="number_of_credits_earned", kind="hex", From 521a19975100c5e7b7e00b52ff4e4257c9f00fb3 Mon Sep 17 00:00:00 2001 From: Burton DeWilde Date: Tue, 21 Jan 2025 19:54:56 -0500 Subject: [PATCH 16/18] hack: Allow arbitrary target params in cfg --- notebooks/pdp/config-v2-TEMPLATE.toml | 4 ++++ src/student_success_tool/configs/schemas/pdp_v2.py | 1 + 2 files changed, 5 insertions(+) diff --git a/notebooks/pdp/config-v2-TEMPLATE.toml b/notebooks/pdp/config-v2-TEMPLATE.toml index 7fc768d7..f74c2a25 100644 --- a/notebooks/pdp/config-v2-TEMPLATE.toml +++ b/notebooks/pdp/config-v2-TEMPLATE.toml @@ -28,6 +28,10 @@ course_level_pattern = 'asdf' key_course_subject_areas = ["24", "51"] key_course_ids = ["ENGL101", "MATH101"] +[preprocessing.target.params] +min_num_credits_checkin = 30.0 +min_num_credits_target = 60.0 + [preprocessing.target.student_criteria] enrollment_type = "FIRST-TIME" credential_type_sought_year_1 = "Bachelor's Degree" diff --git a/src/student_success_tool/configs/schemas/pdp_v2.py b/src/student_success_tool/configs/schemas/pdp_v2.py index 418a058c..45b21aad 100644 --- a/src/student_success_tool/configs/schemas/pdp_v2.py +++ b/src/student_success_tool/configs/schemas/pdp_v2.py @@ -61,6 +61,7 @@ class TargetConfig(pyd.BaseModel): ), ) # TODO: refine target functionality and expand on this configuration + params: dict[str, object] = pyd.Field(default_factory=dict) class PreprocessingConfig(pyd.BaseModel): From 3492f3d3c3540e616e2992e1b81464b916b2cf94 Mon Sep 17 00:00:00 2001 From: bdewilde Date: Wed, 22 Jan 2025 00:55:52 +0000 Subject: [PATCH 17/18] fix: Force logging in template nb --- notebooks/pdp/01-data-assessment-eda-TEMPLATE.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py index 1a021de0..fdabbd39 100644 --- a/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py +++ b/notebooks/pdp/01-data-assessment-eda-TEMPLATE.py @@ -51,7 +51,7 @@ # COMMAND ---------- -logging.basicConfig(level=logging.INFO) +logging.basicConfig(level=logging.INFO, force=True) logging.getLogger("py4j").setLevel(logging.WARNING) # ignore databricks logger try: @@ -209,7 +209,7 @@ try: raw_cohort_file_path = cfg.datasets.labeled.raw_cohort.file_path except AttributeError: - # TODO: fill in the actual path to school's raw course file + # TODO: fill in the actual path to school's raw cohort file raw_cohort_file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/SCHOOL_COHORT_AR_DEID_DTTM.csv" # COMMAND ---------- From d1006b13deb53794a05f5aaad921e331d82fce99 Mon Sep 17 00:00:00 2001 From: bdewilde Date: Sat, 25 Jan 2025 16:08:25 +0000 Subject: [PATCH 18/18] hack: save this work but revisit later --- .../02-prepare-modeling-dataset-TEMPLATE.py | 49 ++++++++++--------- 1 file changed, 27 insertions(+), 22 deletions(-) diff --git a/notebooks/pdp/02-prepare-modeling-dataset-TEMPLATE.py b/notebooks/pdp/02-prepare-modeling-dataset-TEMPLATE.py index 95fa5805..ac3a0f0d 100644 --- a/notebooks/pdp/02-prepare-modeling-dataset-TEMPLATE.py +++ b/notebooks/pdp/02-prepare-modeling-dataset-TEMPLATE.py @@ -22,8 +22,11 @@ # COMMAND ---------- -# install dependencies, most of which should come through our 1st-party SST package -# %pip install git+https://github.com/datakind/student-success-tool.git@develop +# install dependencies, most/all of which should come through our 1st-party SST package +# NOTE: it's okay to use 'develop' or a feature branch while developing this nb +# but when it's finished, it's best to pin to a specific version of the package +# %pip install "student-success-tool == 0.1.0" +# %pip install "git+https://github.com/datakind/student-success-tool.git@develop" # COMMAND ---------- @@ -56,7 +59,7 @@ # COMMAND ---------- # MAGIC %md -# MAGIC ## `student-success-intervention` hacks +# MAGIC ## import school-specific code # COMMAND ---------- @@ -65,7 +68,7 @@ # COMMAND ---------- -# HACK: insert our 1st-party (school-specific) code into PATH +# insert our 1st-party (school-specific) code into PATH if "../" not in sys.path: sys.path.insert(1, "../") @@ -74,24 +77,12 @@ # COMMAND ---------- -# MAGIC %md -# MAGIC ## project config - -# COMMAND ---------- - -# TODO: create a config file in TOML format to school directory -config = configs.load_config("./config.toml", schema=configs.PDPProjectConfig) -config - -# COMMAND ---------- - -catalog = "sst_dev" - -# configure where data is to be read from / written to -inst_name = "SCHOOL" # TODO: fill in school's name in Unity Catalog -schema = f"{inst_name}_silver" -catalog_schema = f"{catalog}.{schema}" -print(f"{catalog_schema=}") +# project configuration should be stored in a config file in TOML format +# it'll start out with just basic info: institution_id, institution_name +# but as each step of the pipeline gets built, more parameters will be moved +# from hard-coded notebook variables to shareable, persistent config fields +cfg = configs.load_config("./config-v2-TEMPLATE.toml", configs.PDPProjectConfigV2) +cfg # COMMAND ---------- @@ -130,6 +121,20 @@ # COMMAND ---------- +try: + feature_params = cfg.preprocessing.features.model_dump() +except AttributeError: + feature_params = { + "min_passing_grade": pdp.constants.DEFAULT_MIN_PASSING_GRADE, + "min_num_credits_full_time": pdp.constants.DEFAULT_MIN_NUM_CREDITS_FULL_TIME, + "course_level_pattern": pdp.constants.DEFAULT_COURSE_LEVEL_PATTERN, + "peak_covid_terms": pdp.constants.DEFAULT_PEAK_COVID_TERMS, + "key_course_subject_areas": None, + "key_course_ids": None, + } + +# COMMAND ---------- + dict(config.prepare_modeling_dataset) # COMMAND ----------