datakind · bdewilde · Jan 23, 2025 · Jan 23, 2025 · Jan 23, 2025 · Jan 23, 2025
@@ -0,0 +1,52 @@
+institution_id = "INST_ID"
+institution_name = "INST NAME"
+
+student_id_col = "student_guid"
+target_col = "target"
+split_col = "split"
+sample_weight_col = "sample_weight"
+student_group_cols = ["student_age", "race", "ethnicity", "gender", "first_gen"]
+pred_col = "pred"
+pred_prob_col = "pred_prob"
+pos_label = true
+random_state = 12345
+
+[datasets.labeled]
+raw_course = { file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/FILE_NAME_COURSE.csv" }
+raw_cohort = { file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/FILE_NAME_COHORT.csv" }
+preprocessed = { table_path = "CATALOG.SCHEMA.TABLE_NAME" }
+
+[preprocessing]
+splits = { train = 0.6, test = 0.2, validate = 0.2 }
+sample_class_weight = "balanced"
+
+[preprocessing.features]
+min_passing_grade = 1.0
+min_num_credits_full_time = 12
+# NOTE: single quotes *required* here; it's TOML syntax for literal strings
+course_level_pattern = 'asdf'
+key_course_subject_areas = ["24", "51"]
+key_course_ids = ["ENGL101", "MATH101"]
+
+[preprocessing.target]
+params = { min_num_credits_checkin = 30.0, min_num_credits_target = 60.0 }
+student_criteria = { enrollment_type = "FIRST-TIME", credential_type_sought_year_1 = "Bachelor's Degree" }
+
+[modeling.feature_selection]
+incomplete_threshold = 0.5
+low_variance_threshold = 0.0
+collinear_threshold = 10.0
+
+[modeling.training]
+# exclude_frameworks = ["xgboost", "lightgbm"]
+primary_metric = "log_loss"
+timeout_minutes = 10
+
+[trained_model]
+experiment_id = "EXPERIMENT_ID"
+run_id = "RUN_ID"
+# model_type = "sklearn"
+min_prob_pos_label = 0.5
+
+[inference]
+num_top_features = 5
diff --git a/src/student_success_tool/configs/__init__.py b/src/student_success_tool/configs/__init__.py
@@ -1,2 +1,3 @@
 from .load import load_config
 from .schemas.pdp import PDPProjectConfig
+from .schemas.pdp_v2 import PDPProjectConfigV2
diff --git a/src/student_success_tool/configs/schemas/pdp_v2.py b/src/student_success_tool/configs/schemas/pdp_v2.py
@@ -0,0 +1,249 @@
+import re
+import typing as t
+
+import pydantic as pyd
+
+from ...analysis.pdp import constants
+
+
+class FeaturesConfig(pyd.BaseModel):
+    min_passing_grade: float = pyd.Field(
+        default=constants.DEFAULT_MIN_PASSING_GRADE,
+        description="Minimum numeric grade considered by institution as 'passing'",
+        gt=0.0,
+        lt=4.0,
+    )
+    min_num_credits_full_time: float = pyd.Field(
+        default=constants.DEFAULT_MIN_NUM_CREDITS_FULL_TIME,
+        description=(
+            "Minimum number of credits *attempted* per term for a student's "
+            "enrollment intensity to be considered 'full-time'."
+        ),
+        gt=0.0,
+        lt=20.0,
+    )
+    course_level_pattern: str = pyd.Field(
+        default=constants.DEFAULT_COURSE_LEVEL_PATTERN,
+        description=(
+            "Regular expression patttern that extracts a course's 'level' "
+            "from a PDP course_number field"
+        ),
+    )
+    peak_covid_terms: set[tuple[str, str]] = pyd.Field(
+        default=constants.DEFAULT_PEAK_COVID_TERMS,
+        description=(
+            "Set of (academic year, academic term) pairs considered by institution "
+            "as 'peak' COVID, for use in control variables to account for pandemic effects"
+        ),
+    )
+    key_course_subject_areas: t.Optional[list[str]] = pyd.Field(
+        default=None,
+        description=(
+            "One or more course subject areas (formatted as 2-digit CIP codes) "
+            "for which custom features should be computed"
+        ),
+    )
+    key_course_ids: t.Optional[list[str]] = pyd.Field(
+        default=None,
+        description=(
+            "One or more course ids (formatted as '[COURSE_PREFIX][COURSE_NUMBER]') "
+            "for which custom features should be computed"
+        ),
+    )
+
+
+class TargetConfig(pyd.BaseModel):
+    student_criteria: dict[str, object] = pyd.Field(
+        default_factory=dict,
+        description=(
+            "Column name in modeling dataset mapped to one or more values that it must equal "
+            "in order for the corresponding student to be considered 'eligible'. "
+            "Multiple criteria are combined with a logical 'AND'."
+        ),
+    )
+    # TODO: refine target functionality and expand on this configuration
+    params: dict[str, object] = pyd.Field(default_factory=dict)
+
+
+class PreprocessingConfig(pyd.BaseModel):
+    features: FeaturesConfig
+    target: TargetConfig
+    splits: dict[t.Literal["train", "test", "validate"], float] = pyd.Field(
+        default={"train": 0.6, "test": 0.2, "validate": 0.2},
+        description=(
+            "Mapping of name to fraction of the full datset belonging to a given 'split', "
+            "which is a randomized subset used for different parts of the modeling process"
+        ),
+    )
+    sample_class_weight: t.Optional[t.Literal["balanced"] | dict[object, int]] = (
+        pyd.Field(
+            default=None,
+            description=(
+                "Weights associated with classes in the form ``{class_label: weight}`` "
+                "or 'balanced' to automatically adjust weights inversely proportional "
+                "to class frequencies in the input data. "
+                "If null (default), then sample weights are not computed."
+            ),
+        )
+    )
+
+    @pyd.field_validator("splits", mode="after")
+    @classmethod
+    def check_split_fractions(cls, value: dict) -> dict:
+        if (sum_fracs := sum(value.values())) != 1.0:
+            raise pyd.ValidationError(
+                f"split fractions must sum up to 1.0, but input sums up to {sum_fracs}"
+            )
+        return value
+
+
+class FeatureSelectionConfig(pyd.BaseModel):
+    """
+    See Also:
+        - :func:`modeling.feature_selection.select_features()`
+    """
+
+    non_feature_cols: t.Optional[list[str]] = None
+    force_include_cols: t.Optional[list[str]] = None
+    incomplete_threshold: float = 0.5
+    low_variance_threshold: float = 0.0
+    collinear_threshold: t.Optional[float] = 10.0
+
+
+class TrainingConfig(pyd.BaseModel):
+    """
+    References:
+        - https://docs.databricks.com/en/machine-learning/automl/automl-api-reference.html#classify
+    """
+
+    exclude_cols: t.Optional[list[str]] = pyd.Field(
+        default=None,
+        description="One or more column names in dataset to exclude from training.",
+    )
+    time_col: t.Optional[str] = pyd.Field(
+        default=None,
+        description=(
+            "Column name in dataset used to split train/test/validate sets chronologically, "
+            "as an alternative to the randomized assignment in ``split_col`` ."
+        ),
+    )
+    exclude_frameworks: t.Optional[list[str]] = pyd.Field(
+        default=None,
+        description="List of algorithm frameworks that AutoML excludes from training.",
+    )
+    primary_metric: str = pyd.Field(
+        default="log_loss",
+        description="Metric used to evaluate and rank model performance.",
+    )
+    timeout_minutes: t.Optional[int] = pyd.Field(
+        default=None,
+        description="Maximum time to wait for AutoML trials to complete.",
+    )
+
+
+class ModelingConfig(pyd.BaseModel):
+    feature_selection: t.Optional[FeatureSelectionConfig] = None
+    training: TrainingConfig
+
+
+class InferenceConfig(pyd.BaseModel):
+    num_top_features: int = pyd.Field(default=5)
+    # TODO: extend this configuration, maybe?
+
+
+class DatasetIOConfig(pyd.BaseModel):
+    table_path: t.Optional[str] = pyd.Field(
+        default=None,
+        description=(
+            "Path to a table in Unity Catalog where dataset is stored, "
+            "including the full three-level namespace: 'CATALOG.SCHEMA.TABLE'"
+        ),
+    )
+    file_path: t.Optional[str] = pyd.Field(
+        default=None,
+        description="Full, absolute path to dataset on disk, e.g. a Databricks Volume",
+    )
+    # TODO: if/when we allow different file formats, add this parameter ...
+    # file_format: t.Optional[t.Literal["csv", "parquet"]] = pyd.Field(default=None)
+
+    @pyd.model_validator(mode="after")
+    def check_some_nonnull_inputs(self):
+        if self.table_path is None and self.file_path is None:
+            raise ValueError("table_path and/or file_path must be non-null")
+        return self
+
+
+class DatasetConfig(pyd.BaseModel):
+    raw_course: DatasetIOConfig
+    raw_cohort: DatasetIOConfig
+    preprocessed: t.Optional[DatasetIOConfig] = None
+    predictions: t.Optional[DatasetIOConfig] = None
+    # TODO: do we want advisor-facing output data separate from "raw" predictions?
+    # finalized: t.Optional[DatasetIOConfig] = None
+
+
+class TrainedModelConfig(pyd.BaseModel):
+    experiment_id: str
+    run_id: str
+    model_type: t.Optional[t.Literal["sklearn", "xgboost", "lightgbm"]] = None
+    min_prob_pos_label: t.Optional[float] = 0.5
+
+    @pyd.computed_field  # type: ignore[misc]
+    @property
+    def mlflow_model_uri(self) -> str:
+        return f"runs:/{self.run_id}/model"
+
+
+class PDPProjectConfigV2(pyd.BaseModel):
+    """Configuration (v2) schema for PDP SST projects."""
+
+    institution_id: str = pyd.Field(
+        ...,
+        description=(
+            "Unique (ASCII-only) identifier for institution; used in naming things "
+            "such as source directories, catalog schemas, keys in shared configs, etc."
+        ),
+    )
+    institution_name: str = pyd.Field(
+        ...,
+        description=(
+            "Readable 'display' name for institution, distinct from the 'id'; "
+            "probably just the school's 'official', public name"
+        ),
+    )
+
+    # shared dataset parameters
+    student_id_col: str = "student_guid"
+    target_col: str = "target"
+    split_col: str = "split"
+    sample_weight_col: t.Optional[str] = None
+    student_group_cols: t.Optional[list[str]] = pyd.Field(
+        default=["student_age", "race", "ethnicity", "gender", "first_gen"],
+        description=(
+            "One or more column names in datasets containing student 'groups' "
+            "to use for model bias assessment, but *not* as model features"
+        ),
+    )
+    pred_col: str = "pred"
+    pred_prob_col: str = "pred_prob"
+    pos_label: t.Optional[int | bool | str] = True
+    # other shared parameters
+    random_state: t.Optional[int] = None
+
+    # key artifacts produced by project pipeline
+    datasets: t.Optional[dict[str, DatasetConfig]] = None
+    trained_model: t.Optional[TrainedModelConfig] = None
+    # key steps in project pipeline
+    preprocessing: t.Optional[PreprocessingConfig] = None
+    modeling: t.Optional[ModelingConfig] = None
+    inference: t.Optional[InferenceConfig] = None
+
+    # NOTE: this is for *pydantic* model -- not ML model -- configuration
+    model_config = pyd.ConfigDict(extra="ignore", strict=True)
+
+    @pyd.field_validator("institution_id", mode="after")
+    @classmethod
+    def check_institution_id_isascii(cls, value: str) -> str:
+        if not re.search(r"^\w+$", value, flags=re.ASCII):
+            raise ValueError(f"institution_id='{value}' is not ASCII-only")
+        return value