Skip to content

Commit

Permalink
Add better (v2) pdp project config
Browse files Browse the repository at this point in the history
  • Loading branch information
bdewilde committed Jan 18, 2025
1 parent 863a075 commit b07569c
Show file tree
Hide file tree
Showing 2 changed files with 195 additions and 1 deletion.
2 changes: 1 addition & 1 deletion src/student_success_tool/configs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
from .load import load_config
from .schemas.pdp import PDPProjectConfig
from .schemas.pdp import PDPProjectConfig, PDPProjectConfigV2
194 changes: 194 additions & 0 deletions src/student_success_tool/configs/schemas/pdp_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
import typing as t

import pydantic as pyd

from ...analysis.pdp import constants


class FeaturesConfig(pyd.BaseModel):
min_passing_grade: float = pyd.Field(
default=constants.DEFAULT_MIN_PASSING_GRADE,
description="Minimum numeric grade considered by institution as 'passing'",
gt=0.0,
lt=4.0,
)
min_num_credits_full_time: float = pyd.Field(
default=constants.DEFAULT_MIN_NUM_CREDITS_FULL_TIME,
description=(
"Minimum number of credits *attempted* per term for a student's "
"enrollment intensity to be considered 'full-time'."
),
gt=0.0,
lt=20.0,
)
course_level_pattern: str = pyd.Field(
default=constants.DEFAULT_COURSE_LEVEL_PATTERN,
description=(
"Regular expression patttern that extracts a course's 'level' "
"from a PDP course_number field"
),
)
peak_covid_terms: set[tuple[str, str]] = pyd.Field(
default=constants.DEFAULT_PEAK_COVID_TERMS,
description=(
"Set of (academic year, academic term) pairs considered by institution "
"as 'peak' COVID, for use in control variables to account for pandemic effects"
),
)
key_course_subject_areas: t.Optional[list[str]] = pyd.Field(
default=None,
description=(
"One or more course subject areas (formatted as 2-digit CIP codes) "
"for which custom features should be computed"
),
)
key_course_ids: t.Optional[list[str]] = pyd.Field(
default=None,
description=(
"One or more course ids (formatted as '[COURSE_PREFIX][COURSE_NUMBER]') "
"for which custom features should be computed"
),
)


class TargetConfig(pyd.BaseModel):
student_criteria: dict[str, object] = pyd.Field(
default_factory=dict,
description=(
"Column name in modeling dataset mapped to one or more values that it must equal "
"in order for the corresponding student to be considered 'eligible'. "
"Multiple criteria are combined with a logical 'AND'."
),
)
# TODO: refine target functionality and expand on this configuration


class PreprocessingConfig(pyd.BaseModel):
features: FeaturesConfig
target: TargetConfig
splits: dict[t.Literal["train", "test", "validate"], float] = pyd.Field(
default={"train": 0.6, "test": 0.2, "validate": 0.2},
description=(
"Mapping of name to fraction of the full datset belonging to a given 'split', "
"which is a randomized subset used for different parts of the modeling process"
),
)
sample_class_weight: t.Optional[t.Literal["balanced"] | dict[object, int]] = (
pyd.Field(
default=None,
description=(
"Weights associated with classes in the form ``{class_label: weight}`` "
"or 'balanced' to automatically adjust weights inversely proportional "
"to class frequencies in the input data. "
"If null (default), then sample weights are not computed."
),
)
)

@pyd.field_validator("splits", mode="after")
@classmethod
def check_split_fractions(cls, value: dict) -> dict:
if (sum_fracs := sum(value.values())) != 1.0:
raise pyd.ValidationError(
f"split fractions must sum up to 1.0, but input sums up to {sum_fracs}"
)
return value


class TrainingConfig(pyd.BaseModel):
"""
References:
- https://docs.databricks.com/en/machine-learning/automl/automl-api-reference.html#classify
"""

student_group_cols: t.Optional[list[str]] = pyd.Field(
default=None,
description=(
"One or more column names in dataset containing student 'groups' "
"to use for model bias assessment, but NOT as model features"
),
)
exclude_cols: t.Optional[list[str]] = pyd.Field(
default=None,
description="One or more column names in dataset to exclude from training.",
)
time_col: t.Optional[str] = pyd.Field(
default=None,
description=(
"Column name in dataset used to split train/test/validate sets chronologically, "
"as an alternative to the randomized assignment in ``split_col`` ."
),
)
exclude_frameworks: t.Optional[list[str]] = pyd.Field(
default=None,
description="List of algorithm frameworks that AutoML excludes from training.",
)
primary_metric: str = pyd.Field(
default="log_loss",
description="Metric used to evaluate and rank model performance.",
)
timeout_minutes: t.Optional[int] = pyd.Field(
default=None,
description="Maximum time to wait for AutoML trials to complete.",
)


class InferenceConfig(pyd.BaseModel):
num_top_features: int = pyd.Field(default=5)


class DatasetConfig(pyd.BaseModel):
table_path: t.Optional[str] = pyd.Field(
...,
description=(
"Path to a table in Unity Catalog where dataset is stored, "
"including the full three-level namespace: 'CATALOG.SCHEMA.TABLE'"
),
)
file_path: t.Optional[str] = pyd.Field(
...,
description="Full, absolute path to dataset on disk, e.g. a Databricks Volume",
)
# TODO: if/when we allow different file formats, add this parameter ...
# file_format: t.Optional[t.Literal["csv", "parquet"]] = pyd.Field(default=None)


class DatasetsConfig(pyd.BaseModel):
raw: DatasetConfig
preprocessed: t.Optional[DatasetConfig]
predictions: t.Optional[DatasetConfig]


class TrainedModelConfig(pyd.BaseModel):
experiment_id: str
run_id: str
model_type: t.Optional[t.Literal["sklearn", "xgboost", "lightgbm"]] = None
min_prob_pos_label: t.Optional[float] = 0.5


class PDPProjectConfigV2(pyd.BaseModel):
"""Configuration (v2) schema for PDP SST projects."""

institution_id: str
institution_name: str

# shared dataset parameters
student_id_col: str = "student_guid"
target_col: str = "target"
split_col: str = "split"
sample_weight_col: t.Optional[str] = None
pos_label: t.Optional[int | bool | str] = True
pred_col: str = "pred"
pred_prob_col: str = "pred_prob"
# other shared parameters
random_state: t.Optional[int] = None

labeled_dataset: DatasetsConfig
trained_model: t.Optional[TrainedModelConfig] = None

preprocessing: t.Optional[PreprocessingConfig] = None
training: t.Optional[TrainingConfig] = None
inference: t.Optional[MakeExplainPredictionsConfig] = None

# NOTE: this is for *pydantic* model -- not ML model -- configuration
model_config = pyd.ConfigDict(extra="ignore", strict=True)

0 comments on commit b07569c

Please sign in to comment.