Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Improve pdp project config structure + functionality #56

Merged
merged 3 commits into from
Jan 23, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions notebooks/pdp/config-v2-TEMPLATE.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
institution_id = "INST_ID"
institution_name = "INST NAME"

student_id_col = "student_guid"
target_col = "target"
split_col = "split"
sample_weight_col = "sample_weight"
student_group_cols = ["student_age", "race", "ethnicity", "gender", "first_gen"]
pred_col = "pred"
pred_prob_col = "pred_prob"
pos_label = true
random_state = 12345
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do lines 4-12 need to go under modeling.training? OR are they meant to be used for more than just modeling.training?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yup! These get used in both preprocessing and model training.


[datasets.labeled]
raw_course = { file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/FILE_NAME_COURSE.csv" }
raw_cohort = { file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/FILE_NAME_COHORT.csv" }
preprocessed = { table_path = "CATALOG.SCHEMA.TABLE_NAME" }

[preprocessing]
splits = { train = 0.6, test = 0.2, validate = 0.2 }
sample_class_weight = "balanced"

[preprocessing.features]
min_passing_grade = 1.0
min_num_credits_full_time = 12
# NOTE: single quotes *required* here; it's TOML syntax for literal strings
course_level_pattern = 'asdf'
key_course_subject_areas = ["24", "51"]
key_course_ids = ["ENGL101", "MATH101"]

[preprocessing.target]
params = { min_num_credits_checkin = 30.0, min_num_credits_target = 60.0 }
student_criteria = { enrollment_type = "FIRST-TIME", credential_type_sought_year_1 = "Bachelor's Degree" }

[modeling.feature_selection]
incomplete_threshold = 0.5
low_variance_threshold = 0.0
collinear_threshold = 10.0

[modeling.training]
# exclude_frameworks = ["xgboost", "lightgbm"]
primary_metric = "log_loss"
timeout_minutes = 10

[trained_model]
experiment_id = "EXPERIMENT_ID"
run_id = "RUN_ID"
# model_type = "sklearn"
min_prob_pos_label = 0.5

[inference]
num_top_features = 5
1 change: 1 addition & 0 deletions src/student_success_tool/configs/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
from .load import load_config
from .schemas.pdp import PDPProjectConfig
from .schemas.pdp_v2 import PDPProjectConfigV2
249 changes: 249 additions & 0 deletions src/student_success_tool/configs/schemas/pdp_v2.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,249 @@
import re
import typing as t

import pydantic as pyd

from ...analysis.pdp import constants


class FeaturesConfig(pyd.BaseModel):
min_passing_grade: float = pyd.Field(
default=constants.DEFAULT_MIN_PASSING_GRADE,
description="Minimum numeric grade considered by institution as 'passing'",
gt=0.0,
lt=4.0,
)
min_num_credits_full_time: float = pyd.Field(
default=constants.DEFAULT_MIN_NUM_CREDITS_FULL_TIME,
description=(
"Minimum number of credits *attempted* per term for a student's "
"enrollment intensity to be considered 'full-time'."
),
gt=0.0,
lt=20.0,
)
course_level_pattern: str = pyd.Field(
default=constants.DEFAULT_COURSE_LEVEL_PATTERN,
description=(
"Regular expression patttern that extracts a course's 'level' "
"from a PDP course_number field"
),
)
peak_covid_terms: set[tuple[str, str]] = pyd.Field(
default=constants.DEFAULT_PEAK_COVID_TERMS,
description=(
"Set of (academic year, academic term) pairs considered by institution "
"as 'peak' COVID, for use in control variables to account for pandemic effects"
),
)
key_course_subject_areas: t.Optional[list[str]] = pyd.Field(
default=None,
description=(
"One or more course subject areas (formatted as 2-digit CIP codes) "
"for which custom features should be computed"
),
)
key_course_ids: t.Optional[list[str]] = pyd.Field(
default=None,
description=(
"One or more course ids (formatted as '[COURSE_PREFIX][COURSE_NUMBER]') "
"for which custom features should be computed"
),
)


class TargetConfig(pyd.BaseModel):
student_criteria: dict[str, object] = pyd.Field(
default_factory=dict,
description=(
"Column name in modeling dataset mapped to one or more values that it must equal "
"in order for the corresponding student to be considered 'eligible'. "
"Multiple criteria are combined with a logical 'AND'."
),
)
# TODO: refine target functionality and expand on this configuration
params: dict[str, object] = pyd.Field(default_factory=dict)


class PreprocessingConfig(pyd.BaseModel):
features: FeaturesConfig
target: TargetConfig
splits: dict[t.Literal["train", "test", "validate"], float] = pyd.Field(
default={"train": 0.6, "test": 0.2, "validate": 0.2},
description=(
"Mapping of name to fraction of the full datset belonging to a given 'split', "
"which is a randomized subset used for different parts of the modeling process"
),
)
sample_class_weight: t.Optional[t.Literal["balanced"] | dict[object, int]] = (
pyd.Field(
default=None,
description=(
"Weights associated with classes in the form ``{class_label: weight}`` "
"or 'balanced' to automatically adjust weights inversely proportional "
"to class frequencies in the input data. "
"If null (default), then sample weights are not computed."
),
)
)

@pyd.field_validator("splits", mode="after")
@classmethod
def check_split_fractions(cls, value: dict) -> dict:
if (sum_fracs := sum(value.values())) != 1.0:
raise pyd.ValidationError(
f"split fractions must sum up to 1.0, but input sums up to {sum_fracs}"
)
return value


class FeatureSelectionConfig(pyd.BaseModel):
"""
See Also:
- :func:`modeling.feature_selection.select_features()`
"""

non_feature_cols: t.Optional[list[str]] = None
force_include_cols: t.Optional[list[str]] = None
incomplete_threshold: float = 0.5
low_variance_threshold: float = 0.0
collinear_threshold: t.Optional[float] = 10.0


class TrainingConfig(pyd.BaseModel):
"""
References:
- https://docs.databricks.com/en/machine-learning/automl/automl-api-reference.html#classify
"""

exclude_cols: t.Optional[list[str]] = pyd.Field(
default=None,
description="One or more column names in dataset to exclude from training.",
)
time_col: t.Optional[str] = pyd.Field(
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is interesting using a chronology column. How is this typically used?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

afaik We haven't used this configuration in our models, but we could. It's supported by AutoML (see the reference link), so included for completeness.

default=None,
description=(
"Column name in dataset used to split train/test/validate sets chronologically, "
"as an alternative to the randomized assignment in ``split_col`` ."
),
)
exclude_frameworks: t.Optional[list[str]] = pyd.Field(
default=None,
description="List of algorithm frameworks that AutoML excludes from training.",
)
primary_metric: str = pyd.Field(
default="log_loss",
description="Metric used to evaluate and rank model performance.",
)
timeout_minutes: t.Optional[int] = pyd.Field(
default=None,
description="Maximum time to wait for AutoML trials to complete.",
)


class ModelingConfig(pyd.BaseModel):
feature_selection: t.Optional[FeatureSelectionConfig] = None
training: TrainingConfig


class InferenceConfig(pyd.BaseModel):
num_top_features: int = pyd.Field(default=5)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We may also want to add support_threshold here as an optional parameter.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is included with the trained model itself, see min_prob_pos_label

# TODO: extend this configuration, maybe?


class DatasetIOConfig(pyd.BaseModel):
table_path: t.Optional[str] = pyd.Field(
default=None,
description=(
"Path to a table in Unity Catalog where dataset is stored, "
"including the full three-level namespace: 'CATALOG.SCHEMA.TABLE'"
),
)
file_path: t.Optional[str] = pyd.Field(
default=None,
description="Full, absolute path to dataset on disk, e.g. a Databricks Volume",
)
# TODO: if/when we allow different file formats, add this parameter ...
# file_format: t.Optional[t.Literal["csv", "parquet"]] = pyd.Field(default=None)

@pyd.model_validator(mode="after")
def check_some_nonnull_inputs(self):
if self.table_path is None and self.file_path is None:
raise ValueError("table_path and/or file_path must be non-null")
return self


class DatasetConfig(pyd.BaseModel):
raw_course: DatasetIOConfig
raw_cohort: DatasetIOConfig
preprocessed: t.Optional[DatasetIOConfig] = None
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

would preprocessed = training dataset?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

In the case of the labeled dataset, yes. For unlabeled data, it would be the dataset to be given to the model as input in order to produce predictions. (Does that have a name?)

predictions: t.Optional[DatasetIOConfig] = None
# TODO: do we want advisor-facing output data separate from "raw" predictions?
# finalized: t.Optional[DatasetIOConfig] = None


class TrainedModelConfig(pyd.BaseModel):
experiment_id: str
run_id: str
model_type: t.Optional[t.Literal["sklearn", "xgboost", "lightgbm"]] = None
min_prob_pos_label: t.Optional[float] = 0.5

@pyd.computed_field # type: ignore[misc]
@property
def mlflow_model_uri(self) -> str:
return f"runs:/{self.run_id}/model"


class PDPProjectConfigV2(pyd.BaseModel):
"""Configuration (v2) schema for PDP SST projects."""

institution_id: str = pyd.Field(
...,
description=(
"Unique (ASCII-only) identifier for institution; used in naming things "
"such as source directories, catalog schemas, keys in shared configs, etc."
),
)
institution_name: str = pyd.Field(
...,
description=(
"Readable 'display' name for institution, distinct from the 'id'; "
"probably just the school's 'official', public name"
),
)

# shared dataset parameters
student_id_col: str = "student_guid"
target_col: str = "target"
split_col: str = "split"
sample_weight_col: t.Optional[str] = None
student_group_cols: t.Optional[list[str]] = pyd.Field(
default=["student_age", "race", "ethnicity", "gender", "first_gen"],
description=(
"One or more column names in datasets containing student 'groups' "
"to use for model bias assessment, but *not* as model features"
),
)
pred_col: str = "pred"
pred_prob_col: str = "pred_prob"
pos_label: t.Optional[int | bool | str] = True
# other shared parameters
random_state: t.Optional[int] = None

# key artifacts produced by project pipeline
datasets: t.Optional[dict[str, DatasetConfig]] = None
trained_model: t.Optional[TrainedModelConfig] = None
# key steps in project pipeline
preprocessing: t.Optional[PreprocessingConfig] = None
modeling: t.Optional[ModelingConfig] = None
inference: t.Optional[InferenceConfig] = None

# NOTE: this is for *pydantic* model -- not ML model -- configuration
model_config = pyd.ConfigDict(extra="ignore", strict=True)

@pyd.field_validator("institution_id", mode="after")
@classmethod
def check_institution_id_isascii(cls, value: str) -> str:
if not re.search(r"^\w+$", value, flags=re.ASCII):
raise ValueError(f"institution_id='{value}' is not ASCII-only")
return value
Loading
Loading