-
Notifications
You must be signed in to change notification settings - Fork 1
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve pdp project config structure + functionality #56
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,52 @@ | ||
institution_id = "INST_ID" | ||
institution_name = "INST NAME" | ||
|
||
student_id_col = "student_guid" | ||
target_col = "target" | ||
split_col = "split" | ||
sample_weight_col = "sample_weight" | ||
student_group_cols = ["student_age", "race", "ethnicity", "gender", "first_gen"] | ||
pred_col = "pred" | ||
pred_prob_col = "pred_prob" | ||
pos_label = true | ||
random_state = 12345 | ||
|
||
[datasets.labeled] | ||
raw_course = { file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/FILE_NAME_COURSE.csv" } | ||
raw_cohort = { file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/FILE_NAME_COHORT.csv" } | ||
preprocessed = { table_path = "CATALOG.SCHEMA.TABLE_NAME" } | ||
|
||
[preprocessing] | ||
splits = { train = 0.6, test = 0.2, validate = 0.2 } | ||
sample_class_weight = "balanced" | ||
|
||
[preprocessing.features] | ||
min_passing_grade = 1.0 | ||
min_num_credits_full_time = 12 | ||
# NOTE: single quotes *required* here; it's TOML syntax for literal strings | ||
course_level_pattern = 'asdf' | ||
key_course_subject_areas = ["24", "51"] | ||
key_course_ids = ["ENGL101", "MATH101"] | ||
|
||
[preprocessing.target] | ||
params = { min_num_credits_checkin = 30.0, min_num_credits_target = 60.0 } | ||
student_criteria = { enrollment_type = "FIRST-TIME", credential_type_sought_year_1 = "Bachelor's Degree" } | ||
|
||
[modeling.feature_selection] | ||
incomplete_threshold = 0.5 | ||
low_variance_threshold = 0.0 | ||
collinear_threshold = 10.0 | ||
|
||
[modeling.training] | ||
# exclude_frameworks = ["xgboost", "lightgbm"] | ||
primary_metric = "log_loss" | ||
timeout_minutes = 10 | ||
|
||
[trained_model] | ||
experiment_id = "EXPERIMENT_ID" | ||
run_id = "RUN_ID" | ||
# model_type = "sklearn" | ||
min_prob_pos_label = 0.5 | ||
|
||
[inference] | ||
num_top_features = 5 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,2 +1,3 @@ | ||
from .load import load_config | ||
from .schemas.pdp import PDPProjectConfig | ||
from .schemas.pdp_v2 import PDPProjectConfigV2 |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,249 @@ | ||
import re | ||
import typing as t | ||
|
||
import pydantic as pyd | ||
|
||
from ...analysis.pdp import constants | ||
|
||
|
||
class FeaturesConfig(pyd.BaseModel): | ||
min_passing_grade: float = pyd.Field( | ||
default=constants.DEFAULT_MIN_PASSING_GRADE, | ||
description="Minimum numeric grade considered by institution as 'passing'", | ||
gt=0.0, | ||
lt=4.0, | ||
) | ||
min_num_credits_full_time: float = pyd.Field( | ||
default=constants.DEFAULT_MIN_NUM_CREDITS_FULL_TIME, | ||
description=( | ||
"Minimum number of credits *attempted* per term for a student's " | ||
"enrollment intensity to be considered 'full-time'." | ||
), | ||
gt=0.0, | ||
lt=20.0, | ||
) | ||
course_level_pattern: str = pyd.Field( | ||
default=constants.DEFAULT_COURSE_LEVEL_PATTERN, | ||
description=( | ||
"Regular expression patttern that extracts a course's 'level' " | ||
"from a PDP course_number field" | ||
), | ||
) | ||
peak_covid_terms: set[tuple[str, str]] = pyd.Field( | ||
default=constants.DEFAULT_PEAK_COVID_TERMS, | ||
description=( | ||
"Set of (academic year, academic term) pairs considered by institution " | ||
"as 'peak' COVID, for use in control variables to account for pandemic effects" | ||
), | ||
) | ||
key_course_subject_areas: t.Optional[list[str]] = pyd.Field( | ||
default=None, | ||
description=( | ||
"One or more course subject areas (formatted as 2-digit CIP codes) " | ||
"for which custom features should be computed" | ||
), | ||
) | ||
key_course_ids: t.Optional[list[str]] = pyd.Field( | ||
default=None, | ||
description=( | ||
"One or more course ids (formatted as '[COURSE_PREFIX][COURSE_NUMBER]') " | ||
"for which custom features should be computed" | ||
), | ||
) | ||
|
||
|
||
class TargetConfig(pyd.BaseModel): | ||
student_criteria: dict[str, object] = pyd.Field( | ||
default_factory=dict, | ||
description=( | ||
"Column name in modeling dataset mapped to one or more values that it must equal " | ||
"in order for the corresponding student to be considered 'eligible'. " | ||
"Multiple criteria are combined with a logical 'AND'." | ||
), | ||
) | ||
# TODO: refine target functionality and expand on this configuration | ||
params: dict[str, object] = pyd.Field(default_factory=dict) | ||
|
||
|
||
class PreprocessingConfig(pyd.BaseModel): | ||
features: FeaturesConfig | ||
target: TargetConfig | ||
splits: dict[t.Literal["train", "test", "validate"], float] = pyd.Field( | ||
default={"train": 0.6, "test": 0.2, "validate": 0.2}, | ||
description=( | ||
"Mapping of name to fraction of the full datset belonging to a given 'split', " | ||
"which is a randomized subset used for different parts of the modeling process" | ||
), | ||
) | ||
sample_class_weight: t.Optional[t.Literal["balanced"] | dict[object, int]] = ( | ||
pyd.Field( | ||
default=None, | ||
description=( | ||
"Weights associated with classes in the form ``{class_label: weight}`` " | ||
"or 'balanced' to automatically adjust weights inversely proportional " | ||
"to class frequencies in the input data. " | ||
"If null (default), then sample weights are not computed." | ||
), | ||
) | ||
) | ||
|
||
@pyd.field_validator("splits", mode="after") | ||
@classmethod | ||
def check_split_fractions(cls, value: dict) -> dict: | ||
if (sum_fracs := sum(value.values())) != 1.0: | ||
raise pyd.ValidationError( | ||
f"split fractions must sum up to 1.0, but input sums up to {sum_fracs}" | ||
) | ||
return value | ||
|
||
|
||
class FeatureSelectionConfig(pyd.BaseModel): | ||
""" | ||
See Also: | ||
- :func:`modeling.feature_selection.select_features()` | ||
""" | ||
|
||
non_feature_cols: t.Optional[list[str]] = None | ||
force_include_cols: t.Optional[list[str]] = None | ||
incomplete_threshold: float = 0.5 | ||
low_variance_threshold: float = 0.0 | ||
collinear_threshold: t.Optional[float] = 10.0 | ||
|
||
|
||
class TrainingConfig(pyd.BaseModel): | ||
""" | ||
References: | ||
- https://docs.databricks.com/en/machine-learning/automl/automl-api-reference.html#classify | ||
""" | ||
|
||
exclude_cols: t.Optional[list[str]] = pyd.Field( | ||
default=None, | ||
description="One or more column names in dataset to exclude from training.", | ||
) | ||
time_col: t.Optional[str] = pyd.Field( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is interesting using a chronology column. How is this typically used? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. afaik We haven't used this configuration in our models, but we could. It's supported by AutoML (see the reference link), so included for completeness. |
||
default=None, | ||
description=( | ||
"Column name in dataset used to split train/test/validate sets chronologically, " | ||
"as an alternative to the randomized assignment in ``split_col`` ." | ||
), | ||
) | ||
exclude_frameworks: t.Optional[list[str]] = pyd.Field( | ||
default=None, | ||
description="List of algorithm frameworks that AutoML excludes from training.", | ||
) | ||
primary_metric: str = pyd.Field( | ||
default="log_loss", | ||
description="Metric used to evaluate and rank model performance.", | ||
) | ||
timeout_minutes: t.Optional[int] = pyd.Field( | ||
default=None, | ||
description="Maximum time to wait for AutoML trials to complete.", | ||
) | ||
|
||
|
||
class ModelingConfig(pyd.BaseModel): | ||
feature_selection: t.Optional[FeatureSelectionConfig] = None | ||
training: TrainingConfig | ||
|
||
|
||
class InferenceConfig(pyd.BaseModel): | ||
num_top_features: int = pyd.Field(default=5) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We may also want to add support_threshold here as an optional parameter. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is included with the trained model itself, see |
||
# TODO: extend this configuration, maybe? | ||
|
||
|
||
class DatasetIOConfig(pyd.BaseModel): | ||
table_path: t.Optional[str] = pyd.Field( | ||
default=None, | ||
description=( | ||
"Path to a table in Unity Catalog where dataset is stored, " | ||
"including the full three-level namespace: 'CATALOG.SCHEMA.TABLE'" | ||
), | ||
) | ||
file_path: t.Optional[str] = pyd.Field( | ||
default=None, | ||
description="Full, absolute path to dataset on disk, e.g. a Databricks Volume", | ||
) | ||
# TODO: if/when we allow different file formats, add this parameter ... | ||
# file_format: t.Optional[t.Literal["csv", "parquet"]] = pyd.Field(default=None) | ||
|
||
@pyd.model_validator(mode="after") | ||
def check_some_nonnull_inputs(self): | ||
if self.table_path is None and self.file_path is None: | ||
raise ValueError("table_path and/or file_path must be non-null") | ||
return self | ||
|
||
|
||
class DatasetConfig(pyd.BaseModel): | ||
raw_course: DatasetIOConfig | ||
raw_cohort: DatasetIOConfig | ||
preprocessed: t.Optional[DatasetIOConfig] = None | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. would preprocessed = training dataset? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In the case of the labeled dataset, yes. For unlabeled data, it would be the dataset to be given to the model as input in order to produce predictions. (Does that have a name?) |
||
predictions: t.Optional[DatasetIOConfig] = None | ||
# TODO: do we want advisor-facing output data separate from "raw" predictions? | ||
# finalized: t.Optional[DatasetIOConfig] = None | ||
|
||
|
||
class TrainedModelConfig(pyd.BaseModel): | ||
experiment_id: str | ||
run_id: str | ||
model_type: t.Optional[t.Literal["sklearn", "xgboost", "lightgbm"]] = None | ||
min_prob_pos_label: t.Optional[float] = 0.5 | ||
|
||
@pyd.computed_field # type: ignore[misc] | ||
@property | ||
def mlflow_model_uri(self) -> str: | ||
return f"runs:/{self.run_id}/model" | ||
|
||
|
||
class PDPProjectConfigV2(pyd.BaseModel): | ||
"""Configuration (v2) schema for PDP SST projects.""" | ||
|
||
institution_id: str = pyd.Field( | ||
..., | ||
description=( | ||
"Unique (ASCII-only) identifier for institution; used in naming things " | ||
"such as source directories, catalog schemas, keys in shared configs, etc." | ||
), | ||
) | ||
institution_name: str = pyd.Field( | ||
..., | ||
description=( | ||
"Readable 'display' name for institution, distinct from the 'id'; " | ||
"probably just the school's 'official', public name" | ||
), | ||
) | ||
|
||
# shared dataset parameters | ||
student_id_col: str = "student_guid" | ||
target_col: str = "target" | ||
split_col: str = "split" | ||
sample_weight_col: t.Optional[str] = None | ||
student_group_cols: t.Optional[list[str]] = pyd.Field( | ||
default=["student_age", "race", "ethnicity", "gender", "first_gen"], | ||
description=( | ||
"One or more column names in datasets containing student 'groups' " | ||
"to use for model bias assessment, but *not* as model features" | ||
), | ||
) | ||
pred_col: str = "pred" | ||
pred_prob_col: str = "pred_prob" | ||
pos_label: t.Optional[int | bool | str] = True | ||
# other shared parameters | ||
random_state: t.Optional[int] = None | ||
|
||
# key artifacts produced by project pipeline | ||
datasets: t.Optional[dict[str, DatasetConfig]] = None | ||
trained_model: t.Optional[TrainedModelConfig] = None | ||
# key steps in project pipeline | ||
preprocessing: t.Optional[PreprocessingConfig] = None | ||
modeling: t.Optional[ModelingConfig] = None | ||
inference: t.Optional[InferenceConfig] = None | ||
|
||
# NOTE: this is for *pydantic* model -- not ML model -- configuration | ||
model_config = pyd.ConfigDict(extra="ignore", strict=True) | ||
|
||
@pyd.field_validator("institution_id", mode="after") | ||
@classmethod | ||
def check_institution_id_isascii(cls, value: str) -> str: | ||
if not re.search(r"^\w+$", value, flags=re.ASCII): | ||
raise ValueError(f"institution_id='{value}' is not ASCII-only") | ||
return value |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Do lines 4-12 need to go under modeling.training? OR are they meant to be used for more than just modeling.training?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yup! These get used in both preprocessing and model training.