Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[pdp] Add inference nb template #55

Closed
wants to merge 19 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
128 changes: 77 additions & 51 deletions notebooks/pdp/01-data-assessment-eda-TEMPLATE.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,8 +23,11 @@

# COMMAND ----------

# install dependencies, most of which should come through our 1st-party SST package
# %pip install git+https://github.com/datakind/student-success-tool.git@develop
# install dependencies, most/all of which should come through our 1st-party SST package
# NOTE: it's okay to use 'develop' or a feature branch while developing this nb
# but when it's finished, it's best to pin to a specific version of the package
# %pip install "student-success-tool == 0.1.0"
# %pip install "git+https://github.com/datakind/student-success-tool.git@develop"

# COMMAND ----------

Expand All @@ -33,7 +36,6 @@
# COMMAND ----------

import logging
import os
import sys

import matplotlib.pyplot as plt
Expand All @@ -45,22 +47,23 @@
from databricks.sdk.runtime import dbutils

from student_success_tool.analysis import pdp
from student_success_tool import configs

# COMMAND ----------

logging.basicConfig(level=logging.INFO)
logging.basicConfig(level=logging.INFO, force=True)
logging.getLogger("py4j").setLevel(logging.WARNING) # ignore databricks logger

try:
spark_session = DatabricksSession.builder.getOrCreate()
spark = DatabricksSession.builder.getOrCreate()
except Exception:
logging.warning("unable to create spark session; are you in a Databricks runtime?")
pass

# COMMAND ----------

# MAGIC %md
# MAGIC ## `student-success-intervention` hacks
# MAGIC ## import school-specific code

# COMMAND ----------

Expand All @@ -69,33 +72,21 @@

# COMMAND ----------

# HACK: insert our 1st-party (school-specific) code into PATH
# insert our 1st-party (school-specific) code into PATH
if "../" not in sys.path:
sys.path.insert(1, "../")

# TODO: specify school's subpackage
# TODO: specify school's subpackage here
from analysis import * # noqa: F403

# COMMAND ----------

# MAGIC %md
# MAGIC ## unity catalog config

# COMMAND ----------

catalog = "sst_dev"

# configure where data is to be read from / written to
inst_name = "SCHOOL" # TODO: fill in school's name in Unity Catalog
read_schema = f"{inst_name}_bronze"
write_schema = f"{inst_name}_silver"

path_volume = os.path.join(
"/Volumes", catalog, read_schema, f"{inst_name}_bronze_file_volume"
)
path_table = f"{catalog}.{read_schema}"
print(f"{path_table=}")
print(f"{path_volume=}")
# project configuration should be stored in a config file in TOML format
# it'll start out with just basic info: institution_id, institution_name
# but as each step of the pipeline gets built, more parameters will be moved
# from hard-coded notebook variables to shareable, persistent config fields
cfg = configs.load_config("./config-v2-TEMPLATE.toml", configs.PDPProjectConfigV2)
cfg

# COMMAND ----------

Expand All @@ -109,14 +100,17 @@

# COMMAND ----------

# TODO: fill in school's name; may not be same as in the schemas above
fpath_course = os.path.join(path_volume, "SCHOOL_COURSE_AR_DEID_DTTM.csv")
try:
raw_course_file_path = cfg.datasets.labeled.raw_course.file_path
except AttributeError:
# TODO: fill in the actual path to school's raw course file
raw_course_file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/SCHOOL_COURSE_AR_DEID_DTTM.csv"

# COMMAND ----------

# read without any schema validation, so we can look at the data "raw"
df_course_raw = pdp.dataio.read_raw_pdp_course_data_from_file(
fpath_course, schema=None, dttm_format="%Y%m%d.0"
raw_course_file_path, schema=None, dttm_format="%Y%m%d.0"
)
print(f"rows x cols = {df_course_raw.shape}")
df_course_raw.head()
Expand All @@ -127,6 +121,10 @@

# COMMAND ----------

df_course_raw["course_begin_date"].describe()

# COMMAND ----------

# MAGIC %md
# MAGIC Quick checks:
# MAGIC - [ ] data exists where it should
Expand All @@ -137,7 +135,7 @@

# try to read data while validating with the "base" PDP schema
df_course = pdp.dataio.read_raw_pdp_course_data_from_file(
fpath_course, schema=pdp.schemas.RawPDPCourseDataSchema, dttm_format="%Y%m%d.0"
raw_course_file_path, schema=pdp.schemas.RawPDPCourseDataSchema, dttm_format="%Y%m%d.0"
)
df_course

Expand Down Expand Up @@ -199,7 +197,7 @@
# MAGIC ```
# MAGIC
# MAGIC At this point, `df_course` should be a properly validated and parsed data frame, ready for exploratory data analysis.

# MAGIC

# COMMAND ----------

Expand All @@ -208,22 +206,26 @@

# COMMAND ----------


# TODO: fill in school's name; may not be same as in the schemas above
fpath_cohort = os.path.join(path_volume, "SCHOOL_COHORT_AR_DEID_DTTM.csv")
try:
raw_cohort_file_path = cfg.datasets.labeled.raw_cohort.file_path
except AttributeError:
# TODO: fill in the actual path to school's raw cohort file
raw_cohort_file_path = "/Volumes/CATALOG/INST_NAME_bronze/INST_NAME_bronze_file_volume/SCHOOL_COHORT_AR_DEID_DTTM.csv"

# COMMAND ----------

# read without any schema validation, so we can look at the data "raw"
df_cohort_raw = pdp.dataio.read_raw_pdp_cohort_data_from_file(fpath_cohort, schema=None)
df_cohort_raw = pdp.dataio.read_raw_pdp_cohort_data_from_file(
raw_cohort_file_path, schema=None
)
print(f"rows x cols = {df_cohort_raw.shape}")
df_cohort_raw.head()

# COMMAND ----------

# try to read data while validating with the "base" PDP schema
df_cohort = pdp.dataio.read_raw_pdp_cohort_data_from_file(
fpath_cohort, schema=pdp.schemas.base.RawPDPCohortDataSchema
raw_cohort_file_path, schema=pdp.schemas.RawPDPCohortDataSchema
)
df_cohort

Expand All @@ -242,22 +244,31 @@
# COMMAND ----------

# MAGIC %md
# MAGIC ## save validated data
# MAGIC ## HEY, STOP HERE!

# COMMAND ----------

# MAGIC %md
# MAGIC Before continuing on to EDA, now's a great time to do a couple things:
# MAGIC
# MAGIC - Copy any school-specific raw dataset schemas into a `schemas.py` file in the current working directory
# MAGIC - Copy any school-specific preprocessing functions needed to coerce the raw data into a standardized form into a `dataio.py` file in the current working directory
# MAGIC - **Optional:** If you want easy access to outputs from every (sub-)step of the data transformation pipeline, save the validated datasets into this school's "silver" schema in Unity Catalog.

# COMMAND ----------

pdp.dataio.write_data_to_delta_table(
df_course,
f"{catalog}.{write_schema}.course_dataset_validated",
spark_session=spark_session,
"CATALOG.INST_NAME_silver.course_dataset_validated",
spark_session=spark,
)

# COMMAND ----------

pdp.dataio.write_data_to_delta_table(
df_cohort,
f"{catalog}.{write_schema}.cohort_dataset_validated",
spark_session=spark_session,
"CATALOG.INST_NAME_silver.cohort_dataset_validated",
spark_session=spark,
)

# COMMAND ----------
Expand All @@ -269,17 +280,17 @@

# MAGIC %md
# MAGIC %md
# MAGIC ## read validated data
# MAGIC ## read validated data?
# MAGIC
# MAGIC (so you don't have to execute the validation process more than once)
# MAGIC (optional, so you don't have to execute the validation process more than once)

# COMMAND ----------

# use base or school-specific schema, as needed
df_course = pdp.schemas.RawPDPCourseDataSchema(
pdp.dataio.read_data_from_delta_table(
f"{catalog}.{write_schema}.course_dataset_validated",
spark_session=spark_session,
"CATALOG.INST_NAME_silver.course_dataset_validated",
spark_session=spark,
)
)
df_course.shape
Expand All @@ -288,8 +299,8 @@

df_cohort = pdp.schemas.RawCohortDataSchema(
pdp.dataio.read_data_from_delta_table(
f"{catalog}.{write_schema}.cohort_dataset_validated",
spark_session=spark_session,
"CATALOG.INST_NAME_silver.cohort_dataset_validated",
spark_session=spark,
)
)
df_cohort.shape
Expand All @@ -307,8 +318,11 @@
# COMMAND ----------

# specific follow-ups, for example
# df_course["academic_year"].value_counts(normalize=True, dropna=False)
# df_course["academic_term"].value_counts(normalize=True, dropna=False)
# df_course["grade"].value_counts(normalize=True, dropna=False)
# df_course["delivery_method"].value_counts(normalize=True, dropna=False)
# df_course["course_name"].value_counts(normalize=True, dropna=False).head(10)

# COMMAND ----------

Expand All @@ -317,8 +331,8 @@
# COMMAND ----------

# specific follow-ups, for example
# df_course["cohort"].value_counts(normalize=True, dropna=False)
# df_course["enrollment_type"].value_counts(normalize=True, dropna=False)
# df_cohort["cohort"].value_counts(normalize=True, dropna=False)
# df_cohort["enrollment_type"].value_counts(normalize=True, dropna=False)

# COMMAND ----------

Expand Down Expand Up @@ -509,6 +523,10 @@

# COMMAND ----------

df_pre_cohort["enrollment_type"].value_counts()

# COMMAND ----------

# MAGIC %md
# MAGIC ### filter invalid rows(?)

Expand Down Expand Up @@ -574,6 +592,7 @@

ax = sb.histplot(
df_course.sort_values(by="academic_year"),
# df_course_valid.sort_values(by="academic_year"),
y="academic_year",
hue="academic_term",
multiple="stack",
Expand Down Expand Up @@ -645,6 +664,7 @@
ax = sb.histplot(
pd.merge(
df_course.groupby("student_guid")
# df_course_valid.groupby("student_guid")
.size()
.rename("num_courses_enrolled")
.reset_index(drop=False),
Expand All @@ -667,6 +687,9 @@
df_course.groupby("student_guid").agg(
{"number_of_credits_attempted": "sum", "number_of_credits_earned": "sum"}
),
# df_course_valid.groupby("student_guid").agg(
# {"number_of_credits_attempted": "sum", "number_of_credits_earned": "sum"}
# ),
x="number_of_credits_attempted",
y="number_of_credits_earned",
kind="hex",
Expand Down Expand Up @@ -769,7 +792,10 @@
# COMMAND ----------

# MAGIC %md
# MAGIC - [ ] Add school-specific data schemas and/or preprocessing functions into the appropriate directory in the [`student-success-intervention` repository](https://github.com/datakind/student-success-intervention)
# MAGIC - ...
# MAGIC - [ ] If you haven't already, add school-specific data schemas and/or preprocessing functions into the appropriate directory in the [`student-success-intervention` repository](https://github.com/datakind/student-success-intervention)
# MAGIC - [ ] Add file paths for the raw course/cohort datasets to the project config file's `datasets.labeled.raw_course` and `datasets.labeled.raw_cohort` blocks
# MAGIC - [ ] Submit a PR including this notebook and any school-specific files added in order to run it

# COMMAND ----------


49 changes: 27 additions & 22 deletions notebooks/pdp/02-prepare-modeling-dataset-TEMPLATE.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,11 @@

# COMMAND ----------

# install dependencies, most of which should come through our 1st-party SST package
# %pip install git+https://github.com/datakind/student-success-tool.git@develop
# install dependencies, most/all of which should come through our 1st-party SST package
# NOTE: it's okay to use 'develop' or a feature branch while developing this nb
# but when it's finished, it's best to pin to a specific version of the package
# %pip install "student-success-tool == 0.1.0"
# %pip install "git+https://github.com/datakind/student-success-tool.git@develop"

# COMMAND ----------

Expand Down Expand Up @@ -56,7 +59,7 @@
# COMMAND ----------

# MAGIC %md
# MAGIC ## `student-success-intervention` hacks
# MAGIC ## import school-specific code

# COMMAND ----------

Expand All @@ -65,7 +68,7 @@

# COMMAND ----------

# HACK: insert our 1st-party (school-specific) code into PATH
# insert our 1st-party (school-specific) code into PATH
if "../" not in sys.path:
sys.path.insert(1, "../")

Expand All @@ -74,24 +77,12 @@

# COMMAND ----------

# MAGIC %md
# MAGIC ## project config

# COMMAND ----------

# TODO: create a config file in TOML format to school directory
config = configs.load_config("./config.toml", schema=configs.PDPProjectConfig)
config

# COMMAND ----------

catalog = "sst_dev"

# configure where data is to be read from / written to
inst_name = "SCHOOL" # TODO: fill in school's name in Unity Catalog
schema = f"{inst_name}_silver"
catalog_schema = f"{catalog}.{schema}"
print(f"{catalog_schema=}")
# project configuration should be stored in a config file in TOML format
# it'll start out with just basic info: institution_id, institution_name
# but as each step of the pipeline gets built, more parameters will be moved
# from hard-coded notebook variables to shareable, persistent config fields
cfg = configs.load_config("./config-v2-TEMPLATE.toml", configs.PDPProjectConfigV2)
cfg

# COMMAND ----------

Expand Down Expand Up @@ -130,6 +121,20 @@

# COMMAND ----------

try:
feature_params = cfg.preprocessing.features.model_dump()
except AttributeError:
feature_params = {
"min_passing_grade": pdp.constants.DEFAULT_MIN_PASSING_GRADE,
"min_num_credits_full_time": pdp.constants.DEFAULT_MIN_NUM_CREDITS_FULL_TIME,
"course_level_pattern": pdp.constants.DEFAULT_COURSE_LEVEL_PATTERN,
"peak_covid_terms": pdp.constants.DEFAULT_PEAK_COVID_TERMS,
"key_course_subject_areas": None,
"key_course_ids": None,
}

# COMMAND ----------

dict(config.prepare_modeling_dataset)

# COMMAND ----------
Expand Down
Loading