Skip to content

Commit

Permalink
include household_id cols; update sample data (#180)
Browse files Browse the repository at this point in the history
  • Loading branch information
stevebachmeier authored May 15, 2023
1 parent 00e85f4 commit 4f94992
Show file tree
Hide file tree
Showing 12 changed files with 90 additions and 39 deletions.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
9 changes: 6 additions & 3 deletions src/pseudopeople/schema_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -382,6 +382,7 @@ class __Datasets(NamedTuple):
DatasetNames.CENSUS,
columns=( # This defines the output column order
COLUMNS.simulant_id,
COLUMNS.household_id,
COLUMNS.first_name,
COLUMNS.middle_initial,
COLUMNS.last_name,
Expand All @@ -408,8 +409,8 @@ class __Datasets(NamedTuple):
acs: Dataset = Dataset(
DatasetNames.ACS,
columns=( # This defines the output column order
COLUMNS.household_id,
COLUMNS.simulant_id,
COLUMNS.household_id,
COLUMNS.survey_date,
COLUMNS.first_name,
COLUMNS.middle_initial,
Expand All @@ -435,8 +436,8 @@ class __Datasets(NamedTuple):
cps: Dataset = Dataset(
DatasetNames.CPS,
columns=( # This defines the output column order
COLUMNS.household_id,
COLUMNS.simulant_id,
COLUMNS.household_id,
COLUMNS.survey_date,
COLUMNS.first_name,
COLUMNS.middle_initial,
Expand All @@ -462,8 +463,8 @@ class __Datasets(NamedTuple):
wic: Dataset = Dataset(
DatasetNames.WIC,
columns=( # This defines the output column order
COLUMNS.household_id,
COLUMNS.simulant_id,
COLUMNS.household_id,
COLUMNS.first_name,
COLUMNS.middle_initial,
COLUMNS.last_name,
Expand All @@ -489,6 +490,7 @@ class __Datasets(NamedTuple):
DatasetNames.SSA,
columns=( # This defines the output column order
COLUMNS.simulant_id,
COLUMNS.household_id,
COLUMNS.first_name,
COLUMNS.middle_initial,
COLUMNS.last_name,
Expand All @@ -508,6 +510,7 @@ class __Datasets(NamedTuple):
DatasetNames.TAXES_W2_1099,
columns=( # This defines the output column order
COLUMNS.simulant_id,
COLUMNS.household_id,
COLUMNS.first_name,
COLUMNS.middle_initial,
COLUMNS.last_name,
Expand Down
34 changes: 34 additions & 0 deletions tests/integration/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from pseudopeople.configuration import Keys, get_configuration
from pseudopeople.constants import paths
from pseudopeople.interface import (
_reformat_dates_for_noising,
generate_american_community_survey,
generate_current_population_survey,
generate_decennial_census,
Expand All @@ -18,6 +19,21 @@
SEED = 0
STATE = "RI"

# TODO: Replace this with the record ID column when implemented (MIC-4039)
IDX_COLS = {
DATASETS.census.name: [COLUMNS.simulant_id.name, COLUMNS.year.name],
DATASETS.acs.name: [COLUMNS.simulant_id.name, COLUMNS.survey_date.name],
DATASETS.cps.name: [COLUMNS.simulant_id.name, COLUMNS.survey_date.name],
DATASETS.wic.name: [COLUMNS.simulant_id.name, COLUMNS.year.name],
DATASETS.ssa.name: [COLUMNS.simulant_id.name, COLUMNS.ssa_event_type.name],
DATASETS.tax_w2_1099.name: [
COLUMNS.simulant_id.name,
COLUMNS.tax_year.name,
COLUMNS.employer_id.name,
],
# DATASETS.tax_1040.name: "todo",
}


@pytest.fixture(scope="module")
def user_config():
Expand Down Expand Up @@ -176,3 +192,21 @@ def sample_data_taxes_w2_and_1099_state_edit():
def _load_sample_data(dataset):
data_path = paths.SAMPLE_DATA_ROOT / dataset / f"{dataset}.parquet"
return pd.read_parquet(data_path)


def _get_common_datasets(dataset_name, data, noised_data):
"""Use unique columns to determine shared non-NA rows between noised and
unnoised data. Note that we cannot use the original index because that
gets reset after noising, i.e. the unique columns must NOT be noised.
"""
idx_cols = IDX_COLS.get(dataset_name)
dataset = DATASETS.get_dataset(dataset_name)
check_original = _reformat_dates_for_noising(data, dataset).set_index(idx_cols)
check_noised = noised_data.set_index(idx_cols)
# Ensure the idx_cols are unique
assert check_original.index.duplicated().sum() == 0
assert check_noised.index.duplicated().sum() == 0
shared_idx = pd.Index(set(check_original.index).intersection(set(check_noised.index)))
check_original = check_original.loc[shared_idx]
check_noised = check_noised.loc[shared_idx]
return check_noised, check_original, shared_idx
51 changes: 15 additions & 36 deletions tests/integration/test_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@

from pseudopeople.configuration import Keys, get_configuration
from pseudopeople.interface import (
_reformat_dates_for_noising,
generate_american_community_survey,
generate_current_population_survey,
generate_decennial_census,
Expand All @@ -14,22 +13,13 @@
)
from pseudopeople.noise_entities import NOISE_TYPES
from pseudopeople.schema_entities import COLUMNS, DATASETS
from tests.integration.conftest import CELL_PROBABILITY, SEED, STATE

# TODO: Replace this with the record ID column when implemented (MIC-4039)
IDX_COLS = {
DATASETS.census.name: [COLUMNS.simulant_id.name, COLUMNS.year.name],
DATASETS.acs.name: [COLUMNS.simulant_id.name, COLUMNS.survey_date.name],
DATASETS.cps.name: [COLUMNS.simulant_id.name, COLUMNS.survey_date.name],
DATASETS.wic.name: [COLUMNS.simulant_id.name, COLUMNS.year.name],
DATASETS.ssa.name: [COLUMNS.simulant_id.name, COLUMNS.ssa_event_type.name],
DATASETS.tax_w2_1099.name: [
COLUMNS.simulant_id.name,
COLUMNS.tax_year.name,
COLUMNS.employer_id.name,
],
# DATASETS.tax_1040.name: "todo",
}
from tests.integration.conftest import (
CELL_PROBABILITY,
IDX_COLS,
SEED,
STATE,
_get_common_datasets,
)

DATASET_GENERATION_FUNCS = {
DATASETS.census.name: generate_decennial_census,
Expand Down Expand Up @@ -107,7 +97,14 @@ def test_generate_dataset_from_sample_and_source(
check_original.loc[compare_dataset_idx, col].values
!= check_noised_dataset.loc[compare_dataset_idx, col].values
).mean()
assert np.isclose(noise_level_dataset, noise_level_sample, rtol=0.08)
# we special-case a few sparse columns that have larger differences
if dataset_name == DATASETS.cps.name and col == COLUMNS.unit_number.name:
rtol = 0.21
elif dataset_name == DATASETS.acs.name and col == COLUMNS.middle_initial.name:
rtol = 0.12
else:
rtol = 0.04
assert np.isclose(noise_level_dataset, noise_level_sample, rtol=rtol)


@pytest.mark.parametrize(
Expand Down Expand Up @@ -546,24 +543,6 @@ def _mock_noise_dataset(
return dataset_data


def _get_common_datasets(dataset_name, data, noised_data):
"""Use unique columns to determine shared non-NA rows between noised and
unnoised data. Note that we cannot use the original index because that
gets reset after noising, i.e. the unique columns must NOT be noised.
"""
idx_cols = IDX_COLS.get(dataset_name)
dataset = DATASETS.get_dataset(dataset_name)
check_original = _reformat_dates_for_noising(data, dataset).set_index(idx_cols)
check_noised = noised_data.set_index(idx_cols)
# Ensure the idx_cols are unique
assert check_original.index.duplicated().sum() == 0
assert check_noised.index.duplicated().sum() == 0
shared_idx = pd.Index(set(check_original.index).intersection(set(check_noised.index)))
check_original = check_original.loc[shared_idx]
check_noised = check_noised.loc[shared_idx]
return check_noised, check_original, shared_idx


def _generate_non_sample_data_root(data_dir_name, tmpdir, data):
"""Helper function to break the single sample dataset into two and save
out to tmpdir to be used as a non-default 'source' argument
Expand Down
35 changes: 35 additions & 0 deletions tests/integration/test_schema.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
import pandas as pd
import pytest

from pseudopeople.schema_entities import COLUMNS, DATASETS
from tests.integration.conftest import _get_common_datasets


@pytest.mark.parametrize(
"dataset_name",
[
DATASETS.census.name,
DATASETS.acs.name,
DATASETS.cps.name,
DATASETS.ssa.name,
DATASETS.tax_w2_1099.name,
DATASETS.wic.name,
"TODO: tax_1040",
],
)
def test_unnoised_id_cols(dataset_name: str, request):
"""Tests that all datasets retain unnoised simulant_id and household_id"""
if "TODO" in dataset_name:
pytest.skip(reason=dataset_name)
unnoised_id_cols = [COLUMNS.simulant_id.name, COLUMNS.household_id.name]
data = request.getfixturevalue(f"sample_data_{dataset_name}")
noised_data = request.getfixturevalue(f"noised_sample_data_{dataset_name}")
check_noised, check_original, _ = _get_common_datasets(dataset_name, data, noised_data)
assert (
(
check_original.reset_index()[unnoised_id_cols]
== check_noised.reset_index()[unnoised_id_cols]
)
.all()
.all()
)

0 comments on commit 4f94992

Please sign in to comment.