include household_id cols; update sample data (#180)

ihmeuw · May 15, 2023 · 4f94992 · 4f94992
1 parent 00e85f4
commit 4f94992
Show file tree

Hide file tree

Showing 12 changed files with 90 additions and 39 deletions.
diff --git a/...dopeople/data/sample_datasets/american_community_survey/american_community_survey.parquet b/...dopeople/data/sample_datasets/american_community_survey/american_community_survey.parquet
diff --git a/...dopeople/data/sample_datasets/current_population_survey/current_population_survey.parquet b/...dopeople/data/sample_datasets/current_population_survey/current_population_survey.parquet
diff --git a/src/pseudopeople/data/sample_datasets/decennial_census/decennial_census.parquet b/src/pseudopeople/data/sample_datasets/decennial_census/decennial_census.parquet
diff --git a/src/pseudopeople/data/sample_datasets/social_security/social_security.parquet b/src/pseudopeople/data/sample_datasets/social_security/social_security.parquet
diff --git a/src/pseudopeople/data/sample_datasets/taxes_1040/taxes_1040.parquet b/src/pseudopeople/data/sample_datasets/taxes_1040/taxes_1040.parquet
diff --git a/src/pseudopeople/data/sample_datasets/taxes_dependents/taxes_dependents.parquet b/src/pseudopeople/data/sample_datasets/taxes_dependents/taxes_dependents.parquet
diff --git a/src/pseudopeople/data/sample_datasets/taxes_w2_and_1099/taxes_w2_and_1099.parquet b/src/pseudopeople/data/sample_datasets/taxes_w2_and_1099/taxes_w2_and_1099.parquet
diff --git a/...people/data/sample_datasets/women_infants_and_children/women_infants_and_children.parquet b/...people/data/sample_datasets/women_infants_and_children/women_infants_and_children.parquet
diff --git a/src/pseudopeople/schema_entities.py b/src/pseudopeople/schema_entities.py
@@ -382,6 +382,7 @@ class __Datasets(NamedTuple):
         DatasetNames.CENSUS,
         columns=(  # This defines the output column order
             COLUMNS.simulant_id,
+            COLUMNS.household_id,
             COLUMNS.first_name,
             COLUMNS.middle_initial,
             COLUMNS.last_name,
@@ -408,8 +409,8 @@ class __Datasets(NamedTuple):
     acs: Dataset = Dataset(
         DatasetNames.ACS,
         columns=(  # This defines the output column order
-            COLUMNS.household_id,
             COLUMNS.simulant_id,
+            COLUMNS.household_id,
             COLUMNS.survey_date,
             COLUMNS.first_name,
             COLUMNS.middle_initial,
@@ -435,8 +436,8 @@ class __Datasets(NamedTuple):
     cps: Dataset = Dataset(
         DatasetNames.CPS,
         columns=(  # This defines the output column order
-            COLUMNS.household_id,
             COLUMNS.simulant_id,
+            COLUMNS.household_id,
             COLUMNS.survey_date,
             COLUMNS.first_name,
             COLUMNS.middle_initial,
@@ -462,8 +463,8 @@ class __Datasets(NamedTuple):
     wic: Dataset = Dataset(
         DatasetNames.WIC,
         columns=(  # This defines the output column order
-            COLUMNS.household_id,
             COLUMNS.simulant_id,
+            COLUMNS.household_id,
             COLUMNS.first_name,
             COLUMNS.middle_initial,
             COLUMNS.last_name,
@@ -489,6 +490,7 @@ class __Datasets(NamedTuple):
         DatasetNames.SSA,
         columns=(  # This defines the output column order
             COLUMNS.simulant_id,
+            COLUMNS.household_id,
             COLUMNS.first_name,
             COLUMNS.middle_initial,
             COLUMNS.last_name,
@@ -508,6 +510,7 @@ class __Datasets(NamedTuple):
         DatasetNames.TAXES_W2_1099,
         columns=(  # This defines the output column order
             COLUMNS.simulant_id,
+            COLUMNS.household_id,
             COLUMNS.first_name,
             COLUMNS.middle_initial,
             COLUMNS.last_name,

diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
@@ -4,6 +4,7 @@
 from pseudopeople.configuration import Keys, get_configuration
 from pseudopeople.constants import paths
 from pseudopeople.interface import (
+    _reformat_dates_for_noising,
     generate_american_community_survey,
     generate_current_population_survey,
     generate_decennial_census,
@@ -18,6 +19,21 @@
 SEED = 0
 STATE = "RI"
 
+# TODO: Replace this with the record ID column when implemented (MIC-4039)
+IDX_COLS = {
+    DATASETS.census.name: [COLUMNS.simulant_id.name, COLUMNS.year.name],
+    DATASETS.acs.name: [COLUMNS.simulant_id.name, COLUMNS.survey_date.name],
+    DATASETS.cps.name: [COLUMNS.simulant_id.name, COLUMNS.survey_date.name],
+    DATASETS.wic.name: [COLUMNS.simulant_id.name, COLUMNS.year.name],
+    DATASETS.ssa.name: [COLUMNS.simulant_id.name, COLUMNS.ssa_event_type.name],
+    DATASETS.tax_w2_1099.name: [
+        COLUMNS.simulant_id.name,
+        COLUMNS.tax_year.name,
+        COLUMNS.employer_id.name,
+    ],
+    # DATASETS.tax_1040.name: "todo",
+}
+
 
 @pytest.fixture(scope="module")
 def user_config():
@@ -176,3 +192,21 @@ def sample_data_taxes_w2_and_1099_state_edit():
 def _load_sample_data(dataset):
     data_path = paths.SAMPLE_DATA_ROOT / dataset / f"{dataset}.parquet"
     return pd.read_parquet(data_path)
+
+
+def _get_common_datasets(dataset_name, data, noised_data):
+    """Use unique columns to determine shared non-NA rows between noised and
+    unnoised data. Note that we cannot use the original index because that
+    gets reset after noising, i.e. the unique columns must NOT be noised.
+    """
+    idx_cols = IDX_COLS.get(dataset_name)
+    dataset = DATASETS.get_dataset(dataset_name)
+    check_original = _reformat_dates_for_noising(data, dataset).set_index(idx_cols)
+    check_noised = noised_data.set_index(idx_cols)
+    # Ensure the idx_cols are unique
+    assert check_original.index.duplicated().sum() == 0
+    assert check_noised.index.duplicated().sum() == 0
+    shared_idx = pd.Index(set(check_original.index).intersection(set(check_noised.index)))
+    check_original = check_original.loc[shared_idx]
+    check_noised = check_noised.loc[shared_idx]
+    return check_noised, check_original, shared_idx
diff --git a/tests/integration/test_interface.py b/tests/integration/test_interface.py
@@ -4,7 +4,6 @@
 
 from pseudopeople.configuration import Keys, get_configuration
 from pseudopeople.interface import (
-    _reformat_dates_for_noising,
     generate_american_community_survey,
     generate_current_population_survey,
     generate_decennial_census,
@@ -14,22 +13,13 @@
 )
 from pseudopeople.noise_entities import NOISE_TYPES
 from pseudopeople.schema_entities import COLUMNS, DATASETS
-from tests.integration.conftest import CELL_PROBABILITY, SEED, STATE
-
-# TODO: Replace this with the record ID column when implemented (MIC-4039)
-IDX_COLS = {
-    DATASETS.census.name: [COLUMNS.simulant_id.name, COLUMNS.year.name],
-    DATASETS.acs.name: [COLUMNS.simulant_id.name, COLUMNS.survey_date.name],
-    DATASETS.cps.name: [COLUMNS.simulant_id.name, COLUMNS.survey_date.name],
-    DATASETS.wic.name: [COLUMNS.simulant_id.name, COLUMNS.year.name],
-    DATASETS.ssa.name: [COLUMNS.simulant_id.name, COLUMNS.ssa_event_type.name],
-    DATASETS.tax_w2_1099.name: [
-        COLUMNS.simulant_id.name,
-        COLUMNS.tax_year.name,
-        COLUMNS.employer_id.name,
-    ],
-    # DATASETS.tax_1040.name: "todo",
-}
+from tests.integration.conftest import (
+    CELL_PROBABILITY,
+    IDX_COLS,
+    SEED,
+    STATE,
+    _get_common_datasets,
+)
 
 DATASET_GENERATION_FUNCS = {
     DATASETS.census.name: generate_decennial_census,
@@ -107,7 +97,14 @@ def test_generate_dataset_from_sample_and_source(
             check_original.loc[compare_dataset_idx, col].values
             != check_noised_dataset.loc[compare_dataset_idx, col].values
         ).mean()
-        assert np.isclose(noise_level_dataset, noise_level_sample, rtol=0.08)
+        # we special-case a few sparse columns that have larger differences
+        if dataset_name == DATASETS.cps.name and col == COLUMNS.unit_number.name:
+            rtol = 0.21
+        elif dataset_name == DATASETS.acs.name and col == COLUMNS.middle_initial.name:
+            rtol = 0.12
+        else:
+            rtol = 0.04
+        assert np.isclose(noise_level_dataset, noise_level_sample, rtol=rtol)
 
 
 @pytest.mark.parametrize(
@@ -546,24 +543,6 @@ def _mock_noise_dataset(
     return dataset_data
 
 
-def _get_common_datasets(dataset_name, data, noised_data):
-    """Use unique columns to determine shared non-NA rows between noised and
-    unnoised data. Note that we cannot use the original index because that
-    gets reset after noising, i.e. the unique columns must NOT be noised.
-    """
-    idx_cols = IDX_COLS.get(dataset_name)
-    dataset = DATASETS.get_dataset(dataset_name)
-    check_original = _reformat_dates_for_noising(data, dataset).set_index(idx_cols)
-    check_noised = noised_data.set_index(idx_cols)
-    # Ensure the idx_cols are unique
-    assert check_original.index.duplicated().sum() == 0
-    assert check_noised.index.duplicated().sum() == 0
-    shared_idx = pd.Index(set(check_original.index).intersection(set(check_noised.index)))
-    check_original = check_original.loc[shared_idx]
-    check_noised = check_noised.loc[shared_idx]
-    return check_noised, check_original, shared_idx
-
-
 def _generate_non_sample_data_root(data_dir_name, tmpdir, data):
     """Helper function to break the single sample dataset into two and save
     out to tmpdir to be used as a non-default 'source' argument

diff --git a/tests/integration/test_schema.py b/tests/integration/test_schema.py
@@ -0,0 +1,35 @@
+import pandas as pd
+import pytest
+
+from pseudopeople.schema_entities import COLUMNS, DATASETS
+from tests.integration.conftest import _get_common_datasets
+
+
+@pytest.mark.parametrize(
+    "dataset_name",
+    [
+        DATASETS.census.name,
+        DATASETS.acs.name,
+        DATASETS.cps.name,
+        DATASETS.ssa.name,
+        DATASETS.tax_w2_1099.name,
+        DATASETS.wic.name,
+        "TODO: tax_1040",
+    ],
+)
+def test_unnoised_id_cols(dataset_name: str, request):
+    """Tests that all datasets retain unnoised simulant_id and household_id"""
+    if "TODO" in dataset_name:
+        pytest.skip(reason=dataset_name)
+    unnoised_id_cols = [COLUMNS.simulant_id.name, COLUMNS.household_id.name]
+    data = request.getfixturevalue(f"sample_data_{dataset_name}")
+    noised_data = request.getfixturevalue(f"noised_sample_data_{dataset_name}")
+    check_noised, check_original, _ = _get_common_datasets(dataset_name, data, noised_data)
+    assert (
+        (
+            check_original.reset_index()[unnoised_id_cols]
+            == check_noised.reset_index()[unnoised_id_cols]
+        )
+        .all()
+        .all()
+    )