diff --git a/MANIFEST.in b/MANIFEST.in index a02323e8..f15850a8 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -8,5 +8,5 @@ include README.rst recursive-include docs * prune docs/_build -recursive-include src/pseudopeople *.py *.yaml *.csv +recursive-include src/pseudopeople *.py *.yaml *.csv *.hdf recursive-include tests *.py *txt *.yaml diff --git a/src/pseudopeople/constants/paths.py b/src/pseudopeople/constants/paths.py index 81321e45..92cc7618 100644 --- a/src/pseudopeople/constants/paths.py +++ b/src/pseudopeople/constants/paths.py @@ -7,3 +7,11 @@ INCORRECT_SELECT_NOISE_OPTIONS_DATA = DATA_ROOT / "incorrect_select_options.csv" QWERTY_ERRORS = DATA_ROOT / "qwerty_errors.yaml" + +SAMPLE_DATA_ROOT = DATA_ROOT / "sample_forms" +SAMPLE_DECENNIAL_CENSUS = SAMPLE_DATA_ROOT / "decennial_census_observer.hdf" +SAMPLE_TAXES_W2_AND_1099 = SAMPLE_DATA_ROOT / "tax_w2_observer.hdf" +SAMPLE_AMERICAN_COMMUNITIES_SURVEY = SAMPLE_DATA_ROOT / "household_survey_observer_acs.hdf" +SAMPLE_CURRENT_POPULATION_SURVEY = SAMPLE_DATA_ROOT / "household_survey_observer_cps.hdf" +SAMPLE_SOCIAL_SECURITY = SAMPLE_DATA_ROOT / "social_security_observer.hdf" +SAMPLE_WOMEN_INFANTS_AND_CHILDREN = SAMPLE_DATA_ROOT / "wic_observer.hdf" diff --git a/src/pseudopeople/data/sample_forms/decennial_census_observer.hdf b/src/pseudopeople/data/sample_forms/decennial_census_observer.hdf new file mode 100644 index 00000000..b3fe3675 Binary files /dev/null and b/src/pseudopeople/data/sample_forms/decennial_census_observer.hdf differ diff --git a/src/pseudopeople/data/sample_forms/household_survey_observer_acs.hdf b/src/pseudopeople/data/sample_forms/household_survey_observer_acs.hdf new file mode 100644 index 00000000..2e1ad5df Binary files /dev/null and b/src/pseudopeople/data/sample_forms/household_survey_observer_acs.hdf differ diff --git a/src/pseudopeople/data/sample_forms/household_survey_observer_cps.hdf b/src/pseudopeople/data/sample_forms/household_survey_observer_cps.hdf new file mode 100644 index 00000000..1616b719 Binary files /dev/null and b/src/pseudopeople/data/sample_forms/household_survey_observer_cps.hdf differ diff --git a/src/pseudopeople/data/sample_forms/social_security_observer.hdf b/src/pseudopeople/data/sample_forms/social_security_observer.hdf new file mode 100644 index 00000000..71d90d11 Binary files /dev/null and b/src/pseudopeople/data/sample_forms/social_security_observer.hdf differ diff --git a/src/pseudopeople/data/sample_forms/tax_1040_observer.hdf b/src/pseudopeople/data/sample_forms/tax_1040_observer.hdf new file mode 100644 index 00000000..ca3f2c6b Binary files /dev/null and b/src/pseudopeople/data/sample_forms/tax_1040_observer.hdf differ diff --git a/src/pseudopeople/data/sample_forms/tax_dependents_observer.hdf b/src/pseudopeople/data/sample_forms/tax_dependents_observer.hdf new file mode 100644 index 00000000..663a646d Binary files /dev/null and b/src/pseudopeople/data/sample_forms/tax_dependents_observer.hdf differ diff --git a/src/pseudopeople/data/sample_forms/tax_w2_observer.hdf b/src/pseudopeople/data/sample_forms/tax_w2_observer.hdf new file mode 100644 index 00000000..c68671cc Binary files /dev/null and b/src/pseudopeople/data/sample_forms/tax_w2_observer.hdf differ diff --git a/src/pseudopeople/data/sample_forms/wic_observer.hdf b/src/pseudopeople/data/sample_forms/wic_observer.hdf new file mode 100644 index 00000000..3b426c5c Binary files /dev/null and b/src/pseudopeople/data/sample_forms/wic_observer.hdf differ diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py index 7a543478..f014a76a 100644 --- a/src/pseudopeople/interface.py +++ b/src/pseudopeople/interface.py @@ -14,7 +14,7 @@ def _generate_form( source: Union[Path, str, pd.DataFrame], seed: int, configuration: Union[Path, str, dict], -): +) -> pd.DataFrame: """ Helper for generating noised forms from clean data. @@ -27,7 +27,7 @@ def _generate_form( :param configuration: Object to configure noise levels :return: - Noised form data + Noised form data in a pd.DataFrame """ configuration_tree = get_configuration(configuration) if isinstance(source, pd.DataFrame): @@ -47,7 +47,7 @@ def generate_decennial_census( source: Union[Path, str, pd.DataFrame], seed: int = 0, configuration: Union[Path, str, dict] = None, -): +) -> pd.DataFrame: """ Generates noised decennial census data from un-noised data. @@ -63,7 +63,7 @@ def generate_american_communities_survey( source: Union[Path, str, pd.DataFrame], seed: int = 0, configuration: Union[Path, str, dict] = None, -): +) -> pd.DataFrame: """ Generates noised American Communities Survey (ACS) data from un-noised data. @@ -79,7 +79,7 @@ def generate_current_population_survey( source: Union[Path, str, pd.DataFrame], seed: int = 0, configuration: Union[Path, str, dict] = None, -): +) -> pd.DataFrame: """ Generates noised Current Population Survey (CPS) data from un-noised data. @@ -95,7 +95,7 @@ def generate_taxes_w2_and_1099( source: Union[Path, str, pd.DataFrame], seed: int = 0, configuration: Union[Path, str, dict] = None, -): +) -> pd.DataFrame: """ Generates noised W2 and 1099 data from un-noised data. @@ -111,7 +111,7 @@ def generate_women_infants_and_children( source: Union[Path, str, pd.DataFrame], seed: int = 0, configuration: Union[Path, str, dict] = None, -): +) -> pd.DataFrame: """ Generates noised Women Infants and Children (WIC) data from un-noised data. @@ -127,7 +127,7 @@ def generate_social_security( source: Union[Path, str, pd.DataFrame], seed: int = 0, configuration: Union[Path, str, dict] = None, -): +) -> pd.DataFrame: """ Generates noised Social Security (SSA) data from un-noised data. diff --git a/src/pseudopeople/noise_functions.py b/src/pseudopeople/noise_functions.py index 7ff6e2b5..40932018 100644 --- a/src/pseudopeople/noise_functions.py +++ b/src/pseudopeople/noise_functions.py @@ -131,6 +131,7 @@ def miswrite_zipcodes( :return: pd.Series of noised zipcodes """ + column = column.astype(str) str_len = column.str.len() if (str_len != 5).sum() > 0: raise ValueError( @@ -209,7 +210,8 @@ def miswrite_numerics( returns: pd.Series with some numeric values experiencing noise. """ - + if column.empty: + return column # This is a fix to not replacing the original token for noise options token_noise_level = configuration.token_noise_level / 0.9 rng = np.random.default_rng(randomness_stream.seed) diff --git a/tests/integration/test_interface.py b/tests/integration/test_interface.py index e2fda8ca..0eb5ec90 100644 --- a/tests/integration/test_interface.py +++ b/tests/integration/test_interface.py @@ -1,60 +1,46 @@ from pathlib import Path -from typing import Union +from typing import Callable, Union import pandas as pd import pytest -from pseudopeople.interface import generate_decennial_census - - -# TODO: possibly parametrize Forms? -def test_generate_decennial_census( - decennial_census_data_path: Union[Path, str], user_config_path: Union[Path, str] -): - data = pd.read_hdf(decennial_census_data_path) - - # TODO: Refactor this check into a separate test - noised_data = generate_decennial_census( - source=decennial_census_data_path, seed=0, configuration=user_config_path - ) - noised_data_same_seed = generate_decennial_census( - source=decennial_census_data_path, seed=0, configuration=user_config_path - ) - noised_data_different_seed = generate_decennial_census( - source=decennial_census_data_path, seed=1, configuration=user_config_path - ) +from pseudopeople.constants.paths import ( + SAMPLE_AMERICAN_COMMUNITIES_SURVEY, + SAMPLE_CURRENT_POPULATION_SURVEY, + SAMPLE_DECENNIAL_CENSUS, + SAMPLE_SOCIAL_SECURITY, + SAMPLE_TAXES_W2_AND_1099, + SAMPLE_WOMEN_INFANTS_AND_CHILDREN, +) +from pseudopeople.interface import ( + generate_american_communities_survey, + generate_current_population_survey, + generate_decennial_census, + generate_social_security, + generate_taxes_w2_and_1099, + generate_women_infants_and_children, +) + + +@pytest.mark.parametrize( + "data_path, noising_function", + [ + (SAMPLE_DECENNIAL_CENSUS, generate_decennial_census), + (SAMPLE_AMERICAN_COMMUNITIES_SURVEY, generate_american_communities_survey), + (SAMPLE_CURRENT_POPULATION_SURVEY, generate_current_population_survey), + (SAMPLE_SOCIAL_SECURITY, generate_social_security), + (SAMPLE_TAXES_W2_AND_1099, generate_taxes_w2_and_1099), + (SAMPLE_WOMEN_INFANTS_AND_CHILDREN, generate_women_infants_and_children), + ], +) +def test_generate_form(data_path: Union[Path, str], noising_function: Callable): + data = pd.DataFrame(pd.read_hdf(data_path)) + + noised_data = noising_function(source=data.copy(), seed=0) + noised_data_same_seed = noising_function(source=data.copy(), seed=0) + noised_data_different_seed = noising_function(source=data.copy(), seed=1) + assert not data.equals(noised_data) assert noised_data.equals(noised_data_same_seed) assert not noised_data.equals(noised_data_different_seed) - assert not data.equals(noised_data) assert set(noised_data.columns) == set(data.columns) - - -@pytest.mark.skip(reason="TODO") -def test_generate_acs(): - pass - - -@pytest.mark.skip(reason="TODO") -def test_generate_cps(): - pass - - -@pytest.mark.skip(reason="TODO") -def test_generate_wic(): - pass - - -@pytest.mark.skip(reason="TODO") -def test_generate_ssa(): - pass - - -@pytest.mark.skip(reason="TODO") -def test_generate_tax_w2_1099(): - pass - - -@pytest.mark.skip(reason="TODO") -def test_generate_tax_1040(): - pass diff --git a/tests/unit/test_noise_form.py b/tests/unit/test_noise_form.py index f6cbe048..9ed43e2e 100644 --- a/tests/unit/test_noise_form.py +++ b/tests/unit/test_noise_form.py @@ -8,7 +8,14 @@ from vivarium.config_tree import ConfigTree from pseudopeople.entity_types import ColumnNoiseType -from pseudopeople.interface import generate_decennial_census +from pseudopeople.interface import ( + generate_american_communities_survey, + generate_current_population_survey, + generate_decennial_census, + generate_social_security, + generate_taxes_w2_and_1099, + generate_women_infants_and_children, +) from pseudopeople.noise import noise_form from pseudopeople.noise_entities import NOISE_TYPES from pseudopeople.schema_entities import Form @@ -156,11 +163,11 @@ def test_columns_noised(dummy_data): "func, form", [ (generate_decennial_census, Form.CENSUS), - ("todo", Form.ACS), - ("todo", Form.CPS), - ("todo", Form.WIC), - ("todo", Form.SSA), - ("todo", Form.TAX_W2_1099), + (generate_american_communities_survey, Form.ACS), + (generate_current_population_survey, Form.CPS), + (generate_women_infants_and_children, Form.WIC), + (generate_social_security, Form.SSA), + (generate_taxes_w2_and_1099, Form.TAX_W2_1099), ("todo", Form.TAX_1040), ], )