Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add sample data and include in tests #37

Merged
merged 4 commits into from
Apr 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion MANIFEST.in
Original file line number Diff line number Diff line change
Expand Up @@ -8,5 +8,5 @@ include README.rst
recursive-include docs *
prune docs/_build

recursive-include src/pseudopeople *.py *.yaml *.csv
recursive-include src/pseudopeople *.py *.yaml *.csv *.hdf
recursive-include tests *.py *txt *.yaml
8 changes: 8 additions & 0 deletions src/pseudopeople/constants/paths.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,3 +7,11 @@

INCORRECT_SELECT_NOISE_OPTIONS_DATA = DATA_ROOT / "incorrect_select_options.csv"
QWERTY_ERRORS = DATA_ROOT / "qwerty_errors.yaml"

SAMPLE_DATA_ROOT = DATA_ROOT / "sample_forms"
SAMPLE_DECENNIAL_CENSUS = SAMPLE_DATA_ROOT / "decennial_census_observer.hdf"
SAMPLE_TAXES_W2_AND_1099 = SAMPLE_DATA_ROOT / "tax_w2_observer.hdf"
SAMPLE_AMERICAN_COMMUNITIES_SURVEY = SAMPLE_DATA_ROOT / "household_survey_observer_acs.hdf"
SAMPLE_CURRENT_POPULATION_SURVEY = SAMPLE_DATA_ROOT / "household_survey_observer_cps.hdf"
SAMPLE_SOCIAL_SECURITY = SAMPLE_DATA_ROOT / "social_security_observer.hdf"
SAMPLE_WOMEN_INFANTS_AND_CHILDREN = SAMPLE_DATA_ROOT / "wic_observer.hdf"
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
16 changes: 8 additions & 8 deletions src/pseudopeople/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ def _generate_form(
source: Union[Path, str, pd.DataFrame],
seed: int,
configuration: Union[Path, str, dict],
):
) -> pd.DataFrame:
"""
Helper for generating noised forms from clean data.

Expand All @@ -27,7 +27,7 @@ def _generate_form(
:param configuration:
Object to configure noise levels
:return:
Noised form data
Noised form data in a pd.DataFrame
"""
configuration_tree = get_configuration(configuration)
if isinstance(source, pd.DataFrame):
Expand All @@ -47,7 +47,7 @@ def generate_decennial_census(
source: Union[Path, str, pd.DataFrame],
seed: int = 0,
configuration: Union[Path, str, dict] = None,
):
) -> pd.DataFrame:
"""
Generates noised decennial census data from un-noised data.

Expand All @@ -63,7 +63,7 @@ def generate_american_communities_survey(
source: Union[Path, str, pd.DataFrame],
seed: int = 0,
configuration: Union[Path, str, dict] = None,
):
) -> pd.DataFrame:
"""
Generates noised American Communities Survey (ACS) data from un-noised data.

Expand All @@ -79,7 +79,7 @@ def generate_current_population_survey(
source: Union[Path, str, pd.DataFrame],
seed: int = 0,
configuration: Union[Path, str, dict] = None,
):
) -> pd.DataFrame:
"""
Generates noised Current Population Survey (CPS) data from un-noised data.

Expand All @@ -95,7 +95,7 @@ def generate_taxes_w2_and_1099(
source: Union[Path, str, pd.DataFrame],
seed: int = 0,
configuration: Union[Path, str, dict] = None,
):
) -> pd.DataFrame:
"""
Generates noised W2 and 1099 data from un-noised data.

Expand All @@ -111,7 +111,7 @@ def generate_women_infants_and_children(
source: Union[Path, str, pd.DataFrame],
seed: int = 0,
configuration: Union[Path, str, dict] = None,
):
) -> pd.DataFrame:
"""
Generates noised Women Infants and Children (WIC) data from un-noised data.

Expand All @@ -127,7 +127,7 @@ def generate_social_security(
source: Union[Path, str, pd.DataFrame],
seed: int = 0,
configuration: Union[Path, str, dict] = None,
):
) -> pd.DataFrame:
"""
Generates noised Social Security (SSA) data from un-noised data.

Expand Down
4 changes: 3 additions & 1 deletion src/pseudopeople/noise_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,6 +131,7 @@ def miswrite_zipcodes(
:return: pd.Series of noised zipcodes
"""

column = column.astype(str)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What are these coming in as now if not strings? I'm concerned if they're coming in as ints again then we will be back to losing preceding 0s (thought I guess the check below would catch that)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The same dataset has all the zipcodes as 90210, which was being interpreted as an int.

str_len = column.str.len()
if (str_len != 5).sum() > 0:
raise ValueError(
Expand Down Expand Up @@ -209,7 +210,8 @@ def miswrite_numerics(

returns: pd.Series with some numeric values experiencing noise.
"""

if column.empty:
return column
# This is a fix to not replacing the original token for noise options
token_noise_level = configuration.token_noise_level / 0.9
rng = np.random.default_rng(randomness_stream.seed)
Expand Down
88 changes: 37 additions & 51 deletions tests/integration/test_interface.py
Original file line number Diff line number Diff line change
@@ -1,60 +1,46 @@
from pathlib import Path
from typing import Union
from typing import Callable, Union

import pandas as pd
import pytest

from pseudopeople.interface import generate_decennial_census


# TODO: possibly parametrize Forms?
def test_generate_decennial_census(
decennial_census_data_path: Union[Path, str], user_config_path: Union[Path, str]
):
data = pd.read_hdf(decennial_census_data_path)

# TODO: Refactor this check into a separate test
noised_data = generate_decennial_census(
source=decennial_census_data_path, seed=0, configuration=user_config_path
)
noised_data_same_seed = generate_decennial_census(
source=decennial_census_data_path, seed=0, configuration=user_config_path
)
noised_data_different_seed = generate_decennial_census(
source=decennial_census_data_path, seed=1, configuration=user_config_path
)
from pseudopeople.constants.paths import (
SAMPLE_AMERICAN_COMMUNITIES_SURVEY,
SAMPLE_CURRENT_POPULATION_SURVEY,
SAMPLE_DECENNIAL_CENSUS,
SAMPLE_SOCIAL_SECURITY,
SAMPLE_TAXES_W2_AND_1099,
SAMPLE_WOMEN_INFANTS_AND_CHILDREN,
)
from pseudopeople.interface import (
generate_american_communities_survey,
generate_current_population_survey,
generate_decennial_census,
generate_social_security,
generate_taxes_w2_and_1099,
generate_women_infants_and_children,
)


@pytest.mark.parametrize(
"data_path, noising_function",
[
(SAMPLE_DECENNIAL_CENSUS, generate_decennial_census),
(SAMPLE_AMERICAN_COMMUNITIES_SURVEY, generate_american_communities_survey),
(SAMPLE_CURRENT_POPULATION_SURVEY, generate_current_population_survey),
(SAMPLE_SOCIAL_SECURITY, generate_social_security),
(SAMPLE_TAXES_W2_AND_1099, generate_taxes_w2_and_1099),
(SAMPLE_WOMEN_INFANTS_AND_CHILDREN, generate_women_infants_and_children),
],
)
def test_generate_form(data_path: Union[Path, str], noising_function: Callable):
data = pd.DataFrame(pd.read_hdf(data_path))

noised_data = noising_function(source=data.copy(), seed=0)
noised_data_same_seed = noising_function(source=data.copy(), seed=0)
noised_data_different_seed = noising_function(source=data.copy(), seed=1)

assert not data.equals(noised_data)
assert noised_data.equals(noised_data_same_seed)
assert not noised_data.equals(noised_data_different_seed)
assert not data.equals(noised_data)
assert set(noised_data.columns) == set(data.columns)


@pytest.mark.skip(reason="TODO")
def test_generate_acs():
pass


@pytest.mark.skip(reason="TODO")
def test_generate_cps():
pass


@pytest.mark.skip(reason="TODO")
def test_generate_wic():
pass


@pytest.mark.skip(reason="TODO")
def test_generate_ssa():
pass


@pytest.mark.skip(reason="TODO")
def test_generate_tax_w2_1099():
pass


@pytest.mark.skip(reason="TODO")
def test_generate_tax_1040():
pass
19 changes: 13 additions & 6 deletions tests/unit/test_noise_form.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,14 @@
from vivarium.config_tree import ConfigTree

from pseudopeople.entity_types import ColumnNoiseType
from pseudopeople.interface import generate_decennial_census
from pseudopeople.interface import (
generate_american_communities_survey,
generate_current_population_survey,
generate_decennial_census,
generate_social_security,
generate_taxes_w2_and_1099,
generate_women_infants_and_children,
)
from pseudopeople.noise import noise_form
from pseudopeople.noise_entities import NOISE_TYPES
from pseudopeople.schema_entities import Form
Expand Down Expand Up @@ -156,11 +163,11 @@ def test_columns_noised(dummy_data):
"func, form",
[
(generate_decennial_census, Form.CENSUS),
("todo", Form.ACS),
("todo", Form.CPS),
("todo", Form.WIC),
("todo", Form.SSA),
("todo", Form.TAX_W2_1099),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You knocked out so many TODOs with one fell swoop!

(generate_american_communities_survey, Form.ACS),
(generate_current_population_survey, Form.CPS),
(generate_women_infants_and_children, Form.WIC),
(generate_social_security, Form.SSA),
(generate_taxes_w2_and_1099, Form.TAX_W2_1099),
("todo", Form.TAX_1040),
],
)
Expand Down