From 1d096042571520a041e1000560c902463b8e4c9c Mon Sep 17 00:00:00 2001 From: Steve Bachmeier <23350991+stevebachmeier@users.noreply.github.com> Date: Fri, 24 Mar 2023 12:39:55 -0600 Subject: [PATCH 1/7] add census integration test (#17) --- tests/integration/conftest.py | 185 ++++++++++++++++++++++++++++ tests/integration/test_interface.py | 79 ++++++++++-- 2 files changed, 255 insertions(+), 9 deletions(-) create mode 100644 tests/integration/conftest.py diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py new file mode 100644 index 00000000..140c6d12 --- /dev/null +++ b/tests/integration/conftest.py @@ -0,0 +1,185 @@ +import random +import time +from string import ascii_lowercase, ascii_uppercase + +import pandas as pd +import pytest +import yaml + +from pseudopeople.utilities import get_configuration + +HOUSING_TYPES = [ + "Carceral", + "College", + "Military", + "Nursing home", + "Other institutional", + "Other non-institutional", + "Standard", +] + +RACE_ETHNICITIES = [ + "AIAN", + "Asian", + "Black", + "Latino", + "Multiracial or Other", + "NHOPI", + "White", +] + +RELATIONS_TO_HOUSEHOLD_HEAD = [ + "Adopted child", + "Biological child", + "Child-in-law", + "Foster child", + "Grandchild", + "Institutionalized GQ pop", + "Noninstitutionalized GQ pop", + "Opp-sex partner", + "Opp-sex spouse", + "Other nonrelative", + "Other relative", + "Parent", + "Parent-in-law", + "Reference person", + "Roommate", + "Same-sex partner", + "Same-sex spouse", + "Sibling", + "Stepchild", +] + +DOB_START_DATE = time.mktime(time.strptime("1920-1-1", "%Y-%m-%d")) +DOB_END_DATE = time.mktime(time.strptime("2030-5-1", "%Y-%m-%d")) + +STATES = [ + "AL", + "AK", + "AZ", + "AR", + "CA", + "CO", + "CT", + "DC", + "DE", + "FL", + "GA", + "HI", + "ID", + "IL", + "IN", + "IA", + "KS", + "KY", + "LA", + "ME", + "MD", + "MA", + "MI", + "MN", + "MS", + "MO", + "MT", + "NE", + "NV", + "NH", + "NJ", + "NM", + "NY", + "NC", + "ND", + "OH", + "OK", + "OR", + "PA", + "RI", + "SC", + "SD", + "TN", + "TX", + "UT", + "VT", + "VA", + "WA", + "WV", + "WI", + "WY", +] + + +@pytest.fixture(scope="session") +def dummy_census_data(tmp_path_factory): + """Generate a dummy decennial census dataframe, save to a tmpdir, and return that path.""" + random.seed(0) + num_rows = 100_000 + data = pd.DataFrame( + { + "housing_type": [random.choice(HOUSING_TYPES) for _ in range(num_rows)], + "age": [str(random.random() * 100) for _ in range(num_rows)], + "year": [random.choice(["2020", "2030"]) for _ in range(num_rows)], + "race_ethnicity": [random.choice(RACE_ETHNICITIES) for _ in range(num_rows)], + "guardian_1": [ + f"100_{random.randint(1,int(num_rows/3))}" for _ in range(num_rows) + ], + "first_name": [ + "First" + "".join(random.choice(ascii_lowercase) for _ in range(3)) + for _ in range(num_rows) + ], + "street_name": [ + "Street" + "".join(random.choice(ascii_lowercase) for _ in range(3)) + for _ in range(num_rows) + ], + "relation_to_household_head": [ + random.choice(RELATIONS_TO_HOUSEHOLD_HEAD) for _ in range(num_rows) + ], + "zipcode": [str(float(random.randint(1, 99999))) for _ in range(num_rows)], + "date_of_birth": [ + time.strftime( + "%Y-%m-%d", + time.localtime( + DOB_START_DATE + random.random() * (DOB_END_DATE - DOB_START_DATE) + ), + ) + for _ in range(num_rows) + ], + "simulant_id": ["100_" + str(i) for i in range(num_rows)], + "middle_initial": [random.choice(ascii_uppercase) for _ in range(num_rows)], + "city": [ + "City" + "".join(random.choice(ascii_lowercase) for _ in range(3)) + for _ in range(num_rows) + ], + "street_number": [str(random.randint(1, 15000)) for _ in range(num_rows)], + "last_name": [ + "Last" + "".join(random.choice(ascii_lowercase) for _ in range(3)) + for _ in range(num_rows) + ], + "state": [random.choice(STATES) for _ in range(num_rows)], + "sex": [random.choice(["Female", "Male"]) for _ in range(num_rows)], + "unit_number": [ + "Unit " + "".join(random.choice(ascii_lowercase) for _ in range(3)) + for _ in range(num_rows) + ], + "guardian_2": [ + f"100_{random.randint(1,int(num_rows)/4)}" for _ in range(num_rows) + ], + } + ) + + data_path = tmp_path_factory.getbasetemp() / "dummy_data.csv" + data.to_csv(data_path, index=False) + + return data_path + + +@pytest.fixture(scope="module") +def dummy_config(tmp_path_factory): + """This simply copies the default config file to a temp directory + to be used as a user-provided config file in integration tests + """ + config = get_configuration().to_dict() # gets default config + config_path = tmp_path_factory.getbasetemp() / "dummy_config.yaml" + with open(config_path, "w") as file: + yaml.dump(config, file) + + return config_path diff --git a/tests/integration/test_interface.py b/tests/integration/test_interface.py index 90ccc2c1..9a32ac24 100644 --- a/tests/integration/test_interface.py +++ b/tests/integration/test_interface.py @@ -1,38 +1,99 @@ +from pathlib import Path +from typing import Union + +import numpy as np +import pandas as pd import pytest from pseudopeople.interface import generate_decennial_census +from pseudopeople.utilities import get_configuration -@pytest.mark.skip(reason="TODO") -def test_noise_census(): - pass +# TODO: possibly parametrize Forms? +def test_generate_decennial_census( + dummy_census_data: Union[Path, str], dummy_config: Union[Path, str] +): + data = pd.read_csv(dummy_census_data) + noised_data = generate_decennial_census( + path=dummy_census_data, seed=0, configuration=dummy_config + ) + noised_data_same_seed = generate_decennial_census( + path=dummy_census_data, seed=0, configuration=dummy_config + ) + noised_data_different_seed = generate_decennial_census( + path=dummy_census_data, seed=1, configuration=dummy_config + ) + + assert noised_data.equals(noised_data_same_seed) + assert not noised_data.equals(noised_data_different_seed) + assert not data.equals(noised_data) + # TODO: Confirm correct columns exist once the interface functions + # modify them + # TODO: if we sort out dtype schemas + # for col in noised_data.columns: + # assert data[col].dtype == noised_data[col].dtype + # TODO: Iterate through cols and check that the percentage of errors makes sense + # eg, if 25% typographic error and 1% OCR + # 1. Use a default config file + # 2. + + config = get_configuration(dummy_config)["decennial_census"] + + # Confirm omission and duplication seems reasonable + # TODO: when omission function gets implemented. + orig_idx = data.index + noised_idx = noised_data.index + # assert np.isclose(len(set(orig_idx) - set(noised_idx)) / len(data), config.omission) + # TODO: when duplication function gets implemented + # assert np.isclose(noised_data.duplicated().sum() / len(data), config.duplication) + + # Check that column-level noise seem reasonable + # NOTE: this is not perfect because (1) it is only looking at row-level + # noise and not token-based noise and (2) it is not accounting for the + # fact that noising can occur on duplicated rows which have been removed + # for comparison purposes. + common_idx = set(orig_idx).intersection(set(noised_idx)) + common_data = data.loc[common_idx] + common_noised_data = noised_data.loc[common_idx].drop_duplicates() + assert common_data.shape == common_noised_data.shape + for col in noised_data: + if col in config: + actual_noise_rate = (common_data[col] != common_noised_data[col]).mean() + noise_types = [k for k in config[col]] + noise_rates = [ + config[col][noise_type]["row_noise_level"] for noise_type in noise_types + ] + expected_noise_rate = 1 - np.prod([1 - x for x in noise_rates]) + assert np.isclose(actual_noise_rate, expected_noise_rate, rtol=0.07) + else: + assert (common_data[col] == common_noised_data[col]).all() @pytest.mark.skip(reason="TODO") -def test_noise_acs(): +def test_generate_acs(): pass @pytest.mark.skip(reason="TODO") -def test_noise_cps(): +def test_generate_cps(): pass @pytest.mark.skip(reason="TODO") -def test_noise_wic(): +def test_generate_wic(): pass @pytest.mark.skip(reason="TODO") -def test_noise_ssa(): +def test_generate_ssa(): pass @pytest.mark.skip(reason="TODO") -def test_noise_tax_w2_1099(): +def test_generate_tax_w2_1099(): pass @pytest.mark.skip(reason="TODO") -def test_noise_tax_1040(): +def test_generate_tax_1040(): pass From 0ff8057800c04c08dba8c07a51b4fc5772587e67 Mon Sep 17 00:00:00 2001 From: Matthew Kappel Date: Wed, 29 Mar 2023 09:43:59 -0700 Subject: [PATCH 2/7] Add W2 generation interface (#20) - *Category*: feature - *JIRA issue*: [MIC-3869](https://jira.ihme.washington.edu/browse/MIC-3869) Changes - Adds `generate_w2` function to interface - Adds relevant configuration to the defaults yaml - Addition of a integration test is deferred pending merge of another PR Testing Running the `main` of `interface.py` and calling `generate_w2` resulted in noised data. --- src/pseudopeople/__init__.py | 2 +- src/pseudopeople/default_configuration.yaml | 70 ++++++++++++++++++++- src/pseudopeople/interface.py | 18 +++++- 3 files changed, 87 insertions(+), 3 deletions(-) diff --git a/src/pseudopeople/__init__.py b/src/pseudopeople/__init__.py index a08dae8b..ac749c28 100644 --- a/src/pseudopeople/__init__.py +++ b/src/pseudopeople/__init__.py @@ -8,4 +8,4 @@ __uri__, __version__, ) -from pseudopeople.interface import generate_decennial_census +from pseudopeople.interface import generate_decennial_census, generate_w2 diff --git a/src/pseudopeople/default_configuration.yaml b/src/pseudopeople/default_configuration.yaml index 62a2bf9d..a19cbfba 100644 --- a/src/pseudopeople/default_configuration.yaml +++ b/src/pseudopeople/default_configuration.yaml @@ -51,4 +51,72 @@ decennial_census: housing_type: missing_data: row_noise_level: 0.01 - +taxes_w2_and_1099: + omission: 0.0145 + duplication: 0.05 + age: + missing_data: + row_noise_level: 0.01 + date_of_birth: + missing_data: + row_noise_level: 0.01 + employer_city: + missing_data: + row_noise_level: 0.01 + employer_id: + missing_data: + row_noise_level: 0.01 + employer_name: + missing_data: + row_noise_level: 0.01 + employer_state: + missing_data: + row_noise_level: 0.01 + employer_street_name: + missing_data: + row_noise_level: 0.01 + employer_street_number: + missing_data: + row_noise_level: 0.01 + employer_unit_number: + missing_data: + row_noise_level: 0.01 + employer_zipcode: + missing_data: + row_noise_level: 0.01 + first_name: + missing_data: + row_noise_level: 0.01 + income: + missing_data: + row_noise_level: 0.01 + is_w2: + missing_data: + row_noise_level: 0.01 + last_name: + missing_data: + row_noise_level: 0.01 + mailing_address_city: + missing_data: + row_noise_level: 0.01 + mailing_address_state: + missing_data: + row_noise_level: 0.01 + mailing_address_street_name: + missing_data: + row_noise_level: 0.01 + mailing_address_street_number: + missing_data: + row_noise_level: 0.01 + mailing_address_unit_number: + missing_data: + row_noise_level: 0.01 + mailing_address_zipcode: + missing_data: + row_noise_level: 0.01 + middle_initial: + missing_data: + row_noise_level: 0.01 + ssn: + missing_data: + row_noise_level: 0.01 \ No newline at end of file diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py index 74e3b86f..5ff041b3 100644 --- a/src/pseudopeople/interface.py +++ b/src/pseudopeople/interface.py @@ -29,13 +29,29 @@ def generate_decennial_census( return noise_form(Form.CENSUS, data, configuration_tree, seed) +def generate_w2( + path: Union[Path, str], seed: int = 0, configuration: Union[Path, str] = None +): + """ + Generates a noised W2 data from un-noised data. + + :param path: A path to the un-noised source W2 data + :param seed: An integer seed for randomness + :param configuration: (optional) A path to a configuration YAML file to modify default values + :return: A pd.DataFrame of noised W2 data + """ + configuration_tree = get_configuration(configuration) + data = pd.read_csv(path) + return noise_form(Form.TAX_W2_1099, data, configuration_tree, seed) + + # Manual testing helper if __name__ == "__main__": args = sys.argv[1:] if len(args) == 1: my_path = Path(args[0]) src = pd.read_csv(my_path) - out = generate_decennial_census(my_path) + out = generate_w2(my_path) diff = src[ ~src.astype(str).apply(tuple, 1).isin(out.astype(str).apply(tuple, 1)) ] # get all changed rows From 6b217e57a502ae9aaf87a772d49ee3980d1bfcb9 Mon Sep 17 00:00:00 2001 From: Steve Bachmeier <23350991+stevebachmeier@users.noreply.github.com> Date: Wed, 29 Mar 2023 12:27:49 -0600 Subject: [PATCH 3/7] implement typographic noise function (#19) --- src/pseudopeople/constants/paths.py | 8 + src/pseudopeople/data/qwerty_errors.yaml | 382 ++++++++++++++++++++ src/pseudopeople/default_configuration.yaml | 40 ++ src/pseudopeople/entities.py | 2 +- src/pseudopeople/interface.py | 6 +- src/pseudopeople/noise_functions.py | 91 ++++- tests/conftest.py | 16 + tests/integration/conftest.py | 28 +- tests/integration/test_interface.py | 53 +-- tests/unit/test_column_noise.py | 174 +++++++-- 10 files changed, 680 insertions(+), 120 deletions(-) create mode 100644 src/pseudopeople/constants/paths.py create mode 100644 src/pseudopeople/data/qwerty_errors.yaml diff --git a/src/pseudopeople/constants/paths.py b/src/pseudopeople/constants/paths.py new file mode 100644 index 00000000..da3b9fb3 --- /dev/null +++ b/src/pseudopeople/constants/paths.py @@ -0,0 +1,8 @@ +from pathlib import Path + +import pseudopeople + +BASE_DIR = Path(pseudopeople.__file__).resolve().parent +DATA_ROOT = BASE_DIR / "data" + +QWERTY_ERRORS = DATA_ROOT / "qwerty_errors.yaml" diff --git a/src/pseudopeople/data/qwerty_errors.yaml b/src/pseudopeople/data/qwerty_errors.yaml new file mode 100644 index 00000000..cd069a9c --- /dev/null +++ b/src/pseudopeople/data/qwerty_errors.yaml @@ -0,0 +1,382 @@ +q: +- w +- a +- s +w: +- q +- e +- a +- s +- d +e: +- w +- r +- s +- d +- f +r: +- e +- t +- d +- f +- g +t: +- r +- y +- f +- g +- h +y: +- t +- u +- g +- h +- j +u: +- y +- i +- h +- j +- k +i: +- u +- o +- j +- k +- l +o: +- i +- p +- k +- l +p: +- o +- l +a: +- q +- w +- s +- z +- x +s: +- q +- w +- e +- a +- d +- z +- x +- c +d: +- w +- e +- r +- s +- f +- x +- c +- v +f: +- e +- r +- t +- d +- g +- c +- v +- b +g: +- r +- t +- y +- f +- h +- v +- b +- n +h: +- t +- y +- u +- g +- j +- b +- n +- m +j: +- y +- u +- i +- h +- k +- n +- m +k: +- u +- i +- o +- j +- l +- m +l: +- i +- o +- p +- k +z: +- a +- s +- x +x: +- a +- s +- d +- z +- c +c: +- s +- d +- f +- x +- v +v: +- d +- f +- g +- c +- b +b: +- f +- g +- h +- v +- n +n: +- g +- h +- j +- b +- m +m: +- h +- j +- k +- n +Q: +- W +- A +- S +W: +- Q +- E +- A +- S +- D +E: +- W +- R +- S +- D +- F +R: +- E +- T +- D +- F +- G +T: +- R +- Y +- F +- G +- H +Y: +- T +- U +- G +- H +- J +U: +- Y +- I +- H +- J +- K +I: +- U +- O +- J +- K +- L +O: +- I +- P +- K +- L +P: +- O +- L +A: +- Q +- W +- S +- Z +- X +S: +- Q +- W +- E +- A +- D +- Z +- X +- C +D: +- W +- E +- R +- S +- F +- X +- C +- V +F: +- E +- R +- T +- D +- G +- C +- V +- B +G: +- R +- T +- Y +- F +- H +- V +- B +- N +H: +- T +- Y +- U +- G +- J +- B +- N +- M +J: +- Y +- U +- I +- H +- K +- N +- M +K: +- U +- I +- O +- J +- L +- M +L: +- I +- O +- P +- K +Z: +- A +- S +- X +X: +- A +- S +- D +- Z +- C +C: +- S +- D +- F +- X +- V +V: +- D +- F +- G +- C +- B +B: +- F +- G +- H +- V +- N +N: +- G +- H +- J +- B +- M +M: +- H +- J +- K +- N +'7': +- '8' +- '4' +- '5' +'8': +- '7' +- '9' +- '4' +- '5' +- '6' +'9': +- '8' +- '5' +- '6' +'4': +- '7' +- '8' +- '5' +- '1' +- '2' +'5': +- '7' +- '8' +- '9' +- '4' +- '6' +- '1' +- '2' +- '3' +'6': +- '8' +- '9' +- '5' +- '2' +- '3' +'1': +- '4' +- '5' +- '2' +- '0' +'2': +- '4' +- '5' +- '6' +- '1' +- '3' +- '0' +'3': +- '5' +- '6' +- '2' +'0': +- '1' +- '2' diff --git a/src/pseudopeople/default_configuration.yaml b/src/pseudopeople/default_configuration.yaml index a19cbfba..f3e81092 100644 --- a/src/pseudopeople/default_configuration.yaml +++ b/src/pseudopeople/default_configuration.yaml @@ -9,36 +9,76 @@ decennial_census: first_name: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 middle_initial: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 last_name: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 age: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 date_of_birth: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 street_number: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 street_name: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 unit_number: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 city: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 state: missing_data: row_noise_level: 0.01 zipcode: missing_data: row_noise_level: 0.01 + typographic: + row_noise_level: 0.01 + token_noise_level: 0.1 + include_original_token_level: 0.1 relation_to_household_head: missing_data: row_noise_level: 0.01 diff --git a/src/pseudopeople/entities.py b/src/pseudopeople/entities.py index 3c095e3a..1a41e264 100644 --- a/src/pseudopeople/entities.py +++ b/src/pseudopeople/entities.py @@ -54,7 +54,7 @@ class __NoiseTypes(NamedTuple): "month_day_swap", noise_functions.swap_months_and_days ) ZIP_CODE_MISWRITING: ColumnNoiseType = ColumnNoiseType( - "zipcode_miswriting", noise_functions.miswrite_zip_codes + "zipcode_miswriting", noise_functions.miswrite_zipcodes ) AGE_MISWRITING: ColumnNoiseType = ColumnNoiseType( "age_miswriting", noise_functions.miswrite_ages diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py index 5ff041b3..5c53b1ed 100644 --- a/src/pseudopeople/interface.py +++ b/src/pseudopeople/interface.py @@ -25,7 +25,7 @@ def generate_decennial_census( :return: A pd.DataFrame of noised census data """ configuration_tree = get_configuration(configuration) - data = pd.read_csv(path) + data = pd.read_csv(path, dtype=str, keep_default_na=False) return noise_form(Form.CENSUS, data, configuration_tree, seed) @@ -41,7 +41,7 @@ def generate_w2( :return: A pd.DataFrame of noised W2 data """ configuration_tree = get_configuration(configuration) - data = pd.read_csv(path) + data = pd.read_csv(path, dtype=str, keep_default_na=False) return noise_form(Form.TAX_W2_1099, data, configuration_tree, seed) @@ -50,7 +50,7 @@ def generate_w2( args = sys.argv[1:] if len(args) == 1: my_path = Path(args[0]) - src = pd.read_csv(my_path) + src = pd.read_csv(my_path, dtype=str, keep_default_na=False) out = generate_w2(my_path) diff = src[ ~src.astype(str).apply(tuple, 1).isin(out.astype(str).apply(tuple, 1)) diff --git a/src/pseudopeople/noise_functions.py b/src/pseudopeople/noise_functions.py index e82c8380..fb4f30da 100644 --- a/src/pseudopeople/noise_functions.py +++ b/src/pseudopeople/noise_functions.py @@ -1,9 +1,13 @@ from typing import Any +import numpy as np import pandas as pd +import yaml from vivarium import ConfigTree from vivarium.framework.randomness import RandomnessStream +from pseudopeople.constants import paths + def omit_rows( form_data: pd.DataFrame, @@ -91,7 +95,7 @@ def swap_months_and_days( return form_data -def miswrite_zip_codes( +def miswrite_zipcodes( form_data: pd.DataFrame, configuration: float, randomness_stream: RandomnessStream, @@ -217,12 +221,12 @@ def generate_missing_data( # Avoid SettingWithCopyWarning column = column.copy() - noise_level = configuration.row_noise_level - # Get rows to noise - to_noise_idx = randomness_stream.filter_for_probability( - column.index, - probability=noise_level, - additional_key=f"{additional_key}_missing_data_filter", + to_noise_idx = _get_to_noise_idx( + column, + configuration, + randomness_stream, + additional_key, + context_key="missing_data_filter", ) column.loc[to_noise_idx] = "" @@ -235,15 +239,62 @@ def generate_typographical_errors( randomness_stream: RandomnessStream, additional_key: Any, ) -> pd.Series: - """ + """Function that takes a column and applies noise to the string values + representative of keyboard mis-typing. - :param column: - :param configuration: - :param randomness_stream: + :param column: pd.Series of data + :param configuration: ConfigTree object containing noising parameters + :param randomness_stream: RandomnessStream to utilize Vivarium CRN :param additional_key: Key for RandomnessStream - :return: + :returns: pd.Series of column with noised data """ - # todo actually generate typographical errors + column = column.copy() + not_missing_idx = column.index[(column.notna()) & (column != "")] + + with open(paths.QWERTY_ERRORS) as f: + qwerty_errors = yaml.full_load(f) + + def keyboard_corrupt(truth, corrupted_pr, addl_pr, rng): + """Abie's implementation of typographical noising""" + err = "" + i = 0 + while i < len(truth): + error_introduced = False + token = truth[i : (i + 1)] + if token in qwerty_errors and not error_introduced: + random_number = rng.uniform() + if random_number < corrupted_pr: + err += rng.choice(qwerty_errors[token]) + random_number = rng.uniform() + if random_number < addl_pr: + err += token + i += 1 + error_introduced = True + if not error_introduced: + err += truth[i : (i + 1)] + i += 1 + return err + + token_noise_level = configuration.token_noise_level + include_original_token_level = configuration.include_original_token_level + + to_noise_idx = _get_to_noise_idx( + column.loc[not_missing_idx], + configuration, + randomness_stream, + additional_key, + context_key="typographical_noise_filter", + ) + rng = np.random.default_rng(seed=randomness_stream.seed) + for idx in to_noise_idx: + noised_value = keyboard_corrupt( + column[idx], + token_noise_level, + include_original_token_level, + rng, + ) + column[idx] = noised_value + return column @@ -265,4 +316,16 @@ def generate_ocr_errors( return column -# todo add noise functions +#################### +# HELPER FUNCTIONS # +#################### +def _get_to_noise_idx(column, configuration, randomness_stream, additional_key, context_key): + noise_level = configuration.row_noise_level + # Get rows to noise + to_noise_idx = randomness_stream.filter_for_probability( + column.index, + probability=noise_level, + additional_key=f"{additional_key}_{context_key}", + ) + + return to_noise_idx diff --git a/tests/conftest.py b/tests/conftest.py index 3178a8fd..894977cb 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -1,4 +1,7 @@ import pytest +import yaml + +from pseudopeople.utilities import get_configuration def pytest_addoption(parser): @@ -17,3 +20,16 @@ def pytest_collection_modifyitems(config, items): for item in items: if "slow" in item.keywords: item.add_marker(skip_slow) + + +@pytest.fixture(scope="session") +def user_config_path(tmp_path_factory): + """This simply copies the default config file to a temp directory + to be used as a user-provided config file in integration tests + """ + config = get_configuration().to_dict() # gets default config + config_path = tmp_path_factory.getbasetemp() / "dummy_config.yaml" + with open(config_path, "w") as file: + yaml.dump(config, file) + + return config_path diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 140c6d12..060a5174 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -4,9 +4,6 @@ import pandas as pd import pytest -import yaml - -from pseudopeople.utilities import get_configuration HOUSING_TYPES = [ "Carceral", @@ -109,14 +106,18 @@ @pytest.fixture(scope="session") -def dummy_census_data(tmp_path_factory): +def decennial_census_data_path(tmp_path_factory): """Generate a dummy decennial census dataframe, save to a tmpdir, and return that path.""" random.seed(0) num_rows = 100_000 data = pd.DataFrame( { "housing_type": [random.choice(HOUSING_TYPES) for _ in range(num_rows)], - "age": [str(random.random() * 100) for _ in range(num_rows)], + # TODO: Currently ages are actually floats but a followup pr will ensure ints + "age": [ + str(random.randint(1, 100) + round(random.random(), 6)) + for _ in range(num_rows) + ], "year": [random.choice(["2020", "2030"]) for _ in range(num_rows)], "race_ethnicity": [random.choice(RACE_ETHNICITIES) for _ in range(num_rows)], "guardian_1": [ @@ -133,7 +134,9 @@ def dummy_census_data(tmp_path_factory): "relation_to_household_head": [ random.choice(RELATIONS_TO_HOUSEHOLD_HEAD) for _ in range(num_rows) ], - "zipcode": [str(float(random.randint(1, 99999))) for _ in range(num_rows)], + # TODO: currently zipcodes are floats (and thus not zero-padded); + # a followup PR will convert to 5-digit integer strings + "zipcode": [str(random.randint(1, 99999)) + ".0" for _ in range(num_rows)], "date_of_birth": [ time.strftime( "%Y-%m-%d", @@ -170,16 +173,3 @@ def dummy_census_data(tmp_path_factory): data.to_csv(data_path, index=False) return data_path - - -@pytest.fixture(scope="module") -def dummy_config(tmp_path_factory): - """This simply copies the default config file to a temp directory - to be used as a user-provided config file in integration tests - """ - config = get_configuration().to_dict() # gets default config - config_path = tmp_path_factory.getbasetemp() / "dummy_config.yaml" - with open(config_path, "w") as file: - yaml.dump(config, file) - - return config_path diff --git a/tests/integration/test_interface.py b/tests/integration/test_interface.py index 9a32ac24..1501aa8c 100644 --- a/tests/integration/test_interface.py +++ b/tests/integration/test_interface.py @@ -11,62 +11,25 @@ # TODO: possibly parametrize Forms? def test_generate_decennial_census( - dummy_census_data: Union[Path, str], dummy_config: Union[Path, str] + decennial_census_data_path: Union[Path, str], user_config_path: Union[Path, str] ): - data = pd.read_csv(dummy_census_data) + data = pd.read_csv(decennial_census_data_path, dtype=str, keep_default_na=False) + + # TODO: Refactor this check into a separate test noised_data = generate_decennial_census( - path=dummy_census_data, seed=0, configuration=dummy_config + path=decennial_census_data_path, seed=0, configuration=user_config_path ) noised_data_same_seed = generate_decennial_census( - path=dummy_census_data, seed=0, configuration=dummy_config + path=decennial_census_data_path, seed=0, configuration=user_config_path ) noised_data_different_seed = generate_decennial_census( - path=dummy_census_data, seed=1, configuration=dummy_config + path=decennial_census_data_path, seed=1, configuration=user_config_path ) assert noised_data.equals(noised_data_same_seed) assert not noised_data.equals(noised_data_different_seed) assert not data.equals(noised_data) - # TODO: Confirm correct columns exist once the interface functions - # modify them - # TODO: if we sort out dtype schemas - # for col in noised_data.columns: - # assert data[col].dtype == noised_data[col].dtype - # TODO: Iterate through cols and check that the percentage of errors makes sense - # eg, if 25% typographic error and 1% OCR - # 1. Use a default config file - # 2. - - config = get_configuration(dummy_config)["decennial_census"] - - # Confirm omission and duplication seems reasonable - # TODO: when omission function gets implemented. - orig_idx = data.index - noised_idx = noised_data.index - # assert np.isclose(len(set(orig_idx) - set(noised_idx)) / len(data), config.omission) - # TODO: when duplication function gets implemented - # assert np.isclose(noised_data.duplicated().sum() / len(data), config.duplication) - - # Check that column-level noise seem reasonable - # NOTE: this is not perfect because (1) it is only looking at row-level - # noise and not token-based noise and (2) it is not accounting for the - # fact that noising can occur on duplicated rows which have been removed - # for comparison purposes. - common_idx = set(orig_idx).intersection(set(noised_idx)) - common_data = data.loc[common_idx] - common_noised_data = noised_data.loc[common_idx].drop_duplicates() - assert common_data.shape == common_noised_data.shape - for col in noised_data: - if col in config: - actual_noise_rate = (common_data[col] != common_noised_data[col]).mean() - noise_types = [k for k in config[col]] - noise_rates = [ - config[col][noise_type]["row_noise_level"] for noise_type in noise_types - ] - expected_noise_rate = 1 - np.prod([1 - x for x in noise_rates]) - assert np.isclose(actual_noise_rate, expected_noise_rate, rtol=0.07) - else: - assert (common_data[col] == common_noised_data[col]).all() + assert set(noised_data.columns) == set(data.columns) @pytest.mark.skip(reason="TODO") diff --git a/tests/unit/test_column_noise.py b/tests/unit/test_column_noise.py index 09d68f88..2ada0f32 100644 --- a/tests/unit/test_column_noise.py +++ b/tests/unit/test_column_noise.py @@ -1,3 +1,6 @@ +import random +from string import ascii_lowercase, ascii_uppercase + import numpy as np import pandas as pd import pytest @@ -5,9 +8,17 @@ from pseudopeople.noise_functions import ( generate_fake_names, + generate_incorrect_selections, generate_missing_data, generate_nicknames, + generate_ocr_errors, generate_phonetic_errors, + generate_typographical_errors, + generate_within_household_copies, + miswrite_ages, + miswrite_numerics, + miswrite_zipcodes, + swap_months_and_days, ) from pseudopeople.utilities import get_configuration @@ -20,96 +31,183 @@ @pytest.fixture(scope="module") -def string_series(): - num_simulants = 1_000_000 - return pd.Series([str(x) for x in range(num_simulants)]) +def dummy_dataset(): + # Add a column of integer strings + num_simulants = 100_000 + dummy_idx = pd.Index(range(num_simulants)) + integer_series = pd.Series([str(x) for x in range(num_simulants)]) + # Add missing data from `generate_missing_data` function + missing_idx = pd.Index([x for x in dummy_idx if x % 3 == 0]) + integer_series.loc[missing_idx] = "" + + # Add a column of character strings + str_length = 6 + character_series = pd.Series( + [ + "".join( + random.choice(ascii_lowercase + ascii_uppercase) for _ in range(str_length) + ) + for _ in range(num_simulants) + ] + ) + # Add missing data from `generate_missing_data` function + character_series.loc[missing_idx] = "" + return pd.DataFrame({"numbers": integer_series, "characters": character_series}) -@pytest.fixture(scope="module") -def default_configuration(): - return get_configuration() - -def test_generate_missing_data(string_series, default_configuration): - # TODO: [MIC-3910] Use custom config (MIC-3866) - config = default_configuration["decennial_census"]["zipcode"]["missing_data"] - noised_data = generate_missing_data( - string_series, config, RANDOMNESS0, "test_missing_data" - ) - noised_data_same_seed = generate_missing_data( - string_series, config, RANDOMNESS0, "test_missing_data" - ) - noised_data_different_seed = generate_missing_data( - string_series, config, RANDOMNESS1, "test_missing_data" +def test_generate_missing_data(dummy_dataset, user_config_path): + config = get_configuration(user_config_path)["decennial_census"]["zipcode"][ + "missing_data" + ] + data = dummy_dataset["numbers"] + noised_data = _validate_seed_and_noise_data( + func=generate_missing_data, column=data, config=config ) - # Confirm same randomness stream provides same results - assert (noised_data == noised_data_same_seed).all() - - # Confirm different streams provide different results - assert (noised_data != noised_data_different_seed).any() + # Calculate newly missing data, ie data that didn't come in as already missing + orig_non_missing_idx = data.index[(data.notna()) & (data != "")] + newly_missing_idx = noised_data.index[ + (noised_data.index.isin(orig_non_missing_idx)) & (noised_data == "") + ] # Check for expected noise level expected_noise = config["row_noise_level"] - actual_noise = (noised_data == "").mean() + actual_noise = len(newly_missing_idx) / len(orig_non_missing_idx) assert np.isclose(expected_noise, actual_noise, rtol=0.02) # Check that un-noised values are unchanged not_noised_idx = noised_data.index[noised_data != ""] assert "" not in noised_data[not_noised_idx].values - assert (string_series[not_noised_idx] == noised_data[not_noised_idx]).all() + assert (data[not_noised_idx] == noised_data[not_noised_idx]).all() @pytest.mark.skip(reason="TODO") -def test_incorrect_selection(): +def test_generate_incorrect_selections(): pass @pytest.mark.skip(reason="TODO") -def test_copy_from_within_household(): +def test_generate_within_household_copies(): pass @pytest.mark.skip(reason="TODO") -def test_swap_month_day(): +def test_swap_months_and_days(): pass @pytest.mark.skip(reason="TODO") -def test_miswrite_zipcode(): +def test_miswrite_zipcodes(): pass @pytest.mark.skip(reason="TODO") -def test_miswrite_age(): +def test_miswrite_ages(): pass @pytest.mark.skip(reason="TODO") -def test_miswrite_numeric(): +def test_miswrite_numerics(): pass @pytest.mark.skip(reason="TODO") -def test_nickname_noise(): +def test_generate_nicknames(): pass @pytest.mark.skip(reason="TODO") -def test_fake_name_noise(): +def test_generate_fake_names(): pass @pytest.mark.skip(reason="TODO") -def test_phonetic_noise(): +def test_generate_phonetic_errors(): pass @pytest.mark.skip(reason="TODO") -def test_ocr_noise(): +def test_generate_ocr_errors(): pass -@pytest.mark.skip(reason="TODO") -def test_typographic_noise(): - pass +@pytest.mark.parametrize( + "column", + [ + "numbers", + "characters", + ], +) +def test_generate_typographical_errors(dummy_dataset, column): + data = dummy_dataset[column] + config = get_configuration() + config.update( + { + "decennial_census": { + column: { + "typographic": { + "row_noise_level": 0.1, + "token_noise_level": 0.1, + "include_original_token_level": 0.1, + }, + }, + }, + } + ) + config = config["decennial_census"][column]["typographic"] + noised_data = _validate_seed_and_noise_data( + func=generate_typographical_errors, column=data, config=config + ) + + not_missing_idx = data.index[(data.notna()) & (data != "")] + check_original = data.loc[not_missing_idx] + check_noised = noised_data.loc[not_missing_idx] + + # Check for expected noise level + p_row_noise = config.row_noise_level + p_token_noise = config.token_noise_level + str_lengths = check_original.str.len() # pd.Series + p_token_not_noised = 1 - p_token_noise + p_strings_not_noised = p_token_not_noised**str_lengths # pd.Series + p_strings_noised = 1 - p_strings_not_noised # pd.Series + expected_noise = p_row_noise * p_strings_noised.mean() + actual_noise = (check_noised != check_original).mean() + assert np.isclose(expected_noise, actual_noise, rtol=0.06) + + # Check for expected string growth due to keeping original noised token + assert (check_noised.str.len() >= check_original.str.len()).all() + p_include_original_token = config.include_original_token_level + p_token_does_not_increase_string_length = 1 - p_token_noise * p_include_original_token + p_strings_do_not_increase_length = ( + p_token_does_not_increase_string_length**str_lengths + ) # pd.Series + p_strings_increase_length = 1 - p_strings_do_not_increase_length # pd.Series + expected_changed_length = p_row_noise * p_strings_increase_length.mean() + actual_changed_length = (check_noised.str.len() != check_original.str.len()).mean() + assert np.isclose(expected_changed_length, actual_changed_length, rtol=0.06) + + # Check that we did not touch the missing data + assert ( + data.loc[~data.index.isin(not_missing_idx)] + == noised_data.loc[~noised_data.index.isin(not_missing_idx)] + ).all() + + +#################### +# HELPER FUNCTIONS # +#################### + + +# TODO: refactor this into its own test parameterized by noise functions +def _validate_seed_and_noise_data(func, column, config): + """Confirms randomness stream behavior and returns the noised data""" + noised_data = func(column, config, RANDOMNESS0, f"test_{func.__name__}") + noised_data_same_seed = func(column, config, RANDOMNESS0, f"test_{func.__name__}") + noised_data_different_seed = func(column, config, RANDOMNESS1, f"test_{func.__name__}") + + assert (noised_data != column).any() + assert (noised_data == noised_data_same_seed).all() + assert (noised_data != noised_data_different_seed).any() + + return noised_data From 926c1ea9655d335018d0f3b2734a9046a31e897d Mon Sep 17 00:00:00 2001 From: Matthew Kappel Date: Wed, 29 Mar 2023 16:31:25 -0700 Subject: [PATCH 4/7] Change is_w2 to tax_form (#21) - *Category*: bugfix - *JIRA issue*: [MIC-3933](https://jira.ihme.washington.edu/browse/MIC-3933) Simply changes is_w2 column to the new tax_form column in the default configuration. No existing tests are impacted. Testing Ran noising against data generated with modified PRL outputs. W2 data were noised as expected. --- src/pseudopeople/default_configuration.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/pseudopeople/default_configuration.yaml b/src/pseudopeople/default_configuration.yaml index f3e81092..8afd8018 100644 --- a/src/pseudopeople/default_configuration.yaml +++ b/src/pseudopeople/default_configuration.yaml @@ -130,7 +130,7 @@ taxes_w2_and_1099: income: missing_data: row_noise_level: 0.01 - is_w2: + tax_form: missing_data: row_noise_level: 0.01 last_name: From 190cae44c35644a7187945b089d435f3b2ffbc59 Mon Sep 17 00:00:00 2001 From: albrja <37345113+albrja@users.noreply.github.com> Date: Wed, 29 Mar 2023 18:06:38 -0700 Subject: [PATCH 5/7] Incorrect select noise function (#18) Implement incorrect select noise function Adds generate_incorrect_selection to noise functions. - *Category*: Feature - *JIRA issue*: [MIC-3873](https://jira.ihme.washington.edu/browse/MIC-3873) -Adds CSV containing possible values for incorrect selection by column -Adds paths module -Adds noise function and test for generate_incorrect_selection Testing -Test suites pass successfully and generated decennial census form. --- MANIFEST.in | 2 +- src/pseudopeople/constants/paths.py | 1 + .../data/incorrect_select_options.csv | 52 ++++++++ src/pseudopeople/default_configuration.yaml | 18 ++- src/pseudopeople/entity_types.py | 13 +- src/pseudopeople/interface.py | 2 +- src/pseudopeople/noise.py | 3 +- .../{entities.py => noise_entities.py} | 25 ---- src/pseudopeople/noise_functions.py | 72 +++++------ src/pseudopeople/schema_entities.py | 27 ++++ src/pseudopeople/utilities.py | 76 +++++++++++- tests/integration/test_interface.py | 2 - tests/unit/test_column_noise.py | 82 ++++++++----- tests/unit/test_noise_form.py | 116 +++++++++++++++++- 14 files changed, 380 insertions(+), 111 deletions(-) create mode 100644 src/pseudopeople/data/incorrect_select_options.csv rename src/pseudopeople/{entities.py => noise_entities.py} (81%) create mode 100644 src/pseudopeople/schema_entities.py diff --git a/MANIFEST.in b/MANIFEST.in index 6c5b4a02..a02323e8 100644 --- a/MANIFEST.in +++ b/MANIFEST.in @@ -8,5 +8,5 @@ include README.rst recursive-include docs * prune docs/_build -recursive-include src/pseudopeople *.py *.yaml +recursive-include src/pseudopeople *.py *.yaml *.csv recursive-include tests *.py *txt *.yaml diff --git a/src/pseudopeople/constants/paths.py b/src/pseudopeople/constants/paths.py index da3b9fb3..81321e45 100644 --- a/src/pseudopeople/constants/paths.py +++ b/src/pseudopeople/constants/paths.py @@ -5,4 +5,5 @@ BASE_DIR = Path(pseudopeople.__file__).resolve().parent DATA_ROOT = BASE_DIR / "data" +INCORRECT_SELECT_NOISE_OPTIONS_DATA = DATA_ROOT / "incorrect_select_options.csv" QWERTY_ERRORS = DATA_ROOT / "qwerty_errors.yaml" diff --git a/src/pseudopeople/data/incorrect_select_options.csv b/src/pseudopeople/data/incorrect_select_options.csv new file mode 100644 index 00000000..bb60c488 --- /dev/null +++ b/src/pseudopeople/data/incorrect_select_options.csv @@ -0,0 +1,52 @@ +state,relation_to_household_head,sex,race_ethnicity,is_w2,event_type +AL,Reference person,Female,White,True,creation +AK,Opp-sex spouse,Male,Black,False,death +AZ,Opp-sex partner,,Asian,, +AR,Same-sex spouse,,AIAN,, +CA,Same-sex partne,,NHOPI,, +CO,Biological child,,Multiracial or Other,, +CT,Adopted child,,Latino,, +DE,Stepchild,,,, +FL,Sibling,,,, +GA,Parent,,,, +HI,Grandchild,,,, +ID,Parent-in-law,,,, +IL,Child-in-law,,,, +IN,Other relative,,,, +IA,Roommate,,,, +KS,Foster child,,,, +KY,Other nonrelative,,,, +LA,Institutionalized GQ po,,,, +ME,Noninstitutionalized GQ pop,,,, +MD,,,,, +MA,,,,, +MI,,,,, +MN,,,,, +MS,,,,, +MO,,,,, +MT,,,,, +NE,,,,, +NV,,,,, +NH,,,,, +NJ,,,,, +NM,,,,, +NY,,,,, +NC,,,,, +ND,,,,, +OH,,,,, +OK,,,,, +OR,,,,, +PA,,,,, +RI,,,,, +SC,,,,, +SD,,,,, +TN,,,,, +TX,,,,, +UT,,,,, +VT,,,,, +VA,,,,, +WA,,,,, +WV,,,,, +WI,,,,, +WY,,,,, +DC,,,,, \ No newline at end of file diff --git a/src/pseudopeople/default_configuration.yaml b/src/pseudopeople/default_configuration.yaml index 8afd8018..771ef3d7 100644 --- a/src/pseudopeople/default_configuration.yaml +++ b/src/pseudopeople/default_configuration.yaml @@ -72,6 +72,8 @@ decennial_census: state: missing_data: row_noise_level: 0.01 + incorrect_selection: + row_noise_level: 0.01 zipcode: missing_data: row_noise_level: 0.01 @@ -82,15 +84,19 @@ decennial_census: relation_to_household_head: missing_data: row_noise_level: 0.01 + incorrect_selection: + row_noise_level: 0.01 sex: missing_data: row_noise_level: 0.01 + incorrect_selection: + row_noise_level: 0.01 race_ethnicity: missing_data: row_noise_level: 0.01 - housing_type: - missing_data: + incorrect_selection: row_noise_level: 0.01 + taxes_w2_and_1099: omission: 0.0145 duplication: 0.05 @@ -112,6 +118,8 @@ taxes_w2_and_1099: employer_state: missing_data: row_noise_level: 0.01 + incorrect_selection: + row_noise_level: 0.01 employer_street_name: missing_data: row_noise_level: 0.01 @@ -133,6 +141,8 @@ taxes_w2_and_1099: tax_form: missing_data: row_noise_level: 0.01 + incorrect_selection: + row_noise_level: 0.01 last_name: missing_data: row_noise_level: 0.01 @@ -142,6 +152,8 @@ taxes_w2_and_1099: mailing_address_state: missing_data: row_noise_level: 0.01 + incorrect_selection: + row_noise_level: 0.01 mailing_address_street_name: missing_data: row_noise_level: 0.01 @@ -159,4 +171,4 @@ taxes_w2_and_1099: row_noise_level: 0.01 ssn: missing_data: - row_noise_level: 0.01 \ No newline at end of file + row_noise_level: 0.01 diff --git a/src/pseudopeople/entity_types.py b/src/pseudopeople/entity_types.py index f5557f49..500cf27c 100644 --- a/src/pseudopeople/entity_types.py +++ b/src/pseudopeople/entity_types.py @@ -5,6 +5,8 @@ from vivarium import ConfigTree from vivarium.framework.randomness import RandomnessStream +from pseudopeople.utilities import get_index_to_noise + @dataclass class RowNoiseType: @@ -56,4 +58,13 @@ def __call__( randomness_stream: RandomnessStream, additional_key: Any, ) -> pd.Series: - return self.noise_function(column, configuration, randomness_stream, additional_key) + column = column.copy() + noise_level = configuration.row_noise_level + to_noise_idx = get_index_to_noise( + column, noise_level, randomness_stream, f"{self.name}_{additional_key}" + ) + column.loc[to_noise_idx] = self.noise_function( + column.loc[to_noise_idx], configuration, randomness_stream, additional_key + ) + + return column diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py index 5c53b1ed..d767bc26 100644 --- a/src/pseudopeople/interface.py +++ b/src/pseudopeople/interface.py @@ -4,8 +4,8 @@ import pandas as pd -from pseudopeople.entities import Form from pseudopeople.noise import noise_form +from pseudopeople.schema_entities import Form from pseudopeople.utilities import get_configuration diff --git a/src/pseudopeople/noise.py b/src/pseudopeople/noise.py index 03ffae68..e5e50bbf 100644 --- a/src/pseudopeople/noise.py +++ b/src/pseudopeople/noise.py @@ -14,8 +14,9 @@ import pandas as pd from vivarium import ConfigTree -from pseudopeople.entities import NOISE_TYPES, Form from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType +from pseudopeople.noise_entities import NOISE_TYPES +from pseudopeople.schema_entities import Form from pseudopeople.utilities import get_randomness_stream diff --git a/src/pseudopeople/entities.py b/src/pseudopeople/noise_entities.py similarity index 81% rename from src/pseudopeople/entities.py rename to src/pseudopeople/noise_entities.py index 1a41e264..93a367c6 100644 --- a/src/pseudopeople/entities.py +++ b/src/pseudopeople/noise_entities.py @@ -5,31 +5,6 @@ from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType -# todo: is "form" the right word? Ask RT -class Form(Enum): - CENSUS = "decennial_census" - ACS = "american_communities_survey" - CPS = "current_population_survey" - WIC = "women_infants_and_children" - SSA = "social_security" - TAX_W2_1099 = "taxes_w2_and_1099" - TAX_1040 = "taxes_1040" - - -class __Columns(NamedTuple): - FIRST_NAME: str = "first_name" - MIDDLE_INITIAL: str = "middle_initial" - LAST_NAME: str = "last_name" - STREET_NAME: str = "street_name" - ZIP_CODE: str = "zipcode" - CITY: str = "city" - AGE: str = "age" - # todo finish filling in columns - - -COLUMNS = __Columns() - - class __NoiseTypes(NamedTuple): """Container for all noise types in the order in which they should be applied: omissions, duplications, missing data, incorrect selection, copy from w/in diff --git a/src/pseudopeople/noise_functions.py b/src/pseudopeople/noise_functions.py index fb4f30da..bfeb0acf 100644 --- a/src/pseudopeople/noise_functions.py +++ b/src/pseudopeople/noise_functions.py @@ -7,6 +7,7 @@ from vivarium.framework.randomness import RandomnessStream from pseudopeople.constants import paths +from pseudopeople.utilities import vectorized_choice def omit_rows( @@ -42,21 +43,36 @@ def duplicate_rows( def generate_incorrect_selections( - form_data: pd.DataFrame, - configuration: float, + column: pd.Series, + _: ConfigTree, randomness_stream: RandomnessStream, additional_key: Any, -) -> pd.DataFrame: +) -> pd.Series: """ + Function that takes a categorical series and applies noise so some values has been replace with other options from + a list. - :param form_data: - :param configuration: - :param randomness_stream: + :param column: A categorical pd.Series + :param _: ConfigTree with rate at which to blank the data in column. + :param randomness_stream: RandomnessStream to utilize Vivarium CRN. :param additional_key: Key for RandomnessStream - :return: + :returns: pd.Series where data has been noised with other values from a list of possibilities """ - # todo actually duplicate rows - return form_data + + col = column.name + selection_options = pd.read_csv(paths.INCORRECT_SELECT_NOISE_OPTIONS_DATA) + + # Get possible noise values + # todo: Update with exclusive resampling when vectorized_choice is improved + options = selection_options.loc[selection_options[col].notna(), col] + new_values = vectorized_choice( + options=options, + n_to_choose=len(column), + randomness_stream=randomness_stream, + additional_key=f"{additional_key}_{col}_incorrect_select_choice", + ).to_numpy() + + return pd.Series(new_values, index=column.index) def generate_within_household_copies( @@ -203,34 +219,15 @@ def generate_phonetic_errors( return column -def generate_missing_data( - column: pd.Series, - configuration: ConfigTree, - randomness_stream: RandomnessStream, - additional_key: Any, -) -> pd.Series: +def generate_missing_data(column: pd.Series, *_: Any) -> pd.Series: """ - Function that takes a column and blanks out a configurable portion of its data to be missing. + Function that takes a column and blanks out all values. :param column: pd.Series of data - :param configuration: ConfigTree with rate at which to blank the data in column. - :param randomness_stream: RandomnessStream to utilize Vivarium CRN. - :param additional_key: Key for RandomnessStream - :returns: pd.Series of column with configured amount of data missing as an empty string. + :returns: pd.Series of empty strings with the index of column. """ - # Avoid SettingWithCopyWarning - column = column.copy() - to_noise_idx = _get_to_noise_idx( - column, - configuration, - randomness_stream, - additional_key, - context_key="missing_data_filter", - ) - column.loc[to_noise_idx] = "" - - return column + return pd.Series("", index=column.index) def generate_typographical_errors( @@ -248,8 +245,6 @@ def generate_typographical_errors( :param additional_key: Key for RandomnessStream :returns: pd.Series of column with noised data """ - column = column.copy() - not_missing_idx = column.index[(column.notna()) & (column != "")] with open(paths.QWERTY_ERRORS) as f: qwerty_errors = yaml.full_load(f) @@ -278,15 +273,8 @@ def keyboard_corrupt(truth, corrupted_pr, addl_pr, rng): token_noise_level = configuration.token_noise_level include_original_token_level = configuration.include_original_token_level - to_noise_idx = _get_to_noise_idx( - column.loc[not_missing_idx], - configuration, - randomness_stream, - additional_key, - context_key="typographical_noise_filter", - ) rng = np.random.default_rng(seed=randomness_stream.seed) - for idx in to_noise_idx: + for idx in column.index: noised_value = keyboard_corrupt( column[idx], token_noise_level, diff --git a/src/pseudopeople/schema_entities.py b/src/pseudopeople/schema_entities.py new file mode 100644 index 00000000..a2e584cf --- /dev/null +++ b/src/pseudopeople/schema_entities.py @@ -0,0 +1,27 @@ +from enum import Enum +from typing import NamedTuple + + +# todo: is "form" the right word? Ask RT +class Form(Enum): + CENSUS = "decennial_census" + ACS = "american_communities_survey" + CPS = "current_population_survey" + WIC = "women_infants_and_children" + SSA = "social_security" + TAX_W2_1099 = "taxes_w2_and_1099" + TAX_1040 = "taxes_1040" + + +class __Columns(NamedTuple): + FIRST_NAME: str = "first_name" + MIDDLE_INITIAL: str = "middle_initial" + LAST_NAME: str = "last_name" + STREET_NAME: str = "street_name" + ZIP_CODE: str = "zipcode" + CITY: str = "city" + AGE: str = "age" + # todo finish filling in columns + + +COLUMNS = __Columns() diff --git a/src/pseudopeople/utilities.py b/src/pseudopeople/utilities.py index 9cb2acd3..8d779d62 100644 --- a/src/pseudopeople/utilities.py +++ b/src/pseudopeople/utilities.py @@ -1,11 +1,12 @@ from pathlib import Path -from typing import Union +from typing import Any, Union +import numpy as np import pandas as pd from vivarium.framework.configuration import ConfigTree -from vivarium.framework.randomness import RandomnessStream +from vivarium.framework.randomness import RandomnessStream, random -from pseudopeople.entities import Form +from pseudopeople.schema_entities import Form def get_randomness_stream(form: Form, seed: int) -> RandomnessStream: @@ -32,3 +33,72 @@ def get_configuration(user_yaml_path: Union[Path, str] = None) -> ConfigTree: if user_yaml_path: noising_configuration.update(user_yaml_path, layer="user") return noising_configuration + + +def vectorized_choice( + options: Union[list, pd.Series], + n_to_choose: int, + randomness_stream: RandomnessStream = None, + weights: Union[list, pd.Series] = None, + additional_key: Any = None, + random_seed: int = None, +): + """ + Function that takes a list of options and uses Vivarium common random numbers framework to make a given number + of razndom choice selections. + + :param options: List and series of possible values to choose + :param n_to_choose: Number of choices to make, the length of the returned array of values + :param randomness_stream: RandomnessStream being used for Vivarium's CRN framework + :param weights: List or series containing weights for each options + :param additional_key: Key to pass to randomness_stream + :param random_seed: Seed to pass to randomness_stream. + Note additional_key and random_seed are used to make calls using a RandomnessStream unique + + returns: ndarray + """ + if not randomness_stream and (additional_key == None and random_seed == None): + raise RuntimeError( + "An additional_key and a random_seed are required in 'vectorized_choice'" + + "if no RandomnessStream is passed in" + ) + if weights is None: + n = len(options) + weights = np.ones(n) / n + # for each of n_to_choose, sample uniformly between 0 and 1 + index = pd.Index(np.arange(n_to_choose)) + if randomness_stream is None: + # Generate an additional_key on-the-fly and use that in randomness.random + additional_key = f"{additional_key}_{random_seed}" + probs = random(str(additional_key), index) + else: + probs = randomness_stream.get_draw(index, additional_key=additional_key) + + # build cdf based on weights + pmf = weights / weights.sum() + cdf = np.cumsum(pmf) + + # for each p_i in probs, count how many elements of cdf for which p_i >= cdf_i + chosen_indices = np.searchsorted(cdf, probs, side="right") + return np.take(options, chosen_indices) + + +def get_index_to_noise( + column: pd.Series, + noise_level: float, + randomness_stream: RandomnessStream, + additional_key: Any, +) -> pd.Index: + """ + Function that takes a series and returns a pd.Index that chosen by Vivarium Common Random Number to be noised. + """ + + # Get rows to noise + not_empty_idx = column.index[(column != "") & (column.notna())] + to_noise_idx = randomness_stream.filter_for_probability( + not_empty_idx, + probability=noise_level, + additional_key=additional_key, + ) + + return to_noise_idx diff --git a/tests/integration/test_interface.py b/tests/integration/test_interface.py index 1501aa8c..294f9d81 100644 --- a/tests/integration/test_interface.py +++ b/tests/integration/test_interface.py @@ -1,12 +1,10 @@ from pathlib import Path from typing import Union -import numpy as np import pandas as pd import pytest from pseudopeople.interface import generate_decennial_census -from pseudopeople.utilities import get_configuration # TODO: possibly parametrize Forms? diff --git a/tests/unit/test_column_noise.py b/tests/unit/test_column_noise.py index 2ada0f32..9ee09288 100644 --- a/tests/unit/test_column_noise.py +++ b/tests/unit/test_column_noise.py @@ -6,20 +6,7 @@ import pytest from vivarium.framework.randomness import RandomnessStream -from pseudopeople.noise_functions import ( - generate_fake_names, - generate_incorrect_selections, - generate_missing_data, - generate_nicknames, - generate_ocr_errors, - generate_phonetic_errors, - generate_typographical_errors, - generate_within_household_copies, - miswrite_ages, - miswrite_numerics, - miswrite_zipcodes, - swap_months_and_days, -) +from pseudopeople.noise_entities import NOISE_TYPES from pseudopeople.utilities import get_configuration RANDOMNESS0 = RandomnessStream( @@ -39,7 +26,6 @@ def dummy_dataset(): # Add missing data from `generate_missing_data` function missing_idx = pd.Index([x for x in dummy_idx if x % 3 == 0]) integer_series.loc[missing_idx] = "" - # Add a column of character strings str_length = 6 character_series = pd.Series( @@ -56,13 +42,36 @@ def dummy_dataset(): return pd.DataFrame({"numbers": integer_series, "characters": character_series}) -def test_generate_missing_data(dummy_dataset, user_config_path): - config = get_configuration(user_config_path)["decennial_census"]["zipcode"][ - "missing_data" - ] +@pytest.fixture(scope="module") +def categorical_series(): + return pd.Series( + ["CA", "WA", "FL", "OR", "CO", "TX", "NY", "VA", "AZ", "''"] * 100_000, name="state" + ) + + +@pytest.fixture(scope="module") +def default_configuration(): + return get_configuration() + + +def test_generate_missing_data(dummy_dataset): + + config = get_configuration() + config.update( + { + "decennial_census": { + "zipcode": { + "missing_data": { + "row_noise_level": 0.25, + }, + }, + }, + } + ) + config = config["decennial_census"]["zipcode"]["missing_data"] data = dummy_dataset["numbers"] noised_data = _validate_seed_and_noise_data( - func=generate_missing_data, column=data, config=config + noise_type=NOISE_TYPES.MISSING_DATA, column=data, config=config ) # Calculate newly missing data, ie data that didn't come in as already missing @@ -82,9 +91,24 @@ def test_generate_missing_data(dummy_dataset, user_config_path): assert (data[not_noised_idx] == noised_data[not_noised_idx]).all() -@pytest.mark.skip(reason="TODO") -def test_generate_incorrect_selections(): - pass +def test_incorrect_selection(categorical_series, default_configuration): + config = default_configuration["decennial_census"]["state"]["incorrect_selection"] + noised_data = _validate_seed_and_noise_data( + noise_type=NOISE_TYPES.INCORRECT_SELECTION, column=categorical_series, config=config + ) + + # Check for expected noise level + expected_noise = config["row_noise_level"] + # todo: Update when generate_incorrect_selection uses exclusive resampling + # Get real expected noise to account for possibility of noising with original value + # Here we have a a possibility of choosing any of the 50 states for our categorical series fixture + expected_noise = expected_noise * (1 - 1 / 50) + actual_noise = (noised_data != categorical_series).mean() + assert np.isclose(expected_noise, actual_noise, rtol=0.02) + + original_empty_idx = categorical_series.index[categorical_series == ""] + noised_empty_idx = noised_data.index[noised_data == ""] + pd.testing.assert_index_equal(original_empty_idx, noised_empty_idx) @pytest.mark.skip(reason="TODO") @@ -157,7 +181,7 @@ def test_generate_typographical_errors(dummy_dataset, column): ) config = config["decennial_census"][column]["typographic"] noised_data = _validate_seed_and_noise_data( - func=generate_typographical_errors, column=data, config=config + noise_type=NOISE_TYPES.TYPOGRAPHIC, column=data, config=config ) not_missing_idx = data.index[(data.notna()) & (data != "")] @@ -200,11 +224,13 @@ def test_generate_typographical_errors(dummy_dataset, column): # TODO: refactor this into its own test parameterized by noise functions -def _validate_seed_and_noise_data(func, column, config): +def _validate_seed_and_noise_data(noise_type, column, config): """Confirms randomness stream behavior and returns the noised data""" - noised_data = func(column, config, RANDOMNESS0, f"test_{func.__name__}") - noised_data_same_seed = func(column, config, RANDOMNESS0, f"test_{func.__name__}") - noised_data_different_seed = func(column, config, RANDOMNESS1, f"test_{func.__name__}") + noised_data = noise_type(column, config, RANDOMNESS0, f"test_{noise_type.name}") + noised_data_same_seed = noise_type(column, config, RANDOMNESS0, f"test_{noise_type.name}") + noised_data_different_seed = noise_type( + column, config, RANDOMNESS1, f"test_{noise_type.name}" + ) assert (noised_data != column).any() assert (noised_data == noised_data_same_seed).all() diff --git a/tests/unit/test_noise_form.py b/tests/unit/test_noise_form.py index 8bf58756..1cfb9388 100644 --- a/tests/unit/test_noise_form.py +++ b/tests/unit/test_noise_form.py @@ -1,13 +1,17 @@ import random from string import ascii_lowercase +from typing import NamedTuple +import numpy as np import pandas as pd import pytest from vivarium.config_tree import ConfigTree -from pseudopeople.entities import Form +from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType from pseudopeople.interface import generate_decennial_census -from pseudopeople.noise import NOISE_TYPES, noise_form +from pseudopeople.noise import noise_form +from pseudopeople.noise_entities import NOISE_TYPES +from pseudopeople.schema_entities import Form @pytest.fixture(scope="module") @@ -87,18 +91,27 @@ def test_noise_order(mocker, dummy_data, dummy_config_noise_numbers): # Mock the noise_functions functions so that they are not actually called and # return the original one-column dataframe (so that it doesn't become a mock # object itself after the first mocked function is applied.) + mocker.patch( + "pseudopeople.entity_types.get_index_to_noise", return_value=dummy_data.index + ) for field in NOISE_TYPES._fields: + mock_return = ( + dummy_data[["numbers"]] + if field in ["OMISSION", "DUPLICATION"] + else dummy_data["numbers"] + ) mock.attach_mock( mocker.patch( f"pseudopeople.noise.NOISE_TYPES.{field}.noise_function", - return_value=dummy_data[["numbers"]], + return_value=mock_return, ), field, ) + # FIXME: would be better to mock the form instead of using census noise_form(Form.CENSUS, dummy_data, dummy_config_noise_numbers, 0) - call_order = [call[0] for call in mock.mock_calls] + call_order = [x[0] for x in mock.mock_calls if not x[0].startswith("__")] expected_call_order = [ "OMISSION", "DUPLICATION", @@ -160,3 +173,98 @@ def test_correct_forms_are_used(func, form, mocker): _ = func("dummy/path") assert mock.call_args[0][0] == form + + +def test_two_noise_functions_are_independent(mocker): + + # Make simple config tree to test 2 noise functions work together + config_tree = ConfigTree( + { + "decennial_census": { + "fake_column_one": { + "alpha": {"row_noise_level": 0.20}, + "beta": {"row_noise_level": 0.30}, + }, + "fake_column_two": { + "alpha": {"row_noise_level": 0.40}, + "beta": {"row_noise_level": 0.50}, + }, + } + } + ) + + # Mock objects for testing + + class MockNoiseTypes(NamedTuple): + ALPHA: ColumnNoiseType = ColumnNoiseType( + "alpha", lambda column, *_: column.str.cat(pd.Series("abc", index=column.index)) + ) + BETA: ColumnNoiseType = ColumnNoiseType( + "beta", lambda column, *_: column.str.cat(pd.Series("123", index=column.index)) + ) + + mock_noise_types = MockNoiseTypes() + + mocker.patch("pseudopeople.noise.NOISE_TYPES", mock_noise_types) + dummy_form = pd.DataFrame( + { + "fake_column_one": ["cat", "dog", "bird", "bunny", "duck"] * 20_000, + "fake_column_two": ["shoe", "pants", "shirt", "hat", "sunglasses"] * 20_000, + } + ) + + noised_data = noise_form( + form=Form.CENSUS, + form_data=dummy_form, + seed=0, + configuration=config_tree, + ) + + # Get config values for testing + col1_expected_abc_proportion = config_tree["decennial_census"]["fake_column_one"][ + "alpha" + ]["row_noise_level"] + col2_expected_abc_proportion = config_tree["decennial_census"]["fake_column_two"][ + "alpha" + ]["row_noise_level"] + col1_expected_123_proportion = config_tree["decennial_census"]["fake_column_one"]["beta"][ + "row_noise_level" + ] + col2_expected_123_proportion = config_tree["decennial_census"]["fake_column_two"]["beta"][ + "row_noise_level" + ] + + assert np.isclose( + noised_data["fake_column_one"].str.contains("abc").mean(), + col1_expected_abc_proportion, + rtol=0.01, + ) + assert np.isclose( + noised_data["fake_column_two"].str.contains("abc").mean(), + col2_expected_abc_proportion, + rtol=0.01, + ) + assert np.isclose( + noised_data["fake_column_one"].str.contains("123").mean(), + col1_expected_123_proportion, + rtol=0.01, + ) + assert np.isclose( + noised_data["fake_column_two"].str.contains("123").mean(), + col2_expected_123_proportion, + rtol=0.01, + ) + + # Assert columns experience both noise + assert np.isclose( + noised_data["fake_column_one"].str.contains("abc123").mean(), + col1_expected_abc_proportion * col1_expected_123_proportion, + rtol=0.01, + ) + assert np.isclose( + noised_data["fake_column_two"].str.contains("abc123").mean(), + col2_expected_abc_proportion * col2_expected_123_proportion, + rtol=0.01, + ) + assert noised_data["fake_column_one"].str.contains("123abc").sum() == 0 + assert noised_data["fake_column_two"].str.contains("123abc").sum() == 0 From 86a5722fbfdce9e4be12a356d8ab17ad78f630b2 Mon Sep 17 00:00:00 2001 From: albrja <37345113+albrja@users.noreply.github.com> Date: Thu, 30 Mar 2023 12:12:20 -0700 Subject: [PATCH 6/7] Change tax_form column name and data values. (#22) Update to incorrect_selection.csv Updates data to align with changes to is_w2 (now tax_form) column in post-processing. - *Category*: Other - *JIRA issue*: [MIC-3937](https://jira.ihme.washington.edu/browse/MIC-3937) -Changes is_w2 column to tax_form -Changes data values from bool to "W2" and "1099" Testing All tests pass. --- .../data/incorrect_select_options.csv | 6 +++--- src/pseudopeople/noise_functions.py | 15 --------------- 2 files changed, 3 insertions(+), 18 deletions(-) diff --git a/src/pseudopeople/data/incorrect_select_options.csv b/src/pseudopeople/data/incorrect_select_options.csv index bb60c488..e4939387 100644 --- a/src/pseudopeople/data/incorrect_select_options.csv +++ b/src/pseudopeople/data/incorrect_select_options.csv @@ -1,6 +1,6 @@ -state,relation_to_household_head,sex,race_ethnicity,is_w2,event_type -AL,Reference person,Female,White,True,creation -AK,Opp-sex spouse,Male,Black,False,death +state,relation_to_household_head,sex,race_ethnicity,tax_form,event_type +AL,Reference person,Female,White,W2,creation +AK,Opp-sex spouse,Male,Black,1099,death AZ,Opp-sex partner,,Asian,, AR,Same-sex spouse,,AIAN,, CA,Same-sex partne,,NHOPI,, diff --git a/src/pseudopeople/noise_functions.py b/src/pseudopeople/noise_functions.py index bfeb0acf..b4b8110a 100644 --- a/src/pseudopeople/noise_functions.py +++ b/src/pseudopeople/noise_functions.py @@ -302,18 +302,3 @@ def generate_ocr_errors( """ # todo actually generate OCR errors return column - - -#################### -# HELPER FUNCTIONS # -#################### -def _get_to_noise_idx(column, configuration, randomness_stream, additional_key, context_key): - noise_level = configuration.row_noise_level - # Get rows to noise - to_noise_idx = randomness_stream.filter_for_probability( - column.index, - probability=noise_level, - additional_key=f"{additional_key}_{context_key}", - ) - - return to_noise_idx From 9cdf118eaf6b19ddab9340a7ec11cd334c645528 Mon Sep 17 00:00:00 2001 From: Rajan Mudambi <11376379+rmudambi@users.noreply.github.com> Date: Fri, 31 Mar 2023 13:24:48 -0700 Subject: [PATCH 7/7] release candidate v0.2.0 (#23) --- CHANGELOG.rst | 6 ++++++ src/pseudopeople/__about__.py | 2 +- 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.rst b/CHANGELOG.rst index 6a1e6fd4..806058a5 100644 --- a/CHANGELOG.rst +++ b/CHANGELOG.rst @@ -1,3 +1,9 @@ +**0.2.0 - 03/31/23** + + - Implemented W2/1099 forms + - Implemented typographic noise function + - Implemented incorrect selection noise function + **0.1.0 - 03/23/23** - Initial release diff --git a/src/pseudopeople/__about__.py b/src/pseudopeople/__about__.py index 7644dc29..c8ae9993 100644 --- a/src/pseudopeople/__about__.py +++ b/src/pseudopeople/__about__.py @@ -13,7 +13,7 @@ __summary__ = "pseudopeople is package which adds noise to simulated census-scale data using standard scientific Python tools." __uri__ = "https://github.com/ihmeuw/pseudopeople" -__version__ = "0.1.0" +__version__ = "0.2.0" __author__ = "The pseudopeople developers" __email__ = "vivarium.dev@gmail.com"