From 595dec6933b2aee44e395d40575cb79788fa959e Mon Sep 17 00:00:00 2001 From: Rajan Mudambi <11376379+rmudambi@users.noreply.github.com> Date: Mon, 3 Apr 2023 18:27:27 -0700 Subject: [PATCH 1/3] read data from an hdf rather than a csv --- src/pseudopeople/data/incorrect_select_options.csv | 4 ++-- src/pseudopeople/entity_types.py | 6 +++--- src/pseudopeople/interface.py | 8 ++++++-- src/pseudopeople/noise_functions.py | 3 ++- tests/integration/conftest.py | 4 ++-- tests/integration/test_interface.py | 2 +- tests/unit/test_column_noise.py | 11 +++++++---- tests/unit/test_noise_form.py | 2 +- 8 files changed, 24 insertions(+), 16 deletions(-) diff --git a/src/pseudopeople/data/incorrect_select_options.csv b/src/pseudopeople/data/incorrect_select_options.csv index e4939387..67fd1629 100644 --- a/src/pseudopeople/data/incorrect_select_options.csv +++ b/src/pseudopeople/data/incorrect_select_options.csv @@ -3,7 +3,7 @@ AL,Reference person,Female,White,W2,creation AK,Opp-sex spouse,Male,Black,1099,death AZ,Opp-sex partner,,Asian,, AR,Same-sex spouse,,AIAN,, -CA,Same-sex partne,,NHOPI,, +CA,Same-sex partner,,NHOPI,, CO,Biological child,,Multiracial or Other,, CT,Adopted child,,Latino,, DE,Stepchild,,,, @@ -16,7 +16,7 @@ IN,Other relative,,,, IA,Roommate,,,, KS,Foster child,,,, KY,Other nonrelative,,,, -LA,Institutionalized GQ po,,,, +LA,Institutionalized GQ pop,,,, ME,Noninstitutionalized GQ pop,,,, MD,,,,, MA,,,,, diff --git a/src/pseudopeople/entity_types.py b/src/pseudopeople/entity_types.py index 500cf27c..7749ce7b 100644 --- a/src/pseudopeople/entity_types.py +++ b/src/pseudopeople/entity_types.py @@ -23,7 +23,7 @@ class RowNoiseType: """ name: str - noise_function: Callable[[pd.DataFrame, float, RandomnessStream, str], pd.DataFrame] + noise_function: Callable[[pd.DataFrame, float, RandomnessStream], pd.DataFrame] def __call__( self, @@ -63,8 +63,8 @@ def __call__( to_noise_idx = get_index_to_noise( column, noise_level, randomness_stream, f"{self.name}_{additional_key}" ) - column.loc[to_noise_idx] = self.noise_function( + noised_data = self.noise_function( column.loc[to_noise_idx], configuration, randomness_stream, additional_key ) - + column.loc[to_noise_idx] = noised_data return column diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py index d767bc26..48edd87b 100644 --- a/src/pseudopeople/interface.py +++ b/src/pseudopeople/interface.py @@ -25,7 +25,9 @@ def generate_decennial_census( :return: A pd.DataFrame of noised census data """ configuration_tree = get_configuration(configuration) - data = pd.read_csv(path, dtype=str, keep_default_na=False) + data = pd.read_hdf(path) + if not isinstance(data, pd.DataFrame): + raise TypeError(f"File located at {path} must contain a pandas DataFrame.") return noise_form(Form.CENSUS, data, configuration_tree, seed) @@ -41,7 +43,9 @@ def generate_w2( :return: A pd.DataFrame of noised W2 data """ configuration_tree = get_configuration(configuration) - data = pd.read_csv(path, dtype=str, keep_default_na=False) + data = pd.read_hdf(path) + if not isinstance(data, pd.DataFrame): + raise TypeError(f"File located at {path} must contain a pandas DataFrame.") return noise_form(Form.TAX_W2_1099, data, configuration_tree, seed) diff --git a/src/pseudopeople/noise_functions.py b/src/pseudopeople/noise_functions.py index 2a2d6f97..ee8669bb 100644 --- a/src/pseudopeople/noise_functions.py +++ b/src/pseudopeople/noise_functions.py @@ -256,7 +256,7 @@ def generate_missing_data(column: pd.Series, *_: Any) -> pd.Series: :returns: pd.Series of empty strings with the index of column. """ - return pd.Series("", index=column.index) + return pd.Series(pd.NA, index=column.index) def generate_typographical_errors( @@ -303,6 +303,7 @@ def keyboard_corrupt(truth, corrupted_pr, addl_pr, rng): include_original_token_level = configuration.include_original_token_level rng = np.random.default_rng(seed=randomness_stream.seed) + column = column.astype(str) for idx in column.index: noised_value = keyboard_corrupt( column[idx], diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index 060a5174..7eb8e3c1 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -169,7 +169,7 @@ def decennial_census_data_path(tmp_path_factory): } ) - data_path = tmp_path_factory.getbasetemp() / "dummy_data.csv" - data.to_csv(data_path, index=False) + data_path = tmp_path_factory.getbasetemp() / "dummy_data.hdf" + data.to_hdf(data_path, "data") return data_path diff --git a/tests/integration/test_interface.py b/tests/integration/test_interface.py index 294f9d81..d7f4a4bb 100644 --- a/tests/integration/test_interface.py +++ b/tests/integration/test_interface.py @@ -11,7 +11,7 @@ def test_generate_decennial_census( decennial_census_data_path: Union[Path, str], user_config_path: Union[Path, str] ): - data = pd.read_csv(decennial_census_data_path, dtype=str, keep_default_na=False) + data = pd.read_hdf(decennial_census_data_path) # TODO: Refactor this check into a separate test noised_data = generate_decennial_census( diff --git a/tests/unit/test_column_noise.py b/tests/unit/test_column_noise.py index 1c4871bf..3e7eeffb 100644 --- a/tests/unit/test_column_noise.py +++ b/tests/unit/test_column_noise.py @@ -84,7 +84,7 @@ def test_generate_missing_data(dummy_dataset): # Calculate newly missing data, ie data that didn't come in as already missing orig_non_missing_idx = data.index[(data.notna()) & (data != "")] newly_missing_idx = noised_data.index[ - (noised_data.index.isin(orig_non_missing_idx)) & (noised_data == "") + (noised_data.index.isin(orig_non_missing_idx)) & (noised_data.isna()) ] # Check for expected noise level @@ -93,8 +93,7 @@ def test_generate_missing_data(dummy_dataset): assert np.isclose(expected_noise, actual_noise, rtol=0.02) # Check that un-noised values are unchanged - not_noised_idx = noised_data.index[noised_data != ""] - assert "" not in noised_data[not_noised_idx].values + not_noised_idx = noised_data.index[noised_data.notna()] assert (data[not_noised_idx] == noised_data[not_noised_idx]).all() @@ -337,7 +336,11 @@ def _validate_seed_and_noise_data(noise_type, column, config): ) assert (noised_data != column).any() - assert (noised_data == noised_data_same_seed).all() + assert (noised_data.isna() == noised_data_same_seed.isna()).all() + assert ( + noised_data[noised_data.notna()] + == noised_data_same_seed[noised_data_same_seed.notna()] + ).all() assert (noised_data != noised_data_different_seed).any() return noised_data diff --git a/tests/unit/test_noise_form.py b/tests/unit/test_noise_form.py index ee099f35..470d3bbc 100644 --- a/tests/unit/test_noise_form.py +++ b/tests/unit/test_noise_form.py @@ -169,7 +169,7 @@ def test_correct_forms_are_used(func, form, mocker): if func == "todo": pytest.skip(reason=f"TODO: implement function for {form.value} form") mock = mocker.patch("pseudopeople.interface.noise_form") - mocker.patch("pseudopeople.interface.pd") + mocker.patch("pseudopeople.interface.pd.read_hdf", return_value=pd.DataFrame()) _ = func("dummy/path") assert mock.call_args[0][0] == form From 331b67bb8b9b92a68fae9b71f0fb62bab13d711c Mon Sep 17 00:00:00 2001 From: Rajan Mudambi <11376379+rmudambi@users.noreply.github.com> Date: Tue, 4 Apr 2023 17:33:06 -0700 Subject: [PATCH 2/3] fix bug introduced during merge --- tests/unit/test_column_noise.py | 1 - 1 file changed, 1 deletion(-) diff --git a/tests/unit/test_column_noise.py b/tests/unit/test_column_noise.py index d9a97c6f..789a83bd 100644 --- a/tests/unit/test_column_noise.py +++ b/tests/unit/test_column_noise.py @@ -517,5 +517,4 @@ def test_seeds_behave_as_expected(noise_type, data_col, form, form_col, dummy_da noised_data[noised_data.notna()] == noised_data_same_seed[noised_data_same_seed.notna()] ).all() - assert (noised_data == noised_data_same_seed).all() assert (noised_data != noised_data_different_seed).any() From 8a3dafd50bcb219d8257c308dc7ffa2e934b2bc3 Mon Sep 17 00:00:00 2001 From: Rajan Mudambi <11376379+rmudambi@users.noreply.github.com> Date: Tue, 4 Apr 2023 17:35:47 -0700 Subject: [PATCH 3/3] formatting --- tests/unit/test_column_noise.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/unit/test_column_noise.py b/tests/unit/test_column_noise.py index 789a83bd..0a1b95bc 100644 --- a/tests/unit/test_column_noise.py +++ b/tests/unit/test_column_noise.py @@ -514,7 +514,7 @@ def test_seeds_behave_as_expected(noise_type, data_col, form, form_col, dummy_da assert (noised_data != data).any() assert (noised_data.isna() == noised_data_same_seed.isna()).all() assert ( - noised_data[noised_data.notna()] - == noised_data_same_seed[noised_data_same_seed.notna()] + noised_data[noised_data.notna()] + == noised_data_same_seed[noised_data_same_seed.notna()] ).all() assert (noised_data != noised_data_different_seed).any()