diff --git a/src/pseudopeople/data/incorrect_select_options.csv b/src/pseudopeople/data/incorrect_select_options.csv index e4939387..67fd1629 100644 --- a/src/pseudopeople/data/incorrect_select_options.csv +++ b/src/pseudopeople/data/incorrect_select_options.csv @@ -3,7 +3,7 @@ AL,Reference person,Female,White,W2,creation AK,Opp-sex spouse,Male,Black,1099,death AZ,Opp-sex partner,,Asian,, AR,Same-sex spouse,,AIAN,, -CA,Same-sex partne,,NHOPI,, +CA,Same-sex partner,,NHOPI,, CO,Biological child,,Multiracial or Other,, CT,Adopted child,,Latino,, DE,Stepchild,,,, @@ -16,7 +16,7 @@ IN,Other relative,,,, IA,Roommate,,,, KS,Foster child,,,, KY,Other nonrelative,,,, -LA,Institutionalized GQ po,,,, +LA,Institutionalized GQ pop,,,, ME,Noninstitutionalized GQ pop,,,, MD,,,,, MA,,,,, diff --git a/src/pseudopeople/entity_types.py b/src/pseudopeople/entity_types.py index 500cf27c..7749ce7b 100644 --- a/src/pseudopeople/entity_types.py +++ b/src/pseudopeople/entity_types.py @@ -23,7 +23,7 @@ class RowNoiseType: """ name: str - noise_function: Callable[[pd.DataFrame, float, RandomnessStream, str], pd.DataFrame] + noise_function: Callable[[pd.DataFrame, float, RandomnessStream], pd.DataFrame] def __call__( self, @@ -63,8 +63,8 @@ def __call__( to_noise_idx = get_index_to_noise( column, noise_level, randomness_stream, f"{self.name}_{additional_key}" ) - column.loc[to_noise_idx] = self.noise_function( + noised_data = self.noise_function( column.loc[to_noise_idx], configuration, randomness_stream, additional_key ) - + column.loc[to_noise_idx] = noised_data return column diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py index 08dfb293..7a543478 100644 --- a/src/pseudopeople/interface.py +++ b/src/pseudopeople/interface.py @@ -33,7 +33,9 @@ def _generate_form( if isinstance(source, pd.DataFrame): data = source else: - data = pd.read_csv(source, dtype=str, keep_default_na=False) + data = pd.read_hdf(source) + if not isinstance(data, pd.DataFrame): + raise TypeError(f"File located at {source} must contain a pandas DataFrame.") return noise_form(form, data, configuration_tree, seed) diff --git a/src/pseudopeople/noise_functions.py b/src/pseudopeople/noise_functions.py index e92a58ce..03826d95 100644 --- a/src/pseudopeople/noise_functions.py +++ b/src/pseudopeople/noise_functions.py @@ -270,7 +270,7 @@ def generate_missing_data(column: pd.Series, *_: Any) -> pd.Series: :returns: pd.Series of empty strings with the index of column. """ - return pd.Series("", index=column.index) + return pd.Series(pd.NA, index=column.index) def generate_typographical_errors( @@ -322,6 +322,7 @@ def keyboard_corrupt(truth, corrupted_pr, addl_pr, rng): include_original_token_level = configuration.include_original_token_level rng = np.random.default_rng(seed=randomness_stream.seed) + column = column.astype(str) for idx in column.index: noised_value = keyboard_corrupt( column[idx], diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py index c8dfe389..c20a62bc 100644 --- a/tests/integration/conftest.py +++ b/tests/integration/conftest.py @@ -163,7 +163,7 @@ def decennial_census_data_path(tmp_path_factory): } ) - data_path = tmp_path_factory.getbasetemp() / "dummy_data.csv" - data.to_csv(data_path, index=False) + data_path = tmp_path_factory.getbasetemp() / "dummy_data.hdf" + data.to_hdf(data_path, "data") return data_path diff --git a/tests/integration/test_interface.py b/tests/integration/test_interface.py index 605c1d0c..e2fda8ca 100644 --- a/tests/integration/test_interface.py +++ b/tests/integration/test_interface.py @@ -11,7 +11,7 @@ def test_generate_decennial_census( decennial_census_data_path: Union[Path, str], user_config_path: Union[Path, str] ): - data = pd.read_csv(decennial_census_data_path, dtype=str, keep_default_na=False) + data = pd.read_hdf(decennial_census_data_path) # TODO: Refactor this check into a separate test noised_data = generate_decennial_census( diff --git a/tests/unit/test_column_noise.py b/tests/unit/test_column_noise.py index 2e05796a..0a1b95bc 100644 --- a/tests/unit/test_column_noise.py +++ b/tests/unit/test_column_noise.py @@ -113,7 +113,7 @@ def test_generate_missing_data(dummy_dataset): # Calculate newly missing data, ie data that didn't come in as already missing orig_non_missing_idx = data.index[(data.notna()) & (data != "")] newly_missing_idx = noised_data.index[ - (noised_data.index.isin(orig_non_missing_idx)) & (noised_data == "") + (noised_data.index.isin(orig_non_missing_idx)) & (noised_data.isna()) ] # Check for expected noise level @@ -122,8 +122,7 @@ def test_generate_missing_data(dummy_dataset): assert np.isclose(expected_noise, actual_noise, rtol=0.02) # Check that un-noised values are unchanged - not_noised_idx = noised_data.index[noised_data != ""] - assert "" not in noised_data[not_noised_idx].values + not_noised_idx = noised_data.index[noised_data.notna()] assert (data[not_noised_idx] == noised_data[not_noised_idx]).all() @@ -513,5 +512,9 @@ def test_seeds_behave_as_expected(noise_type, data_col, form, form_col, dummy_da noised_data_different_seed = noise_type(data, config, RANDOMNESS1, f"test_{noise}") assert (noised_data != data).any() - assert (noised_data == noised_data_same_seed).all() + assert (noised_data.isna() == noised_data_same_seed.isna()).all() + assert ( + noised_data[noised_data.notna()] + == noised_data_same_seed[noised_data_same_seed.notna()] + ).all() assert (noised_data != noised_data_different_seed).any() diff --git a/tests/unit/test_noise_form.py b/tests/unit/test_noise_form.py index bedefab1..0459ea91 100644 --- a/tests/unit/test_noise_form.py +++ b/tests/unit/test_noise_form.py @@ -7,7 +7,7 @@ import pytest from vivarium.config_tree import ConfigTree -from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType +from pseudopeople.entity_types import ColumnNoiseType from pseudopeople.interface import generate_decennial_census from pseudopeople.noise import noise_form from pseudopeople.noise_entities import NOISE_TYPES @@ -169,7 +169,7 @@ def test_correct_forms_are_used(func, form, mocker): if func == "todo": pytest.skip(reason=f"TODO: implement function for {form.value} form") mock = mocker.patch("pseudopeople.interface.noise_form") - mocker.patch("pseudopeople.interface.pd.read_csv") + mocker.patch("pseudopeople.interface.pd.read_hdf", return_value=pd.DataFrame()) _ = func("dummy/path") assert mock.call_args[0][0] == form