Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

read data from an hdf rather than a csv #29

Merged
merged 5 commits into from
Apr 5, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions src/pseudopeople/data/incorrect_select_options.csv
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ AL,Reference person,Female,White,W2,creation
AK,Opp-sex spouse,Male,Black,1099,death
AZ,Opp-sex partner,,Asian,,
AR,Same-sex spouse,,AIAN,,
CA,Same-sex partne,,NHOPI,,
CA,Same-sex partner,,NHOPI,,
CO,Biological child,,Multiracial or Other,,
CT,Adopted child,,Latino,,
DE,Stepchild,,,,
Expand All @@ -16,7 +16,7 @@ IN,Other relative,,,,
IA,Roommate,,,,
KS,Foster child,,,,
KY,Other nonrelative,,,,
LA,Institutionalized GQ po,,,,
LA,Institutionalized GQ pop,,,,
ME,Noninstitutionalized GQ pop,,,,
MD,,,,,
MA,,,,,
Expand Down
6 changes: 3 additions & 3 deletions src/pseudopeople/entity_types.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class RowNoiseType:
"""

name: str
noise_function: Callable[[pd.DataFrame, float, RandomnessStream, str], pd.DataFrame]
noise_function: Callable[[pd.DataFrame, float, RandomnessStream], pd.DataFrame]

def __call__(
self,
Expand Down Expand Up @@ -63,8 +63,8 @@ def __call__(
to_noise_idx = get_index_to_noise(
column, noise_level, randomness_stream, f"{self.name}_{additional_key}"
)
column.loc[to_noise_idx] = self.noise_function(
noised_data = self.noise_function(
column.loc[to_noise_idx], configuration, randomness_stream, additional_key
)

column.loc[to_noise_idx] = noised_data
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Was this actually causing a problem or do you just find this more readable?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This just made debugging easier since I could put a breakpoint between the function call and the assignment to the series.

return column
4 changes: 3 additions & 1 deletion src/pseudopeople/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,9 @@ def _generate_form(
if isinstance(source, pd.DataFrame):
data = source
else:
data = pd.read_csv(source, dtype=str, keep_default_na=False)
data = pd.read_hdf(source)
if not isinstance(data, pd.DataFrame):
raise TypeError(f"File located at {source} must contain a pandas DataFrame.")
return noise_form(form, data, configuration_tree, seed)


Expand Down
3 changes: 2 additions & 1 deletion src/pseudopeople/noise_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -270,7 +270,7 @@ def generate_missing_data(column: pd.Series, *_: Any) -> pd.Series:
:returns: pd.Series of empty strings with the index of column.
"""

return pd.Series("", index=column.index)
return pd.Series(pd.NA, index=column.index)


def generate_typographical_errors(
Expand Down Expand Up @@ -322,6 +322,7 @@ def keyboard_corrupt(truth, corrupted_pr, addl_pr, rng):
include_original_token_level = configuration.include_original_token_level

rng = np.random.default_rng(seed=randomness_stream.seed)
column = column.astype(str)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will convert any NaNs to "nan" and proceed to corrupt that. We shouldn't have any NaNs at this point though, right? B/c those get dropped up front when this gets called?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's correct, but definitely great to call this out

for idx in column.index:
noised_value = keyboard_corrupt(
column[idx],
Expand Down
4 changes: 2 additions & 2 deletions tests/integration/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ def decennial_census_data_path(tmp_path_factory):
}
)

data_path = tmp_path_factory.getbasetemp() / "dummy_data.csv"
data.to_csv(data_path, index=False)
data_path = tmp_path_factory.getbasetemp() / "dummy_data.hdf"
data.to_hdf(data_path, "data")

return data_path
2 changes: 1 addition & 1 deletion tests/integration/test_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
def test_generate_decennial_census(
decennial_census_data_path: Union[Path, str], user_config_path: Union[Path, str]
):
data = pd.read_csv(decennial_census_data_path, dtype=str, keep_default_na=False)
data = pd.read_hdf(decennial_census_data_path)

# TODO: Refactor this check into a separate test
noised_data = generate_decennial_census(
Expand Down
11 changes: 7 additions & 4 deletions tests/unit/test_column_noise.py
Original file line number Diff line number Diff line change
Expand Up @@ -113,7 +113,7 @@ def test_generate_missing_data(dummy_dataset):
# Calculate newly missing data, ie data that didn't come in as already missing
orig_non_missing_idx = data.index[(data.notna()) & (data != "")]
newly_missing_idx = noised_data.index[
(noised_data.index.isin(orig_non_missing_idx)) & (noised_data == "")
(noised_data.index.isin(orig_non_missing_idx)) & (noised_data.isna())
]

# Check for expected noise level
Expand All @@ -122,8 +122,7 @@ def test_generate_missing_data(dummy_dataset):
assert np.isclose(expected_noise, actual_noise, rtol=0.02)

# Check that un-noised values are unchanged
not_noised_idx = noised_data.index[noised_data != ""]
assert "" not in noised_data[not_noised_idx].values
not_noised_idx = noised_data.index[noised_data.notna()]
assert (data[not_noised_idx] == noised_data[not_noised_idx]).all()


Expand Down Expand Up @@ -513,5 +512,9 @@ def test_seeds_behave_as_expected(noise_type, data_col, form, form_col, dummy_da
noised_data_different_seed = noise_type(data, config, RANDOMNESS1, f"test_{noise}")

assert (noised_data != data).any()
assert (noised_data == noised_data_same_seed).all()
assert (noised_data.isna() == noised_data_same_seed.isna()).all()
assert (
noised_data[noised_data.notna()]
== noised_data_same_seed[noised_data_same_seed.notna()]
).all()
assert (noised_data != noised_data_different_seed).any()
4 changes: 2 additions & 2 deletions tests/unit/test_noise_form.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
import pytest
from vivarium.config_tree import ConfigTree

from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType
from pseudopeople.entity_types import ColumnNoiseType
from pseudopeople.interface import generate_decennial_census
from pseudopeople.noise import noise_form
from pseudopeople.noise_entities import NOISE_TYPES
Expand Down Expand Up @@ -169,7 +169,7 @@ def test_correct_forms_are_used(func, form, mocker):
if func == "todo":
pytest.skip(reason=f"TODO: implement function for {form.value} form")
mock = mocker.patch("pseudopeople.interface.noise_form")
mocker.patch("pseudopeople.interface.pd.read_csv")
mocker.patch("pseudopeople.interface.pd.read_hdf", return_value=pd.DataFrame())
_ = func("dummy/path")

assert mock.call_args[0][0] == form
Expand Down