Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Copy from household member noise function implementation #191

Merged
merged 18 commits into from
Jun 20, 2023
Merged
Show file tree
Hide file tree
Changes from 17 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 7 additions & 0 deletions src/pseudopeople/column_getters.py
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe rename this to utilities.py so it's more generic and usaable for future utility functions

Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from typing import List

from pseudopeople.constants.metadata import COPY_HOUSEHOLD_MEMBER_COLS


def copy_from_household_member_column_getter(column_name) -> List[str]:
return [COPY_HOUSEHOLD_MEMBER_COLS[column_name]]
7 changes: 7 additions & 0 deletions src/pseudopeople/configuration/generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,13 @@
Keys.ROW_PROBABILITY: 0.005,
},
},
Keys.COLUMN_NOISE: {
COLUMNS.ssn.name: {
NOISE_TYPES.copy_from_household_member.name: {
Keys.CELL_PROBABILITY: 0.00,
}
},
},
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This isn't necessary, since below we are setting the noise level on the SSA dataset to 0 for all noise types.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes this is no noise for copy_from_household_member for W2.

},
# No noise of any kind for SSN in the SSA observer
DATASETS.ssa.name: {
Expand Down
8 changes: 8 additions & 0 deletions src/pseudopeople/constants/metadata.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,3 +85,11 @@ class Attributes:
"DISTRICT OF COLUMBIA": "DC",
# "PUERTO RICO": "PR",
}


COPY_HOUSEHOLD_MEMBER_COLS = {
"age": "copy_age",
"date_of_birth": "copy_date_of_birth",
"ssn": "copy_ssn",
# TODO: add itins
}
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
12 changes: 7 additions & 5 deletions src/pseudopeople/noise_entities.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import NamedTuple

from pseudopeople import noise_functions, noise_scaling
from pseudopeople import column_getters, noise_functions, noise_scaling
from pseudopeople.configuration import Keys
from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType

Expand Down Expand Up @@ -30,10 +30,12 @@ class __NoiseTypes(NamedTuple):
noise_functions.choose_wrong_options,
noise_level_scaling_function=noise_scaling.scale_choose_wrong_option,
)
# copy_from_household_member: ColumnNoiseType = ColumnNoiseType(
# "copy_from_household_member",
# noise_functions.copy_from_household_members,
# )
copy_from_household_member: ColumnNoiseType = ColumnNoiseType(
"copy_from_household_member",
noise_functions.copy_from_household_member,
noise_level_scaling_function=noise_scaling.scale_copy_from_household_member,
additional_column_getter=column_getters.copy_from_household_member_column_getter,
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I haven't reviewed this closely, but I'm surprised not to see a noise_level_scaling_function here.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You were right I forgot to port that over from the other branch but it is implemented now.

Copy link
Contributor

@stevebachmeier stevebachmeier Jun 16, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't understand how this is working - don't you need to add this to the class ColumnNoiseType attributes alongside additional_parameters, etc?

Edit: nevermind, in the other PR

)
swap_month_and_day: ColumnNoiseType = ColumnNoiseType(
"swap_month_and_day",
noise_functions.swap_months_and_days,
Expand Down
38 changes: 22 additions & 16 deletions src/pseudopeople/noise_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,11 @@

from pseudopeople.configuration import Keys
from pseudopeople.constants import data_values, paths
from pseudopeople.constants.metadata import Attributes, DatasetNames
from pseudopeople.constants.metadata import (
COPY_HOUSEHOLD_MEMBER_COLS,
Attributes,
DatasetNames,
)
from pseudopeople.data.fake_names import fake_first_names, fake_last_names
from pseudopeople.exceptions import ConfigurationError
from pseudopeople.noise_scaling import load_nicknames_data
Expand Down Expand Up @@ -180,22 +184,24 @@ def choose_wrong_options(
return pd.Series(new_values, index=data.index, name=column_name)


# def copy_from_household_members(
# column: pd.Series,
# configuration: ConfigTree,
# randomness_stream: RandomnessStream,
# additional_key: Any,
# ) -> pd.Series:
# """
def copy_from_household_member(
data: pd.DataFrame,
configuration: ConfigTree,
randomness_stream: RandomnessStream,
column_name: str,
) -> pd.Series:
"""

# :param column:
# :param configuration:
# :param randomness_stream:
# :param additional_key: Key for RandomnessStream
# :return:
# """
# # todo actually duplicate rows
# return column
:param data: A pandas dataframe containing necessary columns for column noise
:param _: ConfigTree with rate at which to blank the data in column.
:param randomness_stream: RandomnessStream to utilize Vivarium CRN.
:param column_name: String for column that will be noised, will be the key for RandomnessStream
:returns: pd.Series where data has been noised with other values from a list of possibilities
"""

copy_values = data[COPY_HOUSEHOLD_MEMBER_COLS[column_name]]
column = pd.Series(copy_values, index=data.index, name=column_name)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This looks right, but this seems clearer to me:

column = (
    data[COPY_HOUSEHOLD_MEMBER_COLS[column_name]].copy()
    .rename(column_name)
)

Feel free to leave it as is if you disagree.

return column


def swap_months_and_days(
Expand Down
11 changes: 10 additions & 1 deletion src/pseudopeople/noise_scaling.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import pandas as pd

from pseudopeople.constants import paths
from pseudopeople.constants import metadata, paths


def scale_choose_wrong_option(data: pd.DataFrame, column_name: str) -> float:
Expand Down Expand Up @@ -38,6 +38,15 @@ def scale_nicknames(data: pd.DataFrame, column_name: str) -> float:
return 1 / proportion_have_nickname


def scale_copy_from_household_member(data: pd.DataFrame, column_name: str) -> float:
copy_column = data[metadata.COPY_HOUSEHOLD_MEMBER_COLS[column_name]]
eligible_idx = copy_column.index[(copy_column != "") & (copy_column.notna())]
proportion_eligible = len(eligible_idx) / len(data)
if proportion_eligible == 0.0:
return 0.0
return 1 / proportion_eligible
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Do we actually need this? Aren't all ineligible rows filtered out with the call to get_index_to_noise in the column noise type's __call__ method?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is scaling the noise proportion just like we did for nicknames. Similar to how we have a dataset where we are choosing a certain number of rows for our noise level, we are choosing more of the eligible rows to noise due to those who are ineligible from PRL.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think maybe it is confusing because it looks similar to getting the rows that have NAs like in get_index_to_noise but we need to scale it based on NAs.



####################
# Helper functions #
####################
Expand Down
8 changes: 4 additions & 4 deletions src/pseudopeople/schema_entities.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ class __Columns(NamedTuple):
"age",
(
NOISE_TYPES.leave_blank,
# NOISE_TYPES.copy_from_within_household,
NOISE_TYPES.copy_from_household_member,
NOISE_TYPES.misreport_age,
NOISE_TYPES.make_ocr_errors,
NOISE_TYPES.make_typos,
Expand All @@ -48,7 +48,7 @@ class __Columns(NamedTuple):
"date_of_birth",
(
NOISE_TYPES.leave_blank,
# NOISE_TYPES.copy_from_within_household,
NOISE_TYPES.copy_from_household_member,
NOISE_TYPES.swap_month_and_day,
NOISE_TYPES.write_wrong_digits,
NOISE_TYPES.make_ocr_errors,
Expand Down Expand Up @@ -147,7 +147,7 @@ class __Columns(NamedTuple):
"itin",
(
NOISE_TYPES.leave_blank,
# NOISE_TYPES.copy_from_within_household,
# NOISE_TYPES.copy_from_household_member,
NOISE_TYPES.write_wrong_digits,
NOISE_TYPES.make_ocr_errors,
NOISE_TYPES.make_typos,
Expand Down Expand Up @@ -284,7 +284,7 @@ class __Columns(NamedTuple):
"ssn",
(
NOISE_TYPES.leave_blank,
# NOISE_TYPES.copy_from_within_household,
NOISE_TYPES.copy_from_household_member,
NOISE_TYPES.write_wrong_digits,
NOISE_TYPES.make_ocr_errors,
NOISE_TYPES.make_typos,
Expand Down
2 changes: 2 additions & 0 deletions tests/integration/test_interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -100,6 +100,8 @@ def test_generate_dataset_from_sample_and_source(
# we special-case a few sparse columns that have larger differences
if dataset_name == DATASETS.cps.name and col == COLUMNS.unit_number.name:
rtol = 0.30
elif dataset_name == DATASETS.acs.name and col == COLUMNS.unit_number.name:
rtol = 0.25
else:
rtol = 0.12
assert np.isclose(noise_level_full_dataset, noise_level_single_dataset, rtol=rtol)
Expand Down
42 changes: 36 additions & 6 deletions tests/unit/test_column_noise.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from vivarium.framework.randomness.index_map import IndexMap

from pseudopeople.configuration import Keys, get_configuration
from pseudopeople.constants.metadata import COPY_HOUSEHOLD_MEMBER_COLS
from pseudopeople.data.fake_names import fake_first_names, fake_last_names
from pseudopeople.noise_entities import NOISE_TYPES
from pseudopeople.schema_entities import DATASETS
Expand Down Expand Up @@ -95,6 +96,8 @@ def dummy_dataset():
date_of_birth_series = pd.Series(
date_of_birth_list * int(num_simulants / len(date_of_birth_list))
)
copy_age_list = ["10", "20", "30", "40", "50", "60", "70", "80", "", "100"]
copy_age_series = pd.Series(copy_age_list * int(num_simulants / len(copy_age_list)))

return pd.DataFrame(
{
Expand All @@ -108,6 +111,7 @@ def dummy_dataset():
"last_name": last_name_series,
"event_date": event_date_series,
"date_of_birth": date_of_birth_series,
"copy_age": copy_age_series,
}
)

Expand Down Expand Up @@ -184,9 +188,32 @@ def test_choose_wrong_option(dummy_dataset):
pd.testing.assert_index_equal(original_empty_idx, noised_empty_idx)


@pytest.mark.skip(reason="TODO")
def test_copy_from_household_member():
pass
def test_generate_copy_from_household_member(dummy_dataset):
config = get_configuration()[DATASETS.census.name][Keys.COLUMN_NOISE]["age"][
NOISE_TYPES.copy_from_household_member.name
]
data = dummy_dataset[["age", "copy_age"]]
noised_data = NOISE_TYPES.copy_from_household_member(data, config, RANDOMNESS0, "age")

# Check for expected noise level
expected_noise = config[Keys.CELL_PROBABILITY]
original_missing_idx = data.index[data["age"] == ""]
eligible_for_noise_idx = data.index.difference(original_missing_idx)
data = data["age"]
actual_noise = (
noised_data[eligible_for_noise_idx] != data[eligible_for_noise_idx]
).mean()
is_close_wrapper(actual_noise, expected_noise, 0.02)

# Noised values should be the same as the copy column
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You should also check that all simulants ineligible for noise remain unchanged.

was_noised_series = noised_data[eligible_for_noise_idx] != data[eligible_for_noise_idx]
noised_idx = was_noised_series[was_noised_series].index
assert (
dummy_dataset.loc[noised_idx, COPY_HOUSEHOLD_MEMBER_COLS["age"]]
== noised_data.loc[noised_idx]
).all()
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You can send a mask or an index in lines 212 and 213, so no need to convert the mask to an index..

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I've changed the variable name from mask. It technically was a mask because it was a bool series but it is not the same length as dummy_dataset and noised_data which is why I need to find the index.

not_noised_idx = dummy_dataset.index.difference(noised_idx)
assert (dummy_dataset.loc[not_noised_idx, "age"] == noised_data.loc[not_noised_idx]).all()


def test_swap_months_and_days(dummy_dataset):
Expand Down Expand Up @@ -875,7 +902,7 @@ def test_make_typos(dummy_dataset, column):
[
(NOISE_TYPES.leave_blank, "numbers", "decennial_census", "zipcode"),
(NOISE_TYPES.choose_wrong_option, "state", "decennial_census", "state"),
("NOISE_TYPES.copy_from_within_household", "todo", "todo", "todo"),
(NOISE_TYPES.copy_from_household_member, "age", "decennial_census", "age"),
(NOISE_TYPES.swap_month_and_day, "event_date", "social_security", "event_date"),
(NOISE_TYPES.write_wrong_zipcode_digits, "zipcode", "decennial_census", "zipcode"),
(NOISE_TYPES.misreport_age, "age", "decennial_census", "age"),
Expand All @@ -902,12 +929,15 @@ def test_seeds_behave_as_expected(noise_type, data_col, dataset, dataset_col, du
pytest.skip(reason=f"TODO: implement for {noise_type}")
noise = noise_type.name
config = get_configuration()[dataset][Keys.COLUMN_NOISE][dataset_col][noise]
data = dummy_dataset[[data_col]]
if noise == NOISE_TYPES.copy_from_household_member.name:
data = dummy_dataset[[data_col, COPY_HOUSEHOLD_MEMBER_COLS[data_col]]]
else:
data = dummy_dataset[[data_col]]

noised_data = noise_type(data, config, RANDOMNESS0, data_col)
noised_data_same_seed = noise_type(data, config, RANDOMNESS0, data_col)
noised_data_different_seed = noise_type(data, config, RANDOMNESS1, data_col)
data = data.squeeze()
data = data[data_col]

assert (noised_data != data).any()
assert (noised_data.isna() == noised_data_same_seed.isna()).all()
Expand Down
34 changes: 29 additions & 5 deletions tests/unit/test_noise_form.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,9 @@ def dummy_config_noise_numbers():
"event_type": {
NOISE_TYPES.leave_blank.name: {Keys.CELL_PROBABILITY: 0.01},
NOISE_TYPES.choose_wrong_option.name: {Keys.CELL_PROBABILITY: 0.01},
"copy_from_household_member": {Keys.CELL_PROBABILITY: 0.01},
NOISE_TYPES.copy_from_household_member.name: {
Keys.CELL_PROBABILITY: 0.01
},
NOISE_TYPES.swap_month_and_day.name: {Keys.CELL_PROBABILITY: 0.01},
NOISE_TYPES.write_wrong_zipcode_digits.name: {
Keys.CELL_PROBABILITY: 0.01,
Expand Down Expand Up @@ -85,7 +87,7 @@ def dummy_config_noise_numbers():
},
},
Keys.ROW_NOISE: {
"duplicate_rows": {
"duplicate_row": {
Keys.ROW_PROBABILITY: 0.01,
},
NOISE_TYPES.do_not_respond.name: {
Expand Down Expand Up @@ -114,7 +116,8 @@ def test_noise_order(mocker, dummy_data, dummy_config_noise_numbers):
for field in NOISE_TYPES._fields:
mock_return = (
dummy_data[["event_type"]]
if field in ["do_not_respond", "duplication"]
if field
in [NOISE_TYPES.do_not_respond.name, NOISE_TYPES.omit_row.name, "duplicate_row"]
else dummy_data["event_type"]
)
mock.attach_mock(
Expand All @@ -124,18 +127,39 @@ def test_noise_order(mocker, dummy_data, dummy_config_noise_numbers):
),
field,
)
if field not in [
NOISE_TYPES.do_not_respond.name,
NOISE_TYPES.omit_row.name,
"duplicate_row",
]:
mock.attach_mock(
mocker.patch(
f"pseudopeople.noise.NOISE_TYPES.{field}.additional_column_getter",
return_value=[],
),
field,
)
mock.attach_mock(
mocker.patch(
f"pseudopeople.noise.NOISE_TYPES.{field}.noise_level_scaling_function",
return_value=1,
),
field,
)

# FIXME: would be better to mock the dataset instead of using census
noise_dataset(DATASETS.census, dummy_data, dummy_config_noise_numbers, 0)

call_order = [x[0] for x in mock.mock_calls if not x[0].startswith("__")]
# This is getting the string of each noise type. It is filtering down to string type for each noise type
# while not including mock calls that also contain that string.
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

But why do we care if this is a string? You should communicate these things:

  • What is x[0]?
  • What is x[1][0]? (the first argument of the call)
  • Why do we care if the first argument of the mock call is a string?
  • Mention the duplicate calls occurring due to having two methods mocked on the noise type*

call_order = [x[0] for x in mock.mock_calls if type(x[1][0]) == str]
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a comment explaining what is going on here.

expected_call_order = [
# NOISE_TYPES.omit_row.name, # Census doesn't use omit_row
NOISE_TYPES.do_not_respond.name,
# NOISE_TYPES.duplicate_row.name,
NOISE_TYPES.leave_blank.name,
NOISE_TYPES.choose_wrong_option.name,
# NOISE_TYPES.copy_from_household_member.name,
NOISE_TYPES.copy_from_household_member.name,
NOISE_TYPES.swap_month_and_day.name,
NOISE_TYPES.write_wrong_zipcode_digits.name,
NOISE_TYPES.misreport_age.name,
Expand Down