-
Notifications
You must be signed in to change notification settings - Fork 2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Copy from household member noise function implementation #191
Changes from 17 commits
72744a5
39b0f68
a7c557f
83400b6
0cc9d5f
276d444
e11ef34
28a1acd
8807d5c
8e43dde
262400b
d37e35f
ad5ebc2
ce7f92e
bad1edb
c8fc847
681d286
d9fbd80
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from typing import List | ||
|
||
from pseudopeople.constants.metadata import COPY_HOUSEHOLD_MEMBER_COLS | ||
|
||
|
||
def copy_from_household_member_column_getter(column_name) -> List[str]: | ||
return [COPY_HOUSEHOLD_MEMBER_COLS[column_name]] |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -46,6 +46,13 @@ | |
Keys.ROW_PROBABILITY: 0.005, | ||
}, | ||
}, | ||
Keys.COLUMN_NOISE: { | ||
COLUMNS.ssn.name: { | ||
NOISE_TYPES.copy_from_household_member.name: { | ||
Keys.CELL_PROBABILITY: 0.00, | ||
} | ||
}, | ||
}, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This isn't necessary, since below we are setting the noise level on the SSA dataset to 0 for all noise types. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes this is no noise for |
||
}, | ||
# No noise of any kind for SSN in the SSA observer | ||
DATASETS.ssa.name: { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
from typing import NamedTuple | ||
|
||
from pseudopeople import noise_functions, noise_scaling | ||
from pseudopeople import column_getters, noise_functions, noise_scaling | ||
from pseudopeople.configuration import Keys | ||
from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType | ||
|
||
|
@@ -30,10 +30,12 @@ class __NoiseTypes(NamedTuple): | |
noise_functions.choose_wrong_options, | ||
noise_level_scaling_function=noise_scaling.scale_choose_wrong_option, | ||
) | ||
# copy_from_household_member: ColumnNoiseType = ColumnNoiseType( | ||
# "copy_from_household_member", | ||
# noise_functions.copy_from_household_members, | ||
# ) | ||
copy_from_household_member: ColumnNoiseType = ColumnNoiseType( | ||
"copy_from_household_member", | ||
noise_functions.copy_from_household_member, | ||
noise_level_scaling_function=noise_scaling.scale_copy_from_household_member, | ||
additional_column_getter=column_getters.copy_from_household_member_column_getter, | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I haven't reviewed this closely, but I'm surprised not to see a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You were right I forgot to port that over from the other branch but it is implemented now. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't understand how this is working - don't you need to add this to the Edit: nevermind, in the other PR |
||
) | ||
swap_month_and_day: ColumnNoiseType = ColumnNoiseType( | ||
"swap_month_and_day", | ||
noise_functions.swap_months_and_days, | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,7 +8,11 @@ | |
|
||
from pseudopeople.configuration import Keys | ||
from pseudopeople.constants import data_values, paths | ||
from pseudopeople.constants.metadata import Attributes, DatasetNames | ||
from pseudopeople.constants.metadata import ( | ||
COPY_HOUSEHOLD_MEMBER_COLS, | ||
Attributes, | ||
DatasetNames, | ||
) | ||
from pseudopeople.data.fake_names import fake_first_names, fake_last_names | ||
from pseudopeople.exceptions import ConfigurationError | ||
from pseudopeople.noise_scaling import load_nicknames_data | ||
|
@@ -180,22 +184,24 @@ def choose_wrong_options( | |
return pd.Series(new_values, index=data.index, name=column_name) | ||
|
||
|
||
# def copy_from_household_members( | ||
# column: pd.Series, | ||
# configuration: ConfigTree, | ||
# randomness_stream: RandomnessStream, | ||
# additional_key: Any, | ||
# ) -> pd.Series: | ||
# """ | ||
def copy_from_household_member( | ||
data: pd.DataFrame, | ||
configuration: ConfigTree, | ||
randomness_stream: RandomnessStream, | ||
column_name: str, | ||
) -> pd.Series: | ||
""" | ||
|
||
# :param column: | ||
# :param configuration: | ||
# :param randomness_stream: | ||
# :param additional_key: Key for RandomnessStream | ||
# :return: | ||
# """ | ||
# # todo actually duplicate rows | ||
# return column | ||
:param data: A pandas dataframe containing necessary columns for column noise | ||
:param _: ConfigTree with rate at which to blank the data in column. | ||
:param randomness_stream: RandomnessStream to utilize Vivarium CRN. | ||
:param column_name: String for column that will be noised, will be the key for RandomnessStream | ||
:returns: pd.Series where data has been noised with other values from a list of possibilities | ||
""" | ||
|
||
copy_values = data[COPY_HOUSEHOLD_MEMBER_COLS[column_name]] | ||
column = pd.Series(copy_values, index=data.index, name=column_name) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This looks right, but this seems clearer to me:
Feel free to leave it as is if you disagree. |
||
return column | ||
|
||
|
||
def swap_months_and_days( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,7 @@ | ||
import numpy as np | ||
import pandas as pd | ||
|
||
from pseudopeople.constants import paths | ||
from pseudopeople.constants import metadata, paths | ||
|
||
|
||
def scale_choose_wrong_option(data: pd.DataFrame, column_name: str) -> float: | ||
|
@@ -38,6 +38,15 @@ def scale_nicknames(data: pd.DataFrame, column_name: str) -> float: | |
return 1 / proportion_have_nickname | ||
|
||
|
||
def scale_copy_from_household_member(data: pd.DataFrame, column_name: str) -> float: | ||
copy_column = data[metadata.COPY_HOUSEHOLD_MEMBER_COLS[column_name]] | ||
eligible_idx = copy_column.index[(copy_column != "") & (copy_column.notna())] | ||
proportion_eligible = len(eligible_idx) / len(data) | ||
if proportion_eligible == 0.0: | ||
return 0.0 | ||
return 1 / proportion_eligible | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Do we actually need this? Aren't all ineligible rows filtered out with the call to There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is scaling the noise proportion just like we did for nicknames. Similar to how we have a dataset where we are choosing a certain number of rows for our noise level, we are choosing more of the eligible rows to noise due to those who are ineligible from PRL. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think maybe it is confusing because it looks similar to getting the rows that have NAs like in |
||
|
||
|
||
#################### | ||
# Helper functions # | ||
#################### | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,6 +8,7 @@ | |
from vivarium.framework.randomness.index_map import IndexMap | ||
|
||
from pseudopeople.configuration import Keys, get_configuration | ||
from pseudopeople.constants.metadata import COPY_HOUSEHOLD_MEMBER_COLS | ||
from pseudopeople.data.fake_names import fake_first_names, fake_last_names | ||
from pseudopeople.noise_entities import NOISE_TYPES | ||
from pseudopeople.schema_entities import DATASETS | ||
|
@@ -95,6 +96,8 @@ def dummy_dataset(): | |
date_of_birth_series = pd.Series( | ||
date_of_birth_list * int(num_simulants / len(date_of_birth_list)) | ||
) | ||
copy_age_list = ["10", "20", "30", "40", "50", "60", "70", "80", "", "100"] | ||
copy_age_series = pd.Series(copy_age_list * int(num_simulants / len(copy_age_list))) | ||
|
||
return pd.DataFrame( | ||
{ | ||
|
@@ -108,6 +111,7 @@ def dummy_dataset(): | |
"last_name": last_name_series, | ||
"event_date": event_date_series, | ||
"date_of_birth": date_of_birth_series, | ||
"copy_age": copy_age_series, | ||
} | ||
) | ||
|
||
|
@@ -184,9 +188,32 @@ def test_choose_wrong_option(dummy_dataset): | |
pd.testing.assert_index_equal(original_empty_idx, noised_empty_idx) | ||
|
||
|
||
@pytest.mark.skip(reason="TODO") | ||
def test_copy_from_household_member(): | ||
pass | ||
def test_generate_copy_from_household_member(dummy_dataset): | ||
config = get_configuration()[DATASETS.census.name][Keys.COLUMN_NOISE]["age"][ | ||
NOISE_TYPES.copy_from_household_member.name | ||
] | ||
data = dummy_dataset[["age", "copy_age"]] | ||
noised_data = NOISE_TYPES.copy_from_household_member(data, config, RANDOMNESS0, "age") | ||
|
||
# Check for expected noise level | ||
expected_noise = config[Keys.CELL_PROBABILITY] | ||
original_missing_idx = data.index[data["age"] == ""] | ||
eligible_for_noise_idx = data.index.difference(original_missing_idx) | ||
data = data["age"] | ||
actual_noise = ( | ||
noised_data[eligible_for_noise_idx] != data[eligible_for_noise_idx] | ||
).mean() | ||
is_close_wrapper(actual_noise, expected_noise, 0.02) | ||
|
||
# Noised values should be the same as the copy column | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You should also check that all simulants ineligible for noise remain unchanged. |
||
was_noised_series = noised_data[eligible_for_noise_idx] != data[eligible_for_noise_idx] | ||
noised_idx = was_noised_series[was_noised_series].index | ||
assert ( | ||
dummy_dataset.loc[noised_idx, COPY_HOUSEHOLD_MEMBER_COLS["age"]] | ||
== noised_data.loc[noised_idx] | ||
).all() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. You can send a mask or an index in lines 212 and 213, so no need to convert the mask to an index.. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've changed the variable name from mask. It technically was a mask because it was a bool series but it is not the same length as dummy_dataset and noised_data which is why I need to find the index. |
||
not_noised_idx = dummy_dataset.index.difference(noised_idx) | ||
assert (dummy_dataset.loc[not_noised_idx, "age"] == noised_data.loc[not_noised_idx]).all() | ||
|
||
|
||
def test_swap_months_and_days(dummy_dataset): | ||
|
@@ -875,7 +902,7 @@ def test_make_typos(dummy_dataset, column): | |
[ | ||
(NOISE_TYPES.leave_blank, "numbers", "decennial_census", "zipcode"), | ||
(NOISE_TYPES.choose_wrong_option, "state", "decennial_census", "state"), | ||
("NOISE_TYPES.copy_from_within_household", "todo", "todo", "todo"), | ||
(NOISE_TYPES.copy_from_household_member, "age", "decennial_census", "age"), | ||
(NOISE_TYPES.swap_month_and_day, "event_date", "social_security", "event_date"), | ||
(NOISE_TYPES.write_wrong_zipcode_digits, "zipcode", "decennial_census", "zipcode"), | ||
(NOISE_TYPES.misreport_age, "age", "decennial_census", "age"), | ||
|
@@ -902,12 +929,15 @@ def test_seeds_behave_as_expected(noise_type, data_col, dataset, dataset_col, du | |
pytest.skip(reason=f"TODO: implement for {noise_type}") | ||
noise = noise_type.name | ||
config = get_configuration()[dataset][Keys.COLUMN_NOISE][dataset_col][noise] | ||
data = dummy_dataset[[data_col]] | ||
if noise == NOISE_TYPES.copy_from_household_member.name: | ||
data = dummy_dataset[[data_col, COPY_HOUSEHOLD_MEMBER_COLS[data_col]]] | ||
else: | ||
data = dummy_dataset[[data_col]] | ||
|
||
noised_data = noise_type(data, config, RANDOMNESS0, data_col) | ||
noised_data_same_seed = noise_type(data, config, RANDOMNESS0, data_col) | ||
noised_data_different_seed = noise_type(data, config, RANDOMNESS1, data_col) | ||
data = data.squeeze() | ||
data = data[data_col] | ||
|
||
assert (noised_data != data).any() | ||
assert (noised_data.isna() == noised_data_same_seed.isna()).all() | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -54,7 +54,9 @@ def dummy_config_noise_numbers(): | |
"event_type": { | ||
NOISE_TYPES.leave_blank.name: {Keys.CELL_PROBABILITY: 0.01}, | ||
NOISE_TYPES.choose_wrong_option.name: {Keys.CELL_PROBABILITY: 0.01}, | ||
"copy_from_household_member": {Keys.CELL_PROBABILITY: 0.01}, | ||
NOISE_TYPES.copy_from_household_member.name: { | ||
Keys.CELL_PROBABILITY: 0.01 | ||
}, | ||
NOISE_TYPES.swap_month_and_day.name: {Keys.CELL_PROBABILITY: 0.01}, | ||
NOISE_TYPES.write_wrong_zipcode_digits.name: { | ||
Keys.CELL_PROBABILITY: 0.01, | ||
|
@@ -85,7 +87,7 @@ def dummy_config_noise_numbers(): | |
}, | ||
}, | ||
Keys.ROW_NOISE: { | ||
"duplicate_rows": { | ||
"duplicate_row": { | ||
Keys.ROW_PROBABILITY: 0.01, | ||
}, | ||
NOISE_TYPES.do_not_respond.name: { | ||
|
@@ -114,7 +116,8 @@ def test_noise_order(mocker, dummy_data, dummy_config_noise_numbers): | |
for field in NOISE_TYPES._fields: | ||
mock_return = ( | ||
dummy_data[["event_type"]] | ||
if field in ["do_not_respond", "duplication"] | ||
if field | ||
in [NOISE_TYPES.do_not_respond.name, NOISE_TYPES.omit_row.name, "duplicate_row"] | ||
else dummy_data["event_type"] | ||
) | ||
mock.attach_mock( | ||
|
@@ -124,18 +127,39 @@ def test_noise_order(mocker, dummy_data, dummy_config_noise_numbers): | |
), | ||
field, | ||
) | ||
if field not in [ | ||
NOISE_TYPES.do_not_respond.name, | ||
NOISE_TYPES.omit_row.name, | ||
"duplicate_row", | ||
]: | ||
mock.attach_mock( | ||
mocker.patch( | ||
f"pseudopeople.noise.NOISE_TYPES.{field}.additional_column_getter", | ||
return_value=[], | ||
), | ||
field, | ||
) | ||
mock.attach_mock( | ||
mocker.patch( | ||
f"pseudopeople.noise.NOISE_TYPES.{field}.noise_level_scaling_function", | ||
return_value=1, | ||
), | ||
field, | ||
) | ||
|
||
# FIXME: would be better to mock the dataset instead of using census | ||
noise_dataset(DATASETS.census, dummy_data, dummy_config_noise_numbers, 0) | ||
|
||
call_order = [x[0] for x in mock.mock_calls if not x[0].startswith("__")] | ||
# This is getting the string of each noise type. It is filtering down to string type for each noise type | ||
# while not including mock calls that also contain that string. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. But why do we care if this is a string? You should communicate these things:
|
||
call_order = [x[0] for x in mock.mock_calls if type(x[1][0]) == str] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Add a comment explaining what is going on here. |
||
expected_call_order = [ | ||
# NOISE_TYPES.omit_row.name, # Census doesn't use omit_row | ||
NOISE_TYPES.do_not_respond.name, | ||
# NOISE_TYPES.duplicate_row.name, | ||
NOISE_TYPES.leave_blank.name, | ||
NOISE_TYPES.choose_wrong_option.name, | ||
# NOISE_TYPES.copy_from_household_member.name, | ||
NOISE_TYPES.copy_from_household_member.name, | ||
NOISE_TYPES.swap_month_and_day.name, | ||
NOISE_TYPES.write_wrong_zipcode_digits.name, | ||
NOISE_TYPES.misreport_age.name, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Maybe rename this to utilities.py so it's more generic and usaable for future utility functions