From 1d096042571520a041e1000560c902463b8e4c9c Mon Sep 17 00:00:00 2001
From: Steve Bachmeier <23350991+stevebachmeier@users.noreply.github.com>
Date: Fri, 24 Mar 2023 12:39:55 -0600
Subject: [PATCH 1/7] add census integration test (#17)

---
 tests/integration/conftest.py       | 185 ++++++++++++++++++++++++++++
 tests/integration/test_interface.py |  79 ++++++++++--
 2 files changed, 255 insertions(+), 9 deletions(-)
 create mode 100644 tests/integration/conftest.py

diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
new file mode 100644
index 00000000..140c6d12
--- /dev/null
+++ b/tests/integration/conftest.py
@@ -0,0 +1,185 @@
+import random
+import time
+from string import ascii_lowercase, ascii_uppercase
+
+import pandas as pd
+import pytest
+import yaml
+
+from pseudopeople.utilities import get_configuration
+
+HOUSING_TYPES = [
+    "Carceral",
+    "College",
+    "Military",
+    "Nursing home",
+    "Other institutional",
+    "Other non-institutional",
+    "Standard",
+]
+
+RACE_ETHNICITIES = [
+    "AIAN",
+    "Asian",
+    "Black",
+    "Latino",
+    "Multiracial or Other",
+    "NHOPI",
+    "White",
+]
+
+RELATIONS_TO_HOUSEHOLD_HEAD = [
+    "Adopted child",
+    "Biological child",
+    "Child-in-law",
+    "Foster child",
+    "Grandchild",
+    "Institutionalized GQ pop",
+    "Noninstitutionalized GQ pop",
+    "Opp-sex partner",
+    "Opp-sex spouse",
+    "Other nonrelative",
+    "Other relative",
+    "Parent",
+    "Parent-in-law",
+    "Reference person",
+    "Roommate",
+    "Same-sex partner",
+    "Same-sex spouse",
+    "Sibling",
+    "Stepchild",
+]
+
+DOB_START_DATE = time.mktime(time.strptime("1920-1-1", "%Y-%m-%d"))
+DOB_END_DATE = time.mktime(time.strptime("2030-5-1", "%Y-%m-%d"))
+
+STATES = [
+    "AL",
+    "AK",
+    "AZ",
+    "AR",
+    "CA",
+    "CO",
+    "CT",
+    "DC",
+    "DE",
+    "FL",
+    "GA",
+    "HI",
+    "ID",
+    "IL",
+    "IN",
+    "IA",
+    "KS",
+    "KY",
+    "LA",
+    "ME",
+    "MD",
+    "MA",
+    "MI",
+    "MN",
+    "MS",
+    "MO",
+    "MT",
+    "NE",
+    "NV",
+    "NH",
+    "NJ",
+    "NM",
+    "NY",
+    "NC",
+    "ND",
+    "OH",
+    "OK",
+    "OR",
+    "PA",
+    "RI",
+    "SC",
+    "SD",
+    "TN",
+    "TX",
+    "UT",
+    "VT",
+    "VA",
+    "WA",
+    "WV",
+    "WI",
+    "WY",
+]
+
+
+@pytest.fixture(scope="session")
+def dummy_census_data(tmp_path_factory):
+    """Generate a dummy decennial census dataframe, save to a tmpdir, and return that path."""
+    random.seed(0)
+    num_rows = 100_000
+    data = pd.DataFrame(
+        {
+            "housing_type": [random.choice(HOUSING_TYPES) for _ in range(num_rows)],
+            "age": [str(random.random() * 100) for _ in range(num_rows)],
+            "year": [random.choice(["2020", "2030"]) for _ in range(num_rows)],
+            "race_ethnicity": [random.choice(RACE_ETHNICITIES) for _ in range(num_rows)],
+            "guardian_1": [
+                f"100_{random.randint(1,int(num_rows/3))}" for _ in range(num_rows)
+            ],
+            "first_name": [
+                "First" + "".join(random.choice(ascii_lowercase) for _ in range(3))
+                for _ in range(num_rows)
+            ],
+            "street_name": [
+                "Street" + "".join(random.choice(ascii_lowercase) for _ in range(3))
+                for _ in range(num_rows)
+            ],
+            "relation_to_household_head": [
+                random.choice(RELATIONS_TO_HOUSEHOLD_HEAD) for _ in range(num_rows)
+            ],
+            "zipcode": [str(float(random.randint(1, 99999))) for _ in range(num_rows)],
+            "date_of_birth": [
+                time.strftime(
+                    "%Y-%m-%d",
+                    time.localtime(
+                        DOB_START_DATE + random.random() * (DOB_END_DATE - DOB_START_DATE)
+                    ),
+                )
+                for _ in range(num_rows)
+            ],
+            "simulant_id": ["100_" + str(i) for i in range(num_rows)],
+            "middle_initial": [random.choice(ascii_uppercase) for _ in range(num_rows)],
+            "city": [
+                "City" + "".join(random.choice(ascii_lowercase) for _ in range(3))
+                for _ in range(num_rows)
+            ],
+            "street_number": [str(random.randint(1, 15000)) for _ in range(num_rows)],
+            "last_name": [
+                "Last" + "".join(random.choice(ascii_lowercase) for _ in range(3))
+                for _ in range(num_rows)
+            ],
+            "state": [random.choice(STATES) for _ in range(num_rows)],
+            "sex": [random.choice(["Female", "Male"]) for _ in range(num_rows)],
+            "unit_number": [
+                "Unit " + "".join(random.choice(ascii_lowercase) for _ in range(3))
+                for _ in range(num_rows)
+            ],
+            "guardian_2": [
+                f"100_{random.randint(1,int(num_rows)/4)}" for _ in range(num_rows)
+            ],
+        }
+    )
+
+    data_path = tmp_path_factory.getbasetemp() / "dummy_data.csv"
+    data.to_csv(data_path, index=False)
+
+    return data_path
+
+
+@pytest.fixture(scope="module")
+def dummy_config(tmp_path_factory):
+    """This simply copies the default config file to a temp directory
+    to be used as a user-provided config file in integration tests
+    """
+    config = get_configuration().to_dict()  # gets default config
+    config_path = tmp_path_factory.getbasetemp() / "dummy_config.yaml"
+    with open(config_path, "w") as file:
+        yaml.dump(config, file)
+
+    return config_path
diff --git a/tests/integration/test_interface.py b/tests/integration/test_interface.py
index 90ccc2c1..9a32ac24 100644
--- a/tests/integration/test_interface.py
+++ b/tests/integration/test_interface.py
@@ -1,38 +1,99 @@
+from pathlib import Path
+from typing import Union
+
+import numpy as np
+import pandas as pd
 import pytest
 
 from pseudopeople.interface import generate_decennial_census
+from pseudopeople.utilities import get_configuration
 
 
-@pytest.mark.skip(reason="TODO")
-def test_noise_census():
-    pass
+# TODO: possibly parametrize Forms?
+def test_generate_decennial_census(
+    dummy_census_data: Union[Path, str], dummy_config: Union[Path, str]
+):
+    data = pd.read_csv(dummy_census_data)
+    noised_data = generate_decennial_census(
+        path=dummy_census_data, seed=0, configuration=dummy_config
+    )
+    noised_data_same_seed = generate_decennial_census(
+        path=dummy_census_data, seed=0, configuration=dummy_config
+    )
+    noised_data_different_seed = generate_decennial_census(
+        path=dummy_census_data, seed=1, configuration=dummy_config
+    )
+
+    assert noised_data.equals(noised_data_same_seed)
+    assert not noised_data.equals(noised_data_different_seed)
+    assert not data.equals(noised_data)
+    # TODO: Confirm correct columns exist once the interface functions
+    # modify them
+    # TODO: if we sort out dtype schemas
+    # for col in noised_data.columns:
+    # assert data[col].dtype == noised_data[col].dtype
+    # TODO: Iterate through cols and check that the percentage of errors makes sense
+    # eg, if 25% typographic error and 1% OCR
+    # 1. Use a default config file
+    # 2.
+
+    config = get_configuration(dummy_config)["decennial_census"]
+
+    # Confirm omission and duplication seems reasonable
+    # TODO: when omission function gets implemented.
+    orig_idx = data.index
+    noised_idx = noised_data.index
+    # assert np.isclose(len(set(orig_idx) - set(noised_idx)) / len(data), config.omission)
+    # TODO: when duplication function gets implemented
+    # assert np.isclose(noised_data.duplicated().sum() / len(data), config.duplication)
+
+    # Check that column-level noise seem reasonable
+    # NOTE: this is not perfect because (1) it is only looking at row-level
+    # noise and not token-based noise and (2) it is not accounting for the
+    # fact that noising can occur on duplicated rows which have been removed
+    # for comparison purposes.
+    common_idx = set(orig_idx).intersection(set(noised_idx))
+    common_data = data.loc[common_idx]
+    common_noised_data = noised_data.loc[common_idx].drop_duplicates()
+    assert common_data.shape == common_noised_data.shape
+    for col in noised_data:
+        if col in config:
+            actual_noise_rate = (common_data[col] != common_noised_data[col]).mean()
+            noise_types = [k for k in config[col]]
+            noise_rates = [
+                config[col][noise_type]["row_noise_level"] for noise_type in noise_types
+            ]
+            expected_noise_rate = 1 - np.prod([1 - x for x in noise_rates])
+            assert np.isclose(actual_noise_rate, expected_noise_rate, rtol=0.07)
+        else:
+            assert (common_data[col] == common_noised_data[col]).all()
 
 
 @pytest.mark.skip(reason="TODO")
-def test_noise_acs():
+def test_generate_acs():
     pass
 
 
 @pytest.mark.skip(reason="TODO")
-def test_noise_cps():
+def test_generate_cps():
     pass
 
 
 @pytest.mark.skip(reason="TODO")
-def test_noise_wic():
+def test_generate_wic():
     pass
 
 
 @pytest.mark.skip(reason="TODO")
-def test_noise_ssa():
+def test_generate_ssa():
     pass
 
 
 @pytest.mark.skip(reason="TODO")
-def test_noise_tax_w2_1099():
+def test_generate_tax_w2_1099():
     pass
 
 
 @pytest.mark.skip(reason="TODO")
-def test_noise_tax_1040():
+def test_generate_tax_1040():
     pass

From 0ff8057800c04c08dba8c07a51b4fc5772587e67 Mon Sep 17 00:00:00 2001
From: Matthew Kappel <mkappel@uw.edu>
Date: Wed, 29 Mar 2023 09:43:59 -0700
Subject: [PATCH 2/7] Add W2 generation interface (#20)

- *Category*: feature
- *JIRA issue*: [MIC-3869](https://jira.ihme.washington.edu/browse/MIC-3869)

Changes
- Adds `generate_w2` function to interface
- Adds relevant configuration to the defaults yaml
- Addition of a integration test is deferred pending merge of another PR

Testing
Running the `main` of `interface.py` and calling `generate_w2` resulted in noised data.
---
 src/pseudopeople/__init__.py                |  2 +-
 src/pseudopeople/default_configuration.yaml | 70 ++++++++++++++++++++-
 src/pseudopeople/interface.py               | 18 +++++-
 3 files changed, 87 insertions(+), 3 deletions(-)

diff --git a/src/pseudopeople/__init__.py b/src/pseudopeople/__init__.py
index a08dae8b..ac749c28 100644
--- a/src/pseudopeople/__init__.py
+++ b/src/pseudopeople/__init__.py
@@ -8,4 +8,4 @@
     __uri__,
     __version__,
 )
-from pseudopeople.interface import generate_decennial_census
+from pseudopeople.interface import generate_decennial_census, generate_w2
diff --git a/src/pseudopeople/default_configuration.yaml b/src/pseudopeople/default_configuration.yaml
index 62a2bf9d..a19cbfba 100644
--- a/src/pseudopeople/default_configuration.yaml
+++ b/src/pseudopeople/default_configuration.yaml
@@ -51,4 +51,72 @@ decennial_census:
     housing_type:
         missing_data:
             row_noise_level: 0.01
-
+taxes_w2_and_1099:
+    omission: 0.0145
+    duplication: 0.05
+    age:
+        missing_data:
+            row_noise_level: 0.01
+    date_of_birth:
+        missing_data:
+            row_noise_level: 0.01
+    employer_city:
+        missing_data:
+            row_noise_level: 0.01
+    employer_id:
+        missing_data:
+            row_noise_level: 0.01
+    employer_name:
+        missing_data:
+            row_noise_level: 0.01
+    employer_state:
+        missing_data:
+            row_noise_level: 0.01
+    employer_street_name:
+        missing_data:
+            row_noise_level: 0.01
+    employer_street_number:
+        missing_data:
+            row_noise_level: 0.01
+    employer_unit_number:
+        missing_data:
+            row_noise_level: 0.01
+    employer_zipcode:
+        missing_data:
+            row_noise_level: 0.01
+    first_name:
+        missing_data:
+            row_noise_level: 0.01
+    income:
+        missing_data:
+            row_noise_level: 0.01
+    is_w2:
+        missing_data:
+            row_noise_level: 0.01
+    last_name:
+        missing_data:
+            row_noise_level: 0.01
+    mailing_address_city:
+        missing_data:
+            row_noise_level: 0.01
+    mailing_address_state:
+        missing_data:
+            row_noise_level: 0.01
+    mailing_address_street_name:
+        missing_data:
+            row_noise_level: 0.01
+    mailing_address_street_number:
+        missing_data:
+            row_noise_level: 0.01
+    mailing_address_unit_number:
+        missing_data:
+            row_noise_level: 0.01
+    mailing_address_zipcode:
+        missing_data:
+            row_noise_level: 0.01
+    middle_initial:
+        missing_data:
+            row_noise_level: 0.01
+    ssn:
+        missing_data:
+            row_noise_level: 0.01
\ No newline at end of file
diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py
index 74e3b86f..5ff041b3 100644
--- a/src/pseudopeople/interface.py
+++ b/src/pseudopeople/interface.py
@@ -29,13 +29,29 @@ def generate_decennial_census(
     return noise_form(Form.CENSUS, data, configuration_tree, seed)
 
 
+def generate_w2(
+    path: Union[Path, str], seed: int = 0, configuration: Union[Path, str] = None
+):
+    """
+    Generates a noised W2 data from un-noised data.
+
+    :param path: A path to the un-noised source W2 data
+    :param seed: An integer seed for randomness
+    :param configuration: (optional) A path to a configuration YAML file to modify default values
+    :return: A pd.DataFrame of noised W2 data
+    """
+    configuration_tree = get_configuration(configuration)
+    data = pd.read_csv(path)
+    return noise_form(Form.TAX_W2_1099, data, configuration_tree, seed)
+
+
 # Manual testing helper
 if __name__ == "__main__":
     args = sys.argv[1:]
     if len(args) == 1:
         my_path = Path(args[0])
         src = pd.read_csv(my_path)
-        out = generate_decennial_census(my_path)
+        out = generate_w2(my_path)
         diff = src[
             ~src.astype(str).apply(tuple, 1).isin(out.astype(str).apply(tuple, 1))
         ]  # get all changed rows

From 6b217e57a502ae9aaf87a772d49ee3980d1bfcb9 Mon Sep 17 00:00:00 2001
From: Steve Bachmeier <23350991+stevebachmeier@users.noreply.github.com>
Date: Wed, 29 Mar 2023 12:27:49 -0600
Subject: [PATCH 3/7] implement typographic noise function (#19)

---
 src/pseudopeople/constants/paths.py         |   8 +
 src/pseudopeople/data/qwerty_errors.yaml    | 382 ++++++++++++++++++++
 src/pseudopeople/default_configuration.yaml |  40 ++
 src/pseudopeople/entities.py                |   2 +-
 src/pseudopeople/interface.py               |   6 +-
 src/pseudopeople/noise_functions.py         |  91 ++++-
 tests/conftest.py                           |  16 +
 tests/integration/conftest.py               |  28 +-
 tests/integration/test_interface.py         |  53 +--
 tests/unit/test_column_noise.py             | 174 +++++++--
 10 files changed, 680 insertions(+), 120 deletions(-)
 create mode 100644 src/pseudopeople/constants/paths.py
 create mode 100644 src/pseudopeople/data/qwerty_errors.yaml

diff --git a/src/pseudopeople/constants/paths.py b/src/pseudopeople/constants/paths.py
new file mode 100644
index 00000000..da3b9fb3
--- /dev/null
+++ b/src/pseudopeople/constants/paths.py
@@ -0,0 +1,8 @@
+from pathlib import Path
+
+import pseudopeople
+
+BASE_DIR = Path(pseudopeople.__file__).resolve().parent
+DATA_ROOT = BASE_DIR / "data"
+
+QWERTY_ERRORS = DATA_ROOT / "qwerty_errors.yaml"
diff --git a/src/pseudopeople/data/qwerty_errors.yaml b/src/pseudopeople/data/qwerty_errors.yaml
new file mode 100644
index 00000000..cd069a9c
--- /dev/null
+++ b/src/pseudopeople/data/qwerty_errors.yaml
@@ -0,0 +1,382 @@
+q:
+- w
+- a
+- s
+w:
+- q
+- e
+- a
+- s
+- d
+e:
+- w
+- r
+- s
+- d
+- f
+r:
+- e
+- t
+- d
+- f
+- g
+t:
+- r
+- y
+- f
+- g
+- h
+y:
+- t
+- u
+- g
+- h
+- j
+u:
+- y
+- i
+- h
+- j
+- k
+i:
+- u
+- o
+- j
+- k
+- l
+o:
+- i
+- p
+- k
+- l
+p:
+- o
+- l
+a:
+- q
+- w
+- s
+- z
+- x
+s:
+- q
+- w
+- e
+- a
+- d
+- z
+- x
+- c
+d:
+- w
+- e
+- r
+- s
+- f
+- x
+- c
+- v
+f:
+- e
+- r
+- t
+- d
+- g
+- c
+- v
+- b
+g:
+- r
+- t
+- y
+- f
+- h
+- v
+- b
+- n
+h:
+- t
+- y
+- u
+- g
+- j
+- b
+- n
+- m
+j:
+- y
+- u
+- i
+- h
+- k
+- n
+- m
+k:
+- u
+- i
+- o
+- j
+- l
+- m
+l:
+- i
+- o
+- p
+- k
+z:
+- a
+- s
+- x
+x:
+- a
+- s
+- d
+- z
+- c
+c:
+- s
+- d
+- f
+- x
+- v
+v:
+- d
+- f
+- g
+- c
+- b
+b:
+- f
+- g
+- h
+- v
+- n
+n:
+- g
+- h
+- j
+- b
+- m
+m:
+- h
+- j
+- k
+- n
+Q:
+- W
+- A
+- S
+W:
+- Q
+- E
+- A
+- S
+- D
+E:
+- W
+- R
+- S
+- D
+- F
+R:
+- E
+- T
+- D
+- F
+- G
+T:
+- R
+- Y
+- F
+- G
+- H
+Y:
+- T
+- U
+- G
+- H
+- J
+U:
+- Y
+- I
+- H
+- J
+- K
+I:
+- U
+- O
+- J
+- K
+- L
+O:
+- I
+- P
+- K
+- L
+P:
+- O
+- L
+A:
+- Q
+- W
+- S
+- Z
+- X
+S:
+- Q
+- W
+- E
+- A
+- D
+- Z
+- X
+- C
+D:
+- W
+- E
+- R
+- S
+- F
+- X
+- C
+- V
+F:
+- E
+- R
+- T
+- D
+- G
+- C
+- V
+- B
+G:
+- R
+- T
+- Y
+- F
+- H
+- V
+- B
+- N
+H:
+- T
+- Y
+- U
+- G
+- J
+- B
+- N
+- M
+J:
+- Y
+- U
+- I
+- H
+- K
+- N
+- M
+K:
+- U
+- I
+- O
+- J
+- L
+- M
+L:
+- I
+- O
+- P
+- K
+Z:
+- A
+- S
+- X
+X:
+- A
+- S
+- D
+- Z
+- C
+C:
+- S
+- D
+- F
+- X
+- V
+V:
+- D
+- F
+- G
+- C
+- B
+B:
+- F
+- G
+- H
+- V
+- N
+N:
+- G
+- H
+- J
+- B
+- M
+M:
+- H
+- J
+- K
+- N
+'7':
+- '8'
+- '4'
+- '5'
+'8':
+- '7'
+- '9'
+- '4'
+- '5'
+- '6'
+'9':
+- '8'
+- '5'
+- '6'
+'4':
+- '7'
+- '8'
+- '5'
+- '1'
+- '2'
+'5':
+- '7'
+- '8'
+- '9'
+- '4'
+- '6'
+- '1'
+- '2'
+- '3'
+'6':
+- '8'
+- '9'
+- '5'
+- '2'
+- '3'
+'1':
+- '4'
+- '5'
+- '2'
+- '0'
+'2':
+- '4'
+- '5'
+- '6'
+- '1'
+- '3'
+- '0'
+'3':
+- '5'
+- '6'
+- '2'
+'0':
+- '1'
+- '2'
diff --git a/src/pseudopeople/default_configuration.yaml b/src/pseudopeople/default_configuration.yaml
index a19cbfba..f3e81092 100644
--- a/src/pseudopeople/default_configuration.yaml
+++ b/src/pseudopeople/default_configuration.yaml
@@ -9,36 +9,76 @@ decennial_census:
     first_name:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     middle_initial:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     last_name:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     age:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     date_of_birth:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     street_number:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     street_name:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     unit_number:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     city:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     state:
         missing_data:
             row_noise_level: 0.01
     zipcode:
         missing_data:
             row_noise_level: 0.01
+        typographic:
+            row_noise_level: 0.01
+            token_noise_level: 0.1
+            include_original_token_level: 0.1
     relation_to_household_head:
         missing_data:
             row_noise_level: 0.01
diff --git a/src/pseudopeople/entities.py b/src/pseudopeople/entities.py
index 3c095e3a..1a41e264 100644
--- a/src/pseudopeople/entities.py
+++ b/src/pseudopeople/entities.py
@@ -54,7 +54,7 @@ class __NoiseTypes(NamedTuple):
         "month_day_swap", noise_functions.swap_months_and_days
     )
     ZIP_CODE_MISWRITING: ColumnNoiseType = ColumnNoiseType(
-        "zipcode_miswriting", noise_functions.miswrite_zip_codes
+        "zipcode_miswriting", noise_functions.miswrite_zipcodes
     )
     AGE_MISWRITING: ColumnNoiseType = ColumnNoiseType(
         "age_miswriting", noise_functions.miswrite_ages
diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py
index 5ff041b3..5c53b1ed 100644
--- a/src/pseudopeople/interface.py
+++ b/src/pseudopeople/interface.py
@@ -25,7 +25,7 @@ def generate_decennial_census(
     :return: A pd.DataFrame of noised census data
     """
     configuration_tree = get_configuration(configuration)
-    data = pd.read_csv(path)
+    data = pd.read_csv(path, dtype=str, keep_default_na=False)
     return noise_form(Form.CENSUS, data, configuration_tree, seed)
 
 
@@ -41,7 +41,7 @@ def generate_w2(
     :return: A pd.DataFrame of noised W2 data
     """
     configuration_tree = get_configuration(configuration)
-    data = pd.read_csv(path)
+    data = pd.read_csv(path, dtype=str, keep_default_na=False)
     return noise_form(Form.TAX_W2_1099, data, configuration_tree, seed)
 
 
@@ -50,7 +50,7 @@ def generate_w2(
     args = sys.argv[1:]
     if len(args) == 1:
         my_path = Path(args[0])
-        src = pd.read_csv(my_path)
+        src = pd.read_csv(my_path, dtype=str, keep_default_na=False)
         out = generate_w2(my_path)
         diff = src[
             ~src.astype(str).apply(tuple, 1).isin(out.astype(str).apply(tuple, 1))
diff --git a/src/pseudopeople/noise_functions.py b/src/pseudopeople/noise_functions.py
index e82c8380..fb4f30da 100644
--- a/src/pseudopeople/noise_functions.py
+++ b/src/pseudopeople/noise_functions.py
@@ -1,9 +1,13 @@
 from typing import Any
 
+import numpy as np
 import pandas as pd
+import yaml
 from vivarium import ConfigTree
 from vivarium.framework.randomness import RandomnessStream
 
+from pseudopeople.constants import paths
+
 
 def omit_rows(
     form_data: pd.DataFrame,
@@ -91,7 +95,7 @@ def swap_months_and_days(
     return form_data
 
 
-def miswrite_zip_codes(
+def miswrite_zipcodes(
     form_data: pd.DataFrame,
     configuration: float,
     randomness_stream: RandomnessStream,
@@ -217,12 +221,12 @@ def generate_missing_data(
 
     # Avoid SettingWithCopyWarning
     column = column.copy()
-    noise_level = configuration.row_noise_level
-    # Get rows to noise
-    to_noise_idx = randomness_stream.filter_for_probability(
-        column.index,
-        probability=noise_level,
-        additional_key=f"{additional_key}_missing_data_filter",
+    to_noise_idx = _get_to_noise_idx(
+        column,
+        configuration,
+        randomness_stream,
+        additional_key,
+        context_key="missing_data_filter",
     )
     column.loc[to_noise_idx] = ""
 
@@ -235,15 +239,62 @@ def generate_typographical_errors(
     randomness_stream: RandomnessStream,
     additional_key: Any,
 ) -> pd.Series:
-    """
+    """Function that takes a column and applies noise to the string values
+    representative of keyboard mis-typing.
 
-    :param column:
-    :param configuration:
-    :param randomness_stream:
+    :param column:  pd.Series of data
+    :param configuration: ConfigTree object containing noising parameters
+    :param randomness_stream:  RandomnessStream to utilize Vivarium CRN
     :param additional_key: Key for RandomnessStream
-    :return:
+    :returns: pd.Series of column with noised data
     """
-    # todo actually generate typographical errors
+    column = column.copy()
+    not_missing_idx = column.index[(column.notna()) & (column != "")]
+
+    with open(paths.QWERTY_ERRORS) as f:
+        qwerty_errors = yaml.full_load(f)
+
+    def keyboard_corrupt(truth, corrupted_pr, addl_pr, rng):
+        """Abie's implementation of typographical noising"""
+        err = ""
+        i = 0
+        while i < len(truth):
+            error_introduced = False
+            token = truth[i : (i + 1)]
+            if token in qwerty_errors and not error_introduced:
+                random_number = rng.uniform()
+                if random_number < corrupted_pr:
+                    err += rng.choice(qwerty_errors[token])
+                    random_number = rng.uniform()
+                    if random_number < addl_pr:
+                        err += token
+                    i += 1
+                    error_introduced = True
+            if not error_introduced:
+                err += truth[i : (i + 1)]
+                i += 1
+        return err
+
+    token_noise_level = configuration.token_noise_level
+    include_original_token_level = configuration.include_original_token_level
+
+    to_noise_idx = _get_to_noise_idx(
+        column.loc[not_missing_idx],
+        configuration,
+        randomness_stream,
+        additional_key,
+        context_key="typographical_noise_filter",
+    )
+    rng = np.random.default_rng(seed=randomness_stream.seed)
+    for idx in to_noise_idx:
+        noised_value = keyboard_corrupt(
+            column[idx],
+            token_noise_level,
+            include_original_token_level,
+            rng,
+        )
+        column[idx] = noised_value
+
     return column
 
 
@@ -265,4 +316,16 @@ def generate_ocr_errors(
     return column
 
 
-# todo add noise functions
+####################
+# HELPER FUNCTIONS #
+####################
+def _get_to_noise_idx(column, configuration, randomness_stream, additional_key, context_key):
+    noise_level = configuration.row_noise_level
+    # Get rows to noise
+    to_noise_idx = randomness_stream.filter_for_probability(
+        column.index,
+        probability=noise_level,
+        additional_key=f"{additional_key}_{context_key}",
+    )
+
+    return to_noise_idx
diff --git a/tests/conftest.py b/tests/conftest.py
index 3178a8fd..894977cb 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,4 +1,7 @@
 import pytest
+import yaml
+
+from pseudopeople.utilities import get_configuration
 
 
 def pytest_addoption(parser):
@@ -17,3 +20,16 @@ def pytest_collection_modifyitems(config, items):
     for item in items:
         if "slow" in item.keywords:
             item.add_marker(skip_slow)
+
+
+@pytest.fixture(scope="session")
+def user_config_path(tmp_path_factory):
+    """This simply copies the default config file to a temp directory
+    to be used as a user-provided config file in integration tests
+    """
+    config = get_configuration().to_dict()  # gets default config
+    config_path = tmp_path_factory.getbasetemp() / "dummy_config.yaml"
+    with open(config_path, "w") as file:
+        yaml.dump(config, file)
+
+    return config_path
diff --git a/tests/integration/conftest.py b/tests/integration/conftest.py
index 140c6d12..060a5174 100644
--- a/tests/integration/conftest.py
+++ b/tests/integration/conftest.py
@@ -4,9 +4,6 @@
 
 import pandas as pd
 import pytest
-import yaml
-
-from pseudopeople.utilities import get_configuration
 
 HOUSING_TYPES = [
     "Carceral",
@@ -109,14 +106,18 @@
 
 
 @pytest.fixture(scope="session")
-def dummy_census_data(tmp_path_factory):
+def decennial_census_data_path(tmp_path_factory):
     """Generate a dummy decennial census dataframe, save to a tmpdir, and return that path."""
     random.seed(0)
     num_rows = 100_000
     data = pd.DataFrame(
         {
             "housing_type": [random.choice(HOUSING_TYPES) for _ in range(num_rows)],
-            "age": [str(random.random() * 100) for _ in range(num_rows)],
+            # TODO: Currently ages are actually floats but a followup pr will ensure ints
+            "age": [
+                str(random.randint(1, 100) + round(random.random(), 6))
+                for _ in range(num_rows)
+            ],
             "year": [random.choice(["2020", "2030"]) for _ in range(num_rows)],
             "race_ethnicity": [random.choice(RACE_ETHNICITIES) for _ in range(num_rows)],
             "guardian_1": [
@@ -133,7 +134,9 @@ def dummy_census_data(tmp_path_factory):
             "relation_to_household_head": [
                 random.choice(RELATIONS_TO_HOUSEHOLD_HEAD) for _ in range(num_rows)
             ],
-            "zipcode": [str(float(random.randint(1, 99999))) for _ in range(num_rows)],
+            # TODO: currently zipcodes are floats (and thus not zero-padded);
+            # a followup PR will convert to 5-digit integer strings
+            "zipcode": [str(random.randint(1, 99999)) + ".0" for _ in range(num_rows)],
             "date_of_birth": [
                 time.strftime(
                     "%Y-%m-%d",
@@ -170,16 +173,3 @@ def dummy_census_data(tmp_path_factory):
     data.to_csv(data_path, index=False)
 
     return data_path
-
-
-@pytest.fixture(scope="module")
-def dummy_config(tmp_path_factory):
-    """This simply copies the default config file to a temp directory
-    to be used as a user-provided config file in integration tests
-    """
-    config = get_configuration().to_dict()  # gets default config
-    config_path = tmp_path_factory.getbasetemp() / "dummy_config.yaml"
-    with open(config_path, "w") as file:
-        yaml.dump(config, file)
-
-    return config_path
diff --git a/tests/integration/test_interface.py b/tests/integration/test_interface.py
index 9a32ac24..1501aa8c 100644
--- a/tests/integration/test_interface.py
+++ b/tests/integration/test_interface.py
@@ -11,62 +11,25 @@
 
 # TODO: possibly parametrize Forms?
 def test_generate_decennial_census(
-    dummy_census_data: Union[Path, str], dummy_config: Union[Path, str]
+    decennial_census_data_path: Union[Path, str], user_config_path: Union[Path, str]
 ):
-    data = pd.read_csv(dummy_census_data)
+    data = pd.read_csv(decennial_census_data_path, dtype=str, keep_default_na=False)
+
+    # TODO: Refactor this check into a separate test
     noised_data = generate_decennial_census(
-        path=dummy_census_data, seed=0, configuration=dummy_config
+        path=decennial_census_data_path, seed=0, configuration=user_config_path
     )
     noised_data_same_seed = generate_decennial_census(
-        path=dummy_census_data, seed=0, configuration=dummy_config
+        path=decennial_census_data_path, seed=0, configuration=user_config_path
     )
     noised_data_different_seed = generate_decennial_census(
-        path=dummy_census_data, seed=1, configuration=dummy_config
+        path=decennial_census_data_path, seed=1, configuration=user_config_path
     )
 
     assert noised_data.equals(noised_data_same_seed)
     assert not noised_data.equals(noised_data_different_seed)
     assert not data.equals(noised_data)
-    # TODO: Confirm correct columns exist once the interface functions
-    # modify them
-    # TODO: if we sort out dtype schemas
-    # for col in noised_data.columns:
-    # assert data[col].dtype == noised_data[col].dtype
-    # TODO: Iterate through cols and check that the percentage of errors makes sense
-    # eg, if 25% typographic error and 1% OCR
-    # 1. Use a default config file
-    # 2.
-
-    config = get_configuration(dummy_config)["decennial_census"]
-
-    # Confirm omission and duplication seems reasonable
-    # TODO: when omission function gets implemented.
-    orig_idx = data.index
-    noised_idx = noised_data.index
-    # assert np.isclose(len(set(orig_idx) - set(noised_idx)) / len(data), config.omission)
-    # TODO: when duplication function gets implemented
-    # assert np.isclose(noised_data.duplicated().sum() / len(data), config.duplication)
-
-    # Check that column-level noise seem reasonable
-    # NOTE: this is not perfect because (1) it is only looking at row-level
-    # noise and not token-based noise and (2) it is not accounting for the
-    # fact that noising can occur on duplicated rows which have been removed
-    # for comparison purposes.
-    common_idx = set(orig_idx).intersection(set(noised_idx))
-    common_data = data.loc[common_idx]
-    common_noised_data = noised_data.loc[common_idx].drop_duplicates()
-    assert common_data.shape == common_noised_data.shape
-    for col in noised_data:
-        if col in config:
-            actual_noise_rate = (common_data[col] != common_noised_data[col]).mean()
-            noise_types = [k for k in config[col]]
-            noise_rates = [
-                config[col][noise_type]["row_noise_level"] for noise_type in noise_types
-            ]
-            expected_noise_rate = 1 - np.prod([1 - x for x in noise_rates])
-            assert np.isclose(actual_noise_rate, expected_noise_rate, rtol=0.07)
-        else:
-            assert (common_data[col] == common_noised_data[col]).all()
+    assert set(noised_data.columns) == set(data.columns)
 
 
 @pytest.mark.skip(reason="TODO")
diff --git a/tests/unit/test_column_noise.py b/tests/unit/test_column_noise.py
index 09d68f88..2ada0f32 100644
--- a/tests/unit/test_column_noise.py
+++ b/tests/unit/test_column_noise.py
@@ -1,3 +1,6 @@
+import random
+from string import ascii_lowercase, ascii_uppercase
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -5,9 +8,17 @@
 
 from pseudopeople.noise_functions import (
     generate_fake_names,
+    generate_incorrect_selections,
     generate_missing_data,
     generate_nicknames,
+    generate_ocr_errors,
     generate_phonetic_errors,
+    generate_typographical_errors,
+    generate_within_household_copies,
+    miswrite_ages,
+    miswrite_numerics,
+    miswrite_zipcodes,
+    swap_months_and_days,
 )
 from pseudopeople.utilities import get_configuration
 
@@ -20,96 +31,183 @@
 
 
 @pytest.fixture(scope="module")
-def string_series():
-    num_simulants = 1_000_000
-    return pd.Series([str(x) for x in range(num_simulants)])
+def dummy_dataset():
+    # Add a column of integer strings
+    num_simulants = 100_000
+    dummy_idx = pd.Index(range(num_simulants))
+    integer_series = pd.Series([str(x) for x in range(num_simulants)])
+    # Add missing data from `generate_missing_data` function
+    missing_idx = pd.Index([x for x in dummy_idx if x % 3 == 0])
+    integer_series.loc[missing_idx] = ""
+
+    # Add a column of character strings
+    str_length = 6
+    character_series = pd.Series(
+        [
+            "".join(
+                random.choice(ascii_lowercase + ascii_uppercase) for _ in range(str_length)
+            )
+            for _ in range(num_simulants)
+        ]
+    )
+    # Add missing data from `generate_missing_data` function
+    character_series.loc[missing_idx] = ""
 
+    return pd.DataFrame({"numbers": integer_series, "characters": character_series})
 
-@pytest.fixture(scope="module")
-def default_configuration():
-    return get_configuration()
 
-
-def test_generate_missing_data(string_series, default_configuration):
-    # TODO: [MIC-3910] Use custom config (MIC-3866)
-    config = default_configuration["decennial_census"]["zipcode"]["missing_data"]
-    noised_data = generate_missing_data(
-        string_series, config, RANDOMNESS0, "test_missing_data"
-    )
-    noised_data_same_seed = generate_missing_data(
-        string_series, config, RANDOMNESS0, "test_missing_data"
-    )
-    noised_data_different_seed = generate_missing_data(
-        string_series, config, RANDOMNESS1, "test_missing_data"
+def test_generate_missing_data(dummy_dataset, user_config_path):
+    config = get_configuration(user_config_path)["decennial_census"]["zipcode"][
+        "missing_data"
+    ]
+    data = dummy_dataset["numbers"]
+    noised_data = _validate_seed_and_noise_data(
+        func=generate_missing_data, column=data, config=config
     )
 
-    # Confirm same randomness stream provides same results
-    assert (noised_data == noised_data_same_seed).all()
-
-    # Confirm different streams provide different results
-    assert (noised_data != noised_data_different_seed).any()
+    # Calculate newly missing data, ie data that didn't come in as already missing
+    orig_non_missing_idx = data.index[(data.notna()) & (data != "")]
+    newly_missing_idx = noised_data.index[
+        (noised_data.index.isin(orig_non_missing_idx)) & (noised_data == "")
+    ]
 
     # Check for expected noise level
     expected_noise = config["row_noise_level"]
-    actual_noise = (noised_data == "").mean()
+    actual_noise = len(newly_missing_idx) / len(orig_non_missing_idx)
     assert np.isclose(expected_noise, actual_noise, rtol=0.02)
 
     # Check that un-noised values are unchanged
     not_noised_idx = noised_data.index[noised_data != ""]
     assert "" not in noised_data[not_noised_idx].values
-    assert (string_series[not_noised_idx] == noised_data[not_noised_idx]).all()
+    assert (data[not_noised_idx] == noised_data[not_noised_idx]).all()
 
 
 @pytest.mark.skip(reason="TODO")
-def test_incorrect_selection():
+def test_generate_incorrect_selections():
     pass
 
 
 @pytest.mark.skip(reason="TODO")
-def test_copy_from_within_household():
+def test_generate_within_household_copies():
     pass
 
 
 @pytest.mark.skip(reason="TODO")
-def test_swap_month_day():
+def test_swap_months_and_days():
     pass
 
 
 @pytest.mark.skip(reason="TODO")
-def test_miswrite_zipcode():
+def test_miswrite_zipcodes():
     pass
 
 
 @pytest.mark.skip(reason="TODO")
-def test_miswrite_age():
+def test_miswrite_ages():
     pass
 
 
 @pytest.mark.skip(reason="TODO")
-def test_miswrite_numeric():
+def test_miswrite_numerics():
     pass
 
 
 @pytest.mark.skip(reason="TODO")
-def test_nickname_noise():
+def test_generate_nicknames():
     pass
 
 
 @pytest.mark.skip(reason="TODO")
-def test_fake_name_noise():
+def test_generate_fake_names():
     pass
 
 
 @pytest.mark.skip(reason="TODO")
-def test_phonetic_noise():
+def test_generate_phonetic_errors():
     pass
 
 
 @pytest.mark.skip(reason="TODO")
-def test_ocr_noise():
+def test_generate_ocr_errors():
     pass
 
 
-@pytest.mark.skip(reason="TODO")
-def test_typographic_noise():
-    pass
+@pytest.mark.parametrize(
+    "column",
+    [
+        "numbers",
+        "characters",
+    ],
+)
+def test_generate_typographical_errors(dummy_dataset, column):
+    data = dummy_dataset[column]
+    config = get_configuration()
+    config.update(
+        {
+            "decennial_census": {
+                column: {
+                    "typographic": {
+                        "row_noise_level": 0.1,
+                        "token_noise_level": 0.1,
+                        "include_original_token_level": 0.1,
+                    },
+                },
+            },
+        }
+    )
+    config = config["decennial_census"][column]["typographic"]
+    noised_data = _validate_seed_and_noise_data(
+        func=generate_typographical_errors, column=data, config=config
+    )
+
+    not_missing_idx = data.index[(data.notna()) & (data != "")]
+    check_original = data.loc[not_missing_idx]
+    check_noised = noised_data.loc[not_missing_idx]
+
+    # Check for expected noise level
+    p_row_noise = config.row_noise_level
+    p_token_noise = config.token_noise_level
+    str_lengths = check_original.str.len()  # pd.Series
+    p_token_not_noised = 1 - p_token_noise
+    p_strings_not_noised = p_token_not_noised**str_lengths  # pd.Series
+    p_strings_noised = 1 - p_strings_not_noised  # pd.Series
+    expected_noise = p_row_noise * p_strings_noised.mean()
+    actual_noise = (check_noised != check_original).mean()
+    assert np.isclose(expected_noise, actual_noise, rtol=0.06)
+
+    # Check for expected string growth due to keeping original noised token
+    assert (check_noised.str.len() >= check_original.str.len()).all()
+    p_include_original_token = config.include_original_token_level
+    p_token_does_not_increase_string_length = 1 - p_token_noise * p_include_original_token
+    p_strings_do_not_increase_length = (
+        p_token_does_not_increase_string_length**str_lengths
+    )  # pd.Series
+    p_strings_increase_length = 1 - p_strings_do_not_increase_length  # pd.Series
+    expected_changed_length = p_row_noise * p_strings_increase_length.mean()
+    actual_changed_length = (check_noised.str.len() != check_original.str.len()).mean()
+    assert np.isclose(expected_changed_length, actual_changed_length, rtol=0.06)
+
+    # Check that we did not touch the missing data
+    assert (
+        data.loc[~data.index.isin(not_missing_idx)]
+        == noised_data.loc[~noised_data.index.isin(not_missing_idx)]
+    ).all()
+
+
+####################
+# HELPER FUNCTIONS #
+####################
+
+
+# TODO: refactor this into its own test parameterized by noise functions
+def _validate_seed_and_noise_data(func, column, config):
+    """Confirms randomness stream behavior and returns the noised data"""
+    noised_data = func(column, config, RANDOMNESS0, f"test_{func.__name__}")
+    noised_data_same_seed = func(column, config, RANDOMNESS0, f"test_{func.__name__}")
+    noised_data_different_seed = func(column, config, RANDOMNESS1, f"test_{func.__name__}")
+
+    assert (noised_data != column).any()
+    assert (noised_data == noised_data_same_seed).all()
+    assert (noised_data != noised_data_different_seed).any()
+
+    return noised_data

From 926c1ea9655d335018d0f3b2734a9046a31e897d Mon Sep 17 00:00:00 2001
From: Matthew Kappel <mkappel@uw.edu>
Date: Wed, 29 Mar 2023 16:31:25 -0700
Subject: [PATCH 4/7] Change is_w2 to tax_form (#21)

- *Category*: bugfix
- *JIRA issue*: [MIC-3933](https://jira.ihme.washington.edu/browse/MIC-3933)

Simply changes is_w2 column to the new tax_form column in the default configuration. No existing tests are impacted.

Testing
Ran noising against data generated with modified PRL outputs. W2 data were noised as expected.
---
 src/pseudopeople/default_configuration.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/pseudopeople/default_configuration.yaml b/src/pseudopeople/default_configuration.yaml
index f3e81092..8afd8018 100644
--- a/src/pseudopeople/default_configuration.yaml
+++ b/src/pseudopeople/default_configuration.yaml
@@ -130,7 +130,7 @@ taxes_w2_and_1099:
     income:
         missing_data:
             row_noise_level: 0.01
-    is_w2:
+    tax_form:
         missing_data:
             row_noise_level: 0.01
     last_name:

From 190cae44c35644a7187945b089d435f3b2ffbc59 Mon Sep 17 00:00:00 2001
From: albrja <37345113+albrja@users.noreply.github.com>
Date: Wed, 29 Mar 2023 18:06:38 -0700
Subject: [PATCH 5/7] Incorrect select noise function (#18)

Implement incorrect select noise function

Adds generate_incorrect_selection to noise functions.
- *Category*: Feature
- *JIRA issue*: [MIC-3873](https://jira.ihme.washington.edu/browse/MIC-3873)

-Adds CSV containing possible values for incorrect selection by column
-Adds paths module
-Adds noise function and test for generate_incorrect_selection

Testing
-Test suites pass successfully and generated decennial census form.
---
 MANIFEST.in                                   |   2 +-
 src/pseudopeople/constants/paths.py           |   1 +
 .../data/incorrect_select_options.csv         |  52 ++++++++
 src/pseudopeople/default_configuration.yaml   |  18 ++-
 src/pseudopeople/entity_types.py              |  13 +-
 src/pseudopeople/interface.py                 |   2 +-
 src/pseudopeople/noise.py                     |   3 +-
 .../{entities.py => noise_entities.py}        |  25 ----
 src/pseudopeople/noise_functions.py           |  72 +++++------
 src/pseudopeople/schema_entities.py           |  27 ++++
 src/pseudopeople/utilities.py                 |  76 +++++++++++-
 tests/integration/test_interface.py           |   2 -
 tests/unit/test_column_noise.py               |  82 ++++++++-----
 tests/unit/test_noise_form.py                 | 116 +++++++++++++++++-
 14 files changed, 380 insertions(+), 111 deletions(-)
 create mode 100644 src/pseudopeople/data/incorrect_select_options.csv
 rename src/pseudopeople/{entities.py => noise_entities.py} (81%)
 create mode 100644 src/pseudopeople/schema_entities.py

diff --git a/MANIFEST.in b/MANIFEST.in
index 6c5b4a02..a02323e8 100644
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -8,5 +8,5 @@ include README.rst
 recursive-include docs *
 prune docs/_build
 
-recursive-include src/pseudopeople *.py *.yaml
+recursive-include src/pseudopeople *.py *.yaml *.csv
 recursive-include tests *.py *txt *.yaml
diff --git a/src/pseudopeople/constants/paths.py b/src/pseudopeople/constants/paths.py
index da3b9fb3..81321e45 100644
--- a/src/pseudopeople/constants/paths.py
+++ b/src/pseudopeople/constants/paths.py
@@ -5,4 +5,5 @@
 BASE_DIR = Path(pseudopeople.__file__).resolve().parent
 DATA_ROOT = BASE_DIR / "data"
 
+INCORRECT_SELECT_NOISE_OPTIONS_DATA = DATA_ROOT / "incorrect_select_options.csv"
 QWERTY_ERRORS = DATA_ROOT / "qwerty_errors.yaml"
diff --git a/src/pseudopeople/data/incorrect_select_options.csv b/src/pseudopeople/data/incorrect_select_options.csv
new file mode 100644
index 00000000..bb60c488
--- /dev/null
+++ b/src/pseudopeople/data/incorrect_select_options.csv
@@ -0,0 +1,52 @@
+state,relation_to_household_head,sex,race_ethnicity,is_w2,event_type
+AL,Reference person,Female,White,True,creation
+AK,Opp-sex spouse,Male,Black,False,death
+AZ,Opp-sex partner,,Asian,,
+AR,Same-sex spouse,,AIAN,,
+CA,Same-sex partne,,NHOPI,,
+CO,Biological child,,Multiracial or Other,,
+CT,Adopted child,,Latino,,
+DE,Stepchild,,,,
+FL,Sibling,,,,
+GA,Parent,,,,
+HI,Grandchild,,,,
+ID,Parent-in-law,,,,
+IL,Child-in-law,,,,
+IN,Other relative,,,,
+IA,Roommate,,,,
+KS,Foster child,,,,
+KY,Other nonrelative,,,,
+LA,Institutionalized GQ po,,,,
+ME,Noninstitutionalized GQ pop,,,,
+MD,,,,,
+MA,,,,,
+MI,,,,,
+MN,,,,,
+MS,,,,,
+MO,,,,,
+MT,,,,,
+NE,,,,,
+NV,,,,,
+NH,,,,,
+NJ,,,,,
+NM,,,,,
+NY,,,,,
+NC,,,,,
+ND,,,,,
+OH,,,,,
+OK,,,,,
+OR,,,,,
+PA,,,,,
+RI,,,,,
+SC,,,,,
+SD,,,,,
+TN,,,,,
+TX,,,,,
+UT,,,,,
+VT,,,,,
+VA,,,,,
+WA,,,,,
+WV,,,,,
+WI,,,,,
+WY,,,,,
+DC,,,,,
\ No newline at end of file
diff --git a/src/pseudopeople/default_configuration.yaml b/src/pseudopeople/default_configuration.yaml
index 8afd8018..771ef3d7 100644
--- a/src/pseudopeople/default_configuration.yaml
+++ b/src/pseudopeople/default_configuration.yaml
@@ -72,6 +72,8 @@ decennial_census:
     state:
         missing_data:
             row_noise_level: 0.01
+        incorrect_selection:
+            row_noise_level: 0.01
     zipcode:
         missing_data:
             row_noise_level: 0.01
@@ -82,15 +84,19 @@ decennial_census:
     relation_to_household_head:
         missing_data:
             row_noise_level: 0.01
+        incorrect_selection:
+            row_noise_level: 0.01
     sex:
         missing_data:
             row_noise_level: 0.01
+        incorrect_selection:
+            row_noise_level: 0.01
     race_ethnicity:
         missing_data:
             row_noise_level: 0.01
-    housing_type:
-        missing_data:
+        incorrect_selection:
             row_noise_level: 0.01
+
 taxes_w2_and_1099:
     omission: 0.0145
     duplication: 0.05
@@ -112,6 +118,8 @@ taxes_w2_and_1099:
     employer_state:
         missing_data:
             row_noise_level: 0.01
+        incorrect_selection:
+            row_noise_level: 0.01
     employer_street_name:
         missing_data:
             row_noise_level: 0.01
@@ -133,6 +141,8 @@ taxes_w2_and_1099:
     tax_form:
         missing_data:
             row_noise_level: 0.01
+        incorrect_selection:
+            row_noise_level: 0.01
     last_name:
         missing_data:
             row_noise_level: 0.01
@@ -142,6 +152,8 @@ taxes_w2_and_1099:
     mailing_address_state:
         missing_data:
             row_noise_level: 0.01
+        incorrect_selection:
+            row_noise_level: 0.01
     mailing_address_street_name:
         missing_data:
             row_noise_level: 0.01
@@ -159,4 +171,4 @@ taxes_w2_and_1099:
             row_noise_level: 0.01
     ssn:
         missing_data:
-            row_noise_level: 0.01
\ No newline at end of file
+            row_noise_level: 0.01
diff --git a/src/pseudopeople/entity_types.py b/src/pseudopeople/entity_types.py
index f5557f49..500cf27c 100644
--- a/src/pseudopeople/entity_types.py
+++ b/src/pseudopeople/entity_types.py
@@ -5,6 +5,8 @@
 from vivarium import ConfigTree
 from vivarium.framework.randomness import RandomnessStream
 
+from pseudopeople.utilities import get_index_to_noise
+
 
 @dataclass
 class RowNoiseType:
@@ -56,4 +58,13 @@ def __call__(
         randomness_stream: RandomnessStream,
         additional_key: Any,
     ) -> pd.Series:
-        return self.noise_function(column, configuration, randomness_stream, additional_key)
+        column = column.copy()
+        noise_level = configuration.row_noise_level
+        to_noise_idx = get_index_to_noise(
+            column, noise_level, randomness_stream, f"{self.name}_{additional_key}"
+        )
+        column.loc[to_noise_idx] = self.noise_function(
+            column.loc[to_noise_idx], configuration, randomness_stream, additional_key
+        )
+
+        return column
diff --git a/src/pseudopeople/interface.py b/src/pseudopeople/interface.py
index 5c53b1ed..d767bc26 100644
--- a/src/pseudopeople/interface.py
+++ b/src/pseudopeople/interface.py
@@ -4,8 +4,8 @@
 
 import pandas as pd
 
-from pseudopeople.entities import Form
 from pseudopeople.noise import noise_form
+from pseudopeople.schema_entities import Form
 from pseudopeople.utilities import get_configuration
 
 
diff --git a/src/pseudopeople/noise.py b/src/pseudopeople/noise.py
index 03ffae68..e5e50bbf 100644
--- a/src/pseudopeople/noise.py
+++ b/src/pseudopeople/noise.py
@@ -14,8 +14,9 @@
 import pandas as pd
 from vivarium import ConfigTree
 
-from pseudopeople.entities import NOISE_TYPES, Form
 from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType
+from pseudopeople.noise_entities import NOISE_TYPES
+from pseudopeople.schema_entities import Form
 from pseudopeople.utilities import get_randomness_stream
 
 
diff --git a/src/pseudopeople/entities.py b/src/pseudopeople/noise_entities.py
similarity index 81%
rename from src/pseudopeople/entities.py
rename to src/pseudopeople/noise_entities.py
index 1a41e264..93a367c6 100644
--- a/src/pseudopeople/entities.py
+++ b/src/pseudopeople/noise_entities.py
@@ -5,31 +5,6 @@
 from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType
 
 
-# todo: is "form" the right word? Ask RT
-class Form(Enum):
-    CENSUS = "decennial_census"
-    ACS = "american_communities_survey"
-    CPS = "current_population_survey"
-    WIC = "women_infants_and_children"
-    SSA = "social_security"
-    TAX_W2_1099 = "taxes_w2_and_1099"
-    TAX_1040 = "taxes_1040"
-
-
-class __Columns(NamedTuple):
-    FIRST_NAME: str = "first_name"
-    MIDDLE_INITIAL: str = "middle_initial"
-    LAST_NAME: str = "last_name"
-    STREET_NAME: str = "street_name"
-    ZIP_CODE: str = "zipcode"
-    CITY: str = "city"
-    AGE: str = "age"
-    # todo finish filling in columns
-
-
-COLUMNS = __Columns()
-
-
 class __NoiseTypes(NamedTuple):
     """Container for all noise types in the order in which they should be applied:
     omissions, duplications, missing data, incorrect selection, copy from w/in
diff --git a/src/pseudopeople/noise_functions.py b/src/pseudopeople/noise_functions.py
index fb4f30da..bfeb0acf 100644
--- a/src/pseudopeople/noise_functions.py
+++ b/src/pseudopeople/noise_functions.py
@@ -7,6 +7,7 @@
 from vivarium.framework.randomness import RandomnessStream
 
 from pseudopeople.constants import paths
+from pseudopeople.utilities import vectorized_choice
 
 
 def omit_rows(
@@ -42,21 +43,36 @@ def duplicate_rows(
 
 
 def generate_incorrect_selections(
-    form_data: pd.DataFrame,
-    configuration: float,
+    column: pd.Series,
+    _: ConfigTree,
     randomness_stream: RandomnessStream,
     additional_key: Any,
-) -> pd.DataFrame:
+) -> pd.Series:
     """
+    Function that takes a categorical series and applies noise so some values has been replace with other options from
+    a list.
 
-    :param form_data:
-    :param configuration:
-    :param randomness_stream:
+    :param column:  A categorical pd.Series
+    :param _: ConfigTree with rate at which to blank the data in column.
+    :param randomness_stream:  RandomnessStream to utilize Vivarium CRN.
     :param additional_key: Key for RandomnessStream
-    :return:
+    :returns: pd.Series where data has been noised with other values from a list of possibilities
     """
-    # todo actually duplicate rows
-    return form_data
+
+    col = column.name
+    selection_options = pd.read_csv(paths.INCORRECT_SELECT_NOISE_OPTIONS_DATA)
+
+    # Get possible noise values
+    # todo: Update with exclusive resampling when vectorized_choice is improved
+    options = selection_options.loc[selection_options[col].notna(), col]
+    new_values = vectorized_choice(
+        options=options,
+        n_to_choose=len(column),
+        randomness_stream=randomness_stream,
+        additional_key=f"{additional_key}_{col}_incorrect_select_choice",
+    ).to_numpy()
+
+    return pd.Series(new_values, index=column.index)
 
 
 def generate_within_household_copies(
@@ -203,34 +219,15 @@ def generate_phonetic_errors(
     return column
 
 
-def generate_missing_data(
-    column: pd.Series,
-    configuration: ConfigTree,
-    randomness_stream: RandomnessStream,
-    additional_key: Any,
-) -> pd.Series:
+def generate_missing_data(column: pd.Series, *_: Any) -> pd.Series:
     """
-    Function that takes a column and blanks out a configurable portion of its data to be missing.
+    Function that takes a column and blanks out all values.
 
     :param column:  pd.Series of data
-    :param configuration: ConfigTree with rate at which to blank the data in column.
-    :param randomness_stream:  RandomnessStream to utilize Vivarium CRN.
-    :param additional_key: Key for RandomnessStream
-    :returns: pd.Series of column with configured amount of data missing as an empty string.
+    :returns: pd.Series of empty strings with the index of column.
     """
 
-    # Avoid SettingWithCopyWarning
-    column = column.copy()
-    to_noise_idx = _get_to_noise_idx(
-        column,
-        configuration,
-        randomness_stream,
-        additional_key,
-        context_key="missing_data_filter",
-    )
-    column.loc[to_noise_idx] = ""
-
-    return column
+    return pd.Series("", index=column.index)
 
 
 def generate_typographical_errors(
@@ -248,8 +245,6 @@ def generate_typographical_errors(
     :param additional_key: Key for RandomnessStream
     :returns: pd.Series of column with noised data
     """
-    column = column.copy()
-    not_missing_idx = column.index[(column.notna()) & (column != "")]
 
     with open(paths.QWERTY_ERRORS) as f:
         qwerty_errors = yaml.full_load(f)
@@ -278,15 +273,8 @@ def keyboard_corrupt(truth, corrupted_pr, addl_pr, rng):
     token_noise_level = configuration.token_noise_level
     include_original_token_level = configuration.include_original_token_level
 
-    to_noise_idx = _get_to_noise_idx(
-        column.loc[not_missing_idx],
-        configuration,
-        randomness_stream,
-        additional_key,
-        context_key="typographical_noise_filter",
-    )
     rng = np.random.default_rng(seed=randomness_stream.seed)
-    for idx in to_noise_idx:
+    for idx in column.index:
         noised_value = keyboard_corrupt(
             column[idx],
             token_noise_level,
diff --git a/src/pseudopeople/schema_entities.py b/src/pseudopeople/schema_entities.py
new file mode 100644
index 00000000..a2e584cf
--- /dev/null
+++ b/src/pseudopeople/schema_entities.py
@@ -0,0 +1,27 @@
+from enum import Enum
+from typing import NamedTuple
+
+
+# todo: is "form" the right word? Ask RT
+class Form(Enum):
+    CENSUS = "decennial_census"
+    ACS = "american_communities_survey"
+    CPS = "current_population_survey"
+    WIC = "women_infants_and_children"
+    SSA = "social_security"
+    TAX_W2_1099 = "taxes_w2_and_1099"
+    TAX_1040 = "taxes_1040"
+
+
+class __Columns(NamedTuple):
+    FIRST_NAME: str = "first_name"
+    MIDDLE_INITIAL: str = "middle_initial"
+    LAST_NAME: str = "last_name"
+    STREET_NAME: str = "street_name"
+    ZIP_CODE: str = "zipcode"
+    CITY: str = "city"
+    AGE: str = "age"
+    # todo finish filling in columns
+
+
+COLUMNS = __Columns()
diff --git a/src/pseudopeople/utilities.py b/src/pseudopeople/utilities.py
index 9cb2acd3..8d779d62 100644
--- a/src/pseudopeople/utilities.py
+++ b/src/pseudopeople/utilities.py
@@ -1,11 +1,12 @@
 from pathlib import Path
-from typing import Union
+from typing import Any, Union
 
+import numpy as np
 import pandas as pd
 from vivarium.framework.configuration import ConfigTree
-from vivarium.framework.randomness import RandomnessStream
+from vivarium.framework.randomness import RandomnessStream, random
 
-from pseudopeople.entities import Form
+from pseudopeople.schema_entities import Form
 
 
 def get_randomness_stream(form: Form, seed: int) -> RandomnessStream:
@@ -32,3 +33,72 @@ def get_configuration(user_yaml_path: Union[Path, str] = None) -> ConfigTree:
     if user_yaml_path:
         noising_configuration.update(user_yaml_path, layer="user")
     return noising_configuration
+
+
+def vectorized_choice(
+    options: Union[list, pd.Series],
+    n_to_choose: int,
+    randomness_stream: RandomnessStream = None,
+    weights: Union[list, pd.Series] = None,
+    additional_key: Any = None,
+    random_seed: int = None,
+):
+    """
+    Function that takes a list of options and uses Vivarium common random numbers framework to make a given number
+    of razndom choice selections.
+
+    :param options: List and series of possible values to choose
+    :param n_to_choose: Number of choices to make, the length of the returned array of values
+    :param randomness_stream: RandomnessStream being used for Vivarium's CRN framework
+    :param weights: List or series containing weights for each options
+    :param additional_key: Key to pass to randomness_stream
+    :param random_seed: Seed to pass to randomness_stream.
+    Note additional_key and random_seed are used to make calls using a RandomnessStream unique
+
+    returns: ndarray
+    """
+    if not randomness_stream and (additional_key == None and random_seed == None):
+        raise RuntimeError(
+            "An additional_key and a random_seed are required in 'vectorized_choice'"
+            + "if no RandomnessStream is passed in"
+        )
+    if weights is None:
+        n = len(options)
+        weights = np.ones(n) / n
+    # for each of n_to_choose, sample uniformly between 0 and 1
+    index = pd.Index(np.arange(n_to_choose))
+    if randomness_stream is None:
+        # Generate an additional_key on-the-fly and use that in randomness.random
+        additional_key = f"{additional_key}_{random_seed}"
+        probs = random(str(additional_key), index)
+    else:
+        probs = randomness_stream.get_draw(index, additional_key=additional_key)
+
+    # build cdf based on weights
+    pmf = weights / weights.sum()
+    cdf = np.cumsum(pmf)
+
+    # for each p_i in probs, count how many elements of cdf for which p_i >= cdf_i
+    chosen_indices = np.searchsorted(cdf, probs, side="right")
+    return np.take(options, chosen_indices)
+
+
+def get_index_to_noise(
+    column: pd.Series,
+    noise_level: float,
+    randomness_stream: RandomnessStream,
+    additional_key: Any,
+) -> pd.Index:
+    """
+    Function that takes a series and returns a pd.Index that chosen by Vivarium Common Random Number to be noised.
+    """
+
+    # Get rows to noise
+    not_empty_idx = column.index[(column != "") & (column.notna())]
+    to_noise_idx = randomness_stream.filter_for_probability(
+        not_empty_idx,
+        probability=noise_level,
+        additional_key=additional_key,
+    )
+
+    return to_noise_idx
diff --git a/tests/integration/test_interface.py b/tests/integration/test_interface.py
index 1501aa8c..294f9d81 100644
--- a/tests/integration/test_interface.py
+++ b/tests/integration/test_interface.py
@@ -1,12 +1,10 @@
 from pathlib import Path
 from typing import Union
 
-import numpy as np
 import pandas as pd
 import pytest
 
 from pseudopeople.interface import generate_decennial_census
-from pseudopeople.utilities import get_configuration
 
 
 # TODO: possibly parametrize Forms?
diff --git a/tests/unit/test_column_noise.py b/tests/unit/test_column_noise.py
index 2ada0f32..9ee09288 100644
--- a/tests/unit/test_column_noise.py
+++ b/tests/unit/test_column_noise.py
@@ -6,20 +6,7 @@
 import pytest
 from vivarium.framework.randomness import RandomnessStream
 
-from pseudopeople.noise_functions import (
-    generate_fake_names,
-    generate_incorrect_selections,
-    generate_missing_data,
-    generate_nicknames,
-    generate_ocr_errors,
-    generate_phonetic_errors,
-    generate_typographical_errors,
-    generate_within_household_copies,
-    miswrite_ages,
-    miswrite_numerics,
-    miswrite_zipcodes,
-    swap_months_and_days,
-)
+from pseudopeople.noise_entities import NOISE_TYPES
 from pseudopeople.utilities import get_configuration
 
 RANDOMNESS0 = RandomnessStream(
@@ -39,7 +26,6 @@ def dummy_dataset():
     # Add missing data from `generate_missing_data` function
     missing_idx = pd.Index([x for x in dummy_idx if x % 3 == 0])
     integer_series.loc[missing_idx] = ""
-
     # Add a column of character strings
     str_length = 6
     character_series = pd.Series(
@@ -56,13 +42,36 @@ def dummy_dataset():
     return pd.DataFrame({"numbers": integer_series, "characters": character_series})
 
 
-def test_generate_missing_data(dummy_dataset, user_config_path):
-    config = get_configuration(user_config_path)["decennial_census"]["zipcode"][
-        "missing_data"
-    ]
+@pytest.fixture(scope="module")
+def categorical_series():
+    return pd.Series(
+        ["CA", "WA", "FL", "OR", "CO", "TX", "NY", "VA", "AZ", "''"] * 100_000, name="state"
+    )
+
+
+@pytest.fixture(scope="module")
+def default_configuration():
+    return get_configuration()
+
+
+def test_generate_missing_data(dummy_dataset):
+
+    config = get_configuration()
+    config.update(
+        {
+            "decennial_census": {
+                "zipcode": {
+                    "missing_data": {
+                        "row_noise_level": 0.25,
+                    },
+                },
+            },
+        }
+    )
+    config = config["decennial_census"]["zipcode"]["missing_data"]
     data = dummy_dataset["numbers"]
     noised_data = _validate_seed_and_noise_data(
-        func=generate_missing_data, column=data, config=config
+        noise_type=NOISE_TYPES.MISSING_DATA, column=data, config=config
     )
 
     # Calculate newly missing data, ie data that didn't come in as already missing
@@ -82,9 +91,24 @@ def test_generate_missing_data(dummy_dataset, user_config_path):
     assert (data[not_noised_idx] == noised_data[not_noised_idx]).all()
 
 
-@pytest.mark.skip(reason="TODO")
-def test_generate_incorrect_selections():
-    pass
+def test_incorrect_selection(categorical_series, default_configuration):
+    config = default_configuration["decennial_census"]["state"]["incorrect_selection"]
+    noised_data = _validate_seed_and_noise_data(
+        noise_type=NOISE_TYPES.INCORRECT_SELECTION, column=categorical_series, config=config
+    )
+
+    # Check for expected noise level
+    expected_noise = config["row_noise_level"]
+    # todo: Update when generate_incorrect_selection uses exclusive resampling
+    # Get real expected noise to account for possibility of noising with original value
+    # Here we have a a possibility of choosing any of the 50 states for our categorical series fixture
+    expected_noise = expected_noise * (1 - 1 / 50)
+    actual_noise = (noised_data != categorical_series).mean()
+    assert np.isclose(expected_noise, actual_noise, rtol=0.02)
+
+    original_empty_idx = categorical_series.index[categorical_series == ""]
+    noised_empty_idx = noised_data.index[noised_data == ""]
+    pd.testing.assert_index_equal(original_empty_idx, noised_empty_idx)
 
 
 @pytest.mark.skip(reason="TODO")
@@ -157,7 +181,7 @@ def test_generate_typographical_errors(dummy_dataset, column):
     )
     config = config["decennial_census"][column]["typographic"]
     noised_data = _validate_seed_and_noise_data(
-        func=generate_typographical_errors, column=data, config=config
+        noise_type=NOISE_TYPES.TYPOGRAPHIC, column=data, config=config
     )
 
     not_missing_idx = data.index[(data.notna()) & (data != "")]
@@ -200,11 +224,13 @@ def test_generate_typographical_errors(dummy_dataset, column):
 
 
 # TODO: refactor this into its own test parameterized by noise functions
-def _validate_seed_and_noise_data(func, column, config):
+def _validate_seed_and_noise_data(noise_type, column, config):
     """Confirms randomness stream behavior and returns the noised data"""
-    noised_data = func(column, config, RANDOMNESS0, f"test_{func.__name__}")
-    noised_data_same_seed = func(column, config, RANDOMNESS0, f"test_{func.__name__}")
-    noised_data_different_seed = func(column, config, RANDOMNESS1, f"test_{func.__name__}")
+    noised_data = noise_type(column, config, RANDOMNESS0, f"test_{noise_type.name}")
+    noised_data_same_seed = noise_type(column, config, RANDOMNESS0, f"test_{noise_type.name}")
+    noised_data_different_seed = noise_type(
+        column, config, RANDOMNESS1, f"test_{noise_type.name}"
+    )
 
     assert (noised_data != column).any()
     assert (noised_data == noised_data_same_seed).all()
diff --git a/tests/unit/test_noise_form.py b/tests/unit/test_noise_form.py
index 8bf58756..1cfb9388 100644
--- a/tests/unit/test_noise_form.py
+++ b/tests/unit/test_noise_form.py
@@ -1,13 +1,17 @@
 import random
 from string import ascii_lowercase
+from typing import NamedTuple
 
+import numpy as np
 import pandas as pd
 import pytest
 from vivarium.config_tree import ConfigTree
 
-from pseudopeople.entities import Form
+from pseudopeople.entity_types import ColumnNoiseType, RowNoiseType
 from pseudopeople.interface import generate_decennial_census
-from pseudopeople.noise import NOISE_TYPES, noise_form
+from pseudopeople.noise import noise_form
+from pseudopeople.noise_entities import NOISE_TYPES
+from pseudopeople.schema_entities import Form
 
 
 @pytest.fixture(scope="module")
@@ -87,18 +91,27 @@ def test_noise_order(mocker, dummy_data, dummy_config_noise_numbers):
     # Mock the noise_functions functions so that they are not actually called and
     # return the original one-column dataframe (so that it doesn't become a mock
     # object itself after the first mocked function is applied.)
+    mocker.patch(
+        "pseudopeople.entity_types.get_index_to_noise", return_value=dummy_data.index
+    )
     for field in NOISE_TYPES._fields:
+        mock_return = (
+            dummy_data[["numbers"]]
+            if field in ["OMISSION", "DUPLICATION"]
+            else dummy_data["numbers"]
+        )
         mock.attach_mock(
             mocker.patch(
                 f"pseudopeople.noise.NOISE_TYPES.{field}.noise_function",
-                return_value=dummy_data[["numbers"]],
+                return_value=mock_return,
             ),
             field,
         )
+
     # FIXME: would be better to mock the form instead of using census
     noise_form(Form.CENSUS, dummy_data, dummy_config_noise_numbers, 0)
 
-    call_order = [call[0] for call in mock.mock_calls]
+    call_order = [x[0] for x in mock.mock_calls if not x[0].startswith("__")]
     expected_call_order = [
         "OMISSION",
         "DUPLICATION",
@@ -160,3 +173,98 @@ def test_correct_forms_are_used(func, form, mocker):
     _ = func("dummy/path")
 
     assert mock.call_args[0][0] == form
+
+
+def test_two_noise_functions_are_independent(mocker):
+
+    # Make simple config tree to test 2 noise functions work together
+    config_tree = ConfigTree(
+        {
+            "decennial_census": {
+                "fake_column_one": {
+                    "alpha": {"row_noise_level": 0.20},
+                    "beta": {"row_noise_level": 0.30},
+                },
+                "fake_column_two": {
+                    "alpha": {"row_noise_level": 0.40},
+                    "beta": {"row_noise_level": 0.50},
+                },
+            }
+        }
+    )
+
+    # Mock objects for testing
+
+    class MockNoiseTypes(NamedTuple):
+        ALPHA: ColumnNoiseType = ColumnNoiseType(
+            "alpha", lambda column, *_: column.str.cat(pd.Series("abc", index=column.index))
+        )
+        BETA: ColumnNoiseType = ColumnNoiseType(
+            "beta", lambda column, *_: column.str.cat(pd.Series("123", index=column.index))
+        )
+
+    mock_noise_types = MockNoiseTypes()
+
+    mocker.patch("pseudopeople.noise.NOISE_TYPES", mock_noise_types)
+    dummy_form = pd.DataFrame(
+        {
+            "fake_column_one": ["cat", "dog", "bird", "bunny", "duck"] * 20_000,
+            "fake_column_two": ["shoe", "pants", "shirt", "hat", "sunglasses"] * 20_000,
+        }
+    )
+
+    noised_data = noise_form(
+        form=Form.CENSUS,
+        form_data=dummy_form,
+        seed=0,
+        configuration=config_tree,
+    )
+
+    # Get config values for testing
+    col1_expected_abc_proportion = config_tree["decennial_census"]["fake_column_one"][
+        "alpha"
+    ]["row_noise_level"]
+    col2_expected_abc_proportion = config_tree["decennial_census"]["fake_column_two"][
+        "alpha"
+    ]["row_noise_level"]
+    col1_expected_123_proportion = config_tree["decennial_census"]["fake_column_one"]["beta"][
+        "row_noise_level"
+    ]
+    col2_expected_123_proportion = config_tree["decennial_census"]["fake_column_two"]["beta"][
+        "row_noise_level"
+    ]
+
+    assert np.isclose(
+        noised_data["fake_column_one"].str.contains("abc").mean(),
+        col1_expected_abc_proportion,
+        rtol=0.01,
+    )
+    assert np.isclose(
+        noised_data["fake_column_two"].str.contains("abc").mean(),
+        col2_expected_abc_proportion,
+        rtol=0.01,
+    )
+    assert np.isclose(
+        noised_data["fake_column_one"].str.contains("123").mean(),
+        col1_expected_123_proportion,
+        rtol=0.01,
+    )
+    assert np.isclose(
+        noised_data["fake_column_two"].str.contains("123").mean(),
+        col2_expected_123_proportion,
+        rtol=0.01,
+    )
+
+    # Assert columns experience both noise
+    assert np.isclose(
+        noised_data["fake_column_one"].str.contains("abc123").mean(),
+        col1_expected_abc_proportion * col1_expected_123_proportion,
+        rtol=0.01,
+    )
+    assert np.isclose(
+        noised_data["fake_column_two"].str.contains("abc123").mean(),
+        col2_expected_abc_proportion * col2_expected_123_proportion,
+        rtol=0.01,
+    )
+    assert noised_data["fake_column_one"].str.contains("123abc").sum() == 0
+    assert noised_data["fake_column_two"].str.contains("123abc").sum() == 0

From 86a5722fbfdce9e4be12a356d8ab17ad78f630b2 Mon Sep 17 00:00:00 2001
From: albrja <37345113+albrja@users.noreply.github.com>
Date: Thu, 30 Mar 2023 12:12:20 -0700
Subject: [PATCH 6/7] Change tax_form column name and data values. (#22)

Update to incorrect_selection.csv

Updates data to align with changes to is_w2 (now tax_form) column in post-processing.
- *Category*: Other
- *JIRA issue*: [MIC-3937](https://jira.ihme.washington.edu/browse/MIC-3937)

-Changes is_w2 column to tax_form
-Changes data values from bool to "W2" and "1099"

Testing
All tests pass.
---
 .../data/incorrect_select_options.csv             |  6 +++---
 src/pseudopeople/noise_functions.py               | 15 ---------------
 2 files changed, 3 insertions(+), 18 deletions(-)

diff --git a/src/pseudopeople/data/incorrect_select_options.csv b/src/pseudopeople/data/incorrect_select_options.csv
index bb60c488..e4939387 100644
--- a/src/pseudopeople/data/incorrect_select_options.csv
+++ b/src/pseudopeople/data/incorrect_select_options.csv
@@ -1,6 +1,6 @@
-state,relation_to_household_head,sex,race_ethnicity,is_w2,event_type
-AL,Reference person,Female,White,True,creation
-AK,Opp-sex spouse,Male,Black,False,death
+state,relation_to_household_head,sex,race_ethnicity,tax_form,event_type
+AL,Reference person,Female,White,W2,creation
+AK,Opp-sex spouse,Male,Black,1099,death
 AZ,Opp-sex partner,,Asian,,
 AR,Same-sex spouse,,AIAN,,
 CA,Same-sex partne,,NHOPI,,
diff --git a/src/pseudopeople/noise_functions.py b/src/pseudopeople/noise_functions.py
index bfeb0acf..b4b8110a 100644
--- a/src/pseudopeople/noise_functions.py
+++ b/src/pseudopeople/noise_functions.py
@@ -302,18 +302,3 @@ def generate_ocr_errors(
     """
     # todo actually generate OCR errors
     return column
-
-
-####################
-# HELPER FUNCTIONS #
-####################
-def _get_to_noise_idx(column, configuration, randomness_stream, additional_key, context_key):
-    noise_level = configuration.row_noise_level
-    # Get rows to noise
-    to_noise_idx = randomness_stream.filter_for_probability(
-        column.index,
-        probability=noise_level,
-        additional_key=f"{additional_key}_{context_key}",
-    )
-
-    return to_noise_idx

From 9cdf118eaf6b19ddab9340a7ec11cd334c645528 Mon Sep 17 00:00:00 2001
From: Rajan Mudambi <11376379+rmudambi@users.noreply.github.com>
Date: Fri, 31 Mar 2023 13:24:48 -0700
Subject: [PATCH 7/7] release candidate v0.2.0 (#23)

---
 CHANGELOG.rst                 | 6 ++++++
 src/pseudopeople/__about__.py | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.rst b/CHANGELOG.rst
index 6a1e6fd4..806058a5 100644
--- a/CHANGELOG.rst
+++ b/CHANGELOG.rst
@@ -1,3 +1,9 @@
+**0.2.0 - 03/31/23**
+
+ - Implemented W2/1099 forms
+ - Implemented typographic noise function
+ - Implemented incorrect selection noise function
+
 **0.1.0 - 03/23/23**
 
  - Initial release
diff --git a/src/pseudopeople/__about__.py b/src/pseudopeople/__about__.py
index 7644dc29..c8ae9993 100644
--- a/src/pseudopeople/__about__.py
+++ b/src/pseudopeople/__about__.py
@@ -13,7 +13,7 @@
 __summary__ = "pseudopeople is package which adds noise to simulated census-scale data using standard scientific Python tools."
 __uri__ = "https://github.com/ihmeuw/pseudopeople"
 
-__version__ = "0.1.0"
+__version__ = "0.2.0"
 
 __author__ = "The pseudopeople developers"
 __email__ = "vivarium.dev@gmail.com"