Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Optimize noise functions #333

Merged
merged 13 commits into from
Oct 25, 2023
3 changes: 3 additions & 0 deletions CHANGELOG.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
**0.8.0 - 10/24/23**
- Improve performance of dataset generation functions

**0.7.2 - 10/16/23**
- Drop support for python 3.8
- Fix bug in "Choose the wrong option" noise type
Expand Down
25 changes: 16 additions & 9 deletions src/pseudopeople/entity_types.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
from dataclasses import dataclass
from typing import Any, Callable, Dict, List, Optional
from typing import Any, Callable, Dict, List, Optional, Tuple

import pandas as pd
from loguru import logger
Expand Down Expand Up @@ -51,8 +51,12 @@ class ColumnNoiseType:
The noise function takes as input a DataFrame, the ConfigTree object for this
ColumnNoise operation, a RandomnessStream for controlling randomness, and
a column name, which is the column that will be noised and who's name will be used
as the additional key for the RandomnessStream. It applies the noising operation
to the Series and returns the modified Series.
as the additional key for the RandomnessStream.
Optionally, it can take a pre-existing DataFrame indicating where there is missingness
in the data (same index and columns as the main DataFrame, all boolean type) --
if this is not passed, it calculates it, which can be expensive for large data.
It applies the noising operation to the Series and returns both the modified Series
and an Index of which items in the Series were selected for noise.
"""

name: str
Expand All @@ -69,10 +73,11 @@ def __call__(
randomness_stream: RandomnessStream,
dataset_name: str,
column_name: str,
) -> pd.Series:
missingness: Optional[pd.DataFrame] = None,
) -> Tuple[pd.Series, pd.Index]:
zmbc marked this conversation as resolved.
Show resolved Hide resolved
if data[column_name].empty:
return data[column_name]
data = data.copy()
return data[column_name], pd.Index([])

noise_level = configuration[
Keys.CELL_PROBABILITY
] * self.noise_level_scaling_function(data, column_name)
Expand All @@ -85,13 +90,14 @@ def __call__(
randomness_stream,
f"{self.name}_{column_name}",
is_column_noise=True,
missingness=missingness,
)
if to_noise_idx.empty:
logger.debug(
f"No cells chosen to noise for noise function {self.name} on column {column_name}. "
"This is likely due to a combination of the configuration noise levels and the simulated population data."
)
return data[column_name]
return data[column_name], to_noise_idx
noised_data = self.noise_function(
data.loc[to_noise_idx],
configuration,
Expand All @@ -104,6 +110,7 @@ def __call__(
if noised_data.dtype.name != data[column_name].dtype.name:
noised_data = noised_data.astype(data[column_name].dtype)

data.loc[to_noise_idx, column_name] = noised_data
result = data[column_name].copy()
result.loc[to_noise_idx] = noised_data

return data[column_name]
return result, to_noise_idx
30 changes: 28 additions & 2 deletions src/pseudopeople/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,11 @@
from pseudopeople import __version__ as psp_version
from pseudopeople.configuration import get_configuration
from pseudopeople.constants import paths
from pseudopeople.constants.metadata import COPY_HOUSEHOLD_MEMBER_COLS, INT_COLUMNS
from pseudopeople.constants.metadata import (
COPY_HOUSEHOLD_MEMBER_COLS,
DATEFORMATS,
INT_COLUMNS,
)
from pseudopeople.exceptions import DataSourceError
from pseudopeople.loader import load_standard_dataset_file
from pseudopeople.noise import noise_dataset
Expand Down Expand Up @@ -162,11 +166,33 @@ def _reformat_dates_for_noising(data: pd.DataFrame, dataset: Dataset):
# to copy from a household member
for column in [date_column, COPY_HOUSEHOLD_MEMBER_COLS.get(date_column)]:
if column in data.columns:
data[column] = data[column].dt.strftime(dataset.date_format)
# Avoid running strftime on large data, since that will
# re-parse the format string for each row
# https://github.com/pandas-dev/pandas/issues/44764
# Year is already guaranteed to be 4-digit: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-timestamp-limits
year_string = data[column].dt.year.astype(str)
month_string = _zfill_fast(data[column].dt.month.astype(str), 2)
day_string = _zfill_fast(data[column].dt.day.astype(str), 2)
if dataset.date_format == DATEFORMATS.YYYYMMDD:
data[column] = year_string + month_string + day_string
elif dataset.date_format == DATEFORMATS.MM_DD_YYYY:
data[column] = month_string + "/" + day_string + "/" + year_string
elif dataset.date_format == DATEFORMATS.MMDDYYYY:
data[column] = month_string + day_string + year_string
else:
raise ValueError(f"Invalid date format in {dataset.name}.")

return data


def _zfill_fast(col: pd.Series, desired_length: int) -> pd.Series:
"""Performs the same operation as col.str.zfill(desired_length), but vectorized."""
# The most zeroes that could ever be needed would be desired_length
maximum_padding = ("0" * desired_length) + col
# Now trim to only the zeroes needed
return maximum_padding.str[-desired_length:]


def _extract_columns(columns_to_keep, noised_dataset):
"""Helper function for test mocking purposes"""
if columns_to_keep:
Expand Down
12 changes: 11 additions & 1 deletion src/pseudopeople/noise.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,11 @@ def noise_dataset(
randomness = get_randomness_stream(dataset.name, seed, dataset_data.index)

noise_configuration = configuration[dataset.name]

# We only need to do this once, because noise does not introduce missingness,
# except for the leave_blank kind which is special-cased below
missingness = (dataset_data == "") | (dataset_data.isna())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

So the speedup is calculating missingness once for the entire dataframe instead of ad hoc as needed per column? It really sped things up that much?

Copy link
Collaborator Author

@zmbc zmbc Oct 24, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The old way was calculating it for each noise type for each column. Even with the profiling I did it's a bit hard to directly measure the impact of this change since it changes the structure of things, but the total time spent in the isna method was cut in half.


for noise_type in tqdm(NOISE_TYPES, desc="Applying noise", unit="type", leave=False):
if isinstance(noise_type, RowNoiseType):
if (
Expand All @@ -64,6 +69,7 @@ def noise_dataset(
noise_configuration[Keys.ROW_NOISE][noise_type.name],
randomness,
)
missingness = missingness.loc[dataset_data.index].copy()

elif isinstance(noise_type, ColumnNoiseType):
if Keys.COLUMN_NOISE in noise_configuration:
Expand All @@ -76,13 +82,17 @@ def noise_dataset(
# Apply column noise to each column as appropriate
for column in columns_to_noise:
required_cols = [column] + noise_type.additional_column_getter(column)
dataset_data[column] = noise_type(
dataset_data[column], index_noised = noise_type(
dataset_data[required_cols],
noise_configuration.column_noise[column][noise_type.name],
randomness,
dataset.name,
column,
missingness=missingness[required_cols],
)
if noise_type == NOISE_TYPES.leave_blank:
# The only situation in which more missingness is introduced
missingness.loc[index_noised, column] = True
else:
raise TypeError(
f"Invalid noise type. Allowed types are {RowNoiseType} and "
Expand Down
Loading