ihmeuw · rmudambi · Oct 25, 2023 · Oct 24, 2023 · Oct 18, 2023 · Oct 18, 2023
@@ -1,3 +1,6 @@
+**0.8.0 - 10/24/23**
+ - Improve performance of dataset generation functions
+
 **0.7.2 - 10/16/23**
  - Drop support for python 3.8
  - Fix bug in "Choose the wrong option" noise type

@@ -1,5 +1,5 @@
 from dataclasses import dataclass
-from typing import Any, Callable, Dict, List, Optional
+from typing import Any, Callable, Dict, List, Optional, Tuple
 
 import pandas as pd
 from loguru import logger
@@ -51,8 +51,12 @@ class ColumnNoiseType:
     The noise function takes as input a DataFrame, the ConfigTree object for this
     ColumnNoise operation, a RandomnessStream for controlling randomness, and
     a column name, which is the column that will be noised and who's name will be used
-    as the additional key for the RandomnessStream. It applies the noising operation
-    to the Series and returns the modified Series.
+    as the additional key for the RandomnessStream.
+    Optionally, it can take a pre-existing DataFrame indicating where there is missingness
+    in the data (same index and columns as the main DataFrame, all boolean type) --
+    if this is not passed, it calculates it, which can be expensive for large data.
+    It applies the noising operation to the Series and returns both the modified Series
+    and an Index of which items in the Series were selected for noise.
     """
 
     name: str
@@ -69,10 +73,11 @@ def __call__(
         randomness_stream: RandomnessStream,
         dataset_name: str,
         column_name: str,
-    ) -> pd.Series:
+        missingness: Optional[pd.DataFrame] = None,
+    ) -> Tuple[pd.Series, pd.Index]:
         if data[column_name].empty:
-            return data[column_name]
-        data = data.copy()
+            return data[column_name], pd.Index([])
+
         noise_level = configuration[
             Keys.CELL_PROBABILITY
         ] * self.noise_level_scaling_function(data, column_name)
@@ -85,13 +90,14 @@ def __call__(
             randomness_stream,
             f"{self.name}_{column_name}",
             is_column_noise=True,
+            missingness=missingness,
         )
         if to_noise_idx.empty:
             logger.debug(
                 f"No cells chosen to noise for noise function {self.name} on column {column_name}. "
                 "This is likely due to a combination of the configuration noise levels and the simulated population data."
             )
-            return data[column_name]
+            return data[column_name], to_noise_idx
         noised_data = self.noise_function(
             data.loc[to_noise_idx],
             configuration,
@@ -104,6 +110,7 @@ def __call__(
         if noised_data.dtype.name != data[column_name].dtype.name:
             noised_data = noised_data.astype(data[column_name].dtype)
 
-        data.loc[to_noise_idx, column_name] = noised_data
+        result = data[column_name].copy()
+        result.loc[to_noise_idx] = noised_data
 
-        return data[column_name]
+        return result, to_noise_idx
@@ -9,7 +9,11 @@
 from pseudopeople import __version__ as psp_version
 from pseudopeople.configuration import get_configuration
 from pseudopeople.constants import paths
-from pseudopeople.constants.metadata import COPY_HOUSEHOLD_MEMBER_COLS, INT_COLUMNS
+from pseudopeople.constants.metadata import (
+    COPY_HOUSEHOLD_MEMBER_COLS,
+    DATEFORMATS,
+    INT_COLUMNS,
+)
 from pseudopeople.exceptions import DataSourceError
 from pseudopeople.loader import load_standard_dataset_file
 from pseudopeople.noise import noise_dataset
@@ -162,11 +166,33 @@ def _reformat_dates_for_noising(data: pd.DataFrame, dataset: Dataset):
         # to copy from a household member
         for column in [date_column, COPY_HOUSEHOLD_MEMBER_COLS.get(date_column)]:
             if column in data.columns:
-                data[column] = data[column].dt.strftime(dataset.date_format)
+                # Avoid running strftime on large data, since that will
+                # re-parse the format string for each row
+                # https://github.com/pandas-dev/pandas/issues/44764
+                # Year is already guaranteed to be 4-digit: https://pandas.pydata.org/pandas-docs/stable/user_guide/timeseries.html#timeseries-timestamp-limits
+                year_string = data[column].dt.year.astype(str)
+                month_string = _zfill_fast(data[column].dt.month.astype(str), 2)
+                day_string = _zfill_fast(data[column].dt.day.astype(str), 2)
+                if dataset.date_format == DATEFORMATS.YYYYMMDD:
+                    data[column] = year_string + month_string + day_string
+                elif dataset.date_format == DATEFORMATS.MM_DD_YYYY:
+                    data[column] = month_string + "/" + day_string + "/" + year_string
+                elif dataset.date_format == DATEFORMATS.MMDDYYYY:
+                    data[column] = month_string + day_string + year_string
+                else:
+                    raise ValueError(f"Invalid date format in {dataset.name}.")
 
     return data
 
 
+def _zfill_fast(col: pd.Series, desired_length: int) -> pd.Series:
+    """Performs the same operation as col.str.zfill(desired_length), but vectorized."""
+    # The most zeroes that could ever be needed would be desired_length
+    maximum_padding = ("0" * desired_length) + col
+    # Now trim to only the zeroes needed
+    return maximum_padding.str[-desired_length:]
+
+
 def _extract_columns(columns_to_keep, noised_dataset):
     """Helper function for test mocking purposes"""
     if columns_to_keep:

@@ -51,6 +51,11 @@ def noise_dataset(
     randomness = get_randomness_stream(dataset.name, seed, dataset_data.index)
 
     noise_configuration = configuration[dataset.name]
+
+    # We only need to do this once, because noise does not introduce missingness,
+    # except for the leave_blank kind which is special-cased below
+    missingness = (dataset_data == "") | (dataset_data.isna())
+
     for noise_type in tqdm(NOISE_TYPES, desc="Applying noise", unit="type", leave=False):
         if isinstance(noise_type, RowNoiseType):
             if (
@@ -64,6 +69,7 @@ def noise_dataset(
                     noise_configuration[Keys.ROW_NOISE][noise_type.name],
                     randomness,
                 )
+                missingness = missingness.loc[dataset_data.index].copy()
 
         elif isinstance(noise_type, ColumnNoiseType):
             if Keys.COLUMN_NOISE in noise_configuration:
@@ -76,13 +82,17 @@ def noise_dataset(
                 # Apply column noise to each column as appropriate
                 for column in columns_to_noise:
                     required_cols = [column] + noise_type.additional_column_getter(column)
-                    dataset_data[column] = noise_type(
+                    dataset_data[column], index_noised = noise_type(
                         dataset_data[required_cols],
                         noise_configuration.column_noise[column][noise_type.name],
                         randomness,
                         dataset.name,
                         column,
+                        missingness=missingness[required_cols],
                     )
+                    if noise_type == NOISE_TYPES.leave_blank:
+                        # The only situation in which more missingness is introduced
+                        missingness.loc[index_noised, column] = True
         else:
             raise TypeError(
                 f"Invalid noise type. Allowed types are {RowNoiseType} and "