diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index c85b195a..1ca63090 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -2,7 +2,6 @@ from __future__ import annotations import math -import warnings from collections import defaultdict from operator import itemgetter from typing import cast @@ -10,10 +9,13 @@ import datasketches from pandas import DataFrame, Series +from .. import dp_logging from . import profiler_utils from .base_column_profilers import BaseColumnProfiler from .profiler_options import CategoricalOptions +logger = dp_logging.get_child_logger(__name__) + class CategoricalColumn(BaseColumnProfiler["CategoricalColumn"]): """ @@ -306,24 +308,27 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: other_profile._categories.items(), key=itemgetter(1), reverse=True ) ) - if cat_count1.keys() == cat_count2.keys(): - total_psi = 0.0 - for key in cat_count1.keys(): - perc_A = cat_count1[key] / self.sample_size - perc_B = cat_count2[key] / other_profile.sample_size - total_psi += (perc_B - perc_A) * math.log(perc_B / perc_A) - differences["statistics"]["psi"] = total_psi - else: - warnings.warn( - "psi was not calculated due to the differences in categories " - "of the profiles. Differences:\n" - f"{set(cat_count1.keys()) ^ set(cat_count2.keys())}", - RuntimeWarning, - ) + ( + self_cat_count, + other_cat_count, + ) = self._preprocess_for_categorical_psi_calculation( + self_cat_count=cat_count1, + other_cat_count=cat_count2, + ) + + total_psi = 0.0 + for iter_key in self_cat_count.keys(): + percent_self = self_cat_count[iter_key] / self.sample_size + percent_other = other_cat_count[iter_key] / other_profile.sample_size + if (percent_other != 0) and (percent_self != 0): + total_psi += (percent_other - percent_self) * math.log( + percent_other / percent_self + ) + differences["statistics"]["psi"] = total_psi differences["statistics"][ "categorical_count" - ] = profiler_utils.find_diff_of_dicts(cat_count1, cat_count2) + ] = profiler_utils.find_diff_of_dicts(self_cat_count, other_cat_count) return differences @@ -431,6 +436,25 @@ def is_match(self) -> bool: is_match = True return is_match + def _preprocess_for_categorical_psi_calculation( + self, self_cat_count, other_cat_count + ): + super_set_categories = set(self_cat_count.keys()) | set(other_cat_count.keys()) + if self_cat_count.keys() != other_cat_count.keys(): + logger.info( + f"""PSI data pre-processing found that categories between + the profiles were not equal. Both profiles do not contain + the following categories {super_set_categories}.""" + ) + + for iter_key in super_set_categories: + for iter_dictionary in [self_cat_count, other_cat_count]: + try: + iter_dictionary[iter_key] = iter_dictionary[iter_key] + except KeyError: + iter_dictionary[iter_key] = 0 + return self_cat_count, other_cat_count + def _check_stop_condition_is_met(self, sample_size: int, unqiue_ratio: float): """Return boolean given stop conditions. diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py index 5bdbbb83..2b751d57 100644 --- a/dataprofiler/tests/profilers/test_categorical_column_profile.py +++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py @@ -701,6 +701,7 @@ def test_gini_impurity(self): self.assertEqual(profile.gini_impurity, None) def test_categorical_diff(self): + # test psi new category in another profile df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) @@ -720,21 +721,17 @@ def test_categorical_diff(self): "categories": [[], ["y", "n"], ["maybe"]], "gini_impurity": -0.16326530612244894, "unalikeability": -0.19047619047619047, - "categorical_count": {"y": 1, "n": 1, "maybe": [None, 2]}, + "categorical_count": {"y": 1, "n": 1, "maybe": -2}, "chi2-test": { "chi2-statistic": 82 / 35, "df": 2, "p-value": 0.3099238764710244, }, + "psi": 0.0990210257942779, }, } - with self.assertWarnsRegex( - RuntimeWarning, - "psi was not calculated due to the differences in categories " - "of the profiles. Differences:\n{'maybe'}", - ): - test_profile_diff = profile.diff(profile2) - self.assertDictEqual(expected_diff, test_profile_diff) + actual_diff = profile.diff(profile2) + self.assertDictEqual(expected_diff, actual_diff) # Test with one categorical column matching df_not_categorical = pd.Series( @@ -770,10 +767,6 @@ def test_categorical_diff(self): profile2 = CategoricalColumn(df_categorical.name) profile2.update(df_categorical) - # chi2-statistic = sum((observed-expected)^2/expected for each category in each column) - # df = categories - 1 - # psi = (% of records based on Sample (A) - % of records Sample (B)) * ln(A/ B) - # p-value found through using chi2 CDF expected_diff = { "categorical": "unchanged", "statistics": { diff --git a/dataprofiler/tests/profilers/test_column_profile_compilers.py b/dataprofiler/tests/profilers/test_column_profile_compilers.py index 35617d1e..46b0212d 100644 --- a/dataprofiler/tests/profilers/test_column_profile_compilers.py +++ b/dataprofiler/tests/profilers/test_column_profile_compilers.py @@ -500,12 +500,13 @@ def test_column_stats_profile_compiler_stats_diff(self): "categories": [["1"], ["9"], ["10"]], "gini_impurity": 0.06944444444444448, "unalikeability": 0.16666666666666663, - "categorical_count": {"9": -1, "1": [1, None], "10": [None, 1]}, + "categorical_count": {"9": -1, "1": 1, "10": -1}, "chi2-test": { "chi2-statistic": 2.1, "df": 2, "p-value": 0.3499377491111554, }, + "psi": 0.009815252971365292, }, } self.assertDictEqual(expected_diff, compiler1.diff(compiler2))