From b269fe700f2e18d927d7a053a66535a26ad61ff0 Mon Sep 17 00:00:00 2001 From: taylorfturner Date: Sat, 23 Sep 2023 01:30:55 -0400 Subject: [PATCH 1/7] fix bug --- .../profilers/categorical_column_profile.py | 41 +++++++++++-------- .../test_categorical_column_profile.py | 17 +++----- .../test_column_profile_compilers.py | 3 +- 3 files changed, 32 insertions(+), 29 deletions(-) diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index c85b195a1..70186c654 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -2,7 +2,6 @@ from __future__ import annotations import math -import warnings from collections import defaultdict from operator import itemgetter from typing import cast @@ -306,24 +305,24 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: other_profile._categories.items(), key=itemgetter(1), reverse=True ) ) - if cat_count1.keys() == cat_count2.keys(): - total_psi = 0.0 - for key in cat_count1.keys(): - perc_A = cat_count1[key] / self.sample_size - perc_B = cat_count2[key] / other_profile.sample_size - total_psi += (perc_B - perc_A) * math.log(perc_B / perc_A) - differences["statistics"]["psi"] = total_psi - else: - warnings.warn( - "psi was not calculated due to the differences in categories " - "of the profiles. Differences:\n" - f"{set(cat_count1.keys()) ^ set(cat_count2.keys())}", - RuntimeWarning, - ) + self_cat_count, other_cat_count = self._preprocess_for_categorical_psi_calculation( + self_cat_count=cat_count1, + other_cat_count=cat_count2, + ) + + total_psi = 0.0 + for iter_key in self_cat_count.keys(): + percent_self = self_cat_count[iter_key] / self.sample_size + percent_other = other_cat_count[iter_key] / other_profile.sample_size + try: + total_psi += (percent_other - percent_self) * math.log(percent_other / percent_self) + except Exception: + total_psi += 0.0 + differences["statistics"]["psi"] = total_psi differences["statistics"][ "categorical_count" - ] = profiler_utils.find_diff_of_dicts(cat_count1, cat_count2) + ] = profiler_utils.find_diff_of_dicts(self_cat_count, other_cat_count) return differences @@ -431,6 +430,16 @@ def is_match(self) -> bool: is_match = True return is_match + def _preprocess_for_categorical_psi_calculation(self, self_cat_count, other_cat_count): + super_set_categories = set(self_cat_count.keys()) | set(other_cat_count.keys()) + for iter_key in super_set_categories: + for iter_dictionary in [self_cat_count, other_cat_count]: + try: + iter_dictionary[iter_key] = iter_dictionary[iter_key] + except KeyError: + iter_dictionary[iter_key] = 0 + return self_cat_count, other_cat_count + def _check_stop_condition_is_met(self, sample_size: int, unqiue_ratio: float): """Return boolean given stop conditions. diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py index 5bdbbb83c..2b751d573 100644 --- a/dataprofiler/tests/profilers/test_categorical_column_profile.py +++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py @@ -701,6 +701,7 @@ def test_gini_impurity(self): self.assertEqual(profile.gini_impurity, None) def test_categorical_diff(self): + # test psi new category in another profile df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"]) profile = CategoricalColumn(df_categorical.name) profile.update(df_categorical) @@ -720,21 +721,17 @@ def test_categorical_diff(self): "categories": [[], ["y", "n"], ["maybe"]], "gini_impurity": -0.16326530612244894, "unalikeability": -0.19047619047619047, - "categorical_count": {"y": 1, "n": 1, "maybe": [None, 2]}, + "categorical_count": {"y": 1, "n": 1, "maybe": -2}, "chi2-test": { "chi2-statistic": 82 / 35, "df": 2, "p-value": 0.3099238764710244, }, + "psi": 0.0990210257942779, }, } - with self.assertWarnsRegex( - RuntimeWarning, - "psi was not calculated due to the differences in categories " - "of the profiles. Differences:\n{'maybe'}", - ): - test_profile_diff = profile.diff(profile2) - self.assertDictEqual(expected_diff, test_profile_diff) + actual_diff = profile.diff(profile2) + self.assertDictEqual(expected_diff, actual_diff) # Test with one categorical column matching df_not_categorical = pd.Series( @@ -770,10 +767,6 @@ def test_categorical_diff(self): profile2 = CategoricalColumn(df_categorical.name) profile2.update(df_categorical) - # chi2-statistic = sum((observed-expected)^2/expected for each category in each column) - # df = categories - 1 - # psi = (% of records based on Sample (A) - % of records Sample (B)) * ln(A/ B) - # p-value found through using chi2 CDF expected_diff = { "categorical": "unchanged", "statistics": { diff --git a/dataprofiler/tests/profilers/test_column_profile_compilers.py b/dataprofiler/tests/profilers/test_column_profile_compilers.py index 35617d1e8..6561b4988 100644 --- a/dataprofiler/tests/profilers/test_column_profile_compilers.py +++ b/dataprofiler/tests/profilers/test_column_profile_compilers.py @@ -500,12 +500,13 @@ def test_column_stats_profile_compiler_stats_diff(self): "categories": [["1"], ["9"], ["10"]], "gini_impurity": 0.06944444444444448, "unalikeability": 0.16666666666666663, - "categorical_count": {"9": -1, "1": [1, None], "10": [None, 1]}, + "categorical_count": {"9": -1, "1": 1, "10": -1}, "chi2-test": { "chi2-statistic": 2.1, "df": 2, "p-value": 0.3499377491111554, }, + "psi": 0.009815252971365292 }, } self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) From bd9f11c999515ee7f178da092ef458dbd6403a56 Mon Sep 17 00:00:00 2001 From: taylorfturner Date: Sat, 23 Sep 2023 01:12:55 -0400 Subject: [PATCH 2/7] reformatting pre-commit --- .../profilers/categorical_column_profile.py | 13 ++++++++++--- .../profilers/test_column_profile_compilers.py | 2 +- 2 files changed, 11 insertions(+), 4 deletions(-) diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index 70186c654..1148e26d7 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -305,7 +305,10 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: other_profile._categories.items(), key=itemgetter(1), reverse=True ) ) - self_cat_count, other_cat_count = self._preprocess_for_categorical_psi_calculation( + ( + self_cat_count, + other_cat_count, + ) = self._preprocess_for_categorical_psi_calculation( self_cat_count=cat_count1, other_cat_count=cat_count2, ) @@ -315,7 +318,9 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: percent_self = self_cat_count[iter_key] / self.sample_size percent_other = other_cat_count[iter_key] / other_profile.sample_size try: - total_psi += (percent_other - percent_self) * math.log(percent_other / percent_self) + total_psi += (percent_other - percent_self) * math.log( + percent_other / percent_self + ) except Exception: total_psi += 0.0 differences["statistics"]["psi"] = total_psi @@ -430,7 +435,9 @@ def is_match(self) -> bool: is_match = True return is_match - def _preprocess_for_categorical_psi_calculation(self, self_cat_count, other_cat_count): + def _preprocess_for_categorical_psi_calculation( + self, self_cat_count, other_cat_count + ): super_set_categories = set(self_cat_count.keys()) | set(other_cat_count.keys()) for iter_key in super_set_categories: for iter_dictionary in [self_cat_count, other_cat_count]: diff --git a/dataprofiler/tests/profilers/test_column_profile_compilers.py b/dataprofiler/tests/profilers/test_column_profile_compilers.py index 6561b4988..46b0212d3 100644 --- a/dataprofiler/tests/profilers/test_column_profile_compilers.py +++ b/dataprofiler/tests/profilers/test_column_profile_compilers.py @@ -506,7 +506,7 @@ def test_column_stats_profile_compiler_stats_diff(self): "df": 2, "p-value": 0.3499377491111554, }, - "psi": 0.009815252971365292 + "psi": 0.009815252971365292, }, } self.assertDictEqual(expected_diff, compiler1.diff(compiler2)) From 9aed67c158bd222b1e99d62563a40ec6ac7a6bb8 Mon Sep 17 00:00:00 2001 From: taylorfturner Date: Sat, 23 Sep 2023 09:41:14 -0400 Subject: [PATCH 3/7] clean up and remove try/except --- dataprofiler/profilers/categorical_column_profile.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index 1148e26d7..32b31828c 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -317,12 +317,12 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: for iter_key in self_cat_count.keys(): percent_self = self_cat_count[iter_key] / self.sample_size percent_other = other_cat_count[iter_key] / other_profile.sample_size - try: + if (percent_other == 0) or (percent_self == 0): + total_psi += 0.0 + else: total_psi += (percent_other - percent_self) * math.log( percent_other / percent_self ) - except Exception: - total_psi += 0.0 differences["statistics"]["psi"] = total_psi differences["statistics"][ From dd1aa6fd8b190549cfcd154565fda14972701a54 Mon Sep 17 00:00:00 2001 From: taylorfturner Date: Sat, 23 Sep 2023 10:31:26 -0400 Subject: [PATCH 4/7] pre-commit fix --- dataprofiler/profilers/categorical_column_profile.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index 32b31828c..c2aecebda 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -9,10 +9,13 @@ import datasketches from pandas import DataFrame, Series +from .. import dp_logging from . import profiler_utils from .base_column_profilers import BaseColumnProfiler from .profiler_options import CategoricalOptions +logger = dp_logging.get_child_logger(__name__) + class CategoricalColumn(BaseColumnProfiler["CategoricalColumn"]): """ @@ -439,6 +442,15 @@ def _preprocess_for_categorical_psi_calculation( self, self_cat_count, other_cat_count ): super_set_categories = set(self_cat_count.keys()) | set(other_cat_count.keys()) + if (super_set_categories != self_cat_count.keys()) or ( + super_set_categories != other_cat_count.keys() + ): + logger.info( + f"""PSI data pre-processing found that categories between + the profiles were not equal. Both profiles not contain + the following categories {super_set_categories}.""" + ) + for iter_key in super_set_categories: for iter_dictionary in [self_cat_count, other_cat_count]: try: From 1b42df0b257c5464b118731eac260bba5b8530d0 Mon Sep 17 00:00:00 2001 From: taylorfturner Date: Sat, 23 Sep 2023 11:11:57 -0400 Subject: [PATCH 5/7] typo fix --- dataprofiler/profilers/categorical_column_profile.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index c2aecebda..ca0e6a4b9 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -447,7 +447,7 @@ def _preprocess_for_categorical_psi_calculation( ): logger.info( f"""PSI data pre-processing found that categories between - the profiles were not equal. Both profiles not contain + the profiles were not equal. Both profiles do not contain the following categories {super_set_categories}.""" ) From 0b47910e479be0f552f698e4f5677549e3d23826 Mon Sep 17 00:00:00 2001 From: Taylor Turner Date: Mon, 25 Sep 2023 09:46:37 -0400 Subject: [PATCH 6/7] Update dataprofiler/profilers/categorical_column_profile.py Co-authored-by: Michael Davis <36012613+micdavis@users.noreply.github.com> --- dataprofiler/profilers/categorical_column_profile.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index ca0e6a4b9..6c6f8a314 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -320,9 +320,7 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict: for iter_key in self_cat_count.keys(): percent_self = self_cat_count[iter_key] / self.sample_size percent_other = other_cat_count[iter_key] / other_profile.sample_size - if (percent_other == 0) or (percent_self == 0): - total_psi += 0.0 - else: + if (percent_other != 0) and (percent_self != 0): total_psi += (percent_other - percent_self) * math.log( percent_other / percent_self ) From bd770ea6563bd90124ac10b524aa9e6814062275 Mon Sep 17 00:00:00 2001 From: Taylor Turner Date: Mon, 25 Sep 2023 09:46:44 -0400 Subject: [PATCH 7/7] Update dataprofiler/profilers/categorical_column_profile.py Co-authored-by: Michael Davis <36012613+micdavis@users.noreply.github.com> --- dataprofiler/profilers/categorical_column_profile.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py index 6c6f8a314..1ca630900 100644 --- a/dataprofiler/profilers/categorical_column_profile.py +++ b/dataprofiler/profilers/categorical_column_profile.py @@ -440,9 +440,7 @@ def _preprocess_for_categorical_psi_calculation( self, self_cat_count, other_cat_count ): super_set_categories = set(self_cat_count.keys()) | set(other_cat_count.keys()) - if (super_set_categories != self_cat_count.keys()) or ( - super_set_categories != other_cat_count.keys() - ): + if self_cat_count.keys() != other_cat_count.keys(): logger.info( f"""PSI data pre-processing found that categories between the profiles were not equal. Both profiles do not contain