From b269fe700f2e18d927d7a053a66535a26ad61ff0 Mon Sep 17 00:00:00 2001
From: taylorfturner <taylorfturner@gmail.com>
Date: Sat, 23 Sep 2023 01:30:55 -0400
Subject: [PATCH 1/7] fix bug

---
 .../profilers/categorical_column_profile.py   | 41 +++++++++++--------
 .../test_categorical_column_profile.py        | 17 +++-----
 .../test_column_profile_compilers.py          |  3 +-
 3 files changed, 32 insertions(+), 29 deletions(-)

diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py
index c85b195a1..70186c654 100644
--- a/dataprofiler/profilers/categorical_column_profile.py
+++ b/dataprofiler/profilers/categorical_column_profile.py
@@ -2,7 +2,6 @@
 from __future__ import annotations
 
 import math
-import warnings
 from collections import defaultdict
 from operator import itemgetter
 from typing import cast
@@ -306,24 +305,24 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
                     other_profile._categories.items(), key=itemgetter(1), reverse=True
                 )
             )
-            if cat_count1.keys() == cat_count2.keys():
-                total_psi = 0.0
-                for key in cat_count1.keys():
-                    perc_A = cat_count1[key] / self.sample_size
-                    perc_B = cat_count2[key] / other_profile.sample_size
-                    total_psi += (perc_B - perc_A) * math.log(perc_B / perc_A)
-                    differences["statistics"]["psi"] = total_psi
-            else:
-                warnings.warn(
-                    "psi was not calculated due to the differences in categories "
-                    "of the profiles. Differences:\n"
-                    f"{set(cat_count1.keys()) ^ set(cat_count2.keys())}",
-                    RuntimeWarning,
-                )
+            self_cat_count, other_cat_count = self._preprocess_for_categorical_psi_calculation(
+                self_cat_count=cat_count1,
+                other_cat_count=cat_count2,
+            )
+
+            total_psi = 0.0
+            for iter_key in self_cat_count.keys():
+                percent_self = self_cat_count[iter_key] / self.sample_size
+                percent_other = other_cat_count[iter_key] / other_profile.sample_size
+                try:
+                    total_psi += (percent_other - percent_self) * math.log(percent_other / percent_self)
+                except Exception:
+                    total_psi += 0.0
+                differences["statistics"]["psi"] = total_psi
 
             differences["statistics"][
                 "categorical_count"
-            ] = profiler_utils.find_diff_of_dicts(cat_count1, cat_count2)
+            ] = profiler_utils.find_diff_of_dicts(self_cat_count, other_cat_count)
 
         return differences
 
@@ -431,6 +430,16 @@ def is_match(self) -> bool:
             is_match = True
         return is_match
 
+    def _preprocess_for_categorical_psi_calculation(self, self_cat_count, other_cat_count):
+        super_set_categories = set(self_cat_count.keys()) | set(other_cat_count.keys())
+        for iter_key in super_set_categories:
+            for iter_dictionary in [self_cat_count, other_cat_count]:
+                try:
+                    iter_dictionary[iter_key] = iter_dictionary[iter_key]
+                except KeyError:
+                    iter_dictionary[iter_key] = 0
+        return self_cat_count, other_cat_count
+
     def _check_stop_condition_is_met(self, sample_size: int, unqiue_ratio: float):
         """Return boolean given stop conditions.
 
diff --git a/dataprofiler/tests/profilers/test_categorical_column_profile.py b/dataprofiler/tests/profilers/test_categorical_column_profile.py
index 5bdbbb83c..2b751d573 100644
--- a/dataprofiler/tests/profilers/test_categorical_column_profile.py
+++ b/dataprofiler/tests/profilers/test_categorical_column_profile.py
@@ -701,6 +701,7 @@ def test_gini_impurity(self):
         self.assertEqual(profile.gini_impurity, None)
 
     def test_categorical_diff(self):
+        # test psi new category in another profile
         df_categorical = pd.Series(["y", "y", "y", "y", "n", "n", "n"])
         profile = CategoricalColumn(df_categorical.name)
         profile.update(df_categorical)
@@ -720,21 +721,17 @@ def test_categorical_diff(self):
                 "categories": [[], ["y", "n"], ["maybe"]],
                 "gini_impurity": -0.16326530612244894,
                 "unalikeability": -0.19047619047619047,
-                "categorical_count": {"y": 1, "n": 1, "maybe": [None, 2]},
+                "categorical_count": {"y": 1, "n": 1, "maybe": -2},
                 "chi2-test": {
                     "chi2-statistic": 82 / 35,
                     "df": 2,
                     "p-value": 0.3099238764710244,
                 },
+                "psi": 0.0990210257942779,
             },
         }
-        with self.assertWarnsRegex(
-            RuntimeWarning,
-            "psi was not calculated due to the differences in categories "
-            "of the profiles. Differences:\n{'maybe'}",
-        ):
-            test_profile_diff = profile.diff(profile2)
-        self.assertDictEqual(expected_diff, test_profile_diff)
+        actual_diff = profile.diff(profile2)
+        self.assertDictEqual(expected_diff, actual_diff)
 
         # Test with one categorical column matching
         df_not_categorical = pd.Series(
@@ -770,10 +767,6 @@ def test_categorical_diff(self):
         profile2 = CategoricalColumn(df_categorical.name)
         profile2.update(df_categorical)
 
-        # chi2-statistic = sum((observed-expected)^2/expected for each category in each column)
-        # df = categories - 1
-        # psi = (% of records based on Sample (A) - % of records  Sample (B)) * ln(A/ B)
-        # p-value found through using chi2 CDF
         expected_diff = {
             "categorical": "unchanged",
             "statistics": {
diff --git a/dataprofiler/tests/profilers/test_column_profile_compilers.py b/dataprofiler/tests/profilers/test_column_profile_compilers.py
index 35617d1e8..6561b4988 100644
--- a/dataprofiler/tests/profilers/test_column_profile_compilers.py
+++ b/dataprofiler/tests/profilers/test_column_profile_compilers.py
@@ -500,12 +500,13 @@ def test_column_stats_profile_compiler_stats_diff(self):
                 "categories": [["1"], ["9"], ["10"]],
                 "gini_impurity": 0.06944444444444448,
                 "unalikeability": 0.16666666666666663,
-                "categorical_count": {"9": -1, "1": [1, None], "10": [None, 1]},
+                "categorical_count": {"9": -1, "1": 1, "10": -1},
                 "chi2-test": {
                     "chi2-statistic": 2.1,
                     "df": 2,
                     "p-value": 0.3499377491111554,
                 },
+                "psi": 0.009815252971365292
             },
         }
         self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

From bd9f11c999515ee7f178da092ef458dbd6403a56 Mon Sep 17 00:00:00 2001
From: taylorfturner <taylorfturner@gmail.com>
Date: Sat, 23 Sep 2023 01:12:55 -0400
Subject: [PATCH 2/7] reformatting pre-commit

---
 .../profilers/categorical_column_profile.py         | 13 ++++++++++---
 .../profilers/test_column_profile_compilers.py      |  2 +-
 2 files changed, 11 insertions(+), 4 deletions(-)

diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py
index 70186c654..1148e26d7 100644
--- a/dataprofiler/profilers/categorical_column_profile.py
+++ b/dataprofiler/profilers/categorical_column_profile.py
@@ -305,7 +305,10 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
                     other_profile._categories.items(), key=itemgetter(1), reverse=True
                 )
             )
-            self_cat_count, other_cat_count = self._preprocess_for_categorical_psi_calculation(
+            (
+                self_cat_count,
+                other_cat_count,
+            ) = self._preprocess_for_categorical_psi_calculation(
                 self_cat_count=cat_count1,
                 other_cat_count=cat_count2,
             )
@@ -315,7 +318,9 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
                 percent_self = self_cat_count[iter_key] / self.sample_size
                 percent_other = other_cat_count[iter_key] / other_profile.sample_size
                 try:
-                    total_psi += (percent_other - percent_self) * math.log(percent_other / percent_self)
+                    total_psi += (percent_other - percent_self) * math.log(
+                        percent_other / percent_self
+                    )
                 except Exception:
                     total_psi += 0.0
                 differences["statistics"]["psi"] = total_psi
@@ -430,7 +435,9 @@ def is_match(self) -> bool:
             is_match = True
         return is_match
 
-    def _preprocess_for_categorical_psi_calculation(self, self_cat_count, other_cat_count):
+    def _preprocess_for_categorical_psi_calculation(
+        self, self_cat_count, other_cat_count
+    ):
         super_set_categories = set(self_cat_count.keys()) | set(other_cat_count.keys())
         for iter_key in super_set_categories:
             for iter_dictionary in [self_cat_count, other_cat_count]:
diff --git a/dataprofiler/tests/profilers/test_column_profile_compilers.py b/dataprofiler/tests/profilers/test_column_profile_compilers.py
index 6561b4988..46b0212d3 100644
--- a/dataprofiler/tests/profilers/test_column_profile_compilers.py
+++ b/dataprofiler/tests/profilers/test_column_profile_compilers.py
@@ -506,7 +506,7 @@ def test_column_stats_profile_compiler_stats_diff(self):
                     "df": 2,
                     "p-value": 0.3499377491111554,
                 },
-                "psi": 0.009815252971365292
+                "psi": 0.009815252971365292,
             },
         }
         self.assertDictEqual(expected_diff, compiler1.diff(compiler2))

From 9aed67c158bd222b1e99d62563a40ec6ac7a6bb8 Mon Sep 17 00:00:00 2001
From: taylorfturner <taylorfturner@gmail.com>
Date: Sat, 23 Sep 2023 09:41:14 -0400
Subject: [PATCH 3/7] clean up and remove try/except

---
 dataprofiler/profilers/categorical_column_profile.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py
index 1148e26d7..32b31828c 100644
--- a/dataprofiler/profilers/categorical_column_profile.py
+++ b/dataprofiler/profilers/categorical_column_profile.py
@@ -317,12 +317,12 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
             for iter_key in self_cat_count.keys():
                 percent_self = self_cat_count[iter_key] / self.sample_size
                 percent_other = other_cat_count[iter_key] / other_profile.sample_size
-                try:
+                if (percent_other == 0) or (percent_self == 0):
+                    total_psi += 0.0
+                else:
                     total_psi += (percent_other - percent_self) * math.log(
                         percent_other / percent_self
                     )
-                except Exception:
-                    total_psi += 0.0
                 differences["statistics"]["psi"] = total_psi
 
             differences["statistics"][

From dd1aa6fd8b190549cfcd154565fda14972701a54 Mon Sep 17 00:00:00 2001
From: taylorfturner <taylorfturner@gmail.com>
Date: Sat, 23 Sep 2023 10:31:26 -0400
Subject: [PATCH 4/7] pre-commit fix

---
 dataprofiler/profilers/categorical_column_profile.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py
index 32b31828c..c2aecebda 100644
--- a/dataprofiler/profilers/categorical_column_profile.py
+++ b/dataprofiler/profilers/categorical_column_profile.py
@@ -9,10 +9,13 @@
 import datasketches
 from pandas import DataFrame, Series
 
+from .. import dp_logging
 from . import profiler_utils
 from .base_column_profilers import BaseColumnProfiler
 from .profiler_options import CategoricalOptions
 
+logger = dp_logging.get_child_logger(__name__)
+
 
 class CategoricalColumn(BaseColumnProfiler["CategoricalColumn"]):
     """
@@ -439,6 +442,15 @@ def _preprocess_for_categorical_psi_calculation(
         self, self_cat_count, other_cat_count
     ):
         super_set_categories = set(self_cat_count.keys()) | set(other_cat_count.keys())
+        if (super_set_categories != self_cat_count.keys()) or (
+            super_set_categories != other_cat_count.keys()
+        ):
+            logger.info(
+                f"""PSI data pre-processing found that categories between
+                    the profiles were not equal. Both profiles not contain
+                    the following categories {super_set_categories}."""
+            )
+
         for iter_key in super_set_categories:
             for iter_dictionary in [self_cat_count, other_cat_count]:
                 try:

From 1b42df0b257c5464b118731eac260bba5b8530d0 Mon Sep 17 00:00:00 2001
From: taylorfturner <taylorfturner@gmail.com>
Date: Sat, 23 Sep 2023 11:11:57 -0400
Subject: [PATCH 5/7] typo fix

---
 dataprofiler/profilers/categorical_column_profile.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py
index c2aecebda..ca0e6a4b9 100644
--- a/dataprofiler/profilers/categorical_column_profile.py
+++ b/dataprofiler/profilers/categorical_column_profile.py
@@ -447,7 +447,7 @@ def _preprocess_for_categorical_psi_calculation(
         ):
             logger.info(
                 f"""PSI data pre-processing found that categories between
-                    the profiles were not equal. Both profiles not contain
+                    the profiles were not equal. Both profiles do not contain
                     the following categories {super_set_categories}."""
             )
 

From 0b47910e479be0f552f698e4f5677549e3d23826 Mon Sep 17 00:00:00 2001
From: Taylor Turner <taylorfturner@gmail.com>
Date: Mon, 25 Sep 2023 09:46:37 -0400
Subject: [PATCH 6/7] Update
 dataprofiler/profilers/categorical_column_profile.py

Co-authored-by: Michael Davis <36012613+micdavis@users.noreply.github.com>
---
 dataprofiler/profilers/categorical_column_profile.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py
index ca0e6a4b9..6c6f8a314 100644
--- a/dataprofiler/profilers/categorical_column_profile.py
+++ b/dataprofiler/profilers/categorical_column_profile.py
@@ -320,9 +320,7 @@ def diff(self, other_profile: CategoricalColumn, options: dict = None) -> dict:
             for iter_key in self_cat_count.keys():
                 percent_self = self_cat_count[iter_key] / self.sample_size
                 percent_other = other_cat_count[iter_key] / other_profile.sample_size
-                if (percent_other == 0) or (percent_self == 0):
-                    total_psi += 0.0
-                else:
+                if (percent_other != 0) and (percent_self != 0):
                     total_psi += (percent_other - percent_self) * math.log(
                         percent_other / percent_self
                     )

From bd770ea6563bd90124ac10b524aa9e6814062275 Mon Sep 17 00:00:00 2001
From: Taylor Turner <taylorfturner@gmail.com>
Date: Mon, 25 Sep 2023 09:46:44 -0400
Subject: [PATCH 7/7] Update
 dataprofiler/profilers/categorical_column_profile.py

Co-authored-by: Michael Davis <36012613+micdavis@users.noreply.github.com>
---
 dataprofiler/profilers/categorical_column_profile.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py
index 6c6f8a314..1ca630900 100644
--- a/dataprofiler/profilers/categorical_column_profile.py
+++ b/dataprofiler/profilers/categorical_column_profile.py
@@ -440,9 +440,7 @@ def _preprocess_for_categorical_psi_calculation(
         self, self_cat_count, other_cat_count
     ):
         super_set_categories = set(self_cat_count.keys()) | set(other_cat_count.keys())
-        if (super_set_categories != self_cat_count.keys()) or (
-            super_set_categories != other_cat_count.keys()
-        ):
+        if self_cat_count.keys() != other_cat_count.keys():
             logger.info(
                 f"""PSI data pre-processing found that categories between
                     the profiles were not equal. Both profiles do not contain