Adding PSI to diff report (#688)

* placeholder * adding placeholders * DRY-er code * logic placeholder * reformat * end of day catch up * catch up * catch up * conditionally regen * conditionally regen * black reformatting * black reformatting * commit * commit * commit * commit * typo * typo in diff() * typo in diff() * fix logic * fix logi in * formatting * catch up * catch up * cond None * conditionality * outside try/except * formatting * clean up * take out config condition on diff psi * test runs * revert * clean up * remove unneeded changes in expected values * only do 10 bins for PSI * suggested to 9 so ultimately truly 10 bins * fix values expected * fix values expected * pre-commit * comment resolution * adding options for num_psi_bins * clean up conditional logic * clean up white space * clean up logic in second case * remove lingering maxDiff = None * format * docstring and .match_count * psi clean up bin edges and max/min * psi values based on manual calc * psi values based on manual calc * clean up logic on regen_hist * clean up logic on regen_hist * clean up variable * clean up ifs * classmethod * classmethod * fix debugging remnant
capitalone · Nov 2, 2022 · 6a407af · 6a407af
1 parent da43bbf
commit 6a407af
Show file tree

Hide file tree

Showing 6 changed files with 211 additions and 43 deletions.
diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py
@@ -391,6 +391,12 @@ def diff(self, other_profile: NumericStatsMixin, options: Dict = None) -> Dict:
                 other_profile.variance,
                 other_profile.match_count,
             ),
+            "psi": self._calculate_psi(
+                self.match_count,
+                self._stored_histogram["histogram"],
+                other_profile.match_count,
+                other_profile._stored_histogram["histogram"],
+            ),
         }
         return differences
 
@@ -516,6 +522,144 @@ def _perform_t_test(
         results["welch"]["p-value"] = float(welch_p_val)
         return results
 
+    def _preprocess_for_calculate_psi(
+        self,
+        self_histogram,
+        other_histogram,
+    ):
+        new_self_histogram = {"bin_counts": None, "bin_edges": None}
+        new_other_histogram = {"bin_counts": None, "bin_edges": None}
+        regenerate_histogram = False
+        num_psi_bins = 10
+
+        if (
+            isinstance(self_histogram["bin_counts"], np.ndarray)
+            and isinstance(self_histogram["bin_edges"], np.ndarray)
+            and isinstance(other_histogram["bin_counts"], np.ndarray)
+            and isinstance(other_histogram["bin_edges"], np.ndarray)
+        ):
+            regenerate_histogram = True
+            min_min_edge = min(
+                self_histogram["bin_edges"][0],
+                other_histogram["bin_edges"][0],
+            )
+            max_max_edge = max(
+                self_histogram["bin_edges"][-1],
+                other_histogram["bin_edges"][-1],
+            )
+
+        if regenerate_histogram:
+            new_self_histogram["bin_counts"] = self_histogram["bin_counts"]
+            new_self_histogram["bin_edges"] = self_histogram["bin_edges"]
+            new_other_histogram["bin_edges"] = other_histogram["bin_edges"]
+            new_other_histogram["bin_counts"] = other_histogram["bin_counts"]
+
+            len_self_bin_counts = 0
+            if len(self_histogram["bin_counts"]) > 0:
+                len_self_bin_counts = len(self_histogram["bin_counts"])
+
+            # re-calculate `self` histogram
+            if not len_self_bin_counts == num_psi_bins:
+                histogram, hist_loss = self._regenerate_histogram(
+                    bin_counts=self_histogram["bin_counts"],
+                    bin_edges=self_histogram["bin_edges"],
+                    suggested_bin_count=num_psi_bins,
+                    options={
+                        "min_edge": min_min_edge,
+                        "max_edge": max_max_edge,
+                    },
+                )
+                new_self_histogram["bin_counts"] = histogram["bin_counts"]
+                new_self_histogram["bin_edges"] = histogram["bin_edges"]
+
+            # re-calculate `other_profile` histogram
+            histogram_edges_not_equal = False
+            all_array_values_equal = (
+                other_histogram["bin_edges"] == self_histogram["bin_edges"]
+            ).all()
+            if not all_array_values_equal:
+                histogram_edges_not_equal = True
+
+            if histogram_edges_not_equal:
+                histogram, hist_loss = self._regenerate_histogram(
+                    bin_counts=other_histogram["bin_counts"],
+                    bin_edges=other_histogram["bin_edges"],
+                    suggested_bin_count=num_psi_bins,
+                    options={
+                        "min_edge": min_min_edge,
+                        "max_edge": max_max_edge,
+                    },
+                )
+
+                new_other_histogram["bin_edges"] = histogram["bin_edges"]
+                new_other_histogram["bin_counts"] = histogram["bin_counts"]
+
+        return new_self_histogram, new_other_histogram
+
+    def _calculate_psi(
+        self,
+        self_match_count: int,
+        self_histogram: np.ndarray,
+        other_match_count: int,
+        other_histogram: np.ndarray,
+    ) -> Optional[float]:
+        """
+        Calculate PSI (Population Stability Index).
+
+        ```
+        PSI = SUM((other_pcnt - self_pcnt) * ln(other_pcnt / self_pcnt))
+        ```
+
+        PSI Breakpoint Thresholds:
+            - PSI < 0.1: no significant population change
+            - 0.1 < PSI < 0.2: moderate population change
+            - PSI >= 0.2: significant population change
+
+        :param self_match_count: self.match_count
+        :type self_match_count: int
+        :param self_histogram: self._stored_histogram["histogram"]
+        :type self_histogram: np.ndarray
+        :param self_match_count: other_profile.match_count
+        :type self_match_count: int
+        :param other_histogram: other_profile._stored_histogram["histogram"]
+        :type other_histogram: np.ndarray
+        :return: psi_value
+        :rtype: optional[float]
+        """
+        psi_value = 0
+
+        new_self_histogram, new_other_histogram = self._preprocess_for_calculate_psi(
+            self_histogram=self_histogram,
+            other_histogram=other_histogram,
+        )
+
+        if isinstance(new_other_histogram["bin_edges"], type(None)) or isinstance(
+            new_self_histogram["bin_edges"], type(None)
+        ):
+            warnings.warn(
+                "No edges available in at least one histogram for calculating `PSI`",
+                RuntimeWarning,
+            )
+            return None
+
+        bin_count: int = 0  # required typing by mypy
+        for iter_value, bin_count in enumerate(new_self_histogram["bin_counts"]):
+
+            self_percent = bin_count / self_match_count
+            other_percent = (
+                new_other_histogram["bin_counts"][iter_value] / other_match_count
+            )
+            if (self_percent == other_percent) and self_percent == 0:
+                continue
+
+            iter_psi = (other_percent - self_percent) * np.log(
+                other_percent / self_percent
+            )
+            if iter_psi and iter_psi != float("inf"):
+                psi_value += iter_psi
+
+        return psi_value
+
     def _update_variance(
         self, batch_mean: float, batch_var: float, batch_count: int
     ) -> float:
@@ -1059,48 +1203,19 @@ def _update_histogram(self, df_series: pd.Series) -> None:
         self._stored_histogram["current_loss"] = histogram_loss
         self._stored_histogram["total_loss"] += histogram_loss
 
-    def _histogram_for_profile(
-        self, histogram_method: str
+    def _regenerate_histogram(
+        self, bin_counts, bin_edges, suggested_bin_count, options=None
     ) -> Tuple[Dict[str, np.ndarray], float]:
-        """
-        Convert the stored histogram into the presentable state.
-
-        Based on the suggested histogram bin count from numpy.histograms.
-        The bin count used is stored in 'suggested_bin_count' for each method.
-
-        :param histogram_method: method to use for determining the histogram
-            profile
-        :type histogram_method: str
-        :return: histogram bin edges and bin counts
-        :rtype: dict
-        """
-        bin_counts, bin_edges = (
-            self._stored_histogram["histogram"]["bin_counts"],
-            self._stored_histogram["histogram"]["bin_edges"],
-        )
-
-        current_bin_counts, suggested_bin_count = (
-            self.histogram_methods[histogram_method]["histogram"]["bin_counts"],
-            self.histogram_methods[histogram_method]["suggested_bin_count"],
-        )
-
-        # base case, no need to change if it is already correct
-        if not self._has_histogram or current_bin_counts is not None:
-            return (
-                self.histogram_methods[histogram_method]["histogram"],
-                self.histogram_methods[histogram_method]["total_loss"],
-            )
-        elif len(bin_counts) == suggested_bin_count:
-            return (
-                self._stored_histogram["histogram"],
-                self._stored_histogram["total_loss"],
-            )
 
         # create proper binning
         new_bin_counts = np.zeros((suggested_bin_count,))
         new_bin_edges = np.linspace(
             bin_edges[0], bin_edges[-1], suggested_bin_count + 1
         )
+        if options:
+            new_bin_edges = np.linspace(
+                options["min_edge"], options["max_edge"], suggested_bin_count + 1
+            )
 
         # allocate bin_counts
         new_bin_id = 0
@@ -1159,6 +1274,49 @@ def _histogram_for_profile(
 
         return ({"bin_edges": new_bin_edges, "bin_counts": new_bin_counts}, hist_loss)
 
+    def _histogram_for_profile(
+        self, histogram_method: str
+    ) -> Tuple[Dict[str, np.ndarray], float]:
+        """
+        Convert the stored histogram into the presentable state.
+
+        Based on the suggested histogram bin count from numpy.histograms.
+        The bin count used is stored in 'suggested_bin_count' for each method.
+
+        :param histogram_method: method to use for determining the histogram
+            profile
+        :type histogram_method: str
+        :return: histogram bin edges and bin counts
+        :rtype: dict
+        """
+        bin_counts, bin_edges = (
+            self._stored_histogram["histogram"]["bin_counts"],
+            self._stored_histogram["histogram"]["bin_edges"],
+        )
+
+        current_bin_counts, suggested_bin_count = (
+            self.histogram_methods[histogram_method]["histogram"]["bin_counts"],
+            self.histogram_methods[histogram_method]["suggested_bin_count"],
+        )
+
+        # base case, no need to change if it is already correct
+        if not self._has_histogram or current_bin_counts is not None:
+            return (
+                self.histogram_methods[histogram_method]["histogram"],
+                self.histogram_methods[histogram_method]["total_loss"],
+            )
+        elif len(bin_counts) == suggested_bin_count:
+            return (
+                self._stored_histogram["histogram"],
+                self._stored_histogram["total_loss"],
+            )
+
+        return self._regenerate_histogram(
+            bin_counts=bin_counts,
+            bin_edges=bin_edges,
+            suggested_bin_count=suggested_bin_count,
+        )
+
     def _get_best_histogram_for_profile(self) -> Dict:
         """
         Convert the stored histogram into the presentable state.

diff --git a/dataprofiler/tests/profilers/test_column_profile_compilers.py b/dataprofiler/tests/profilers/test_column_profile_compilers.py
@@ -171,9 +171,10 @@ def test_diff_primitive_compilers(self):
                 "stddev": 3.285085839971525,
                 "t-test": {
                     "t-statistic": 0.4155260166386663,
-                    "conservative": {"df": 1, "p-value": 0.749287157907667},
+                    "conservative": {"df": 1.0, "p-value": 0.749287157907667},
                     "welch": {"df": 3.6288111187629117, "p-value": 0.7011367179395704},
                 },
+                "psi": 0.17328679513998632,
             },
         }
         profile_diff = compiler1.diff(compiler2)
@@ -247,7 +248,7 @@ def test_disabling_columns_during_primitive_diff(self):
                 "sum": -20.0,
                 "mean": -10.0,
                 "median": -10,
-                "mode": [[-2, -1, 1, 2], [], [5, 15]],
+                "mode": [[-2.0, -1.0, 1.0, 2.0], [], [5, 15]],
                 "median_absolute_deviation": -3.5,
                 "variance": -46.666666666666664,
                 "stddev": data1.astype(int).std() - data2.astype(int).std(),
@@ -262,9 +263,10 @@ def test_disabling_columns_during_primitive_diff(self):
                 },
                 "t-test": {
                     "t-statistic": -1.9674775073518591,
-                    "conservative": {"df": 1, "p-value": 0.29936264581081673},
+                    "conservative": {"df": 1.0, "p-value": 0.29936264581081673},
                     "welch": {"df": 1.0673824509440946, "p-value": 0.28696889329266506},
                 },
+                "psi": 0,
             },
         }
         profile_diff = compiler1.diff(compiler2)

diff --git a/dataprofiler/tests/profilers/test_float_column_profile.py b/dataprofiler/tests/profilers/test_float_column_profile.py
@@ -1689,7 +1689,7 @@ def test_diff(self):
             "precision": {
                 "min": 1,
                 "max": 1,
-                "mean": 1,
+                "mean": 1.0,
                 "var": profile1["precision"]["var"] - profile2["precision"]["var"],
                 "std": profile1["precision"]["std"] - profile2["precision"]["std"],
                 "sample_size": -1,
@@ -1698,9 +1698,10 @@ def test_diff(self):
             },
             "t-test": {
                 "t-statistic": 0.5393164101529813,
-                "conservative": {"df": 2, "p-value": 0.643676756587475},
+                "conservative": {"df": 2.0, "p-value": 0.643676756587475},
                 "welch": {"df": 4.999127432888682, "p-value": 0.6128117908944144},
             },
+            "psi": 0,
         }
         profile_diff = profiler1.diff(profiler2)
         try:

diff --git a/dataprofiler/tests/profilers/test_int_column_profile.py b/dataprofiler/tests/profilers/test_int_column_profile.py
@@ -1053,14 +1053,15 @@ def test_diff(self):
             "stddev": -7.899494936611665,
             "sum": -4.0,
             "variance": -94.0,
-            "median": -4,
+            "median": -4.0,
             "mode": [[2, 6, 4], [], [1, 15]],
             "median_absolute_deviation": -5,
             "t-test": {
                 "t-statistic": -0.5638091828819275,
-                "conservative": {"df": 1, "p-value": 0.6731699660830497},
+                "conservative": {"df": 1.0, "p-value": 0.6731699660830497},
                 "welch": {"df": 1.0547717074524683, "p-value": 0.6691886269547123},
             },
+            "psi": 0.0675775180180274,
         }
         profile_diff = profiler1.diff(profiler2)
         try:

diff --git a/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py b/dataprofiler/tests/profilers/test_numeric_stats_mixin_profile.py
@@ -796,6 +796,7 @@ def test_diff(self):
                 "conservative": {"df": 9, "p-value": 0.7039643545772609},
                 "welch": {"df": 25.945257024943864, "p-value": 0.6980401261750298},
             },
+            "psi": None,
         }
 
         difference = other1.diff(other2)
@@ -836,6 +837,7 @@ def test_diff(self):
                 "conservative": {"df": None, "p-value": None},
                 "welch": {"df": None, "p-value": None},
             },
+            "psi": None,
         }
         expected_var = expected_diff.pop("variance")
         expected_stddev = expected_diff.pop("stddev")
@@ -885,6 +887,7 @@ def test_diff(self):
                 "conservative": {"df": None, "p-value": None},
                 "welch": {"df": None, "p-value": None},
             },
+            "psi": None,
         }
         expected_var = expected_diff.pop("variance")
         expected_stddev = expected_diff.pop("stddev")
@@ -933,6 +936,7 @@ def test_diff(self):
                 "conservative": {"df": None, "p-value": None},
                 "welch": {"df": None, "p-value": None},
             },
+            "psi": None,
         }
         expected_var = expected_diff.pop("variance")
         expected_stddev = expected_diff.pop("stddev")
@@ -980,6 +984,7 @@ def test_diff(self):
                 "conservative": {"df": 9, "p-value": 0.011958658754358975},
                 "welch": {"df": 25.945257024943864, "p-value": 0.004201616692122823},
             },
+            "psi": None,
         }
         difference = other1.diff(other2)
         self.assertDictEqual(expected_diff, difference)

diff --git a/dataprofiler/tests/profilers/test_text_column_profile.py b/dataprofiler/tests/profilers/test_text_column_profile.py
@@ -587,9 +587,10 @@ def test_diff(self):
             ),
             "t-test": {
                 "t-statistic": -1.9339958714826413,
-                "conservative": {"df": 8, "p-value": 0.08916903961929257},
+                "conservative": {"df": 8.0, "p-value": 0.08916903961929257},
                 "welch": {"df": 15.761400272034564, "p-value": 0.07127621949432528},
             },
+            "psi": 0.7211391539728152,
         }
 
         profile_diff = profiler1.diff(profiler2)