Skip to content

Commit

Permalink
Adding PSI to diff report (#688)
Browse files Browse the repository at this point in the history
* placeholder

* adding placeholders

* DRY-er code

* logic placeholder

* reformat

* end of day catch up

* catch up

* catch up

* conditionally regen

* conditionally regen

* black reformatting

* black reformatting

* commit

* commit

* commit

* commit

* typo

* typo in diff()

* typo in diff()

* fix logic

* fix logi in

* formatting

* catch up

* catch up

* cond None

* conditionality

* outside try/except

* formatting

* clean up

* take out config condition on diff psi

* test runs

* revert

* clean up

* remove unneeded changes in expected values

* only do 10 bins for PSI

* suggested to 9 so ultimately truly 10 bins

* fix values expected

* fix values expected

* pre-commit

* comment resolution

* adding options for num_psi_bins

* clean up conditional logic

* clean up white space

* clean up logic in second case

* remove lingering maxDiff = None

* format

* docstring and .match_count

* psi clean up bin edges and max/min

* psi values based on manual calc

* psi values based on manual calc

* clean up logic on regen_hist

* clean up logic on regen_hist

* clean up variable

* clean up ifs

* classmethod

* classmethod

* fix debugging remnant
  • Loading branch information
taylorfturner authored Nov 2, 2022
1 parent da43bbf commit 6a407af
Show file tree
Hide file tree
Showing 6 changed files with 211 additions and 43 deletions.
228 changes: 193 additions & 35 deletions dataprofiler/profilers/numerical_column_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,12 @@ def diff(self, other_profile: NumericStatsMixin, options: Dict = None) -> Dict:
other_profile.variance,
other_profile.match_count,
),
"psi": self._calculate_psi(
self.match_count,
self._stored_histogram["histogram"],
other_profile.match_count,
other_profile._stored_histogram["histogram"],
),
}
return differences

Expand Down Expand Up @@ -516,6 +522,144 @@ def _perform_t_test(
results["welch"]["p-value"] = float(welch_p_val)
return results

def _preprocess_for_calculate_psi(
self,
self_histogram,
other_histogram,
):
new_self_histogram = {"bin_counts": None, "bin_edges": None}
new_other_histogram = {"bin_counts": None, "bin_edges": None}
regenerate_histogram = False
num_psi_bins = 10

if (
isinstance(self_histogram["bin_counts"], np.ndarray)
and isinstance(self_histogram["bin_edges"], np.ndarray)
and isinstance(other_histogram["bin_counts"], np.ndarray)
and isinstance(other_histogram["bin_edges"], np.ndarray)
):
regenerate_histogram = True
min_min_edge = min(
self_histogram["bin_edges"][0],
other_histogram["bin_edges"][0],
)
max_max_edge = max(
self_histogram["bin_edges"][-1],
other_histogram["bin_edges"][-1],
)

if regenerate_histogram:
new_self_histogram["bin_counts"] = self_histogram["bin_counts"]
new_self_histogram["bin_edges"] = self_histogram["bin_edges"]
new_other_histogram["bin_edges"] = other_histogram["bin_edges"]
new_other_histogram["bin_counts"] = other_histogram["bin_counts"]

len_self_bin_counts = 0
if len(self_histogram["bin_counts"]) > 0:
len_self_bin_counts = len(self_histogram["bin_counts"])

# re-calculate `self` histogram
if not len_self_bin_counts == num_psi_bins:
histogram, hist_loss = self._regenerate_histogram(
bin_counts=self_histogram["bin_counts"],
bin_edges=self_histogram["bin_edges"],
suggested_bin_count=num_psi_bins,
options={
"min_edge": min_min_edge,
"max_edge": max_max_edge,
},
)
new_self_histogram["bin_counts"] = histogram["bin_counts"]
new_self_histogram["bin_edges"] = histogram["bin_edges"]

# re-calculate `other_profile` histogram
histogram_edges_not_equal = False
all_array_values_equal = (
other_histogram["bin_edges"] == self_histogram["bin_edges"]
).all()
if not all_array_values_equal:
histogram_edges_not_equal = True

if histogram_edges_not_equal:
histogram, hist_loss = self._regenerate_histogram(
bin_counts=other_histogram["bin_counts"],
bin_edges=other_histogram["bin_edges"],
suggested_bin_count=num_psi_bins,
options={
"min_edge": min_min_edge,
"max_edge": max_max_edge,
},
)

new_other_histogram["bin_edges"] = histogram["bin_edges"]
new_other_histogram["bin_counts"] = histogram["bin_counts"]

return new_self_histogram, new_other_histogram

def _calculate_psi(
self,
self_match_count: int,
self_histogram: np.ndarray,
other_match_count: int,
other_histogram: np.ndarray,
) -> Optional[float]:
"""
Calculate PSI (Population Stability Index).
```
PSI = SUM((other_pcnt - self_pcnt) * ln(other_pcnt / self_pcnt))
```
PSI Breakpoint Thresholds:
- PSI < 0.1: no significant population change
- 0.1 < PSI < 0.2: moderate population change
- PSI >= 0.2: significant population change
:param self_match_count: self.match_count
:type self_match_count: int
:param self_histogram: self._stored_histogram["histogram"]
:type self_histogram: np.ndarray
:param self_match_count: other_profile.match_count
:type self_match_count: int
:param other_histogram: other_profile._stored_histogram["histogram"]
:type other_histogram: np.ndarray
:return: psi_value
:rtype: optional[float]
"""
psi_value = 0

new_self_histogram, new_other_histogram = self._preprocess_for_calculate_psi(
self_histogram=self_histogram,
other_histogram=other_histogram,
)

if isinstance(new_other_histogram["bin_edges"], type(None)) or isinstance(
new_self_histogram["bin_edges"], type(None)
):
warnings.warn(
"No edges available in at least one histogram for calculating `PSI`",
RuntimeWarning,
)
return None

bin_count: int = 0 # required typing by mypy
for iter_value, bin_count in enumerate(new_self_histogram["bin_counts"]):

self_percent = bin_count / self_match_count
other_percent = (
new_other_histogram["bin_counts"][iter_value] / other_match_count
)
if (self_percent == other_percent) and self_percent == 0:
continue

iter_psi = (other_percent - self_percent) * np.log(
other_percent / self_percent
)
if iter_psi and iter_psi != float("inf"):
psi_value += iter_psi

return psi_value

def _update_variance(
self, batch_mean: float, batch_var: float, batch_count: int
) -> float:
Expand Down Expand Up @@ -1059,48 +1203,19 @@ def _update_histogram(self, df_series: pd.Series) -> None:
self._stored_histogram["current_loss"] = histogram_loss
self._stored_histogram["total_loss"] += histogram_loss

def _histogram_for_profile(
self, histogram_method: str
def _regenerate_histogram(
self, bin_counts, bin_edges, suggested_bin_count, options=None
) -> Tuple[Dict[str, np.ndarray], float]:
"""
Convert the stored histogram into the presentable state.
Based on the suggested histogram bin count from numpy.histograms.
The bin count used is stored in 'suggested_bin_count' for each method.
:param histogram_method: method to use for determining the histogram
profile
:type histogram_method: str
:return: histogram bin edges and bin counts
:rtype: dict
"""
bin_counts, bin_edges = (
self._stored_histogram["histogram"]["bin_counts"],
self._stored_histogram["histogram"]["bin_edges"],
)

current_bin_counts, suggested_bin_count = (
self.histogram_methods[histogram_method]["histogram"]["bin_counts"],
self.histogram_methods[histogram_method]["suggested_bin_count"],
)

# base case, no need to change if it is already correct
if not self._has_histogram or current_bin_counts is not None:
return (
self.histogram_methods[histogram_method]["histogram"],
self.histogram_methods[histogram_method]["total_loss"],
)
elif len(bin_counts) == suggested_bin_count:
return (
self._stored_histogram["histogram"],
self._stored_histogram["total_loss"],
)

# create proper binning
new_bin_counts = np.zeros((suggested_bin_count,))
new_bin_edges = np.linspace(
bin_edges[0], bin_edges[-1], suggested_bin_count + 1
)
if options:
new_bin_edges = np.linspace(
options["min_edge"], options["max_edge"], suggested_bin_count + 1
)

# allocate bin_counts
new_bin_id = 0
Expand Down Expand Up @@ -1159,6 +1274,49 @@ def _histogram_for_profile(

return ({"bin_edges": new_bin_edges, "bin_counts": new_bin_counts}, hist_loss)

def _histogram_for_profile(
self, histogram_method: str
) -> Tuple[Dict[str, np.ndarray], float]:
"""
Convert the stored histogram into the presentable state.
Based on the suggested histogram bin count from numpy.histograms.
The bin count used is stored in 'suggested_bin_count' for each method.
:param histogram_method: method to use for determining the histogram
profile
:type histogram_method: str
:return: histogram bin edges and bin counts
:rtype: dict
"""
bin_counts, bin_edges = (
self._stored_histogram["histogram"]["bin_counts"],
self._stored_histogram["histogram"]["bin_edges"],
)

current_bin_counts, suggested_bin_count = (
self.histogram_methods[histogram_method]["histogram"]["bin_counts"],
self.histogram_methods[histogram_method]["suggested_bin_count"],
)

# base case, no need to change if it is already correct
if not self._has_histogram or current_bin_counts is not None:
return (
self.histogram_methods[histogram_method]["histogram"],
self.histogram_methods[histogram_method]["total_loss"],
)
elif len(bin_counts) == suggested_bin_count:
return (
self._stored_histogram["histogram"],
self._stored_histogram["total_loss"],
)

return self._regenerate_histogram(
bin_counts=bin_counts,
bin_edges=bin_edges,
suggested_bin_count=suggested_bin_count,
)

def _get_best_histogram_for_profile(self) -> Dict:
"""
Convert the stored histogram into the presentable state.
Expand Down
8 changes: 5 additions & 3 deletions dataprofiler/tests/profilers/test_column_profile_compilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,10 @@ def test_diff_primitive_compilers(self):
"stddev": 3.285085839971525,
"t-test": {
"t-statistic": 0.4155260166386663,
"conservative": {"df": 1, "p-value": 0.749287157907667},
"conservative": {"df": 1.0, "p-value": 0.749287157907667},
"welch": {"df": 3.6288111187629117, "p-value": 0.7011367179395704},
},
"psi": 0.17328679513998632,
},
}
profile_diff = compiler1.diff(compiler2)
Expand Down Expand Up @@ -247,7 +248,7 @@ def test_disabling_columns_during_primitive_diff(self):
"sum": -20.0,
"mean": -10.0,
"median": -10,
"mode": [[-2, -1, 1, 2], [], [5, 15]],
"mode": [[-2.0, -1.0, 1.0, 2.0], [], [5, 15]],
"median_absolute_deviation": -3.5,
"variance": -46.666666666666664,
"stddev": data1.astype(int).std() - data2.astype(int).std(),
Expand All @@ -262,9 +263,10 @@ def test_disabling_columns_during_primitive_diff(self):
},
"t-test": {
"t-statistic": -1.9674775073518591,
"conservative": {"df": 1, "p-value": 0.29936264581081673},
"conservative": {"df": 1.0, "p-value": 0.29936264581081673},
"welch": {"df": 1.0673824509440946, "p-value": 0.28696889329266506},
},
"psi": 0,
},
}
profile_diff = compiler1.diff(compiler2)
Expand Down
5 changes: 3 additions & 2 deletions dataprofiler/tests/profilers/test_float_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -1689,7 +1689,7 @@ def test_diff(self):
"precision": {
"min": 1,
"max": 1,
"mean": 1,
"mean": 1.0,
"var": profile1["precision"]["var"] - profile2["precision"]["var"],
"std": profile1["precision"]["std"] - profile2["precision"]["std"],
"sample_size": -1,
Expand All @@ -1698,9 +1698,10 @@ def test_diff(self):
},
"t-test": {
"t-statistic": 0.5393164101529813,
"conservative": {"df": 2, "p-value": 0.643676756587475},
"conservative": {"df": 2.0, "p-value": 0.643676756587475},
"welch": {"df": 4.999127432888682, "p-value": 0.6128117908944144},
},
"psi": 0,
}
profile_diff = profiler1.diff(profiler2)
try:
Expand Down
5 changes: 3 additions & 2 deletions dataprofiler/tests/profilers/test_int_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -1053,14 +1053,15 @@ def test_diff(self):
"stddev": -7.899494936611665,
"sum": -4.0,
"variance": -94.0,
"median": -4,
"median": -4.0,
"mode": [[2, 6, 4], [], [1, 15]],
"median_absolute_deviation": -5,
"t-test": {
"t-statistic": -0.5638091828819275,
"conservative": {"df": 1, "p-value": 0.6731699660830497},
"conservative": {"df": 1.0, "p-value": 0.6731699660830497},
"welch": {"df": 1.0547717074524683, "p-value": 0.6691886269547123},
},
"psi": 0.0675775180180274,
}
profile_diff = profiler1.diff(profiler2)
try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -796,6 +796,7 @@ def test_diff(self):
"conservative": {"df": 9, "p-value": 0.7039643545772609},
"welch": {"df": 25.945257024943864, "p-value": 0.6980401261750298},
},
"psi": None,
}

difference = other1.diff(other2)
Expand Down Expand Up @@ -836,6 +837,7 @@ def test_diff(self):
"conservative": {"df": None, "p-value": None},
"welch": {"df": None, "p-value": None},
},
"psi": None,
}
expected_var = expected_diff.pop("variance")
expected_stddev = expected_diff.pop("stddev")
Expand Down Expand Up @@ -885,6 +887,7 @@ def test_diff(self):
"conservative": {"df": None, "p-value": None},
"welch": {"df": None, "p-value": None},
},
"psi": None,
}
expected_var = expected_diff.pop("variance")
expected_stddev = expected_diff.pop("stddev")
Expand Down Expand Up @@ -933,6 +936,7 @@ def test_diff(self):
"conservative": {"df": None, "p-value": None},
"welch": {"df": None, "p-value": None},
},
"psi": None,
}
expected_var = expected_diff.pop("variance")
expected_stddev = expected_diff.pop("stddev")
Expand Down Expand Up @@ -980,6 +984,7 @@ def test_diff(self):
"conservative": {"df": 9, "p-value": 0.011958658754358975},
"welch": {"df": 25.945257024943864, "p-value": 0.004201616692122823},
},
"psi": None,
}
difference = other1.diff(other2)
self.assertDictEqual(expected_diff, difference)
Expand Down
3 changes: 2 additions & 1 deletion dataprofiler/tests/profilers/test_text_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,9 +587,10 @@ def test_diff(self):
),
"t-test": {
"t-statistic": -1.9339958714826413,
"conservative": {"df": 8, "p-value": 0.08916903961929257},
"conservative": {"df": 8.0, "p-value": 0.08916903961929257},
"welch": {"df": 15.761400272034564, "p-value": 0.07127621949432528},
},
"psi": 0.7211391539728152,
}

profile_diff = profiler1.diff(profiler2)
Expand Down

0 comments on commit 6a407af

Please sign in to comment.