Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adding PSI to diff report #688

Merged
merged 62 commits into from
Nov 2, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
62 commits
Select commit Hold shift + click to select a range
f6be48b
placeholder
taylorfturner Oct 13, 2022
336a8a2
adding placeholders
taylorfturner Oct 13, 2022
7584838
DRY-er code
taylorfturner Oct 13, 2022
3394a0f
logic placeholder
taylorfturner Oct 13, 2022
4b4f165
reformat
taylorfturner Oct 13, 2022
0358ecc
end of day catch up
taylorfturner Oct 13, 2022
eae37b5
catch up
taylorfturner Oct 14, 2022
2a19fc1
Merge branch 'main' into feature/psi
taylorfturner Oct 17, 2022
3ce26cf
Merge branch 'main' into feature/psi
taylorfturner Oct 19, 2022
c311ebc
catch up
taylorfturner Oct 20, 2022
b92793b
conditionally regen
taylorfturner Oct 20, 2022
4a3f34d
conditionally regen
taylorfturner Oct 20, 2022
f418d5f
black reformatting
taylorfturner Oct 20, 2022
66c4122
black reformatting
taylorfturner Oct 20, 2022
6a534aa
commit
taylorfturner Oct 21, 2022
943def3
commit
taylorfturner Oct 21, 2022
fd2e425
commit
taylorfturner Oct 21, 2022
b439472
commit
taylorfturner Oct 21, 2022
be4ed9a
typo
taylorfturner Oct 21, 2022
5a7397b
typo in diff()
taylorfturner Oct 21, 2022
ba325cf
typo in diff()
taylorfturner Oct 21, 2022
c004a6f
fix logic
taylorfturner Oct 21, 2022
e1b62e4
fix logi in
taylorfturner Oct 21, 2022
bfa0042
Merge branch 'main' into feature/psi
taylorfturner Oct 21, 2022
007be39
formatting
taylorfturner Oct 21, 2022
9a7fc67
catch up
taylorfturner Oct 24, 2022
db590af
catch up
taylorfturner Oct 24, 2022
a989132
Merge branch 'main' into feature/psi
taylorfturner Oct 27, 2022
42f0cd0
Merge branch 'main' into feature/psi
taylorfturner Oct 27, 2022
ec219f2
cond None
taylorfturner Oct 28, 2022
90bd46f
conditionality
taylorfturner Oct 28, 2022
254a4ef
outside try/except
taylorfturner Oct 28, 2022
0579092
formatting
taylorfturner Oct 28, 2022
9fdb9b4
clean up
taylorfturner Oct 28, 2022
aab0ea1
take out config condition on diff psi
taylorfturner Oct 28, 2022
b75f140
test runs
taylorfturner Oct 28, 2022
a2353fb
revert
taylorfturner Oct 28, 2022
b107534
clean up
taylorfturner Oct 28, 2022
b5d14f8
remove unneeded changes in expected values
taylorfturner Oct 31, 2022
6ec3dc4
only do 10 bins for PSI
taylorfturner Oct 31, 2022
a74b357
suggested to 9 so ultimately truly 10 bins
taylorfturner Oct 31, 2022
abaf3fc
fix values expected
taylorfturner Oct 31, 2022
8218bbd
fix values expected
taylorfturner Oct 31, 2022
df1e81a
pre-commit
taylorfturner Oct 31, 2022
02f1271
comment resolution
taylorfturner Nov 1, 2022
6fd8865
adding options for num_psi_bins
taylorfturner Nov 1, 2022
10db92f
clean up conditional logic
taylorfturner Nov 1, 2022
ef6cd36
clean up white space
taylorfturner Nov 1, 2022
7da765f
clean up logic in second case
taylorfturner Nov 1, 2022
4a1514e
remove lingering maxDiff = None
taylorfturner Nov 1, 2022
040ce6d
format
taylorfturner Nov 1, 2022
1a2db04
docstring and .match_count
taylorfturner Nov 1, 2022
e1d367e
psi clean up bin edges and max/min
taylorfturner Nov 1, 2022
251dff7
psi values based on manual calc
taylorfturner Nov 1, 2022
53e7355
psi values based on manual calc
taylorfturner Nov 1, 2022
54ca76f
clean up logic on regen_hist
taylorfturner Nov 1, 2022
2730d0b
clean up logic on regen_hist
taylorfturner Nov 1, 2022
a7b0d46
clean up variable
taylorfturner Nov 1, 2022
33b1710
clean up ifs
taylorfturner Nov 1, 2022
19cdd35
classmethod
taylorfturner Nov 2, 2022
b3672aa
classmethod
taylorfturner Nov 2, 2022
e2fba6d
fix debugging remnant
taylorfturner Nov 2, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
228 changes: 193 additions & 35 deletions dataprofiler/profilers/numerical_column_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -391,6 +391,12 @@ def diff(self, other_profile: NumericStatsMixin, options: Dict = None) -> Dict:
other_profile.variance,
other_profile.match_count,
),
"psi": self._calculate_psi(
self.match_count,
self._stored_histogram["histogram"],
other_profile.match_count,
other_profile._stored_histogram["histogram"],
),
}
return differences

Expand Down Expand Up @@ -516,6 +522,144 @@ def _perform_t_test(
results["welch"]["p-value"] = float(welch_p_val)
return results

def _preprocess_for_calculate_psi(
self,
self_histogram,
other_histogram,
):
new_self_histogram = {"bin_counts": None, "bin_edges": None}
new_other_histogram = {"bin_counts": None, "bin_edges": None}
regenerate_histogram = False
num_psi_bins = 10

if (
isinstance(self_histogram["bin_counts"], np.ndarray)
and isinstance(self_histogram["bin_edges"], np.ndarray)
and isinstance(other_histogram["bin_counts"], np.ndarray)
and isinstance(other_histogram["bin_edges"], np.ndarray)
):
regenerate_histogram = True
min_min_edge = min(
self_histogram["bin_edges"][0],
other_histogram["bin_edges"][0],
)
max_max_edge = max(
self_histogram["bin_edges"][-1],
other_histogram["bin_edges"][-1],
)

if regenerate_histogram:
new_self_histogram["bin_counts"] = self_histogram["bin_counts"]
new_self_histogram["bin_edges"] = self_histogram["bin_edges"]
new_other_histogram["bin_edges"] = other_histogram["bin_edges"]
new_other_histogram["bin_counts"] = other_histogram["bin_counts"]

len_self_bin_counts = 0
if len(self_histogram["bin_counts"]) > 0:
len_self_bin_counts = len(self_histogram["bin_counts"])

# re-calculate `self` histogram
if not len_self_bin_counts == num_psi_bins:
histogram, hist_loss = self._regenerate_histogram(
bin_counts=self_histogram["bin_counts"],
bin_edges=self_histogram["bin_edges"],
suggested_bin_count=num_psi_bins,
options={
"min_edge": min_min_edge,
"max_edge": max_max_edge,
},
)
new_self_histogram["bin_counts"] = histogram["bin_counts"]
new_self_histogram["bin_edges"] = histogram["bin_edges"]

# re-calculate `other_profile` histogram
histogram_edges_not_equal = False
all_array_values_equal = (
other_histogram["bin_edges"] == self_histogram["bin_edges"]
).all()
if not all_array_values_equal:
histogram_edges_not_equal = True

if histogram_edges_not_equal:
histogram, hist_loss = self._regenerate_histogram(
bin_counts=other_histogram["bin_counts"],
bin_edges=other_histogram["bin_edges"],
suggested_bin_count=num_psi_bins,
options={
"min_edge": min_min_edge,
"max_edge": max_max_edge,
},
)

new_other_histogram["bin_edges"] = histogram["bin_edges"]
new_other_histogram["bin_counts"] = histogram["bin_counts"]

return new_self_histogram, new_other_histogram

def _calculate_psi(
self,
self_match_count: int,
self_histogram: np.ndarray,
other_match_count: int,
other_histogram: np.ndarray,
) -> Optional[float]:
taylorfturner marked this conversation as resolved.
Show resolved Hide resolved
"""
Calculate PSI (Population Stability Index).
```
PSI = SUM((other_pcnt - self_pcnt) * ln(other_pcnt / self_pcnt))
```
PSI Breakpoint Thresholds:
- PSI < 0.1: no significant population change
- 0.1 < PSI < 0.2: moderate population change
- PSI >= 0.2: significant population change
:param self_match_count: self.match_count
:type self_match_count: int
:param self_histogram: self._stored_histogram["histogram"]
:type self_histogram: np.ndarray
:param self_match_count: other_profile.match_count
:type self_match_count: int
:param other_histogram: other_profile._stored_histogram["histogram"]
:type other_histogram: np.ndarray
:return: psi_value
:rtype: optional[float]
"""
psi_value = 0

new_self_histogram, new_other_histogram = self._preprocess_for_calculate_psi(
self_histogram=self_histogram,
other_histogram=other_histogram,
)

if isinstance(new_other_histogram["bin_edges"], type(None)) or isinstance(
new_self_histogram["bin_edges"], type(None)
):
warnings.warn(
taylorfturner marked this conversation as resolved.
Show resolved Hide resolved
"No edges available in at least one histogram for calculating `PSI`",
RuntimeWarning,
)
return None

bin_count: int = 0 # required typing by mypy
for iter_value, bin_count in enumerate(new_self_histogram["bin_counts"]):

self_percent = bin_count / self_match_count
other_percent = (
new_other_histogram["bin_counts"][iter_value] / other_match_count
)
if (self_percent == other_percent) and self_percent == 0:
continue

iter_psi = (other_percent - self_percent) * np.log(
other_percent / self_percent
)
if iter_psi and iter_psi != float("inf"):
psi_value += iter_psi

return psi_value

def _update_variance(
self, batch_mean: float, batch_var: float, batch_count: int
) -> float:
Expand Down Expand Up @@ -1059,48 +1203,19 @@ def _update_histogram(self, df_series: pd.Series) -> None:
self._stored_histogram["current_loss"] = histogram_loss
self._stored_histogram["total_loss"] += histogram_loss

def _histogram_for_profile(
self, histogram_method: str
def _regenerate_histogram(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

broke out _histogram_for_profile() function into two functions in the code so that we could call _regenerate_histogram on either self or other_profile.

The idea here is that to calculate PSI the edges in the historgram need to be the same between self and other_profile. In order to accomplish, prior to calling _calculate_psi potentially one or both obj._stored_histograms need to be updated.

This is the purpose of the refactor and breaking out into _regenerate_histogram.

self, bin_counts, bin_edges, suggested_bin_count, options=None
) -> Tuple[Dict[str, np.ndarray], float]:
"""
Convert the stored histogram into the presentable state.
Based on the suggested histogram bin count from numpy.histograms.
The bin count used is stored in 'suggested_bin_count' for each method.
:param histogram_method: method to use for determining the histogram
profile
:type histogram_method: str
:return: histogram bin edges and bin counts
:rtype: dict
"""
bin_counts, bin_edges = (
self._stored_histogram["histogram"]["bin_counts"],
self._stored_histogram["histogram"]["bin_edges"],
)

current_bin_counts, suggested_bin_count = (
self.histogram_methods[histogram_method]["histogram"]["bin_counts"],
self.histogram_methods[histogram_method]["suggested_bin_count"],
)

# base case, no need to change if it is already correct
if not self._has_histogram or current_bin_counts is not None:
return (
self.histogram_methods[histogram_method]["histogram"],
self.histogram_methods[histogram_method]["total_loss"],
)
elif len(bin_counts) == suggested_bin_count:
return (
self._stored_histogram["histogram"],
self._stored_histogram["total_loss"],
)

# create proper binning
new_bin_counts = np.zeros((suggested_bin_count,))
new_bin_edges = np.linspace(
bin_edges[0], bin_edges[-1], suggested_bin_count + 1
)
if options:
new_bin_edges = np.linspace(
options["min_edge"], options["max_edge"], suggested_bin_count + 1
)

# allocate bin_counts
new_bin_id = 0
Expand Down Expand Up @@ -1159,6 +1274,49 @@ def _histogram_for_profile(

return ({"bin_edges": new_bin_edges, "bin_counts": new_bin_counts}, hist_loss)

def _histogram_for_profile(
self, histogram_method: str
) -> Tuple[Dict[str, np.ndarray], float]:
"""
Convert the stored histogram into the presentable state.
Based on the suggested histogram bin count from numpy.histograms.
The bin count used is stored in 'suggested_bin_count' for each method.
:param histogram_method: method to use for determining the histogram
profile
:type histogram_method: str
:return: histogram bin edges and bin counts
:rtype: dict
"""
bin_counts, bin_edges = (
self._stored_histogram["histogram"]["bin_counts"],
self._stored_histogram["histogram"]["bin_edges"],
)

current_bin_counts, suggested_bin_count = (
self.histogram_methods[histogram_method]["histogram"]["bin_counts"],
self.histogram_methods[histogram_method]["suggested_bin_count"],
)

# base case, no need to change if it is already correct
if not self._has_histogram or current_bin_counts is not None:
return (
self.histogram_methods[histogram_method]["histogram"],
self.histogram_methods[histogram_method]["total_loss"],
)
elif len(bin_counts) == suggested_bin_count:
return (
self._stored_histogram["histogram"],
self._stored_histogram["total_loss"],
)

return self._regenerate_histogram(
bin_counts=bin_counts,
bin_edges=bin_edges,
suggested_bin_count=suggested_bin_count,
)
Comment on lines +1314 to +1318
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Calling _regenerate_histogram here now. The old code in the function is now in def _regenerate_histogram().


def _get_best_histogram_for_profile(self) -> Dict:
"""
Convert the stored histogram into the presentable state.
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -171,9 +171,10 @@ def test_diff_primitive_compilers(self):
"stddev": 3.285085839971525,
"t-test": {
"t-statistic": 0.4155260166386663,
"conservative": {"df": 1, "p-value": 0.749287157907667},
"conservative": {"df": 1.0, "p-value": 0.749287157907667},
JGSweets marked this conversation as resolved.
Show resolved Hide resolved
"welch": {"df": 3.6288111187629117, "p-value": 0.7011367179395704},
},
"psi": 0.17328679513998632,
},
}
profile_diff = compiler1.diff(compiler2)
Expand Down Expand Up @@ -247,7 +248,7 @@ def test_disabling_columns_during_primitive_diff(self):
"sum": -20.0,
"mean": -10.0,
"median": -10,
"mode": [[-2, -1, 1, 2], [], [5, 15]],
"mode": [[-2.0, -1.0, 1.0, 2.0], [], [5, 15]],
taylorfturner marked this conversation as resolved.
Show resolved Hide resolved
"median_absolute_deviation": -3.5,
"variance": -46.666666666666664,
"stddev": data1.astype(int).std() - data2.astype(int).std(),
Expand All @@ -262,9 +263,10 @@ def test_disabling_columns_during_primitive_diff(self):
},
"t-test": {
"t-statistic": -1.9674775073518591,
"conservative": {"df": 1, "p-value": 0.29936264581081673},
"conservative": {"df": 1.0, "p-value": 0.29936264581081673},
JGSweets marked this conversation as resolved.
Show resolved Hide resolved
"welch": {"df": 1.0673824509440946, "p-value": 0.28696889329266506},
},
"psi": 0,
},
}
profile_diff = compiler1.diff(compiler2)
Expand Down
5 changes: 3 additions & 2 deletions dataprofiler/tests/profilers/test_float_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -1689,7 +1689,7 @@ def test_diff(self):
"precision": {
"min": 1,
"max": 1,
"mean": 1,
"mean": 1.0,
"var": profile1["precision"]["var"] - profile2["precision"]["var"],
"std": profile1["precision"]["std"] - profile2["precision"]["std"],
"sample_size": -1,
Expand All @@ -1698,9 +1698,10 @@ def test_diff(self):
},
"t-test": {
"t-statistic": 0.5393164101529813,
"conservative": {"df": 2, "p-value": 0.643676756587475},
"conservative": {"df": 2.0, "p-value": 0.643676756587475},
JGSweets marked this conversation as resolved.
Show resolved Hide resolved
"welch": {"df": 4.999127432888682, "p-value": 0.6128117908944144},
},
"psi": 0,
}
profile_diff = profiler1.diff(profiler2)
try:
Expand Down
5 changes: 3 additions & 2 deletions dataprofiler/tests/profilers/test_int_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -1053,14 +1053,15 @@ def test_diff(self):
"stddev": -7.899494936611665,
"sum": -4.0,
"variance": -94.0,
"median": -4,
"median": -4.0,
"mode": [[2, 6, 4], [], [1, 15]],
"median_absolute_deviation": -5,
"t-test": {
"t-statistic": -0.5638091828819275,
"conservative": {"df": 1, "p-value": 0.6731699660830497},
"conservative": {"df": 1.0, "p-value": 0.6731699660830497},
JGSweets marked this conversation as resolved.
Show resolved Hide resolved
"welch": {"df": 1.0547717074524683, "p-value": 0.6691886269547123},
},
"psi": 0.0675775180180274,
}
profile_diff = profiler1.diff(profiler2)
try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -796,6 +796,7 @@ def test_diff(self):
"conservative": {"df": 9, "p-value": 0.7039643545772609},
"welch": {"df": 25.945257024943864, "p-value": 0.6980401261750298},
},
"psi": None,
}

difference = other1.diff(other2)
Expand Down Expand Up @@ -836,6 +837,7 @@ def test_diff(self):
"conservative": {"df": None, "p-value": None},
"welch": {"df": None, "p-value": None},
},
"psi": None,
}
expected_var = expected_diff.pop("variance")
expected_stddev = expected_diff.pop("stddev")
Expand Down Expand Up @@ -885,6 +887,7 @@ def test_diff(self):
"conservative": {"df": None, "p-value": None},
"welch": {"df": None, "p-value": None},
},
"psi": None,
}
expected_var = expected_diff.pop("variance")
expected_stddev = expected_diff.pop("stddev")
Expand Down Expand Up @@ -933,6 +936,7 @@ def test_diff(self):
"conservative": {"df": None, "p-value": None},
"welch": {"df": None, "p-value": None},
},
"psi": None,
}
expected_var = expected_diff.pop("variance")
expected_stddev = expected_diff.pop("stddev")
Expand Down Expand Up @@ -980,6 +984,7 @@ def test_diff(self):
"conservative": {"df": 9, "p-value": 0.011958658754358975},
"welch": {"df": 25.945257024943864, "p-value": 0.004201616692122823},
},
"psi": None,
}
difference = other1.diff(other2)
self.assertDictEqual(expected_diff, difference)
Expand Down
3 changes: 2 additions & 1 deletion dataprofiler/tests/profilers/test_text_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -587,9 +587,10 @@ def test_diff(self):
),
"t-test": {
"t-statistic": -1.9339958714826413,
"conservative": {"df": 8, "p-value": 0.08916903961929257},
"conservative": {"df": 8.0, "p-value": 0.08916903961929257},
"welch": {"df": 15.761400272034564, "p-value": 0.07127621949432528},
},
"psi": 0.7211391539728152,
}

profile_diff = profiler1.diff(profiler2)
Expand Down