Skip to content

Commit

Permalink
Fix bug with null replication metrics (#702)
Browse files Browse the repository at this point in the history
* Fix bug

* Change 0 to np.nan

* Add tests

* Change test

* Simplify tests

* Change mean_not_null to nan

* Fix

* Use nan for sum and mean with no values

* Reorder code
  • Loading branch information
tonywu315 authored Nov 3, 2022
1 parent 948816f commit d4b5860
Show file tree
Hide file tree
Showing 2 changed files with 63 additions and 5 deletions.
35 changes: 30 additions & 5 deletions dataprofiler/profilers/profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2244,7 +2244,12 @@ def _update_null_replication_metrics(self, clean_samples: Dict) -> None:
]._profiles[get_data_type(profile)]

total_row_sum = np.asarray(
[get_data_type_profiler(profile).sum for profile in self._profile]
[
get_data_type_profiler(profile).sum
if get_data_type(profile)
else np.nan
for profile in self._profile
]
)

if not isinstance(self._null_replication_metrics, dict):
Expand Down Expand Up @@ -2290,7 +2295,12 @@ def _update_null_replication_metrics(self, clean_samples: Dict) -> None:
sum_not_null = np.delete(total_row_sum, col_id) - sum_null

mean_null = sum_null / null_count
mean_not_null = sum_not_null / true_count

mean_not_null = np.full(len(self._profile) - 1, np.nan)
if not true_count:
sum_not_null = np.full(len(self._profile) - 1, np.nan)
else:
mean_not_null = sum_not_null / true_count

# Convert numpy arrays to lists (serializable)
sum_null = sum_null.tolist()
Expand Down Expand Up @@ -2324,10 +2334,20 @@ def _merge_null_replication_metrics(self, other: StructuredProfiler) -> Dict:
]._profiles[get_data_type(profile)]

self_row_sum = np.asarray(
[get_data_type_profiler(profile).sum for profile in self._profile]
[
get_data_type_profiler(profile).sum
if get_data_type(profile)
else np.nan
for profile in self._profile
]
)
other_row_sum = np.asarray(
[get_data_type_profiler(profile).sum for profile in other._profile]
[
get_data_type_profiler(profile).sum
if get_data_type(profile)
else np.nan
for profile in other._profile
]
)
total_row_sum: np.ndarray = self_row_sum + other_row_sum
merged_properties: Dict = defaultdict(dict)
Expand Down Expand Up @@ -2373,7 +2393,12 @@ def _merge_null_replication_metrics(self, other: StructuredProfiler) -> Dict:
sum_not_null = np.delete(total_row_sum, col_id) - sum_null

mean_null = sum_null / null_count
mean_not_null = sum_not_null / true_count

mean_not_null = np.full(len(self._profile) - 1, np.nan)
if not true_count:
sum_not_null = np.full(len(self._profile) - 1, np.nan)
else:
mean_not_null = sum_not_null / true_count

# Convert numpy arrays to lists (serializable)
sum_null = sum_null.tolist()
Expand Down
33 changes: 33 additions & 0 deletions dataprofiler/tests/profilers/test_profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2048,6 +2048,39 @@ def test_null_replication_metrics_calculation(self):
np.testing.assert_array_almost_equal([17 / 8, 48 / 8], column["class_mean"][0])
np.testing.assert_array_almost_equal([12 / 2, 6 / 2], column["class_mean"][1])

# Test with all null in a column
data_3 = pd.DataFrame([[9999999, 9], [9999999, 9]])

NO_FLAG = 0
profile_options = dp.ProfilerOptions()
profile_options.set(
{
"*.null_values": {
"": NO_FLAG,
"nan": re.IGNORECASE,
"none": re.IGNORECASE,
"null": re.IGNORECASE,
" *": NO_FLAG,
"--*": NO_FLAG,
"__*": NO_FLAG,
"9" * 7: NO_FLAG,
},
"*.null_replication_metrics.is_enabled": True,
"data_labeler.is_enabled": False,
"multiprocess.is_enabled": False,
}
)

profiler = dp.StructuredProfiler(data_3, options=profile_options)
report = profiler.report()

self.assertTrue("null_replication_metrics" in report["data_stats"][0])
column = report["data_stats"][0]["null_replication_metrics"]

np.testing.assert_array_almost_equal([0, 1], column["class_prior"])
np.testing.assert_array_almost_equal([[np.nan], [18]], column["class_sum"])
np.testing.assert_array_almost_equal([[np.nan], [9]], column["class_mean"])


class TestStructuredColProfilerClass(unittest.TestCase):
def setUp(self):
Expand Down

0 comments on commit d4b5860

Please sign in to comment.