Skip to content

Commit

Permalink
RowStatisticsOptions: Add implementation
Browse files Browse the repository at this point in the history
  • Loading branch information
drahc1R committed Jun 14, 2023
1 parent 3b68e2e commit 2da6a93
Show file tree
Hide file tree
Showing 2 changed files with 17 additions and 4 deletions.
10 changes: 6 additions & 4 deletions dataprofiler/profilers/profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -2088,11 +2088,13 @@ def _update_row_statistics(

# If sample_ids provided, increment since that means only new data read
if sample_ids is not None:
self.row_has_null_count += len(null_in_row_count)
self.row_is_null_count += len(null_rows)
if self.options.row_statistics.null_count.is_enabled:
self.row_has_null_count += len(null_in_row_count)
self.row_is_null_count += len(null_rows)
else:
self.row_has_null_count = len(null_in_row_count)
self.row_is_null_count = len(null_rows)
if self.options.row_statistics.null_count.is_enabled:
self.row_has_null_count = len(null_in_row_count)
self.row_is_null_count = len(null_rows)

def _get_correlation(
self, clean_samples: dict, batch_properties: dict
Expand Down
11 changes: 11 additions & 0 deletions dataprofiler/tests/profilers/test_profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -233,6 +233,7 @@ def test_add_profilers(self, *mocks):
self.assertEqual(
"<class 'pandas.core.frame.DataFrame'>", merged_profile.file_type
)
self.assertTrue(merged_profile.options.row_statistics.null_count.is_enabled)
self.assertEqual(2, merged_profile.row_has_null_count)
self.assertEqual(2, merged_profile.row_is_null_count)
self.assertEqual(7, merged_profile.total_samples)
Expand Down Expand Up @@ -3602,6 +3603,7 @@ def test_correct_null_row_counts(self):
}
)
profile = dp.StructuredProfiler(data, options=profiler_options)
self.assertTrue(profile.options.row_statistics.null_count.is_enabled)
self.assertEqual(2, profile.row_has_null_count)
self.assertEqual(0.25, profile._get_row_has_null_ratio())
self.assertEqual(2, profile.row_is_null_count)
Expand All @@ -3610,6 +3612,7 @@ def test_correct_null_row_counts(self):
file_path = os.path.join(test_root_path, "data", "csv/iris-with-null-rows.csv")
data = pd.read_csv(file_path)
profile = dp.StructuredProfiler(data, options=profiler_options)
self.assertTrue(profile.options.row_statistics.null_count.is_enabled)
self.assertEqual(13, profile.row_has_null_count)
self.assertEqual(13 / 24, profile._get_row_has_null_ratio())
self.assertEqual(3, profile.row_is_null_count)
Expand Down Expand Up @@ -3701,6 +3704,7 @@ def test_correct_total_sample_size_and_counts_and_mutability(self):
# rows sampled are [5, 6], [13, 14] (0 index)
self.assertEqual(16, profile.total_samples)
self.assertEqual(4, profile._max_col_samples_used)
self.assertTrue(profile.options.row_statistics.null_count.is_enabled)
self.assertEqual(2, profile.row_has_null_count)
self.assertEqual(0.5, profile._get_row_has_null_ratio())
self.assertEqual(2, profile.row_is_null_count)
Expand Down Expand Up @@ -3733,6 +3737,7 @@ def test_null_calculation_with_differently_sampled_cols(self):
# The only null in those rows in second column in that subset are 5, 7
# Therefore only 2 rows have null according to row_has_null_count
self.assertEqual(0, profile.row_is_null_count)
self.assertTrue(profile.options.row_statistics.null_count.is_enabled)
self.assertEqual(2, profile.row_has_null_count)
# Accordingly, make sure ratio of null rows accounts for the fact that
# Only 5 total rows were sampled (5 in col 1, 9 in col 2)
Expand Down Expand Up @@ -3773,6 +3778,7 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
profile = dp.StructuredProfiler(
data1, min_true_samples=2, samples_per_update=2, options=opts
)
self.assertTrue(profile.options.row_statistics.null_count.is_enabled)
self.assertEqual(3, profile.row_has_null_count)
self.assertEqual(1, profile.row_is_null_count)
self.assertEqual(0.75, profile._get_row_has_null_ratio())
Expand All @@ -3782,6 +3788,7 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
self.assertSetEqual({0, 2}, profile._profile[1].null_types_index["nan"])

profile.update_profile(data2, min_true_samples=2, sample_size=2)
self.assertTrue(profile.options.row_statistics.null_count.is_enabled)
self.assertEqual(7, profile.row_has_null_count)
self.assertEqual(3, profile.row_is_null_count)
self.assertEqual(0.875, profile._get_row_has_null_ratio())
Expand All @@ -3803,6 +3810,7 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
}
)
profile = dp.StructuredProfiler(data1, options=opts)
self.assertTrue(profile.options.row_statistics.null_count.is_enabled)
self.assertEqual(3, profile.row_has_null_count)
self.assertEqual(1, profile.row_is_null_count)
self.assertEqual(0.75, profile._get_row_has_null_ratio())
Expand All @@ -3812,6 +3820,7 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
self.assertSetEqual({0, 2}, profile._profile[1].null_types_index["nan"])

profile.update_profile(data2)
self.assertTrue(profile.options.row_statistics.null_count.is_enabled)
self.assertEqual(7, profile.row_has_null_count)
self.assertEqual(3, profile.row_is_null_count)
self.assertEqual(0.875, profile._get_row_has_null_ratio())
Expand All @@ -3826,6 +3835,7 @@ def test_null_row_stats_correct_after_updates(self, *mocks):

# Test that update with emtpy data doesn't change stats
profile.update_profile(pd.DataFrame([]))
self.assertTrue(profile.options.row_statistics.null_count.is_enabled)
self.assertEqual(7, profile.row_has_null_count)
self.assertEqual(3, profile.row_is_null_count)
self.assertEqual(0.875, profile._get_row_has_null_ratio())
Expand All @@ -3840,6 +3850,7 @@ def test_null_row_stats_correct_after_updates(self, *mocks):

# Test one row update
profile.update_profile(pd.DataFrame([[1, None]]))
self.assertTrue(profile.options.row_statistics.null_count.is_enabled)
self.assertEqual(8, profile.row_has_null_count)
self.assertEqual(3, profile.row_is_null_count)
self.assertEqual(8 / 9, profile._get_row_has_null_ratio())
Expand Down

0 comments on commit 2da6a93

Please sign in to comment.