RowStatisticsOptions: Add implementation

capitalone · Jun 14, 2023 · 2da6a93 · 2da6a93
1 parent 3b68e2e
commit 2da6a93
Show file tree

Hide file tree

Showing 2 changed files with 17 additions and 4 deletions.
diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py
@@ -2088,11 +2088,13 @@ def _update_row_statistics(
 
         # If sample_ids provided, increment since that means only new data read
         if sample_ids is not None:
-            self.row_has_null_count += len(null_in_row_count)
-            self.row_is_null_count += len(null_rows)
+            if self.options.row_statistics.null_count.is_enabled:
+                self.row_has_null_count += len(null_in_row_count)
+                self.row_is_null_count += len(null_rows)
         else:
-            self.row_has_null_count = len(null_in_row_count)
-            self.row_is_null_count = len(null_rows)
+            if self.options.row_statistics.null_count.is_enabled:
+                self.row_has_null_count = len(null_in_row_count)
+                self.row_is_null_count = len(null_rows)
 
     def _get_correlation(
         self, clean_samples: dict, batch_properties: dict

diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py
@@ -233,6 +233,7 @@ def test_add_profilers(self, *mocks):
         self.assertEqual(
             "<class 'pandas.core.frame.DataFrame'>", merged_profile.file_type
         )
+        self.assertTrue(merged_profile.options.row_statistics.null_count.is_enabled)
         self.assertEqual(2, merged_profile.row_has_null_count)
         self.assertEqual(2, merged_profile.row_is_null_count)
         self.assertEqual(7, merged_profile.total_samples)
@@ -3602,6 +3603,7 @@ def test_correct_null_row_counts(self):
             }
         )
         profile = dp.StructuredProfiler(data, options=profiler_options)
+        self.assertTrue(profile.options.row_statistics.null_count.is_enabled)
         self.assertEqual(2, profile.row_has_null_count)
         self.assertEqual(0.25, profile._get_row_has_null_ratio())
         self.assertEqual(2, profile.row_is_null_count)
@@ -3610,6 +3612,7 @@ def test_correct_null_row_counts(self):
         file_path = os.path.join(test_root_path, "data", "csv/iris-with-null-rows.csv")
         data = pd.read_csv(file_path)
         profile = dp.StructuredProfiler(data, options=profiler_options)
+        self.assertTrue(profile.options.row_statistics.null_count.is_enabled)
         self.assertEqual(13, profile.row_has_null_count)
         self.assertEqual(13 / 24, profile._get_row_has_null_ratio())
         self.assertEqual(3, profile.row_is_null_count)
@@ -3701,6 +3704,7 @@ def test_correct_total_sample_size_and_counts_and_mutability(self):
             # rows sampled are [5, 6], [13, 14] (0 index)
             self.assertEqual(16, profile.total_samples)
             self.assertEqual(4, profile._max_col_samples_used)
+            self.assertTrue(profile.options.row_statistics.null_count.is_enabled)
             self.assertEqual(2, profile.row_has_null_count)
             self.assertEqual(0.5, profile._get_row_has_null_ratio())
             self.assertEqual(2, profile.row_is_null_count)
@@ -3733,6 +3737,7 @@ def test_null_calculation_with_differently_sampled_cols(self):
         # The only null in those rows in second column in that subset are 5, 7
         # Therefore only 2 rows have null according to row_has_null_count
         self.assertEqual(0, profile.row_is_null_count)
+        self.assertTrue(profile.options.row_statistics.null_count.is_enabled)
         self.assertEqual(2, profile.row_has_null_count)
         # Accordingly, make sure ratio of null rows accounts for the fact that
         # Only 5 total rows were sampled (5 in col 1, 9 in col 2)
@@ -3773,6 +3778,7 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
         profile = dp.StructuredProfiler(
             data1, min_true_samples=2, samples_per_update=2, options=opts
         )
+        self.assertTrue(profile.options.row_statistics.null_count.is_enabled)
         self.assertEqual(3, profile.row_has_null_count)
         self.assertEqual(1, profile.row_is_null_count)
         self.assertEqual(0.75, profile._get_row_has_null_ratio())
@@ -3782,6 +3788,7 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
         self.assertSetEqual({0, 2}, profile._profile[1].null_types_index["nan"])
 
         profile.update_profile(data2, min_true_samples=2, sample_size=2)
+        self.assertTrue(profile.options.row_statistics.null_count.is_enabled)
         self.assertEqual(7, profile.row_has_null_count)
         self.assertEqual(3, profile.row_is_null_count)
         self.assertEqual(0.875, profile._get_row_has_null_ratio())
@@ -3803,6 +3810,7 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
             }
         )
         profile = dp.StructuredProfiler(data1, options=opts)
+        self.assertTrue(profile.options.row_statistics.null_count.is_enabled)
         self.assertEqual(3, profile.row_has_null_count)
         self.assertEqual(1, profile.row_is_null_count)
         self.assertEqual(0.75, profile._get_row_has_null_ratio())
@@ -3812,6 +3820,7 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
         self.assertSetEqual({0, 2}, profile._profile[1].null_types_index["nan"])
 
         profile.update_profile(data2)
+        self.assertTrue(profile.options.row_statistics.null_count.is_enabled)
         self.assertEqual(7, profile.row_has_null_count)
         self.assertEqual(3, profile.row_is_null_count)
         self.assertEqual(0.875, profile._get_row_has_null_ratio())
@@ -3826,6 +3835,7 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
 
         # Test that update with emtpy data doesn't change stats
         profile.update_profile(pd.DataFrame([]))
+        self.assertTrue(profile.options.row_statistics.null_count.is_enabled)
         self.assertEqual(7, profile.row_has_null_count)
         self.assertEqual(3, profile.row_is_null_count)
         self.assertEqual(0.875, profile._get_row_has_null_ratio())
@@ -3840,6 +3850,7 @@ def test_null_row_stats_correct_after_updates(self, *mocks):
 
         # Test one row update
         profile.update_profile(pd.DataFrame([[1, None]]))
+        self.assertTrue(profile.options.row_statistics.null_count.is_enabled)
         self.assertEqual(8, profile.row_has_null_count)
         self.assertEqual(3, profile.row_is_null_count)
         self.assertEqual(8 / 9, profile._get_row_has_null_ratio())