diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py index 113d19ef..6e512658 100644 --- a/dataprofiler/profilers/profile_builder.py +++ b/dataprofiler/profilers/profile_builder.py @@ -94,6 +94,7 @@ def __init__( self.sample_size: int = 0 self.sample: list[str] = list() self.null_count: int = 0 + self.null_ratio: float | None = None self.null_types: list[str] = list() self.null_types_index: dict = {} self._min_id: int | None = None @@ -292,6 +293,9 @@ def diff(self, other_profile: StructuredColProfiler, options: dict = None) -> di "null_count": profiler_utils.find_diff_of_numbers( self.null_count, other_profile.null_count ), + "null_ratio": profiler_utils.find_diff_of_numbers( + self.null_ratio, other_profile.null_ratio + ), "null_types": profiler_utils.find_diff_of_lists_and_sets( self.null_types, other_profile.null_types ), @@ -428,6 +432,7 @@ def _update_base_stats(self, base_stats: dict) -> None: self._last_batch_size = base_stats["sample_size"] self.sample = base_stats["sample"] self.null_count += base_stats["null_count"] + self.null_ratio = base_stats["null_count"] / base_stats["sample_size"] self.null_types = profiler_utils._combine_unique_sets( self.null_types, list(base_stats["null_types"].keys()) ) @@ -570,6 +575,7 @@ def clean_data_and_get_base_stats( { "sample_size": 0, "null_count": 0, + "null_ratio": None, "null_types": dict(), "sample": [], "min_id": None, @@ -658,6 +664,7 @@ def clean_data_and_get_base_stats( base_stats = { "sample_size": total_sample_size, "null_count": total_na, + "null_ratio": total_na / total_sample_size, "null_types": na_columns, "sample": rng.choice( list(df_series.values), (min(len(df_series), 5),), replace=False diff --git a/dataprofiler/tests/profilers/test_profile_builder.py b/dataprofiler/tests/profilers/test_profile_builder.py index 9507fe20..3ebcd504 100644 --- a/dataprofiler/tests/profilers/test_profile_builder.py +++ b/dataprofiler/tests/profilers/test_profile_builder.py @@ -2043,6 +2043,7 @@ def test_diff(self, *mocks): "label_representation": {"a": "unchanged"}, "sample_size": -2, "null_count": -1, + "null_ratio": -0.25, "null_types": [[], [], ["nan"]], "null_types_index": [{}, {}, {"nan": {2}}], "data_type_representation": {"all_data_types": "unchanged"}, @@ -2061,6 +2062,7 @@ def test_diff(self, *mocks): "label_representation": {"a": "unchanged"}, "sample_size": -2, "null_count": -1, + "null_ratio": -0.25, "null_types": [[], [], ["nan"]], "null_types_index": [{}, {}, {"nan": {2}}], "data_type_representation": {"all_data_types": "unchanged"}, @@ -2780,6 +2782,7 @@ def test_clean_data_and_get_base_stats(self, *mocks): "sample": ["6.0", "3.0", "4.0"], "sample_size": 5, "null_count": 2, + "null_ratio": 2 / 5, "null_types": dict(nan=["e", "b"]), "min_id": None, "max_id": None, @@ -2797,6 +2800,7 @@ def test_clean_data_and_get_base_stats(self, *mocks): "sample": ["6.0", "nan", "nan", "4.0"], "sample_size": 6, "null_count": 2, + "null_ratio": 2 / 6, "null_types": {"1.0": ["a"], "3.0": ["c"]}, "min_id": None, "max_id": None, @@ -2814,6 +2818,7 @@ def test_clean_data_and_get_base_stats(self, *mocks): "sample": ["3.0", "4.0", "nan", "6.0", "nan"], "sample_size": 6, "null_count": 0, + "null_ratio": 0 / 6, "null_types": {}, "min_id": None, "max_id": None, @@ -3091,6 +3096,7 @@ def test_diff(self, *mocks): "label_representation": {"a": "unchanged"}, "sample_size": 3, "null_count": 2, + "null_ratio": 2 / 7, "null_types": [["nan"], [], []], "null_types_index": [{"nan": {1, 5}}, {}, {}], "data_type_representation": {"all_data_types": "unchanged"}, @@ -3119,6 +3125,7 @@ def test_json_encode(self, mocked_datalabeler, *mocks): "sample_size": 0, "sample": [], "null_count": 0, + "null_ratio": None, "null_types": [], "null_types_index": {}, "_min_id": None, @@ -3170,6 +3177,7 @@ def test_json_encode_after_update(self, mock_DataLabeler, *mocks): "sample_size": 4, "sample": ["2", "-2", "1"], "null_count": 1, + "null_ratio": 1 / 4, "null_types": ["Nan"], "null_types_index": { "Nan": [