Skip to content

Commit

Permalink
Encode categorical column (#770)
Browse files Browse the repository at this point in the history
* Test ProfileEncoder on CategoricalColumn

* Fix variable name typo

* Compare serialized strings without calling json.loads

* Remove sort keys
  • Loading branch information
kshitijavis authored Mar 20, 2023
1 parent a9f6ee8 commit 3db5836
Showing 1 changed file with 72 additions and 12 deletions.
84 changes: 72 additions & 12 deletions dataprofiler/tests/profilers/test_json_encoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,10 @@
from unittest.mock import patch

import numpy as np
import pandas as pd

from dataprofiler.profilers.base_column_profilers import BaseColumnProfiler
from dataprofiler.profilers.categorical_column_profile import CategoricalColumn
from dataprofiler.profilers.json_encoder import ProfileEncoder


Expand All @@ -15,17 +17,75 @@ def test_encode_base_column_profiler(self):
profile = BaseColumnProfiler(name="0")

serialized = json.dumps(profile, cls=ProfileEncoder)
exepcted = json.loads(
json.dumps(
{
"name": "0",
"col_index": np.nan,
"sample_size": 0,
"metadata": dict(),
"times": defaultdict(),
"thread_safe": True,
}
)
expected = json.dumps(
{
"name": "0",
"col_index": np.nan,
"sample_size": 0,
"metadata": dict(),
"times": defaultdict(),
"thread_safe": True,
}
)

self.assertEqual(json.loads(serialized), exepcted)
self.assertEqual(serialized, expected)

def test_encode_categorical_column_profiler(self):
profile = CategoricalColumn("0")

serialized = json.dumps(profile, cls=ProfileEncoder)
expected = json.dumps(
{
"name": "0",
"col_index": np.nan,
"sample_size": 0,
"metadata": dict(),
"times": defaultdict(),
"thread_safe": True,
"_categories": defaultdict(int),
"_CategoricalColumn__calculations": dict(),
"_top_k_categories": None,
}
)

self.assertEqual(serialized, expected)

def test_encode_categorical_column_profiler_after_update(self):
df_categorical = pd.Series(
[
"a",
"a",
"a",
"b",
"b",
"b",
"b",
"c",
"c",
"c",
"c",
"c",
]
)
profile = CategoricalColumn(df_categorical.name)

with patch("time.time", side_effect=lambda: 0.0):
profile.update(df_categorical)

serialized = json.dumps(profile, cls=ProfileEncoder)

expected = json.dumps(
{
"name": None,
"col_index": np.nan,
"sample_size": 12,
"metadata": {},
"times": {"categories": 0.0},
"thread_safe": True,
"_categories": {"c": 5, "b": 4, "a": 3},
"_CategoricalColumn__calculations": {},
"_top_k_categories": None,
},
)

self.assertEqual(serialized, expected)

0 comments on commit 3db5836

Please sign in to comment.