Skip to content

Commit

Permalink
Deserialization of datalabeler (capitalone#891)
Browse files Browse the repository at this point in the history
* Added initial profiler decoding for datalabeler column (WIP)

* Intialial implementation for deserialization of datalabelercolumn

* Fix LSP violations (capitalone#840)

* Make profiler superclasses generic

Makes the superclasses BaseColumnProfiler, NumericStatsMixin, and
BaseCompiler generic, to avoid casting in subclass diff() methods and
violating LSP in principle.

* Add needed cast import

---------

Co-authored-by: Junho Lee <[email protected]>
  • Loading branch information
2 people authored and JGSweets committed Jun 29, 2023
1 parent 2ad1de4 commit c13c318
Show file tree
Hide file tree
Showing 4 changed files with 94 additions and 8 deletions.
19 changes: 14 additions & 5 deletions dataprofiler/profilers/base_column_profilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,8 @@
import numpy as np
import pandas as pd

from dataprofiler.profilers.profiler_options import BaseInspectorOptions

from . import utils
from .profiler_options import BaseInspectorOptions, BaseOption

BaseColumnProfilerT = TypeVar("BaseColumnProfilerT", bound="BaseColumnProfiler")

Expand All @@ -30,7 +29,7 @@ class BaseColumnProfiler(Generic[BaseColumnProfilerT], metaclass=abc.ABCMeta):
_SAMPLING_RATIO = 0.20
_MIN_SAMPLING_COUNT = 500

def __init__(self, name: str | None) -> None:
def __init__(self, name: str | None, options: BaseOption | None = None):
"""
Initialize base class properties for the subclass.
Expand Down Expand Up @@ -250,17 +249,27 @@ def report(self, remove_disabled_flag: bool = False) -> dict:
raise NotImplementedError()

@classmethod
def load_from_dict(cls, data) -> BaseColumnProfiler:
def load_from_dict(
cls: type[BaseColumnProfilerT],
data: dict[str, Any],
options: dict | None = None,
) -> BaseColumnProfilerT:
"""
Parse attribute from json dictionary into self.
:param data: dictionary with attributes and values.
:type data: dict[string, Any]
:param options: options for loading column profiler params from dictionary
:type options: Dict | None
:return: Profiler with attributes populated.
:rtype: BaseColumnProfiler
"""
profile = cls(data["name"])
if options is None:
options = {}

class_options = options.get(cls.__name__)
profile: BaseColumnProfilerT = cls(data["name"], class_options)

time_vals = data.pop("times")
setattr(profile, "times", defaultdict(float, time_vals))
Expand Down
42 changes: 42 additions & 0 deletions dataprofiler/profilers/data_labeler_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -307,6 +307,48 @@ def profile(self) -> dict:
}
return profile

@classmethod
def load_from_dict(cls, data, options: dict | None = None) -> DataLabelerColumn:
"""
Parse attribute from json dictionary into self.
:param data: dictionary with attributes and values.
:type data: dict[string, Any]
:param options: options for loading column profiler params from dictionary
:type options: Dict | None
:return: Profiler with attributes populated.
:rtype: DataLabelerColumn
"""
opt = DataLabelerOptions()
data_labeler_load_attr = data.pop("data_labeler")
if "from_library" in data_labeler_load_attr:
opt.data_labeler_object = DataLabeler.load_from_library(
data_labeler_load_attr["from_library"]
)
elif "from_disk" in data_labeler_load_attr:
raise NotImplementedError(
"Models intialized from disk have not yet been made deserializable"
)
else:
raise ValueError(
"Deserialization cannot be done on labelers without "
"_default_model_loc set to known value."
)

# This is an ambiguous call to super classes.
# If load_from_dict is part of both super classes there may be issues
profile = super().load_from_dict(data, options={cls.__name__: opt})

if profile._reverse_label_mapping is not None:
profile._reverse_label_mapping = {
int(k): v for k, v in profile._reverse_label_mapping.items()
}
if profile._sum_predictions is not None:
profile._sum_predictions = np.array(profile._sum_predictions)

return profile

def report(self, remove_disabled_flag: bool = False) -> dict:
"""
Return report.
Expand Down
6 changes: 3 additions & 3 deletions dataprofiler/profilers/json_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ def load_column_profile(serialized_json: dict) -> "BaseColumnProfiler":
JSON
"""
column_profiler_cls: Type["BaseColumnProfiler"] = get_column_profiler_class(
serialized_json["class"]
)
column_profiler_cls: Type[
"BaseColumnProfiler[BaseColumnProfiler]"
] = get_column_profiler_class(serialized_json["class"])
return column_profiler_cls.load_from_dict(serialized_json["data"])


Expand Down
35 changes: 35 additions & 0 deletions dataprofiler/tests/profilers/test_data_labeler_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from dataprofiler.labelers import BaseDataLabeler
from dataprofiler.profilers import utils
from dataprofiler.profilers.data_labeler_column_profile import DataLabelerColumn
from dataprofiler.profilers.json_decoder import load_column_profile
from dataprofiler.profilers.json_encoder import ProfileEncoder
from dataprofiler.profilers.profiler_options import DataLabelerOptions

Expand All @@ -27,6 +28,7 @@ def _setup_data_labeler_mock(mock_instance):
mock_DataLabeler.reverse_label_mapping = {0: "a", 1: "b"}
mock_DataLabeler.model.num_labels = 2
mock_DataLabeler.model.requires_zero_mapping = False
mock_instance.load_from_library.side_effect = mock_instance

def mock_predict(data, *args, **kwargs):
len_data = len(data)
Expand Down Expand Up @@ -483,3 +485,36 @@ def test_json_encode_after_update(self, mock_instance):
)

self.assertEqual(expected, serialized)

def test_json_decode(self, mock_instance):
self._setup_data_labeler_mock(mock_instance)

data = pd.Series(["1", "2", "3", "4"], dtype=object)
expected = DataLabelerColumn(data.name)
expected.data_labeler._default_model_loc = "structured_model"
serialized = json.dumps(expected, cls=ProfileEncoder)

deserialized = load_column_profile(json.loads(serialized))

test_utils.assert_profiles_equal(deserialized, expected)

def test_json_decode_after_update(self, mock_instance):
self._setup_data_labeler_mock(mock_instance)
data = pd.Series(["1", "2", "3", "4"], dtype=object)
expected = DataLabelerColumn(data.name)
expected.data_labeler._default_model_loc = "structured_model"
with test_utils.mock_timeit():
expected.update(data)

serialized = json.dumps(expected, cls=ProfileEncoder)
deserialized = load_column_profile(json.loads(serialized))

test_utils.assert_profiles_equal(deserialized, expected)
update_data = pd.Series(["4", "5", "6", "7"], dtype=object)
deserialized.update(update_data)

assert deserialized.sample_size == 8
self.assertDictEqual({"a": 4, "b": 4}, deserialized.rank_distribution)
np.testing.assert_array_equal(
np.array([4.0, 4.0]), deserialized.sum_predictions
)

0 comments on commit c13c318

Please sign in to comment.