Skip to content

Commit

Permalink
Deserialization of datalabeler (#891)
Browse files Browse the repository at this point in the history
* Added initial profiler decoding for datalabeler column (WIP)

* Intialial implementation for deserialization of datalabelercolumn

* Fix LSP violations (#840)

* Make profiler superclasses generic

Makes the superclasses BaseColumnProfiler, NumericStatsMixin, and
BaseCompiler generic, to avoid casting in subclass diff() methods and
violating LSP in principle.

* Add needed cast import

---------

Co-authored-by: Junho Lee <[email protected]>
  • Loading branch information
ksneab7 and junholee6a authored Jun 21, 2023
1 parent f46b8a9 commit 3bb1127
Show file tree
Hide file tree
Showing 12 changed files with 145 additions and 37 deletions.
48 changes: 32 additions & 16 deletions dataprofiler/profilers/base_column_profilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,17 +6,18 @@
import abc
import warnings
from collections import defaultdict
from typing import Any, Callable
from typing import Any, Callable, Generic, TypeVar

import numpy as np
import pandas as pd

from dataprofiler.profilers.profiler_options import BaseInspectorOptions

from . import utils
from .profiler_options import BaseInspectorOptions, BaseOption

BaseColumnProfilerT = TypeVar("BaseColumnProfilerT", bound="BaseColumnProfiler")


class BaseColumnProfiler(metaclass=abc.ABCMeta): # type: ignore
class BaseColumnProfiler(Generic[BaseColumnProfilerT], metaclass=abc.ABCMeta):
"""Abstract class for profiling a column of data."""

col_type = None
Expand All @@ -28,7 +29,7 @@ class BaseColumnProfiler(metaclass=abc.ABCMeta): # type: ignore
_SAMPLING_RATIO = 0.20
_MIN_SAMPLING_COUNT = 500

def __init__(self, name: str | None) -> None:
def __init__(self, name: str | None, options: BaseOption | None = None):
"""
Initialize base class properties for the subclass.
Expand Down Expand Up @@ -147,7 +148,7 @@ def _merge_calculations(
)

def _add_helper(
self, other1: BaseColumnProfiler, other2: BaseColumnProfiler
self, other1: BaseColumnProfilerT, other2: BaseColumnProfilerT
) -> None:
"""
Merge the properties of two BaseColumnProfile objects.
Expand Down Expand Up @@ -176,7 +177,7 @@ def _add_helper(

self.sample_size = other1.sample_size + other2.sample_size

def diff(self, other_profile: BaseColumnProfiler, options: dict = None) -> dict:
def diff(self, other_profile: BaseColumnProfilerT, options: dict = None) -> dict:
"""
Find the differences for columns.
Expand Down Expand Up @@ -248,17 +249,27 @@ def report(self, remove_disabled_flag: bool = False) -> dict:
raise NotImplementedError()

@classmethod
def load_from_dict(cls, data) -> BaseColumnProfiler:
def load_from_dict(
cls: type[BaseColumnProfilerT],
data: dict[str, Any],
options: dict | None = None,
) -> BaseColumnProfilerT:
"""
Parse attribute from json dictionary into self.
:param data: dictionary with attributes and values.
:type data: dict[string, Any]
:param options: options for loading column profiler params from dictionary
:type options: Dict | None
:return: Profiler with attributes populated.
:rtype: BaseColumnProfiler
"""
profile = cls(data["name"])
if options is None:
options = {}

class_options = options.get(cls.__name__)
profile: BaseColumnProfilerT = cls(data["name"], class_options)

time_vals = data.pop("times")
setattr(profile, "times", defaultdict(float, time_vals))
Expand All @@ -276,9 +287,14 @@ def load_from_dict(cls, data) -> BaseColumnProfiler:
return profile


BaseColumnPrimitiveTypeProfilerT = TypeVar(
"BaseColumnPrimitiveTypeProfilerT", bound="BaseColumnPrimitiveTypeProfiler"
)


class BaseColumnPrimitiveTypeProfiler(
BaseColumnProfiler,
metaclass=abc.ABCMeta, # type: ignore
BaseColumnProfiler[BaseColumnPrimitiveTypeProfilerT],
metaclass=abc.ABCMeta,
):
"""Abstract class for profiling primative data type for col of data."""

Expand Down Expand Up @@ -306,10 +322,10 @@ def _update_column_base_properties(self, profile: dict) -> None:
self.match_count += int(profile.pop("match_count"))
BaseColumnProfiler._update_column_base_properties(self, profile)

def _add_helper( # type: ignore[override]
def _add_helper(
self,
other1: BaseColumnPrimitiveTypeProfiler,
other2: BaseColumnPrimitiveTypeProfiler,
other1: BaseColumnPrimitiveTypeProfilerT,
other2: BaseColumnPrimitiveTypeProfilerT,
) -> None:
"""
Merge the properties of two objects inputted.
Expand All @@ -319,5 +335,5 @@ def _add_helper( # type: ignore[override]
:type other1: BaseColumnPrimitiveTypeProfiler
:type other2: BaseColumnPrimitiveTypeProfiler
"""
BaseColumnProfiler._add_helper(self, other1, other2)
self.match_count = int(other1.match_count + other2.match_count)
super()._add_helper(other1, other2)
self.match_count = other1.match_count + other2.match_count
2 changes: 1 addition & 1 deletion dataprofiler/profilers/categorical_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
from .profiler_options import CategoricalOptions


class CategoricalColumn(BaseColumnProfiler):
class CategoricalColumn(BaseColumnProfiler["CategoricalColumn"]):
"""
Categorical column profile subclass of BaseColumnProfiler.
Expand Down
17 changes: 11 additions & 6 deletions dataprofiler/profilers/column_profile_compilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
import abc
from collections import OrderedDict
from multiprocessing.pool import Pool
from typing import Generic, TypeVar

from pandas import Series

Expand All @@ -20,8 +21,10 @@
from .unstructured_labeler_profile import UnstructuredLabelerProfile
from .unstructured_text_profile import TextProfiler

BaseCompilerT = TypeVar("BaseCompilerT", bound="BaseCompiler")

class BaseCompiler(metaclass=abc.ABCMeta): # type: ignore

class BaseCompiler(Generic[BaseCompilerT], metaclass=abc.ABCMeta):
"""Abstract class for generating a report."""

# NOTE: these profilers are ordered. Test functionality if changed.
Expand Down Expand Up @@ -141,7 +144,7 @@ def __add__(self, other: BaseCompiler) -> BaseCompiler:
)
return merged_profile_compiler

def diff(self, other: BaseCompiler, options: dict = None) -> dict:
def diff(self, other: BaseCompilerT, options: dict = None) -> dict:
"""
Find the difference between 2 compilers and returns the report.
Expand Down Expand Up @@ -247,7 +250,9 @@ def load_from_dict(cls, data) -> BaseCompiler:
return compiler


class ColumnPrimitiveTypeProfileCompiler(BaseCompiler):
class ColumnPrimitiveTypeProfileCompiler(
BaseCompiler["ColumnPrimitiveTypeProfileCompiler"]
):
"""For generating ordered column profile reports."""

# NOTE: these profilers are ordered. Test functionality if changed.
Expand Down Expand Up @@ -361,7 +366,7 @@ def diff(
return diff_profile


class ColumnStatsProfileCompiler(BaseCompiler):
class ColumnStatsProfileCompiler(BaseCompiler["ColumnStatsProfileCompiler"]):
"""For generating OrderColumn and CategoricalColumn reports."""

# NOTE: these profilers are ordered. Test functionality if changed.
Expand Down Expand Up @@ -406,7 +411,7 @@ def diff(self, other: ColumnStatsProfileCompiler, options: dict = None) -> dict:
return diff_profile


class ColumnDataLabelerCompiler(BaseCompiler):
class ColumnDataLabelerCompiler(BaseCompiler["ColumnDataLabelerCompiler"]):
"""For generating DataLabelerColumn report."""

# NOTE: these profilers are ordered. Test functionality if changed.
Expand Down Expand Up @@ -457,7 +462,7 @@ def diff(self, other: ColumnDataLabelerCompiler, options: dict = None) -> dict:
return diff_profile


class UnstructuredCompiler(BaseCompiler):
class UnstructuredCompiler(BaseCompiler["UnstructuredCompiler"]):
"""For generating TextProfiler and UnstructuredLabelerProfile reports."""

# NOTE: these profilers are ordered. Test functionality if changed.
Expand Down
44 changes: 43 additions & 1 deletion dataprofiler/profilers/data_labeler_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
from .profiler_options import DataLabelerOptions


class DataLabelerColumn(BaseColumnProfiler):
class DataLabelerColumn(BaseColumnProfiler["DataLabelerColumn"]):
"""Sublass of BaseColumnProfiler for profiling data labeler col."""

type = "data_labeler"
Expand Down Expand Up @@ -307,6 +307,48 @@ def profile(self) -> dict:
}
return profile

@classmethod
def load_from_dict(cls, data, options: dict | None = None) -> DataLabelerColumn:
"""
Parse attribute from json dictionary into self.
:param data: dictionary with attributes and values.
:type data: dict[string, Any]
:param options: options for loading column profiler params from dictionary
:type options: Dict | None
:return: Profiler with attributes populated.
:rtype: DataLabelerColumn
"""
opt = DataLabelerOptions()
data_labeler_load_attr = data.pop("data_labeler")
if "from_library" in data_labeler_load_attr:
opt.data_labeler_object = DataLabeler.load_from_library(
data_labeler_load_attr["from_library"]
)
elif "from_disk" in data_labeler_load_attr:
raise NotImplementedError(
"Models intialized from disk have not yet been made deserializable"
)
else:
raise ValueError(
"Deserialization cannot be done on labelers without "
"_default_model_loc set to known value."
)

# This is an ambiguous call to super classes.
# If load_from_dict is part of both super classes there may be issues
profile = super().load_from_dict(data, options={cls.__name__: opt})

if profile._reverse_label_mapping is not None:
profile._reverse_label_mapping = {
int(k): v for k, v in profile._reverse_label_mapping.items()
}
if profile._sum_predictions is not None:
profile._sum_predictions = np.array(profile._sum_predictions)

return profile

def report(self, remove_disabled_flag: bool = False) -> dict:
"""
Return report.
Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/profilers/datetime_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from .profiler_options import DateTimeOptions


class DateTimeColumn(BaseColumnPrimitiveTypeProfiler):
class DateTimeColumn(BaseColumnPrimitiveTypeProfiler["DateTimeColumn"]):
"""
Datetime column profile subclass of BaseColumnProfiler.
Expand Down
4 changes: 3 additions & 1 deletion dataprofiler/profilers/float_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,9 @@
from .profiler_options import FloatOptions


class FloatColumn(NumericStatsMixin, BaseColumnPrimitiveTypeProfiler): # type: ignore
class FloatColumn(
NumericStatsMixin["FloatColumn"], BaseColumnPrimitiveTypeProfiler["FloatColumn"]
):
"""
Float column profile mixin with numerical stats.
Expand Down
4 changes: 3 additions & 1 deletion dataprofiler/profilers/int_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
from .profiler_options import IntOptions


class IntColumn(NumericStatsMixin, BaseColumnPrimitiveTypeProfiler): # type: ignore
class IntColumn(
NumericStatsMixin["IntColumn"], BaseColumnPrimitiveTypeProfiler["IntColumn"]
):
"""
Integer column profile mixin with of numerical stats.
Expand Down
6 changes: 3 additions & 3 deletions dataprofiler/profilers/json_decoder.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,9 +69,9 @@ def load_column_profile(serialized_json: dict) -> "BaseColumnProfiler":
JSON
"""
column_profiler_cls: Type["BaseColumnProfiler"] = get_column_profiler_class(
serialized_json["class"]
)
column_profiler_cls: Type[
"BaseColumnProfiler[BaseColumnProfiler]"
] = get_column_profiler_class(serialized_json["class"])
return column_profiler_cls.load_from_dict(serialized_json["data"])


Expand Down
13 changes: 8 additions & 5 deletions dataprofiler/profilers/numerical_column_stats.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
import copy
import itertools
import warnings
from typing import Any, Callable, Dict, List, cast
from typing import Any, Callable, Dict, List, TypeVar, cast

import numpy as np
import numpy.typing as npt
Expand All @@ -31,7 +31,10 @@ def __init__(self, function: Callable) -> None:
__isabstractmethod__ = True


class NumericStatsMixin(metaclass=abc.ABCMeta): # type: ignore
NumericStatsMixinT = TypeVar("NumericStatsMixinT", bound="NumericStatsMixin")


class NumericStatsMixin(BaseColumnProfiler[NumericStatsMixinT], metaclass=abc.ABCMeta):
"""
Abstract numerical column profile subclass of BaseColumnProfiler.
Expand Down Expand Up @@ -200,8 +203,8 @@ def _add_helper_merge_profile_histograms(

def _add_helper(
self,
other1: NumericStatsMixin,
other2: NumericStatsMixin,
other1: NumericStatsMixinT,
other2: NumericStatsMixinT,
) -> None:
"""
Help merge profiles.
Expand Down Expand Up @@ -413,7 +416,7 @@ def convert_histogram_key_types_to_np(histogram_info: dict):

def diff(
self,
other_profile: NumericStatsMixin,
other_profile: NumericStatsMixinT,
options: dict = None,
) -> dict:
"""
Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/profilers/order_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def __lt__(self: CT, other: CT) -> bool:
CT = TypeVar("CT", bound=Comparable)


class OrderColumn(BaseColumnProfiler):
class OrderColumn(BaseColumnProfiler["OrderColumn"]):
"""
Index column profile subclass of BaseColumnProfiler.
Expand Down
5 changes: 4 additions & 1 deletion dataprofiler/profilers/text_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,9 @@
from .profiler_options import TextOptions


class TextColumn(NumericStatsMixin, BaseColumnPrimitiveTypeProfiler): # type: ignore
class TextColumn(
NumericStatsMixin["TextColumn"], BaseColumnPrimitiveTypeProfiler["TextColumn"]
):
"""
Text column profile subclass of BaseColumnProfiler.
Expand Down Expand Up @@ -106,6 +108,7 @@ def diff(self, other_profile: TextColumn, options: dict = None) -> dict:
:rtype: dict
"""
differences = NumericStatsMixin.diff(self, other_profile, options)

del differences["psi"]
vocab_diff = utils.find_diff_of_lists_and_sets(self.vocab, other_profile.vocab)
differences["vocab"] = vocab_diff
Expand Down
Loading

0 comments on commit 3bb1127

Please sign in to comment.