Deserialization of datalabeler (#891)

* Added initial profiler decoding for datalabeler column (WIP) * Intialial implementation for deserialization of datalabelercolumn * Fix LSP violations (#840) * Make profiler superclasses generic Makes the superclasses BaseColumnProfiler, NumericStatsMixin, and BaseCompiler generic, to avoid casting in subclass diff() methods and violating LSP in principle. * Add needed cast import --------- Co-authored-by: Junho Lee <[email protected]>
capitalone · Jun 21, 2023 · 3bb1127 · 3bb1127
1 parent f46b8a9
commit 3bb1127
Show file tree

Hide file tree

Showing 12 changed files with 145 additions and 37 deletions.
diff --git a/dataprofiler/profilers/base_column_profilers.py b/dataprofiler/profilers/base_column_profilers.py
@@ -6,17 +6,18 @@
 import abc
 import warnings
 from collections import defaultdict
-from typing import Any, Callable
+from typing import Any, Callable, Generic, TypeVar
 
 import numpy as np
 import pandas as pd
 
-from dataprofiler.profilers.profiler_options import BaseInspectorOptions
-
 from . import utils
+from .profiler_options import BaseInspectorOptions, BaseOption
+
+BaseColumnProfilerT = TypeVar("BaseColumnProfilerT", bound="BaseColumnProfiler")
 
 
-class BaseColumnProfiler(metaclass=abc.ABCMeta):  # type: ignore
+class BaseColumnProfiler(Generic[BaseColumnProfilerT], metaclass=abc.ABCMeta):
     """Abstract class for profiling a column of data."""
 
     col_type = None
@@ -28,7 +29,7 @@ class BaseColumnProfiler(metaclass=abc.ABCMeta):  # type: ignore
     _SAMPLING_RATIO = 0.20
     _MIN_SAMPLING_COUNT = 500
 
-    def __init__(self, name: str | None) -> None:
+    def __init__(self, name: str | None, options: BaseOption | None = None):
         """
         Initialize base class properties for the subclass.
 
@@ -147,7 +148,7 @@ def _merge_calculations(
                     )
 
     def _add_helper(
-        self, other1: BaseColumnProfiler, other2: BaseColumnProfiler
+        self, other1: BaseColumnProfilerT, other2: BaseColumnProfilerT
     ) -> None:
         """
         Merge the properties of two BaseColumnProfile objects.
@@ -176,7 +177,7 @@ def _add_helper(
 
         self.sample_size = other1.sample_size + other2.sample_size
 
-    def diff(self, other_profile: BaseColumnProfiler, options: dict = None) -> dict:
+    def diff(self, other_profile: BaseColumnProfilerT, options: dict = None) -> dict:
         """
         Find the differences for columns.
 
@@ -248,17 +249,27 @@ def report(self, remove_disabled_flag: bool = False) -> dict:
         raise NotImplementedError()
 
     @classmethod
-    def load_from_dict(cls, data) -> BaseColumnProfiler:
+    def load_from_dict(
+        cls: type[BaseColumnProfilerT],
+        data: dict[str, Any],
+        options: dict | None = None,
+    ) -> BaseColumnProfilerT:
         """
         Parse attribute from json dictionary into self.
 
         :param data: dictionary with attributes and values.
         :type data: dict[string, Any]
+        :param options: options for loading column profiler params from dictionary
+        :type options: Dict | None
 
         :return: Profiler with attributes populated.
         :rtype: BaseColumnProfiler
         """
-        profile = cls(data["name"])
+        if options is None:
+            options = {}
+
+        class_options = options.get(cls.__name__)
+        profile: BaseColumnProfilerT = cls(data["name"], class_options)
 
         time_vals = data.pop("times")
         setattr(profile, "times", defaultdict(float, time_vals))
@@ -276,9 +287,14 @@ def load_from_dict(cls, data) -> BaseColumnProfiler:
         return profile
 
 
+BaseColumnPrimitiveTypeProfilerT = TypeVar(
+    "BaseColumnPrimitiveTypeProfilerT", bound="BaseColumnPrimitiveTypeProfiler"
+)
+
+
 class BaseColumnPrimitiveTypeProfiler(
-    BaseColumnProfiler,
-    metaclass=abc.ABCMeta,  # type: ignore
+    BaseColumnProfiler[BaseColumnPrimitiveTypeProfilerT],
+    metaclass=abc.ABCMeta,
 ):
     """Abstract class for profiling primative data type for col of data."""
 
@@ -306,10 +322,10 @@ def _update_column_base_properties(self, profile: dict) -> None:
         self.match_count += int(profile.pop("match_count"))
         BaseColumnProfiler._update_column_base_properties(self, profile)
 
-    def _add_helper(  # type: ignore[override]
+    def _add_helper(
         self,
-        other1: BaseColumnPrimitiveTypeProfiler,
-        other2: BaseColumnPrimitiveTypeProfiler,
+        other1: BaseColumnPrimitiveTypeProfilerT,
+        other2: BaseColumnPrimitiveTypeProfilerT,
     ) -> None:
         """
         Merge the properties of two objects inputted.
@@ -319,5 +335,5 @@ def _add_helper(  # type: ignore[override]
         :type other1: BaseColumnPrimitiveTypeProfiler
         :type other2: BaseColumnPrimitiveTypeProfiler
         """
-        BaseColumnProfiler._add_helper(self, other1, other2)
-        self.match_count = int(other1.match_count + other2.match_count)
+        super()._add_helper(other1, other2)
+        self.match_count = other1.match_count + other2.match_count
diff --git a/dataprofiler/profilers/categorical_column_profile.py b/dataprofiler/profilers/categorical_column_profile.py
@@ -11,7 +11,7 @@
 from .profiler_options import CategoricalOptions
 
 
-class CategoricalColumn(BaseColumnProfiler):
+class CategoricalColumn(BaseColumnProfiler["CategoricalColumn"]):
     """
     Categorical column profile subclass of BaseColumnProfiler.
 

diff --git a/dataprofiler/profilers/column_profile_compilers.py b/dataprofiler/profilers/column_profile_compilers.py
@@ -4,6 +4,7 @@
 import abc
 from collections import OrderedDict
 from multiprocessing.pool import Pool
+from typing import Generic, TypeVar
 
 from pandas import Series
 
@@ -20,8 +21,10 @@
 from .unstructured_labeler_profile import UnstructuredLabelerProfile
 from .unstructured_text_profile import TextProfiler
 
+BaseCompilerT = TypeVar("BaseCompilerT", bound="BaseCompiler")
 
-class BaseCompiler(metaclass=abc.ABCMeta):  # type: ignore
+
+class BaseCompiler(Generic[BaseCompilerT], metaclass=abc.ABCMeta):
     """Abstract class for generating a report."""
 
     # NOTE: these profilers are ordered. Test functionality if changed.
@@ -141,7 +144,7 @@ def __add__(self, other: BaseCompiler) -> BaseCompiler:
             )
         return merged_profile_compiler
 
-    def diff(self, other: BaseCompiler, options: dict = None) -> dict:
+    def diff(self, other: BaseCompilerT, options: dict = None) -> dict:
         """
         Find the difference between 2 compilers and returns the report.
 
@@ -247,7 +250,9 @@ def load_from_dict(cls, data) -> BaseCompiler:
         return compiler
 
 
-class ColumnPrimitiveTypeProfileCompiler(BaseCompiler):
+class ColumnPrimitiveTypeProfileCompiler(
+    BaseCompiler["ColumnPrimitiveTypeProfileCompiler"]
+):
     """For generating ordered column profile reports."""
 
     # NOTE: these profilers are ordered. Test functionality if changed.
@@ -361,7 +366,7 @@ def diff(
         return diff_profile
 
 
-class ColumnStatsProfileCompiler(BaseCompiler):
+class ColumnStatsProfileCompiler(BaseCompiler["ColumnStatsProfileCompiler"]):
     """For generating OrderColumn and CategoricalColumn reports."""
 
     # NOTE: these profilers are ordered. Test functionality if changed.
@@ -406,7 +411,7 @@ def diff(self, other: ColumnStatsProfileCompiler, options: dict = None) -> dict:
         return diff_profile
 
 
-class ColumnDataLabelerCompiler(BaseCompiler):
+class ColumnDataLabelerCompiler(BaseCompiler["ColumnDataLabelerCompiler"]):
     """For generating DataLabelerColumn report."""
 
     # NOTE: these profilers are ordered. Test functionality if changed.
@@ -457,7 +462,7 @@ def diff(self, other: ColumnDataLabelerCompiler, options: dict = None) -> dict:
         return diff_profile
 
 
-class UnstructuredCompiler(BaseCompiler):
+class UnstructuredCompiler(BaseCompiler["UnstructuredCompiler"]):
     """For generating TextProfiler and UnstructuredLabelerProfile reports."""
 
     # NOTE: these profilers are ordered. Test functionality if changed.

diff --git a/dataprofiler/profilers/data_labeler_column_profile.py b/dataprofiler/profilers/data_labeler_column_profile.py
@@ -14,7 +14,7 @@
 from .profiler_options import DataLabelerOptions
 
 
-class DataLabelerColumn(BaseColumnProfiler):
+class DataLabelerColumn(BaseColumnProfiler["DataLabelerColumn"]):
     """Sublass of BaseColumnProfiler for profiling data labeler col."""
 
     type = "data_labeler"
@@ -307,6 +307,48 @@ def profile(self) -> dict:
         }
         return profile
 
+    @classmethod
+    def load_from_dict(cls, data, options: dict | None = None) -> DataLabelerColumn:
+        """
+        Parse attribute from json dictionary into self.
+
+        :param data: dictionary with attributes and values.
+        :type data: dict[string, Any]
+        :param options: options for loading column profiler params from dictionary
+        :type options: Dict | None
+
+        :return: Profiler with attributes populated.
+        :rtype: DataLabelerColumn
+        """
+        opt = DataLabelerOptions()
+        data_labeler_load_attr = data.pop("data_labeler")
+        if "from_library" in data_labeler_load_attr:
+            opt.data_labeler_object = DataLabeler.load_from_library(
+                data_labeler_load_attr["from_library"]
+            )
+        elif "from_disk" in data_labeler_load_attr:
+            raise NotImplementedError(
+                "Models intialized from disk have not yet been made deserializable"
+            )
+        else:
+            raise ValueError(
+                "Deserialization cannot be done on labelers without "
+                "_default_model_loc set to known value."
+            )
+
+        # This is an ambiguous call to super classes.
+        # If load_from_dict is part of both super classes there may be issues
+        profile = super().load_from_dict(data, options={cls.__name__: opt})
+
+        if profile._reverse_label_mapping is not None:
+            profile._reverse_label_mapping = {
+                int(k): v for k, v in profile._reverse_label_mapping.items()
+            }
+        if profile._sum_predictions is not None:
+            profile._sum_predictions = np.array(profile._sum_predictions)
+
+        return profile
+
     def report(self, remove_disabled_flag: bool = False) -> dict:
         """
         Return report.

diff --git a/dataprofiler/profilers/datetime_column_profile.py b/dataprofiler/profilers/datetime_column_profile.py
@@ -13,7 +13,7 @@
 from .profiler_options import DateTimeOptions
 
 
-class DateTimeColumn(BaseColumnPrimitiveTypeProfiler):
+class DateTimeColumn(BaseColumnPrimitiveTypeProfiler["DateTimeColumn"]):
     """
     Datetime column profile subclass of BaseColumnProfiler.
 

diff --git a/dataprofiler/profilers/float_column_profile.py b/dataprofiler/profilers/float_column_profile.py
@@ -13,7 +13,9 @@
 from .profiler_options import FloatOptions
 
 
-class FloatColumn(NumericStatsMixin, BaseColumnPrimitiveTypeProfiler):  # type: ignore
+class FloatColumn(
+    NumericStatsMixin["FloatColumn"], BaseColumnPrimitiveTypeProfiler["FloatColumn"]
+):
     """
     Float column profile mixin with numerical stats.
 

diff --git a/dataprofiler/profilers/int_column_profile.py b/dataprofiler/profilers/int_column_profile.py
@@ -9,7 +9,9 @@
 from .profiler_options import IntOptions
 
 
-class IntColumn(NumericStatsMixin, BaseColumnPrimitiveTypeProfiler):  # type: ignore
+class IntColumn(
+    NumericStatsMixin["IntColumn"], BaseColumnPrimitiveTypeProfiler["IntColumn"]
+):
     """
     Integer column profile mixin with of numerical stats.
 

diff --git a/dataprofiler/profilers/json_decoder.py b/dataprofiler/profilers/json_decoder.py
@@ -69,9 +69,9 @@ def load_column_profile(serialized_json: dict) -> "BaseColumnProfiler":
         JSON
 
     """
-    column_profiler_cls: Type["BaseColumnProfiler"] = get_column_profiler_class(
-        serialized_json["class"]
-    )
+    column_profiler_cls: Type[
+        "BaseColumnProfiler[BaseColumnProfiler]"
+    ] = get_column_profiler_class(serialized_json["class"])
     return column_profiler_cls.load_from_dict(serialized_json["data"])
 
 

diff --git a/dataprofiler/profilers/numerical_column_stats.py b/dataprofiler/profilers/numerical_column_stats.py
@@ -6,7 +6,7 @@
 import copy
 import itertools
 import warnings
-from typing import Any, Callable, Dict, List, cast
+from typing import Any, Callable, Dict, List, TypeVar, cast
 
 import numpy as np
 import numpy.typing as npt
@@ -31,7 +31,10 @@ def __init__(self, function: Callable) -> None:
     __isabstractmethod__ = True
 
 
-class NumericStatsMixin(metaclass=abc.ABCMeta):  # type: ignore
+NumericStatsMixinT = TypeVar("NumericStatsMixinT", bound="NumericStatsMixin")
+
+
+class NumericStatsMixin(BaseColumnProfiler[NumericStatsMixinT], metaclass=abc.ABCMeta):
     """
     Abstract numerical column profile subclass of BaseColumnProfiler.
 
@@ -200,8 +203,8 @@ def _add_helper_merge_profile_histograms(
 
     def _add_helper(
         self,
-        other1: NumericStatsMixin,
-        other2: NumericStatsMixin,
+        other1: NumericStatsMixinT,
+        other2: NumericStatsMixinT,
     ) -> None:
         """
         Help merge profiles.
@@ -413,7 +416,7 @@ def convert_histogram_key_types_to_np(histogram_info: dict):
 
     def diff(
         self,
-        other_profile: NumericStatsMixin,
+        other_profile: NumericStatsMixinT,
         options: dict = None,
     ) -> dict:
         """

diff --git a/dataprofiler/profilers/order_column_profile.py b/dataprofiler/profilers/order_column_profile.py
@@ -24,7 +24,7 @@ def __lt__(self: CT, other: CT) -> bool:
 CT = TypeVar("CT", bound=Comparable)
 
 
-class OrderColumn(BaseColumnProfiler):
+class OrderColumn(BaseColumnProfiler["OrderColumn"]):
     """
     Index column profile subclass of BaseColumnProfiler.
 

diff --git a/dataprofiler/profilers/text_column_profile.py b/dataprofiler/profilers/text_column_profile.py
@@ -12,7 +12,9 @@
 from .profiler_options import TextOptions
 
 
-class TextColumn(NumericStatsMixin, BaseColumnPrimitiveTypeProfiler):  # type: ignore
+class TextColumn(
+    NumericStatsMixin["TextColumn"], BaseColumnPrimitiveTypeProfiler["TextColumn"]
+):
     """
     Text column profile subclass of BaseColumnProfiler.
 
@@ -106,6 +108,7 @@ def diff(self, other_profile: TextColumn, options: dict = None) -> dict:
         :rtype: dict
         """
         differences = NumericStatsMixin.diff(self, other_profile, options)
+
         del differences["psi"]
         vocab_diff = utils.find_diff_of_lists_and_sets(self.vocab, other_profile.vocab)
         differences["vocab"] = vocab_diff