Profiler runs without TF & TF only executes when necessary (#41)

* updated readme * updated some documentation & .gitignore * moved examples to doc * rephrase * updated text * profiler runs without tensorflow and doesn't load tensorflow until necessary * remove unnecessary file * updated gitignore * cosmetic code changes * removed commented out items * improved warning message * add tests * move tensorflow to top * typo fix * updated tests and errors * use pandas for dataframe * remove temp file * rename * rename
capitalone · Mar 3, 2021 · 7c05449 · 7c05449
1 parent a357c0c
commit 7c05449
Show file tree

Hide file tree

Showing 11 changed files with 91 additions and 35 deletions.
diff --git a/.gitignore b/.gitignore
@@ -3,6 +3,7 @@
 DataProfiler\.egg-info/
 DataProfiler.egg-info/*
 data_profiler/labelers/embeddings/glove-reduced-64D.txt
+dataprofiler/labelers/embeddings/glove-reduced-64D.txt
 *.xml
 *.orig
 

diff --git a/dataprofiler/__init__.py b/dataprofiler/__init__.py
@@ -4,7 +4,6 @@
 from .labelers.data_labelers import train_structured_labeler, DataLabeler, \
                                     StructuredDataLabeler, \
                                     UnstructuredDataLabeler
-from .labelers import CharacterLevelCnnModel, RegexModel
 from .validators.base_validators import Validator
 from .version import __version__
 

diff --git a/dataprofiler/labelers/__init__.py b/dataprofiler/labelers/__init__.py
@@ -28,8 +28,6 @@
 """
 # import models
 from .base_data_labeler import BaseDataLabeler
-from .character_level_cnn_model import CharacterLevelCnnModel
-from .regex_model import RegexModel
 
 # import data processors
 from .data_processing import CharPreprocessor, CharPostprocessor, \

diff --git a/dataprofiler/labelers/base_data_labeler.py b/dataprofiler/labelers/base_data_labeler.py
@@ -272,6 +272,7 @@ def predict(self, data, batch_size=32, predict_options=None,
         :param verbose: Flag to determine whether to print status or not
         :return: predictions
         """
+
         if predict_options is None:
             predict_options = {}
         data = self._check_and_return_valid_data_format(
@@ -422,6 +423,7 @@ def _load_parameters(dirpath, load_options=None):
 
         with open(os.path.join(dirpath, 'data_labeler_parameters.json')) as fp:
             params = json.load(fp)
+
         if 'model_class' in load_options:
             model_class = load_options.get('model_class')
             if not isinstance(model_class, BaseModel):

diff --git a/dataprofiler/labelers/base_model.py b/dataprofiler/labelers/base_model.py
@@ -96,6 +96,11 @@ def num_labels(self):
 
     @classmethod
     def get_class(cls, class_name):
+
+        # Import possible internal models         
+        from .regex_model import RegexModel
+        from .character_level_cnn_model import CharacterLevelCnnModel
+
         return cls._BaseModel__subclasses.get(class_name.lower(), None)
 
     def get_parameters(self, param_list=None):

diff --git a/dataprofiler/labelers/character_level_cnn_model.py b/dataprofiler/labelers/character_level_cnn_model.py
@@ -7,7 +7,6 @@
 from collections import defaultdict
 
 import tensorflow as tf
-import tensorflow_addons as tfa
 import numpy as np
 from sklearn import decomposition
 
@@ -17,6 +16,16 @@
 
 _file_dir = os.path.dirname(os.path.abspath(__file__))
 
+class NoV1ResourceMessageFilter(logging.Filter):
+    """Removes TF2 warning for using TF1 model which has resources."""
+    def filter(self, record):
+        msg = 'is a problem, consider rebuilding the SavedModel after ' + \
+            'running tf.compat.v1.enable_resource_variables()'
+        return msg not in record.getMessage()
+
+tf_logger = logging.getLogger('tensorflow')
+tf_logger.addFilter(NoV1ResourceMessageFilter())
+
 
 def build_embd_dictionary(filename):
     """
@@ -65,19 +74,6 @@ def create_glove_char(n_dims, source_file=None):
         for word, embd in zip(embd_words, reduced_embds):
             file.write(word + " " + ' '.join(str(num) for num in embd) + "\n")
 
-
-class NoV1ResourceMessageFilter(logging.Filter):
-    """Removes TF2 warning for using TF1 model which has resources."""
-    def filter(self, record):
-        msg = 'is a problem, consider rebuilding the SavedModel after ' + \
-            'running tf.compat.v1.enable_resource_variables()'
-        return msg not in record.getMessage()
-
-
-tf_logger = logging.getLogger('tensorflow')
-tf_logger.addFilter(NoV1ResourceMessageFilter())
-
-
 class CharacterLevelCnnModel(BaseTrainableModel,
                              metaclass=AutoSubRegistrationMeta):
 
@@ -257,6 +253,7 @@ def load_from_disk(cls, dirpath):
         :type dirpath: str
         :return: None
         """
+
         # load parameters
         model_param_dirpath = os.path.join(dirpath, "model_parameters.json")
         with open(model_param_dirpath, 'r') as fp:
@@ -268,6 +265,8 @@ def load_from_disk(cls, dirpath):
             label_mapping = json.load(fp)
 
         # load tf model
+        # Use TFA to add f1 score to output
+        import tensorflow_addons as tfa
         custom_objects = {
             "F1Score": tfa.metrics.F1Score(
                 num_classes=max(label_mapping.values()) + 1,
@@ -314,6 +313,7 @@ def _char_encoding_layer(input_str_tensor, max_char_encoding_id, max_len):
         :return : tensor containing encoded list of input sentences
         :rtype: tf.Tensor
         """
+
         # convert characters to indices
         input_str_flatten = tf.reshape(input_str_tensor, [-1])
         sentences_encode = tf.strings.unicode_decode(input_str_flatten,
@@ -399,7 +399,7 @@ def _construct_model(self):
         num_labels = self.num_labels
         default_ind = self.label_mapping[self._parameters['default_label']]
 
-        #Reset model
+        # Reset model
         tf.keras.backend.clear_session()
 
         # generate glove embedding
@@ -487,8 +487,10 @@ def encoding_function(input_str):
         softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]
         losses = {softmax_output_layer_name: "categorical_crossentropy"}
 
-        f1_score_training = tfa.metrics.F1Score(num_classes=num_labels,
-                                                average='micro')
+        # Use TFA to add f1 score to output
+        import tensorflow_addons as tfa
+        f1_score_training = tfa.metrics.F1Score(
+            num_classes=num_labels, average='micro')
         metrics = {softmax_output_layer_name: ['acc', f1_score_training]}
 
         self._model.compile(loss=losses,
@@ -515,6 +517,7 @@ def _reconstruct_model(self):
         :return: None
         """
 
+        # Reset model
         tf.keras.backend.clear_session()
 
         num_labels = self.num_labels
@@ -547,6 +550,8 @@ def _reconstruct_model(self):
         softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]
         losses = {softmax_output_layer_name: "categorical_crossentropy"}
 
+        # Use TFA to add f1 score to output
+        import tensorflow_addons as tfa
         f1_score_training = tfa.metrics.F1Score(
             num_classes=num_labels, average='micro')
         metrics = {softmax_output_layer_name: ['acc', f1_score_training]}

diff --git a/dataprofiler/labelers/data_labelers.py b/dataprofiler/labelers/data_labelers.py
@@ -64,7 +64,7 @@ class StructuredDataLabeler(BaseDataLabeler):
 
 
 class DataLabeler(object):
-
+    
     labeler_classes = dict(
         structured=StructuredDataLabeler,
         unstructured=UnstructuredDataLabeler,

diff --git a/dataprofiler/labelers/data_processing.py b/dataprofiler/labelers/data_processing.py
@@ -10,7 +10,6 @@
 import math
 
 import numpy as np
-import tensorflow as tf
 
 default_labeler_dir = pkg_resources.resource_filename(
     'resources', 'labelers'
@@ -613,6 +612,9 @@ def process(self, data, labels=None, label_mapping=None, batch_size=32):
             raise ValueError('If `labels` are specified, `label_mapping` must '
                              'also be specified.')
 
+        # Import tensorflow
+        import tensorflow as tf
+
         # get parameters
         max_length = self._parameters['max_length']
         default_label = self._parameters['default_label']
@@ -626,9 +628,9 @@ def process(self, data, labels=None, label_mapping=None, batch_size=32):
             # Convert to necessary training data format.
             X_train = np.array(
                 [[sentence] for sentence in batch_data['samples']])
-
             if labels is not None:
                 num_classes = max(label_mapping.values()) + 1
+
                 Y_train = tf.keras.utils.to_categorical(
                     batch_data['labels'], num_classes)
                 yield X_train, Y_train

diff --git a/dataprofiler/profilers/column_profile_compilers.py b/dataprofiler/profilers/column_profile_compilers.py
@@ -43,20 +43,40 @@ def _create_profile(self, df_series, options=None):
         # convert all the values to string
         df_series = df_series.apply(str)
 
-        selected_columns = None
+        selected_col_profiles = None
         if options and isinstance(options, StructuredOptions):
-            selected_columns = options.enabled_columns
+            selected_col_profiles = options.enabled_columns
 
-        for column_type in self._profilers:
+        for col_profile_type in self._profilers:
             # Create profile if options allow for it or if there are no options
-            if selected_columns is None or \
-                    column_type.col_type in selected_columns:
-                column_options = None
-                if options and options.properties[column_type.col_type]:
-                    column_options = options.properties[column_type.col_type]
-                self._profiles[column_type.col_type] = \
-                    column_type(df_series.name, options=column_options)
-                self._profiles[column_type.col_type].update(df_series)
+            if selected_col_profiles is None or \
+                    col_profile_type.col_type in selected_col_profiles:
+                col_profile_options = None
+                if options and options.properties[col_profile_type.col_type]:
+                    col_profile_options = options.properties[col_profile_type.col_type]
+
+                try:
+                    self._profiles[col_profile_type.col_type] = \
+                        col_profile_type(df_series.name, options=col_profile_options)
+                    self._profiles[col_profile_type.col_type].update(df_series)
+                except Exception as e:
+                    import warnings
+                    warning_msg = "\n\n!!! WARNING Partial Profiler Failure !!!\n\n"
+                    warning_msg += "Profiling Type: {}".format(col_profile_type.col_type)
+                    warning_msg += "\nException: {}".format(type(e).__name__)
+                    warning_msg += "\nMessage: {}".format(e)
+
+                    # This is considered a major error
+                    if type(e).__name__ == "ValueError":
+                        raise ValueError(e)
+
+                    warning_msg += "\n\nFor labeler errors, try installing "
+                    warning_msg += "tensorflow and tensorflow-addons; command:\n"
+                    warning_msg += "  $ pip install tensorflow "
+                    warning_msg += "tensorflow-addons --user\n"
+
+                    warnings.warn(warning_msg, RuntimeWarning, stacklevel=2)
+
 
     def __add__(self, other):
         """

diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py
@@ -329,6 +329,7 @@ def __init__(self, data, samples_per_update=None, min_true_samples=None,
         elif not isinstance(profiler_options, ProfilerOptions):
             raise ValueError("The profile options must be passed as a "
                              "ProfileOptions object.")
+
         profiler_options.validate()
         self.options = profiler_options
 

diff --git a/dataprofiler/tests/test_data_profiler.py b/dataprofiler/tests/test_data_profiler.py
@@ -52,7 +52,7 @@ def reload_data_profiler():
             """Recursively reload modules."""
             sys_modules = sys.modules.copy()
             for module_name, module in sys_modules.items():
-                # Only reload top level of the data_profiler
+                # Only reload top level of the dataprofiler
                 if ('dataprofiler' in module_name and
                         len(module_name.split('.')) < 3):
                     if isinstance(module, types.ModuleType):
@@ -79,6 +79,29 @@ def import_mock(name, *args):
             '\tsudo apt-get -y install libsnappy-dev`\n',
         )
 
+    def test_no_tensorflow(self):
+        import sys
+        import importlib
+        import types
+        import pandas
+        orig_import = __import__
+        # necessary for any wrapper around the library to test if snappy caught
+        # as an issue
+
+        def import_mock(name, *args):
+            if name == 'tensorflow':
+                raise ImportError('test')
+            return orig_import(name, *args)
+
+        with mock.patch('builtins.__import__', side_effect=import_mock):
+
+            with self.assertWarns(RuntimeWarning) as w:
+                import dataprofiler
+                df = pandas.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]])
+                profile = dataprofiler.Profiler(df)
+
+        warning_msg = "Partial Profiler Failure"
+        self.assertIn(warning_msg, str(w.warning))
 
 if __name__ == '__main__':
     unittest.main()