Skip to content

Commit

Permalink
Profiler runs without TF & TF only executes when necessary (#41)
Browse files Browse the repository at this point in the history
* updated readme

* updated some documentation & .gitignore

* moved examples to doc

* rephrase

* updated text

* profiler runs without tensorflow and doesn't load tensorflow until necessary

* remove unnecessary file

* updated gitignore

* cosmetic code changes

* removed commented out items

* improved warning message

* add tests

* move tensorflow to top

* typo fix

* updated tests and errors

* use pandas for dataframe

* remove temp file

* rename

* rename
  • Loading branch information
lettergram authored Mar 3, 2021
1 parent a357c0c commit 7c05449
Show file tree
Hide file tree
Showing 11 changed files with 91 additions and 35 deletions.
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
DataProfiler\.egg-info/
DataProfiler.egg-info/*
data_profiler/labelers/embeddings/glove-reduced-64D.txt
dataprofiler/labelers/embeddings/glove-reduced-64D.txt
*.xml
*.orig

Expand Down
1 change: 0 additions & 1 deletion dataprofiler/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,6 @@
from .labelers.data_labelers import train_structured_labeler, DataLabeler, \
StructuredDataLabeler, \
UnstructuredDataLabeler
from .labelers import CharacterLevelCnnModel, RegexModel
from .validators.base_validators import Validator
from .version import __version__

Expand Down
2 changes: 0 additions & 2 deletions dataprofiler/labelers/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,8 +28,6 @@
"""
# import models
from .base_data_labeler import BaseDataLabeler
from .character_level_cnn_model import CharacterLevelCnnModel
from .regex_model import RegexModel

# import data processors
from .data_processing import CharPreprocessor, CharPostprocessor, \
Expand Down
2 changes: 2 additions & 0 deletions dataprofiler/labelers/base_data_labeler.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,6 +272,7 @@ def predict(self, data, batch_size=32, predict_options=None,
:param verbose: Flag to determine whether to print status or not
:return: predictions
"""

if predict_options is None:
predict_options = {}
data = self._check_and_return_valid_data_format(
Expand Down Expand Up @@ -422,6 +423,7 @@ def _load_parameters(dirpath, load_options=None):

with open(os.path.join(dirpath, 'data_labeler_parameters.json')) as fp:
params = json.load(fp)

if 'model_class' in load_options:
model_class = load_options.get('model_class')
if not isinstance(model_class, BaseModel):
Expand Down
5 changes: 5 additions & 0 deletions dataprofiler/labelers/base_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,6 +96,11 @@ def num_labels(self):

@classmethod
def get_class(cls, class_name):

# Import possible internal models
from .regex_model import RegexModel
from .character_level_cnn_model import CharacterLevelCnnModel

return cls._BaseModel__subclasses.get(class_name.lower(), None)

def get_parameters(self, param_list=None):
Expand Down
39 changes: 22 additions & 17 deletions dataprofiler/labelers/character_level_cnn_model.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
from collections import defaultdict

import tensorflow as tf
import tensorflow_addons as tfa
import numpy as np
from sklearn import decomposition

Expand All @@ -17,6 +16,16 @@

_file_dir = os.path.dirname(os.path.abspath(__file__))

class NoV1ResourceMessageFilter(logging.Filter):
"""Removes TF2 warning for using TF1 model which has resources."""
def filter(self, record):
msg = 'is a problem, consider rebuilding the SavedModel after ' + \
'running tf.compat.v1.enable_resource_variables()'
return msg not in record.getMessage()

tf_logger = logging.getLogger('tensorflow')
tf_logger.addFilter(NoV1ResourceMessageFilter())


def build_embd_dictionary(filename):
"""
Expand Down Expand Up @@ -65,19 +74,6 @@ def create_glove_char(n_dims, source_file=None):
for word, embd in zip(embd_words, reduced_embds):
file.write(word + " " + ' '.join(str(num) for num in embd) + "\n")


class NoV1ResourceMessageFilter(logging.Filter):
"""Removes TF2 warning for using TF1 model which has resources."""
def filter(self, record):
msg = 'is a problem, consider rebuilding the SavedModel after ' + \
'running tf.compat.v1.enable_resource_variables()'
return msg not in record.getMessage()


tf_logger = logging.getLogger('tensorflow')
tf_logger.addFilter(NoV1ResourceMessageFilter())


class CharacterLevelCnnModel(BaseTrainableModel,
metaclass=AutoSubRegistrationMeta):

Expand Down Expand Up @@ -257,6 +253,7 @@ def load_from_disk(cls, dirpath):
:type dirpath: str
:return: None
"""

# load parameters
model_param_dirpath = os.path.join(dirpath, "model_parameters.json")
with open(model_param_dirpath, 'r') as fp:
Expand All @@ -268,6 +265,8 @@ def load_from_disk(cls, dirpath):
label_mapping = json.load(fp)

# load tf model
# Use TFA to add f1 score to output
import tensorflow_addons as tfa
custom_objects = {
"F1Score": tfa.metrics.F1Score(
num_classes=max(label_mapping.values()) + 1,
Expand Down Expand Up @@ -314,6 +313,7 @@ def _char_encoding_layer(input_str_tensor, max_char_encoding_id, max_len):
:return : tensor containing encoded list of input sentences
:rtype: tf.Tensor
"""

# convert characters to indices
input_str_flatten = tf.reshape(input_str_tensor, [-1])
sentences_encode = tf.strings.unicode_decode(input_str_flatten,
Expand Down Expand Up @@ -399,7 +399,7 @@ def _construct_model(self):
num_labels = self.num_labels
default_ind = self.label_mapping[self._parameters['default_label']]

#Reset model
# Reset model
tf.keras.backend.clear_session()

# generate glove embedding
Expand Down Expand Up @@ -487,8 +487,10 @@ def encoding_function(input_str):
softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]
losses = {softmax_output_layer_name: "categorical_crossentropy"}

f1_score_training = tfa.metrics.F1Score(num_classes=num_labels,
average='micro')
# Use TFA to add f1 score to output
import tensorflow_addons as tfa
f1_score_training = tfa.metrics.F1Score(
num_classes=num_labels, average='micro')
metrics = {softmax_output_layer_name: ['acc', f1_score_training]}

self._model.compile(loss=losses,
Expand All @@ -515,6 +517,7 @@ def _reconstruct_model(self):
:return: None
"""

# Reset model
tf.keras.backend.clear_session()

num_labels = self.num_labels
Expand Down Expand Up @@ -547,6 +550,8 @@ def _reconstruct_model(self):
softmax_output_layer_name = self._model.outputs[0].name.split('/')[0]
losses = {softmax_output_layer_name: "categorical_crossentropy"}

# Use TFA to add f1 score to output
import tensorflow_addons as tfa
f1_score_training = tfa.metrics.F1Score(
num_classes=num_labels, average='micro')
metrics = {softmax_output_layer_name: ['acc', f1_score_training]}
Expand Down
2 changes: 1 addition & 1 deletion dataprofiler/labelers/data_labelers.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,7 +64,7 @@ class StructuredDataLabeler(BaseDataLabeler):


class DataLabeler(object):

labeler_classes = dict(
structured=StructuredDataLabeler,
unstructured=UnstructuredDataLabeler,
Expand Down
6 changes: 4 additions & 2 deletions dataprofiler/labelers/data_processing.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@
import math

import numpy as np
import tensorflow as tf

default_labeler_dir = pkg_resources.resource_filename(
'resources', 'labelers'
Expand Down Expand Up @@ -613,6 +612,9 @@ def process(self, data, labels=None, label_mapping=None, batch_size=32):
raise ValueError('If `labels` are specified, `label_mapping` must '
'also be specified.')

# Import tensorflow
import tensorflow as tf

# get parameters
max_length = self._parameters['max_length']
default_label = self._parameters['default_label']
Expand All @@ -626,9 +628,9 @@ def process(self, data, labels=None, label_mapping=None, batch_size=32):
# Convert to necessary training data format.
X_train = np.array(
[[sentence] for sentence in batch_data['samples']])

if labels is not None:
num_classes = max(label_mapping.values()) + 1

Y_train = tf.keras.utils.to_categorical(
batch_data['labels'], num_classes)
yield X_train, Y_train
Expand Down
42 changes: 31 additions & 11 deletions dataprofiler/profilers/column_profile_compilers.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,20 +43,40 @@ def _create_profile(self, df_series, options=None):
# convert all the values to string
df_series = df_series.apply(str)

selected_columns = None
selected_col_profiles = None
if options and isinstance(options, StructuredOptions):
selected_columns = options.enabled_columns
selected_col_profiles = options.enabled_columns

for column_type in self._profilers:
for col_profile_type in self._profilers:
# Create profile if options allow for it or if there are no options
if selected_columns is None or \
column_type.col_type in selected_columns:
column_options = None
if options and options.properties[column_type.col_type]:
column_options = options.properties[column_type.col_type]
self._profiles[column_type.col_type] = \
column_type(df_series.name, options=column_options)
self._profiles[column_type.col_type].update(df_series)
if selected_col_profiles is None or \
col_profile_type.col_type in selected_col_profiles:
col_profile_options = None
if options and options.properties[col_profile_type.col_type]:
col_profile_options = options.properties[col_profile_type.col_type]

try:
self._profiles[col_profile_type.col_type] = \
col_profile_type(df_series.name, options=col_profile_options)
self._profiles[col_profile_type.col_type].update(df_series)
except Exception as e:
import warnings
warning_msg = "\n\n!!! WARNING Partial Profiler Failure !!!\n\n"
warning_msg += "Profiling Type: {}".format(col_profile_type.col_type)
warning_msg += "\nException: {}".format(type(e).__name__)
warning_msg += "\nMessage: {}".format(e)

# This is considered a major error
if type(e).__name__ == "ValueError":
raise ValueError(e)

warning_msg += "\n\nFor labeler errors, try installing "
warning_msg += "tensorflow and tensorflow-addons; command:\n"
warning_msg += " $ pip install tensorflow "
warning_msg += "tensorflow-addons --user\n"

warnings.warn(warning_msg, RuntimeWarning, stacklevel=2)


def __add__(self, other):
"""
Expand Down
1 change: 1 addition & 0 deletions dataprofiler/profilers/profile_builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -329,6 +329,7 @@ def __init__(self, data, samples_per_update=None, min_true_samples=None,
elif not isinstance(profiler_options, ProfilerOptions):
raise ValueError("The profile options must be passed as a "
"ProfileOptions object.")

profiler_options.validate()
self.options = profiler_options

Expand Down
25 changes: 24 additions & 1 deletion dataprofiler/tests/test_data_profiler.py
Original file line number Diff line number Diff line change
Expand Up @@ -52,7 +52,7 @@ def reload_data_profiler():
"""Recursively reload modules."""
sys_modules = sys.modules.copy()
for module_name, module in sys_modules.items():
# Only reload top level of the data_profiler
# Only reload top level of the dataprofiler
if ('dataprofiler' in module_name and
len(module_name.split('.')) < 3):
if isinstance(module, types.ModuleType):
Expand All @@ -79,6 +79,29 @@ def import_mock(name, *args):
'\tsudo apt-get -y install libsnappy-dev`\n',
)

def test_no_tensorflow(self):
import sys
import importlib
import types
import pandas
orig_import = __import__
# necessary for any wrapper around the library to test if snappy caught
# as an issue

def import_mock(name, *args):
if name == 'tensorflow':
raise ImportError('test')
return orig_import(name, *args)

with mock.patch('builtins.__import__', side_effect=import_mock):

with self.assertWarns(RuntimeWarning) as w:
import dataprofiler
df = pandas.DataFrame([[1, 2.0],[1, 2.2],[-1, 3]])
profile = dataprofiler.Profiler(df)

warning_msg = "Partial Profiler Failure"
self.assertIn(warning_msg, str(w.warning))

if __name__ == '__main__':
unittest.main()

0 comments on commit 7c05449

Please sign in to comment.