Skip to content

Commit

Permalink
Add diff to TextColumn (capitalone#301)
Browse files Browse the repository at this point in the history
* Add text diff and tests

* Update parent call to diff

* Simplify tests

* Simplied vocab test check
  • Loading branch information
Andrew Yin authored Jun 30, 2021
1 parent 47f4030 commit e812bca
Show file tree
Hide file tree
Showing 2 changed files with 51 additions and 0 deletions.
15 changes: 15 additions & 0 deletions dataprofiler/profilers/text_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,21 @@ def profile(self):
)
return profile

def diff(self, other_profile, options=None):
"""
Finds the differences for text columns
:param other_profile: profile to find the difference with
:type other_profile: TextColumn Profile
:return: the text columns differences
:rtype: dict
"""
differences = NumericStatsMixin.diff(self, other_profile, options)
vocab_diff = utils.find_diff_of_lists_and_sets(
self.vocab, other_profile.vocab)
differences["vocab"] = vocab_diff
return differences

@property
def data_type_ratio(self):
"""
Expand Down
36 changes: 36 additions & 0 deletions dataprofiler/tests/profilers/test_text_column_profile.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import pandas as pd
import numpy as np

from dataprofiler.profilers import utils
from dataprofiler.tests.profilers import utils as test_utils
from dataprofiler.profilers import TextColumn
from dataprofiler.profilers.profiler_options import TextOptions
Expand Down Expand Up @@ -428,3 +429,38 @@ def test_histogram_option_integration(self):

histogram, _ = num_profiler._histogram_for_profile('custom')
self.assertEqual(100, len(histogram['bin_counts']))

def test_diff(self):
df = pd.Series(
["abcd", "aa", "abcd", "aa", "b", "4", "3", "2", "dfd", "2"]
).apply(str)

df2 = pd.Series(
["hello", "my", "name", "is", "Grant", "I", "have", "67", "dogs"]
).apply(str)

expected_vocab = [
'a', 'b', 'c', 'd', '4', '3', '2', 'f', 'h', 'e', 'l', 'o', 'm',
'y', 'n', 'i', 's', 'G', 'r', 't', 'I', 'v', '6', '7', 'g'
]

profiler = TextColumn(df.name)
profiler.update(df)
profile1 = profiler.profile

profiler2 = TextColumn(df2.name)
profiler2.update(df2)
profile2 = profiler2.profile

expected_diff = {'min': "unchanged",
'max': -1.0,
'sum': -9.0,
'mean': profile1['mean'] - profile2['mean'],
'variance': profile1['variance'] - profile2['variance'],
'stddev': profile1['stddev'] - profiler2['stddev'],
'vocab': utils.find_diff_of_lists_and_sets(
profile1['vocab'], profile2['vocab'])
}
diff = profiler.diff(profiler2)
self.assertDictEqual(expected_diff, diff)

0 comments on commit e812bca

Please sign in to comment.