From 998d7e5833cb25745c0aa63c1534f15513559cc4 Mon Sep 17 00:00:00 2001 From: lettergram Date: Tue, 16 Mar 2021 13:49:48 -0500 Subject: [PATCH 1/6] improved hashing functionality --- dataprofiler/profilers/profile_builder.py | 18 +++++++----------- dataprofiler/tests/.#test_data_profiler.py | 1 + 2 files changed, 8 insertions(+), 11 deletions(-) create mode 120000 dataprofiler/tests/.#test_data_profiler.py diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py index a29c1e62c..a2aa7a5e7 100644 --- a/dataprofiler/profilers/profile_builder.py +++ b/dataprofiler/profilers/profile_builder.py @@ -446,24 +446,20 @@ def _update_row_statistics(self, data): :param data: a dataset :type data: pandas.DataFrame """ - for index, row in data.iterrows(): - # Hash the row and stores it in the dict, count keys for unique rows - hashed_row = hashlib.sha256( - row.to_string().strip().encode()).hexdigest() - self.hashed_row_dict[hashed_row] = True + hashed_rows = pd.util.hash_pandas_object(data, index=False).values + idx = 0 + while idx < len(hashed_rows): + + self.hashed_row_dict[hashed_rows[idx]] = True # check if null in row, if any add count - if row.isnull().any(): + if data.iloc[idx].isnull().any(): self.null_in_row_count += 1 # Used for ratios, total ingested rows self.rows_ingested += 1 - - # Determines null count, transposes column major to row major - # Any major returns true if null and sums total count of trues - # This is done quickly and with minimal transform(s) - # self.null_in_row_count = df.isnull().T.any().sum() + idx += 1 def update_profile(self, data, sample_size=None, min_true_samples=None): """ diff --git a/dataprofiler/tests/.#test_data_profiler.py b/dataprofiler/tests/.#test_data_profiler.py new file mode 120000 index 000000000..794c59fbb --- /dev/null +++ b/dataprofiler/tests/.#test_data_profiler.py @@ -0,0 +1 @@ +austin@Batcave.535153:1613875910 \ No newline at end of file From fcc2c26a950c0511d6b0eb4552cea83fa25e7db7 Mon Sep 17 00:00:00 2001 From: lettergram Date: Tue, 16 Mar 2021 13:50:05 -0500 Subject: [PATCH 2/6] impoved unique detection --- dataprofiler/tests/.#test_data_profiler.py | 1 - 1 file changed, 1 deletion(-) delete mode 120000 dataprofiler/tests/.#test_data_profiler.py diff --git a/dataprofiler/tests/.#test_data_profiler.py b/dataprofiler/tests/.#test_data_profiler.py deleted file mode 120000 index 794c59fbb..000000000 --- a/dataprofiler/tests/.#test_data_profiler.py +++ /dev/null @@ -1 +0,0 @@ -austin@Batcave.535153:1613875910 \ No newline at end of file From 06a25c44daaeeebe505211b1e48cb612e5f5c620 Mon Sep 17 00:00:00 2001 From: lettergram Date: Tue, 16 Mar 2021 14:23:01 -0500 Subject: [PATCH 3/6] updated to use range generator --- dataprofiler/profilers/profile_builder.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py index a2aa7a5e7..452db9138 100644 --- a/dataprofiler/profilers/profile_builder.py +++ b/dataprofiler/profilers/profile_builder.py @@ -446,10 +446,8 @@ def _update_row_statistics(self, data): :param data: a dataset :type data: pandas.DataFrame """ - hashed_rows = pd.util.hash_pandas_object(data, index=False).values - idx = 0 - while idx < len(hashed_rows): + for idx in range(len(hashed_rows)): self.hashed_row_dict[hashed_rows[idx]] = True @@ -459,7 +457,6 @@ def _update_row_statistics(self, data): # Used for ratios, total ingested rows self.rows_ingested += 1 - idx += 1 def update_profile(self, data, sample_size=None, min_true_samples=None): """ From 3d2f69b529169dfd508e32c4b3f5c5e38caf5f9a Mon Sep 17 00:00:00 2001 From: lettergram Date: Tue, 16 Mar 2021 15:27:17 -0500 Subject: [PATCH 4/6] much faster --- dataprofiler/profilers/profile_builder.py | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py index 452db9138..b6e99ecb4 100644 --- a/dataprofiler/profilers/profile_builder.py +++ b/dataprofiler/profilers/profile_builder.py @@ -439,25 +439,20 @@ def _get_duplicate_row_count(self): def _update_row_statistics(self, data): """ Iterate over the provided dataset row by row and calculate - the row statistics. Specificaly, number of unique rows, + the row statistics. Specifically, number of unique rows, rows containing null values, and total rows reviewed. This function is safe to use in batches. :param data: a dataset :type data: pandas.DataFrame """ - hashed_rows = pd.util.hash_pandas_object(data, index=False).values - for idx in range(len(hashed_rows)): - - self.hashed_row_dict[hashed_rows[idx]] = True - - # check if null in row, if any add count - if data.iloc[idx].isnull().any(): - self.null_in_row_count += 1 - - # Used for ratios, total ingested rows - self.rows_ingested += 1 - + + self.rows_ingested = len(data) + self.hashed_row_dict = dict.fromkeys( + pd.util.hash_pandas_object(data, index=False).drop_duplicates(), True + ) + self.null_in_row_count = data.isnull().any(axis=1).sum() + def update_profile(self, data, sample_size=None, min_true_samples=None): """ Update the profile for data provided. User can specify the sample From 14fc61299e3e6bf45b65e26a6abeaa985c9e1a7b Mon Sep 17 00:00:00 2001 From: lettergram Date: Tue, 16 Mar 2021 15:30:31 -0500 Subject: [PATCH 5/6] remove duplicates --- dataprofiler/profilers/profile_builder.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py index b6e99ecb4..0c5d9c98f 100644 --- a/dataprofiler/profilers/profile_builder.py +++ b/dataprofiler/profilers/profile_builder.py @@ -449,7 +449,7 @@ def _update_row_statistics(self, data): self.rows_ingested = len(data) self.hashed_row_dict = dict.fromkeys( - pd.util.hash_pandas_object(data, index=False).drop_duplicates(), True + pd.util.hash_pandas_object(data, index=False), True ) self.null_in_row_count = data.isnull().any(axis=1).sum() From 18e9d885d6572543bfc2b4e115c2e2b6ebd929b6 Mon Sep 17 00:00:00 2001 From: Austin Walters Date: Tue, 16 Mar 2021 15:56:50 -0500 Subject: [PATCH 6/6] Update version.py --- dataprofiler/version.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dataprofiler/version.py b/dataprofiler/version.py index 6564117be..c69dda854 100644 --- a/dataprofiler/version.py +++ b/dataprofiler/version.py @@ -4,7 +4,7 @@ MAJOR = 0 MINOR = 3 -MICRO = 4 +MICRO = 5 VERSION = '%d.%d.%d' % (MAJOR, MINOR, MICRO)