From 998d7e5833cb25745c0aa63c1534f15513559cc4 Mon Sep 17 00:00:00 2001
From: lettergram <lettergramm@gmail.com>
Date: Tue, 16 Mar 2021 13:49:48 -0500
Subject: [PATCH 1/6] improved hashing functionality

---
 dataprofiler/profilers/profile_builder.py  | 18 +++++++-----------
 dataprofiler/tests/.#test_data_profiler.py |  1 +
 2 files changed, 8 insertions(+), 11 deletions(-)
 create mode 120000 dataprofiler/tests/.#test_data_profiler.py

diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py
index a29c1e62c..a2aa7a5e7 100644
--- a/dataprofiler/profilers/profile_builder.py
+++ b/dataprofiler/profilers/profile_builder.py
@@ -446,24 +446,20 @@ def _update_row_statistics(self, data):
         :param data: a dataset
         :type data: pandas.DataFrame
         """
-        for index, row in data.iterrows():
 
-            # Hash the row and stores it in the dict, count keys for unique rows
-            hashed_row = hashlib.sha256(
-                row.to_string().strip().encode()).hexdigest()
-            self.hashed_row_dict[hashed_row] = True
+        hashed_rows = pd.util.hash_pandas_object(data, index=False).values
+        idx = 0
+        while idx < len(hashed_rows):
+            
+            self.hashed_row_dict[hashed_rows[idx]] = True
 
             # check if null in row, if any add count
-            if row.isnull().any():
+            if data.iloc[idx].isnull().any():
                 self.null_in_row_count += 1
 
             # Used for ratios, total ingested rows
             self.rows_ingested += 1
-
-        # Determines null count, transposes column major to row major
-        # Any major returns true if null and sums total count of trues
-        # This is done quickly and with minimal transform(s)
-        # self.null_in_row_count = df.isnull().T.any().sum()
+            idx += 1
 
     def update_profile(self, data, sample_size=None, min_true_samples=None):
         """
diff --git a/dataprofiler/tests/.#test_data_profiler.py b/dataprofiler/tests/.#test_data_profiler.py
new file mode 120000
index 000000000..794c59fbb
--- /dev/null
+++ b/dataprofiler/tests/.#test_data_profiler.py
@@ -0,0 +1 @@
+austin@Batcave.535153:1613875910
\ No newline at end of file

From fcc2c26a950c0511d6b0eb4552cea83fa25e7db7 Mon Sep 17 00:00:00 2001
From: lettergram <lettergramm@gmail.com>
Date: Tue, 16 Mar 2021 13:50:05 -0500
Subject: [PATCH 2/6] impoved unique detection

---
 dataprofiler/tests/.#test_data_profiler.py | 1 -
 1 file changed, 1 deletion(-)
 delete mode 120000 dataprofiler/tests/.#test_data_profiler.py

diff --git a/dataprofiler/tests/.#test_data_profiler.py b/dataprofiler/tests/.#test_data_profiler.py
deleted file mode 120000
index 794c59fbb..000000000
--- a/dataprofiler/tests/.#test_data_profiler.py
+++ /dev/null
@@ -1 +0,0 @@
-austin@Batcave.535153:1613875910
\ No newline at end of file

From 06a25c44daaeeebe505211b1e48cb612e5f5c620 Mon Sep 17 00:00:00 2001
From: lettergram <lettergramm@gmail.com>
Date: Tue, 16 Mar 2021 14:23:01 -0500
Subject: [PATCH 3/6] updated to use range generator

---
 dataprofiler/profilers/profile_builder.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py
index a2aa7a5e7..452db9138 100644
--- a/dataprofiler/profilers/profile_builder.py
+++ b/dataprofiler/profilers/profile_builder.py
@@ -446,10 +446,8 @@ def _update_row_statistics(self, data):
         :param data: a dataset
         :type data: pandas.DataFrame
         """
-
         hashed_rows = pd.util.hash_pandas_object(data, index=False).values
-        idx = 0
-        while idx < len(hashed_rows):
+        for idx in range(len(hashed_rows)):
             
             self.hashed_row_dict[hashed_rows[idx]] = True
 
@@ -459,7 +457,6 @@ def _update_row_statistics(self, data):
 
             # Used for ratios, total ingested rows
             self.rows_ingested += 1
-            idx += 1
 
     def update_profile(self, data, sample_size=None, min_true_samples=None):
         """

From 3d2f69b529169dfd508e32c4b3f5c5e38caf5f9a Mon Sep 17 00:00:00 2001
From: lettergram <lettergramm@gmail.com>
Date: Tue, 16 Mar 2021 15:27:17 -0500
Subject: [PATCH 4/6] much faster

---
 dataprofiler/profilers/profile_builder.py | 21 ++++++++-------------
 1 file changed, 8 insertions(+), 13 deletions(-)

diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py
index 452db9138..b6e99ecb4 100644
--- a/dataprofiler/profilers/profile_builder.py
+++ b/dataprofiler/profilers/profile_builder.py
@@ -439,25 +439,20 @@ def _get_duplicate_row_count(self):
     def _update_row_statistics(self, data):
         """
         Iterate over the provided dataset row by row and calculate
-        the row statistics. Specificaly, number of unique rows,
+        the row statistics. Specifically, number of unique rows,
         rows containing null values, and total rows reviewed. This
         function is safe to use in batches.
 
         :param data: a dataset
         :type data: pandas.DataFrame
         """
-        hashed_rows = pd.util.hash_pandas_object(data, index=False).values
-        for idx in range(len(hashed_rows)):
-            
-            self.hashed_row_dict[hashed_rows[idx]] = True
-
-            # check if null in row, if any add count
-            if data.iloc[idx].isnull().any():
-                self.null_in_row_count += 1
-
-            # Used for ratios, total ingested rows
-            self.rows_ingested += 1
-
+        
+        self.rows_ingested = len(data)
+        self.hashed_row_dict = dict.fromkeys(
+            pd.util.hash_pandas_object(data, index=False).drop_duplicates(), True
+        )
+        self.null_in_row_count = data.isnull().any(axis=1).sum()
+        
     def update_profile(self, data, sample_size=None, min_true_samples=None):
         """
         Update the profile for data provided. User can specify the sample

From 14fc61299e3e6bf45b65e26a6abeaa985c9e1a7b Mon Sep 17 00:00:00 2001
From: lettergram <lettergramm@gmail.com>
Date: Tue, 16 Mar 2021 15:30:31 -0500
Subject: [PATCH 5/6] remove duplicates

---
 dataprofiler/profilers/profile_builder.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataprofiler/profilers/profile_builder.py b/dataprofiler/profilers/profile_builder.py
index b6e99ecb4..0c5d9c98f 100644
--- a/dataprofiler/profilers/profile_builder.py
+++ b/dataprofiler/profilers/profile_builder.py
@@ -449,7 +449,7 @@ def _update_row_statistics(self, data):
         
         self.rows_ingested = len(data)
         self.hashed_row_dict = dict.fromkeys(
-            pd.util.hash_pandas_object(data, index=False).drop_duplicates(), True
+            pd.util.hash_pandas_object(data, index=False), True
         )
         self.null_in_row_count = data.isnull().any(axis=1).sum()
         

From 18e9d885d6572543bfc2b4e115c2e2b6ebd929b6 Mon Sep 17 00:00:00 2001
From: Austin Walters <lettergramm@gmail.com>
Date: Tue, 16 Mar 2021 15:56:50 -0500
Subject: [PATCH 6/6] Update version.py

---
 dataprofiler/version.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/dataprofiler/version.py b/dataprofiler/version.py
index 6564117be..c69dda854 100644
--- a/dataprofiler/version.py
+++ b/dataprofiler/version.py
@@ -4,7 +4,7 @@
 
 MAJOR               = 0
 MINOR               = 3
-MICRO               = 4
+MICRO               = 5
 
 VERSION             = '%d.%d.%d' % (MAJOR, MINOR, MICRO)