fybrik · roytman · Jan 13, 2022 · Mar 14, 2021 · Mar 31, 2021 · May 4, 2021
diff --git a/afm/pep/actions.py b/afm/pep/actions.py
@@ -4,6 +4,7 @@
 #
 import pandas as pd
 import pyarrow as pa
+import hashlib
 
 from .base import Action
 
@@ -71,3 +72,45 @@ def schema(self, original):
         columns = [column for column in self.columns if column in original.names]
         self._schema = pa.schema([pa.field(c, original.field(c).type) for c in columns])
         return self._schema
+
+
+class HashRedact(Action):
+    def __init__(self, description, columns, options):
+        super().__init__(description, columns, options)
+        if options == None:
+            self.hash_algo = "md5"
+        else:
+            self.hash_algo = options.get("algo", "md5")
+
+    def __call__(self, records: pa.RecordBatch) -> pa.RecordBatch:
+        """Transformation logic for HashRedact action.
+
+        Args:
+            records (pa.RecordBatch): record batch to transform
+
+        Returns:
+            pa.RecordBatch: transformed record batch
+        """
+        columns = [column for column in self.columns if column in records.schema.names]
+        indices = [records.schema.get_field_index(c) for c in columns]
+        new_columns = records.columns
+        algo = self.hash_algo.lower()
+        hashFunc = hashlib.md5
+        if algo == "md5":
+            hashFunc = hashlib.md5
+        elif algo == "sha256":
+            hashFunc = hashlib.sha256
+        elif algo == "sha512":
+            hashFunc = hashlib.sha512
+        else:
+            raise ValueError(f"Algorithm {algo} is not supported!")
+        for i in indices:
+            new_columns[i] = pa.array([hashFunc(v.as_py().encode()).hexdigest() for v in records.column(i)])
+
+        new_schema = self.schema(records.schema)
+        return pa.RecordBatch.from_arrays(new_columns, schema=new_schema)
+
+    def field_type(self):
+        """Overrides field_type to calculate transformed schema correctly."""
+        return pa.string() # redacted value is a string
+
diff --git a/afm/pep/test_actions.py b/afm/pep/test_actions.py
@@ -7,6 +7,7 @@
 import pyarrow as pa
 
 from .actions import Redact
+from .actions import HashRedact
 
 class TestActions(unittest.TestCase):
 
@@ -23,6 +24,73 @@ def test_redact(self):
             self.assertEqual(result.schema.field("weight").type, pa.float64())
 
             self.assertEqual(result.to_pandas()["gender"][0], "XXX")
+            self.assertEqual(result.to_pandas()["age"][0], "XXX")
+
+    def test_hash_redact_md5(self):
+        df = pd.DataFrame(
+            {'col1': [1, 2, 3], 'col2': ["abcdefghijklmnopqrstuvwxyz", "bcdefghijklmnopqrstuvwxyza", "cdefghijklmnopqrstuvwxyzab"], 'col3': [1.0, 2.0, 3.0]})
+        table = pa.Table.from_pandas(df)
+
+        action = HashRedact("Hash redact", columns=["col2"], options={"algo": "md5"})
+        for record_batch in table.to_batches():
+            result = action(record_batch)
+            self.assertEqual(result.schema.field("col1").type, pa.int64())
+            self.assertEqual(result.schema.field("col2").type, pa.string())
+            self.assertEqual(result.schema.field("col3").type, pa.float64())
+
+            self.assertEqual(result.to_pandas()["col2"][0], "c3fcd3d76192e4007dfb496cca67e13b")
+            self.assertEqual(result.to_pandas()["col2"][1], "07694ef19cf359bfd74556dc0cc7956d")
+            self.assertEqual(result.to_pandas()["col2"][2], "8dda2bba265b7478676bf9526e79c91c")
+
+    def test_hash_redact_md5(self):
+        df = pd.DataFrame(
+            {'col1': [1, 2, 3], 'col2': ["abcdefghijklmnopqrstuvwxyz", "bcdefghijklmnopqrstuvwxyza", "cdefghijklmnopqrstuvwxyzab"], 'col3': [1.0, 2.0, 3.0]})
+        table = pa.Table.from_pandas(df)
+
+        action = HashRedact("Hash redact", columns=["col2"], options={"algo": "md5"})
+        for record_batch in table.to_batches():
+            result = action(record_batch)
+            self.assertEqual(result.schema.field("col1").type, pa.int64())
+            self.assertEqual(result.schema.field("col2").type, pa.string())
+            self.assertEqual(result.schema.field("col3").type, pa.float64())
+
+            self.assertEqual(result.to_pandas()["col2"][0], "c3fcd3d76192e4007dfb496cca67e13b")
+            self.assertEqual(result.to_pandas()["col2"][1], "07694ef19cf359bfd74556dc0cc7956d")
+            self.assertEqual(result.to_pandas()["col2"][2], "8dda2bba265b7478676bf9526e79c91c")
+
+    def test_hash_redact_sha256(self):
+        df = pd.DataFrame(
+            {'col1': [1, 2, 3], 'col2': ["abcdefghijklmnopqrstuvwxyz", "bcdefghijklmnopqrstuvwxyza", "cdefghijklmnopqrstuvwxyzab"], 'col3': [1.0, 2.0, 3.0]})
+        table = pa.Table.from_pandas(df)
+
+        action = HashRedact("Hash redact", columns=["col2"], options={"algo": "sha256"})
+        for record_batch in table.to_batches():
+            result = action(record_batch)
+            self.assertEqual(result.schema.field("col1").type, pa.int64())
+            self.assertEqual(result.schema.field("col2").type, pa.string())
+            self.assertEqual(result.schema.field("col3").type, pa.float64())
+
+            self.assertEqual(result.to_pandas()["col2"][0], "71c480df93d6ae2f1efad1447c66c9525e316218cf51fc8d9ed832f2daf18b73")
+            self.assertEqual(result.to_pandas()["col2"][1], "e40957dd33bd9da6053d78bea4da6c7cde1fac92614bfd03d8b0c422e021651c")
+            self.assertEqual(result.to_pandas()["col2"][2], "fa732dae244c6d0b946e096d05167539a4b6ec2cc72f13a86a7fd657ef523d07")
+
+
+    def test_hash_redact_sha512(self):
+        df = pd.DataFrame(
+            {'col1': [1, 2, 3], 'col2': ["abcdefghijklmnopqrstuvwxyz", "bcdefghijklmnopqrstuvwxyza", "cdefghijklmnopqrstuvwxyzab"], 'col3': [1.0, 2.0, 3.0]})
+        table = pa.Table.from_pandas(df)
+
+        action = HashRedact("Hash redact", columns=["col2"], options={"algo": "sha512"})
+        for record_batch in table.to_batches():
+            result = action(record_batch)
+            self.assertEqual(result.schema.field("col1").type, pa.int64())
+            self.assertEqual(result.schema.field("col2").type, pa.string())
+            self.assertEqual(result.schema.field("col3").type, pa.float64())
+
+            self.assertEqual(result.to_pandas()["col2"][0], "4dbff86cc2ca1bae1e16468a05cb9881c97f1753bce3619034898faa1aabe429955a1bf8ec483d7421fe3c1646613a59ed5441fb0f321389f77f48a879c7b1f1")
+            self.assertEqual(result.to_pandas()["col2"][1], "6cf15b5b147ed859119df308a3e22a3958ecf1056b9cab135a1ce722ec57f1b65a03983a183141db9cb68817d57fab964be3068fe05eac8ff3d5f24ca34c6524")
+            self.assertEqual(result.to_pandas()["col2"][2], "5d63cd2920fdbf1f67d2a55a7d5b792331f9e21cc9965419170176e98a221d3a68080225f0e781734304c1ef6f162dade36acf463b137e6767416c1c53fa845d")
+
 
 if __name__ == '__main__':
-    unittest.main()
+    unittest.main()