From a389c4aa37df202af60f9e267410dc8bd6aeffdb Mon Sep 17 00:00:00 2001
From: Pee Tankulrat <{ID}+{username}@users.noreply.github.com>
Date: Tue, 22 Nov 2022 15:58:18 +0700
Subject: [PATCH] Merge drop and redact with replace

---
 README.md                                     | 15 ++++---
 pii-anonymizer.json                           |  8 ++--
 pii_anonymizer/common/config_validator.py     |  2 +-
 .../spark/analyze/detectors/pii_detector.py   | 12 +++---
 pii_anonymizer/spark/anonymize/anonymizer.py  | 14 +------
 .../anonymize/tests/test_redact_anonymizer.py | 40 -------------------
 ...onymizer.py => test_replace_anonymizer.py} | 19 +++++----
 .../analyze/detectors/pii_detector.py         | 10 ++---
 .../standalone/anonymize/anonymizer.py        | 10 +----
 .../anonymize/tests/test_redact_anonymizer.py | 20 ----------
 ...onymizer.py => test_replace_anonymizer.py} | 14 +++----
 .../standalone/tests/config/test_config.json  |  2 +-
 12 files changed, 47 insertions(+), 119 deletions(-)
 delete mode 100644 pii_anonymizer/spark/anonymize/tests/test_redact_anonymizer.py
 rename pii_anonymizer/spark/anonymize/tests/{test_drop_anonymizer.py => test_replace_anonymizer.py} (59%)
 delete mode 100644 pii_anonymizer/standalone/anonymize/tests/test_redact_anonymizer.py
 rename pii_anonymizer/standalone/anonymize/tests/{test_drop_anonymizer.py => test_replace_anonymizer.py} (53%)
diff --git a/README.md b/README.md
index 679d486..cf5c5c3 100644
--- a/README.md
+++ b/README.md
@@ -22,8 +22,9 @@ The framework aims to work on a two-fold principle for detecting PII:
    * [x] FIN/NRIC : A unique set of nine alpha-numeric characters on the Singapore National Registration Identity Card.
 
  * Following anonymizers have been added
-    * [x] Redaction: Deletes all or part of a detected sensitive value.
-    * [x] Encryption :  Encrypts the original sensitive data value using a cryptographic key. Cloud DLP supports several types of tokenization, including transformations that can be reversed, or "re-identified."
+    * [x] Replacement ('replace'): Replaces a detected sensitive value with a specified surrogate value. Leave the value empty to simply delete detected sensitive value.
+    * [x] Hash ('hash'): Hash detected sensitive value with sha256.
+
 
 ### TO-DO
 Following features  are part of the backlog with more features coming soon
@@ -31,10 +32,10 @@ Following features  are part of the backlog with more features coming soon
     * [ ] NAME
     * [ ] ADDRESS
  * Anonymizers:
+    * [ ] Encryption :  Encrypts the original sensitive data value using a cryptographic key. Cloud DLP supports several types of tokenization, including transformations that can be reversed, or "re-identified."
     * [ ] Masking: Replaces a number of characters of a sensitive value with a specified surrogate character, such as a hash (#) or asterisk (*).
     * [ ] Bucketing: "Generalizes" a sensitive value by replacing it with a range of values. (For example, replacing a specific age with an age range,
     or temperatures with ranges corresponding to "Hot," "Medium," and "Cold.")
-    * [ ] Replacement: Replaces a detected sensitive value with a specified surrogate value.
 
 
 You can have a detailed at upcoming features and backlog in this [Github Board](https://github.com/thoughtworks-datakind/anonymizer/projects/1?fullscreen=true)
@@ -53,13 +54,15 @@ An example for the config JSON is located at `<PROJECT_ROOT>/pii-anonymizer.json
   },
   "analyze": {
 
+  },
+  "anonymize": {
+    "mode": <replace|hash>,
+    "value": "string to replace",
+    "output_file_path" : <PATH TO YOUR CSV OUTPUT FOLDER>
   },
   "report" : {
     "location" : <PATH TO YOUR REPORT OUTPUT FOLDER>,
     "level" : <LOG LEVEL>
-  },
-  "anonymize": {
-    "output_file_path" : <PATH TO YOUR CSV OUTPUT FOLDER>
   }
 }
 ```
diff --git a/pii-anonymizer.json b/pii-anonymizer.json
index c4f73d3..47eb0c8 100644
--- a/pii-anonymizer.json
+++ b/pii-anonymizer.json
@@ -4,12 +4,12 @@
     "delimiter": ","
   },
   "analyze": {},
+  "anonymize": {
+    "mode": "replace",
+    "output_file_path": "./output"
+  },
   "report": {
     "location": "./output",
     "level": "medium"
-  },
-  "anonymize": {
-    "mode": "hash",
-    "output_file_path": "./output"
   }
 }
diff --git a/pii_anonymizer/common/config_validator.py b/pii_anonymizer/common/config_validator.py
index 1f78952..0310fdb 100644
--- a/pii_anonymizer/common/config_validator.py
+++ b/pii_anonymizer/common/config_validator.py
@@ -1,6 +1,6 @@
 from pii_anonymizer.common.constants import ANONYMIZE
 
-anonymize_mode = ["redact", "drop", "hash"]
+anonymize_mode = ["replace", "hash"]
 anonymize_mode_err_msg = f"{ANONYMIZE}'s mode must be {' or '.join(anonymize_mode)}"
 
 
diff --git a/pii_anonymizer/spark/analyze/detectors/pii_detector.py b/pii_anonymizer/spark/analyze/detectors/pii_detector.py
index bb8d83e..803f862 100644
--- a/pii_anonymizer/spark/analyze/detectors/pii_detector.py
+++ b/pii_anonymizer/spark/analyze/detectors/pii_detector.py
@@ -86,14 +86,12 @@ def get_redacted_text(self, input_data_frame: DataFrame, report: DataFrame):
         column = input_data_frame.columns
 
         mode = self.config[ANONYMIZE].get("mode")
+        value = self.config[ANONYMIZE].get("value", "")
+
         match mode:
-            case "drop":
-                result = input_data_frame.rdd.map(
-                    lambda row: Anonymizer.drop(row, pii_list)
-                ).toDF(column)
-            case "redact":
+            case "replace":
                 result = input_data_frame.rdd.map(
-                    lambda row: Anonymizer.redact(row, pii_list)
+                    lambda row: Anonymizer.replace(row, value, pii_list)
                 ).toDF(column)
             case "hash":
                 result = input_data_frame.rdd.map(
@@ -101,7 +99,7 @@ def get_redacted_text(self, input_data_frame: DataFrame, report: DataFrame):
                 ).toDF(column)
             case _:
                 result = input_data_frame.rdd.map(
-                    lambda row: Anonymizer.drop(row, pii_list)
+                    lambda row: Anonymizer.replace(row, value, pii_list)
                 ).toDF(column)
 
         return result
diff --git a/pii_anonymizer/spark/anonymize/anonymizer.py b/pii_anonymizer/spark/anonymize/anonymizer.py
index 14bb492..2a9946a 100644
--- a/pii_anonymizer/spark/anonymize/anonymizer.py
+++ b/pii_anonymizer/spark/anonymize/anonymizer.py
@@ -3,22 +3,12 @@
 
 class Anonymizer:
     @staticmethod
-    def drop(row, pii_list):
+    def replace(row, replace_string, pii_list):
         new_row = []
         for cell in row:
             for word in pii_list:
                 if word in cell:
-                    cell = cell.replace(word, "")
-            new_row.append(cell)
-        return new_row
-
-    @staticmethod
-    def redact(row, pii_list):
-        new_row = []
-        for cell in row:
-            for word in pii_list:
-                if word in cell:
-                    cell = cell.replace(word, "[Redacted]")
+                    cell = cell.replace(word, replace_string)
             new_row.append(cell)
         return new_row
 
diff --git a/pii_anonymizer/spark/anonymize/tests/test_redact_anonymizer.py b/pii_anonymizer/spark/anonymize/tests/test_redact_anonymizer.py
deleted file mode 100644
index d61cacd..0000000
--- a/pii_anonymizer/spark/anonymize/tests/test_redact_anonymizer.py
+++ /dev/null
@@ -1,40 +0,0 @@
-from unittest import TestCase
-from pyspark.sql import SparkSession
-from pii_anonymizer.spark.anonymize.anonymizer import Anonymizer
-
-
-class TestRedactAnonymizer(TestCase):
-    def setUp(self) -> None:
-        self.SPARK = (
-            SparkSession.builder.master("local")
-            .appName("Test PIIDetector")
-            .getOrCreate()
-        )
-
-    def test_redact_for_single_analyzer_result(self):
-        test_data_frame = self.SPARK.createDataFrame(
-            [("text containing pii", "something else")]
-        )
-
-        analyzer_results = ["pii"]
-        result = test_data_frame.rdd.map(
-            lambda row: Anonymizer.redact(row, analyzer_results)
-        ).toDF()
-
-        actual = result.collect()[0][0]
-
-        self.assertEqual(actual, "text containing [Redacted]")
-
-    def test_redact_for_multiple_analyzer_results(self):
-        test_data_frame = self.SPARK.createDataFrame(
-            [("text containing pii1 and pii2", "something else")]
-        )
-        analyzer_results = ["pii1", "pii2"]
-
-        result = test_data_frame.rdd.map(
-            lambda row: Anonymizer.redact(row, analyzer_results)
-        ).toDF()
-
-        actual = result.collect()[0][0]
-
-        self.assertEqual(actual, "text containing [Redacted] and [Redacted]")
diff --git a/pii_anonymizer/spark/anonymize/tests/test_drop_anonymizer.py b/pii_anonymizer/spark/anonymize/tests/test_replace_anonymizer.py
similarity index 59%
rename from pii_anonymizer/spark/anonymize/tests/test_drop_anonymizer.py
rename to pii_anonymizer/spark/anonymize/tests/test_replace_anonymizer.py
index 0410e95..90e1b13 100644
--- a/pii_anonymizer/spark/anonymize/tests/test_drop_anonymizer.py
+++ b/pii_anonymizer/spark/anonymize/tests/test_replace_anonymizer.py
@@ -3,7 +3,7 @@
 from pii_anonymizer.spark.anonymize.anonymizer import Anonymizer
 
 
-class TestDropAnonymizer(TestCase):
+class TestReplaceAnonymizer(TestCase):
     def setUp(self) -> None:
         self.SPARK = (
             SparkSession.builder.master("local")
@@ -11,30 +11,33 @@ def setUp(self) -> None:
             .getOrCreate()
         )
 
-    def test_drop_for_single_analyzer_result(self):
+    def test_replace_for_single_analyzer_result(self):
+        replace_string = "[REPLACED]"
         test_data_frame = self.SPARK.createDataFrame(
             [("text containing pii", "something else")]
         )
-
         analyzer_results = ["pii"]
         result = test_data_frame.rdd.map(
-            lambda row: Anonymizer.drop(row, analyzer_results)
+            lambda row: Anonymizer.replace(row, replace_string, analyzer_results)
         ).toDF()
 
         actual = result.collect()[0][0]
 
-        self.assertEqual(actual, "text containing ")
+        self.assertEqual(actual, f"text containing {replace_string}")
 
-    def test_drop_for_multiple_analyzer_results(self):
+    def test_replace_for_multiple_analyzer_results(self):
+        replace_string = "[REPLACED]"
         test_data_frame = self.SPARK.createDataFrame(
             [("text containing pii1 and pii2", "something else")]
         )
         analyzer_results = ["pii1", "pii2"]
 
         result = test_data_frame.rdd.map(
-            lambda row: Anonymizer.drop(row, analyzer_results)
+            lambda row: Anonymizer.replace(row, replace_string, analyzer_results)
         ).toDF()
 
         actual = result.collect()[0][0]
 
-        self.assertEqual(actual, "text containing  and ")
+        self.assertEqual(
+            actual, f"text containing {replace_string} and {replace_string}"
+        )
diff --git a/pii_anonymizer/standalone/analyze/detectors/pii_detector.py b/pii_anonymizer/standalone/analyze/detectors/pii_detector.py
index 50ef69a..a7bee56 100644
--- a/pii_anonymizer/standalone/analyze/detectors/pii_detector.py
+++ b/pii_anonymizer/standalone/analyze/detectors/pii_detector.py
@@ -49,15 +49,15 @@ def analyze_and_anonymize(self, text: str):
             analyzer_results = analyzer_results + detector.execute(text)
 
         mode = self.config[ANONYMIZE].get("mode")
+        value = self.config[ANONYMIZE].get("value", "")
+
         match mode:
-            case "drop":
-                redacted_text = Anonymizer.drop(text, analyzer_results)
-            case "redact":
-                redacted_text = Anonymizer.redact(text, analyzer_results)
+            case "replace":
+                redacted_text = Anonymizer.replace(text, value, analyzer_results)
             case "hash":
                 redacted_text = Anonymizer.hash(text, analyzer_results)
             case _:
-                redacted_text = Anonymizer.drop(text, analyzer_results)
+                redacted_text = Anonymizer.replace(text, value, analyzer_results)
 
         return AnonymizerResult(redacted_text, analyzer_results)
 
diff --git a/pii_anonymizer/standalone/anonymize/anonymizer.py b/pii_anonymizer/standalone/anonymize/anonymizer.py
index f835bb5..e4126d6 100644
--- a/pii_anonymizer/standalone/anonymize/anonymizer.py
+++ b/pii_anonymizer/standalone/anonymize/anonymizer.py
@@ -5,15 +5,9 @@
 
 class Anonymizer:
     @staticmethod
-    def drop(text: str, analyzer_results: [AnalyzerResult]):
+    def replace(text: str, replace_string: str, analyzer_results: [AnalyzerResult]):
         for result in analyzer_results:
-            text = text.replace(result.text, "")
-        return text
-
-    @staticmethod
-    def redact(text: str, analyzer_results: [AnalyzerResult]):
-        for result in analyzer_results:
-            text = text.replace(result.text, "[Redacted]")
+            text = text.replace(result.text, replace_string)
         return text
 
     @staticmethod
diff --git a/pii_anonymizer/standalone/anonymize/tests/test_redact_anonymizer.py b/pii_anonymizer/standalone/anonymize/tests/test_redact_anonymizer.py
deleted file mode 100644
index a936e43..0000000
--- a/pii_anonymizer/standalone/anonymize/tests/test_redact_anonymizer.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from unittest import TestCase
-from pii_anonymizer.standalone.anonymize.anonymizer import Anonymizer
-from pii_anonymizer.standalone.analyze.utils.analyzer_result import AnalyzerResult
-
-
-class TestRedactAnonymizer(TestCase):
-    def test_redact_for_single_analyzer_result(self):
-        text = "text containing pii"
-        analyzer_results = [AnalyzerResult("pii", "PII_DETECTOR", 16, 18)]
-        result = Anonymizer.redact(text, analyzer_results)
-        self.assertEqual(result, "text containing [Redacted]")
-
-    def test_redact_for_multiple_analyzer_results(self):
-        text = "text containing pii1 and pii2"
-        analyzer_results = [
-            AnalyzerResult("pii1", "PII_DETECTOR", 16, 19),
-            AnalyzerResult("pii2", "PII_DETECTOR", 25, 28),
-        ]
-        result = Anonymizer.redact(text, analyzer_results)
-        self.assertEqual(result, "text containing [Redacted] and [Redacted]")
diff --git a/pii_anonymizer/standalone/anonymize/tests/test_drop_anonymizer.py b/pii_anonymizer/standalone/anonymize/tests/test_replace_anonymizer.py
similarity index 53%
rename from pii_anonymizer/standalone/anonymize/tests/test_drop_anonymizer.py
rename to pii_anonymizer/standalone/anonymize/tests/test_replace_anonymizer.py
index aee6ade..ed3992b 100644
--- a/pii_anonymizer/standalone/anonymize/tests/test_drop_anonymizer.py
+++ b/pii_anonymizer/standalone/anonymize/tests/test_replace_anonymizer.py
@@ -3,18 +3,18 @@
 from pii_anonymizer.standalone.analyze.utils.analyzer_result import AnalyzerResult
 
 
-class TestDropAnonymizer(TestCase):
-    def test_drop_for_single_analyzer_result(self):
+class TestReplaceAnonymizer(TestCase):
+    def test_Replace_for_single_analyzer_result(self):
         text = "text containing pii"
         analyzer_results = [AnalyzerResult("pii", "PII_DETECTOR", 16, 18)]
-        result = Anonymizer.drop(text, analyzer_results)
-        self.assertEqual(result, "text containing ")
+        result = Anonymizer.replace(text, "[REPLACED]", analyzer_results)
+        self.assertEqual(result, "text containing [REPLACED]")
 
-    def test_drop_for_multiple_analyzer_results(self):
+    def test_Replace_for_multiple_analyzer_results(self):
         text = "text containing pii1 and pii2"
         analyzer_results = [
             AnalyzerResult("pii1", "PII_DETECTOR", 16, 19),
             AnalyzerResult("pii2", "PII_DETECTOR", 25, 28),
         ]
-        result = Anonymizer.drop(text, analyzer_results)
-        self.assertEqual(result, "text containing  and ")
+        result = Anonymizer.replace(text, "[REPLACED]", analyzer_results)
+        self.assertEqual(result, "text containing [REPLACED] and [REPLACED]")
diff --git a/pii_anonymizer/standalone/tests/config/test_config.json b/pii_anonymizer/standalone/tests/config/test_config.json
index 1c17f4e..3924927 100644
--- a/pii_anonymizer/standalone/tests/config/test_config.json
+++ b/pii_anonymizer/standalone/tests/config/test_config.json
@@ -5,7 +5,7 @@
   },
   "analyze": {},
   "anonymize": {
-    "mode": "redact",
+    "mode": "replace",
     "output_file_path": "/Users/wisuchoi/Documents/anonymizer/output"
   },
   "report": {