Merge drop and redact with replace

thoughtworks · Nov 22, 2022 · a389c4a · a389c4a
1 parent 3438947
commit a389c4a
Show file tree

Hide file tree

Showing 12 changed files with 47 additions and 119 deletions.
diff --git a/README.md b/README.md
@@ -22,19 +22,20 @@ The framework aims to work on a two-fold principle for detecting PII:
    * [x] FIN/NRIC : A unique set of nine alpha-numeric characters on the Singapore National Registration Identity Card.
 
  * Following anonymizers have been added
-    * [x] Redaction: Deletes all or part of a detected sensitive value.
-    * [x] Encryption :  Encrypts the original sensitive data value using a cryptographic key. Cloud DLP supports several types of tokenization, including transformations that can be reversed, or "re-identified."
+    * [x] Replacement ('replace'): Replaces a detected sensitive value with a specified surrogate value. Leave the value empty to simply delete detected sensitive value.
+    * [x] Hash ('hash'): Hash detected sensitive value with sha256.
+
 
 ### TO-DO
 Following features  are part of the backlog with more features coming soon
  * Detectors:
     * [ ] NAME
     * [ ] ADDRESS
  * Anonymizers:
+    * [ ] Encryption :  Encrypts the original sensitive data value using a cryptographic key. Cloud DLP supports several types of tokenization, including transformations that can be reversed, or "re-identified."
     * [ ] Masking: Replaces a number of characters of a sensitive value with a specified surrogate character, such as a hash (#) or asterisk (*).
     * [ ] Bucketing: "Generalizes" a sensitive value by replacing it with a range of values. (For example, replacing a specific age with an age range,
     or temperatures with ranges corresponding to "Hot," "Medium," and "Cold.")
-    * [ ] Replacement: Replaces a detected sensitive value with a specified surrogate value.
 
 
 You can have a detailed at upcoming features and backlog in this [Github Board](https://github.com/thoughtworks-datakind/anonymizer/projects/1?fullscreen=true)
@@ -53,13 +54,15 @@ An example for the config JSON is located at `<PROJECT_ROOT>/pii-anonymizer.json
   },
   "analyze": {
 
+  },
+  "anonymize": {
+    "mode": <replace|hash>,
+    "value": "string to replace",
+    "output_file_path" : <PATH TO YOUR CSV OUTPUT FOLDER>
   },
   "report" : {
     "location" : <PATH TO YOUR REPORT OUTPUT FOLDER>,
     "level" : <LOG LEVEL>
-  },
-  "anonymize": {
-    "output_file_path" : <PATH TO YOUR CSV OUTPUT FOLDER>
   }
 }
 ```

diff --git a/pii-anonymizer.json b/pii-anonymizer.json
@@ -4,12 +4,12 @@
     "delimiter": ","
   },
   "analyze": {},
+  "anonymize": {
+    "mode": "replace",
+    "output_file_path": "./output"
+  },
   "report": {
     "location": "./output",
     "level": "medium"
-  },
-  "anonymize": {
-    "mode": "hash",
-    "output_file_path": "./output"
   }
 }
diff --git a/pii_anonymizer/common/config_validator.py b/pii_anonymizer/common/config_validator.py
@@ -1,6 +1,6 @@
 from pii_anonymizer.common.constants import ANONYMIZE
 
-anonymize_mode = ["redact", "drop", "hash"]
+anonymize_mode = ["replace", "hash"]
 anonymize_mode_err_msg = f"{ANONYMIZE}'s mode must be {' or '.join(anonymize_mode)}"
 
 

diff --git a/pii_anonymizer/spark/analyze/detectors/pii_detector.py b/pii_anonymizer/spark/analyze/detectors/pii_detector.py
@@ -86,22 +86,20 @@ def get_redacted_text(self, input_data_frame: DataFrame, report: DataFrame):
         column = input_data_frame.columns
 
         mode = self.config[ANONYMIZE].get("mode")
+        value = self.config[ANONYMIZE].get("value", "")
+
         match mode:
-            case "drop":
-                result = input_data_frame.rdd.map(
-                    lambda row: Anonymizer.drop(row, pii_list)
-                ).toDF(column)
-            case "redact":
+            case "replace":
                 result = input_data_frame.rdd.map(
-                    lambda row: Anonymizer.redact(row, pii_list)
+                    lambda row: Anonymizer.replace(row, value, pii_list)
                 ).toDF(column)
             case "hash":
                 result = input_data_frame.rdd.map(
                     lambda row: Anonymizer.hash(row, pii_list)
                 ).toDF(column)
             case _:
                 result = input_data_frame.rdd.map(
-                    lambda row: Anonymizer.drop(row, pii_list)
+                    lambda row: Anonymizer.replace(row, value, pii_list)
                 ).toDF(column)
 
         return result

diff --git a/pii_anonymizer/spark/anonymize/anonymizer.py b/pii_anonymizer/spark/anonymize/anonymizer.py
@@ -3,22 +3,12 @@
 
 class Anonymizer:
     @staticmethod
-    def drop(row, pii_list):
+    def replace(row, replace_string, pii_list):
         new_row = []
         for cell in row:
             for word in pii_list:
                 if word in cell:
-                    cell = cell.replace(word, "")
-            new_row.append(cell)
-        return new_row
-
-    @staticmethod
-    def redact(row, pii_list):
-        new_row = []
-        for cell in row:
-            for word in pii_list:
-                if word in cell:
-                    cell = cell.replace(word, "[Redacted]")
+                    cell = cell.replace(word, replace_string)
             new_row.append(cell)
         return new_row
 

diff --git a/pii_anonymizer/spark/anonymize/tests/test_redact_anonymizer.py b/pii_anonymizer/spark/anonymize/tests/test_redact_anonymizer.py
diff --git a/...k/anonymize/tests/test_drop_anonymizer.py → ...nonymize/tests/test_replace_anonymizer.py b/...k/anonymize/tests/test_drop_anonymizer.py → ...nonymize/tests/test_replace_anonymizer.py
@@ -3,38 +3,41 @@
 from pii_anonymizer.spark.anonymize.anonymizer import Anonymizer
 
 
-class TestDropAnonymizer(TestCase):
+class TestReplaceAnonymizer(TestCase):
     def setUp(self) -> None:
         self.SPARK = (
             SparkSession.builder.master("local")
             .appName("Test PIIDetector")
             .getOrCreate()
         )
 
-    def test_drop_for_single_analyzer_result(self):
+    def test_replace_for_single_analyzer_result(self):
+        replace_string = "[REPLACED]"
         test_data_frame = self.SPARK.createDataFrame(
             [("text containing pii", "something else")]
         )
-
         analyzer_results = ["pii"]
         result = test_data_frame.rdd.map(
-            lambda row: Anonymizer.drop(row, analyzer_results)
+            lambda row: Anonymizer.replace(row, replace_string, analyzer_results)
         ).toDF()
 
         actual = result.collect()[0][0]
 
-        self.assertEqual(actual, "text containing ")
+        self.assertEqual(actual, f"text containing {replace_string}")
 
-    def test_drop_for_multiple_analyzer_results(self):
+    def test_replace_for_multiple_analyzer_results(self):
+        replace_string = "[REPLACED]"
         test_data_frame = self.SPARK.createDataFrame(
             [("text containing pii1 and pii2", "something else")]
         )
         analyzer_results = ["pii1", "pii2"]
 
         result = test_data_frame.rdd.map(
-            lambda row: Anonymizer.drop(row, analyzer_results)
+            lambda row: Anonymizer.replace(row, replace_string, analyzer_results)
         ).toDF()
 
         actual = result.collect()[0][0]
 
-        self.assertEqual(actual, "text containing  and ")
+        self.assertEqual(
+            actual, f"text containing {replace_string} and {replace_string}"
+        )
diff --git a/pii_anonymizer/standalone/analyze/detectors/pii_detector.py b/pii_anonymizer/standalone/analyze/detectors/pii_detector.py
@@ -49,15 +49,15 @@ def analyze_and_anonymize(self, text: str):
             analyzer_results = analyzer_results + detector.execute(text)
 
         mode = self.config[ANONYMIZE].get("mode")
+        value = self.config[ANONYMIZE].get("value", "")
+
         match mode:
-            case "drop":
-                redacted_text = Anonymizer.drop(text, analyzer_results)
-            case "redact":
-                redacted_text = Anonymizer.redact(text, analyzer_results)
+            case "replace":
+                redacted_text = Anonymizer.replace(text, value, analyzer_results)
             case "hash":
                 redacted_text = Anonymizer.hash(text, analyzer_results)
             case _:
-                redacted_text = Anonymizer.drop(text, analyzer_results)
+                redacted_text = Anonymizer.replace(text, value, analyzer_results)
 
         return AnonymizerResult(redacted_text, analyzer_results)
 

diff --git a/pii_anonymizer/standalone/anonymize/anonymizer.py b/pii_anonymizer/standalone/anonymize/anonymizer.py
@@ -5,15 +5,9 @@
 
 class Anonymizer:
     @staticmethod
-    def drop(text: str, analyzer_results: [AnalyzerResult]):
+    def replace(text: str, replace_string: str, analyzer_results: [AnalyzerResult]):
         for result in analyzer_results:
-            text = text.replace(result.text, "")
-        return text
-
-    @staticmethod
-    def redact(text: str, analyzer_results: [AnalyzerResult]):
-        for result in analyzer_results:
-            text = text.replace(result.text, "[Redacted]")
+            text = text.replace(result.text, replace_string)
         return text
 
     @staticmethod

diff --git a/pii_anonymizer/standalone/anonymize/tests/test_redact_anonymizer.py b/pii_anonymizer/standalone/anonymize/tests/test_redact_anonymizer.py
diff --git a/...e/anonymize/tests/test_drop_anonymizer.py → ...nonymize/tests/test_replace_anonymizer.py b/...e/anonymize/tests/test_drop_anonymizer.py → ...nonymize/tests/test_replace_anonymizer.py
@@ -3,18 +3,18 @@
 from pii_anonymizer.standalone.analyze.utils.analyzer_result import AnalyzerResult
 
 
-class TestDropAnonymizer(TestCase):
-    def test_drop_for_single_analyzer_result(self):
+class TestReplaceAnonymizer(TestCase):
+    def test_Replace_for_single_analyzer_result(self):
         text = "text containing pii"
         analyzer_results = [AnalyzerResult("pii", "PII_DETECTOR", 16, 18)]
-        result = Anonymizer.drop(text, analyzer_results)
-        self.assertEqual(result, "text containing ")
+        result = Anonymizer.replace(text, "[REPLACED]", analyzer_results)
+        self.assertEqual(result, "text containing [REPLACED]")
 
-    def test_drop_for_multiple_analyzer_results(self):
+    def test_Replace_for_multiple_analyzer_results(self):
         text = "text containing pii1 and pii2"
         analyzer_results = [
             AnalyzerResult("pii1", "PII_DETECTOR", 16, 19),
             AnalyzerResult("pii2", "PII_DETECTOR", 25, 28),
         ]
-        result = Anonymizer.drop(text, analyzer_results)
-        self.assertEqual(result, "text containing  and ")
+        result = Anonymizer.replace(text, "[REPLACED]", analyzer_results)
+        self.assertEqual(result, "text containing [REPLACED] and [REPLACED]")
diff --git a/pii_anonymizer/standalone/tests/config/test_config.json b/pii_anonymizer/standalone/tests/config/test_config.json
@@ -5,7 +5,7 @@
   },
   "analyze": {},
   "anonymize": {
-    "mode": "redact",
+    "mode": "replace",
     "output_file_path": "/Users/wisuchoi/Documents/anonymizer/output"
   },
   "report": {