From a389c4aa37df202af60f9e267410dc8bd6aeffdb Mon Sep 17 00:00:00 2001 From: Pee Tankulrat <{ID}+{username}@users.noreply.github.com> Date: Tue, 22 Nov 2022 15:58:18 +0700 Subject: [PATCH] Merge drop and redact with replace --- README.md | 15 ++++--- pii-anonymizer.json | 8 ++-- pii_anonymizer/common/config_validator.py | 2 +- .../spark/analyze/detectors/pii_detector.py | 12 +++--- pii_anonymizer/spark/anonymize/anonymizer.py | 14 +------ .../anonymize/tests/test_redact_anonymizer.py | 40 ------------------- ...onymizer.py => test_replace_anonymizer.py} | 19 +++++---- .../analyze/detectors/pii_detector.py | 10 ++--- .../standalone/anonymize/anonymizer.py | 10 +---- .../anonymize/tests/test_redact_anonymizer.py | 20 ---------- ...onymizer.py => test_replace_anonymizer.py} | 14 +++---- .../standalone/tests/config/test_config.json | 2 +- 12 files changed, 47 insertions(+), 119 deletions(-) delete mode 100644 pii_anonymizer/spark/anonymize/tests/test_redact_anonymizer.py rename pii_anonymizer/spark/anonymize/tests/{test_drop_anonymizer.py => test_replace_anonymizer.py} (59%) delete mode 100644 pii_anonymizer/standalone/anonymize/tests/test_redact_anonymizer.py rename pii_anonymizer/standalone/anonymize/tests/{test_drop_anonymizer.py => test_replace_anonymizer.py} (53%) diff --git a/README.md b/README.md index 679d486..cf5c5c3 100644 --- a/README.md +++ b/README.md @@ -22,8 +22,9 @@ The framework aims to work on a two-fold principle for detecting PII: * [x] FIN/NRIC : A unique set of nine alpha-numeric characters on the Singapore National Registration Identity Card. * Following anonymizers have been added - * [x] Redaction: Deletes all or part of a detected sensitive value. - * [x] Encryption : Encrypts the original sensitive data value using a cryptographic key. Cloud DLP supports several types of tokenization, including transformations that can be reversed, or "re-identified." + * [x] Replacement ('replace'): Replaces a detected sensitive value with a specified surrogate value. Leave the value empty to simply delete detected sensitive value. + * [x] Hash ('hash'): Hash detected sensitive value with sha256. + ### TO-DO Following features are part of the backlog with more features coming soon @@ -31,10 +32,10 @@ Following features are part of the backlog with more features coming soon * [ ] NAME * [ ] ADDRESS * Anonymizers: + * [ ] Encryption : Encrypts the original sensitive data value using a cryptographic key. Cloud DLP supports several types of tokenization, including transformations that can be reversed, or "re-identified." * [ ] Masking: Replaces a number of characters of a sensitive value with a specified surrogate character, such as a hash (#) or asterisk (*). * [ ] Bucketing: "Generalizes" a sensitive value by replacing it with a range of values. (For example, replacing a specific age with an age range, or temperatures with ranges corresponding to "Hot," "Medium," and "Cold.") - * [ ] Replacement: Replaces a detected sensitive value with a specified surrogate value. You can have a detailed at upcoming features and backlog in this [Github Board](https://github.com/thoughtworks-datakind/anonymizer/projects/1?fullscreen=true) @@ -53,13 +54,15 @@ An example for the config JSON is located at `/pii-anonymizer.json }, "analyze": { + }, + "anonymize": { + "mode": , + "value": "string to replace", + "output_file_path" : }, "report" : { "location" : , "level" : - }, - "anonymize": { - "output_file_path" : } } ``` diff --git a/pii-anonymizer.json b/pii-anonymizer.json index c4f73d3..47eb0c8 100644 --- a/pii-anonymizer.json +++ b/pii-anonymizer.json @@ -4,12 +4,12 @@ "delimiter": "," }, "analyze": {}, + "anonymize": { + "mode": "replace", + "output_file_path": "./output" + }, "report": { "location": "./output", "level": "medium" - }, - "anonymize": { - "mode": "hash", - "output_file_path": "./output" } } diff --git a/pii_anonymizer/common/config_validator.py b/pii_anonymizer/common/config_validator.py index 1f78952..0310fdb 100644 --- a/pii_anonymizer/common/config_validator.py +++ b/pii_anonymizer/common/config_validator.py @@ -1,6 +1,6 @@ from pii_anonymizer.common.constants import ANONYMIZE -anonymize_mode = ["redact", "drop", "hash"] +anonymize_mode = ["replace", "hash"] anonymize_mode_err_msg = f"{ANONYMIZE}'s mode must be {' or '.join(anonymize_mode)}" diff --git a/pii_anonymizer/spark/analyze/detectors/pii_detector.py b/pii_anonymizer/spark/analyze/detectors/pii_detector.py index bb8d83e..803f862 100644 --- a/pii_anonymizer/spark/analyze/detectors/pii_detector.py +++ b/pii_anonymizer/spark/analyze/detectors/pii_detector.py @@ -86,14 +86,12 @@ def get_redacted_text(self, input_data_frame: DataFrame, report: DataFrame): column = input_data_frame.columns mode = self.config[ANONYMIZE].get("mode") + value = self.config[ANONYMIZE].get("value", "") + match mode: - case "drop": - result = input_data_frame.rdd.map( - lambda row: Anonymizer.drop(row, pii_list) - ).toDF(column) - case "redact": + case "replace": result = input_data_frame.rdd.map( - lambda row: Anonymizer.redact(row, pii_list) + lambda row: Anonymizer.replace(row, value, pii_list) ).toDF(column) case "hash": result = input_data_frame.rdd.map( @@ -101,7 +99,7 @@ def get_redacted_text(self, input_data_frame: DataFrame, report: DataFrame): ).toDF(column) case _: result = input_data_frame.rdd.map( - lambda row: Anonymizer.drop(row, pii_list) + lambda row: Anonymizer.replace(row, value, pii_list) ).toDF(column) return result diff --git a/pii_anonymizer/spark/anonymize/anonymizer.py b/pii_anonymizer/spark/anonymize/anonymizer.py index 14bb492..2a9946a 100644 --- a/pii_anonymizer/spark/anonymize/anonymizer.py +++ b/pii_anonymizer/spark/anonymize/anonymizer.py @@ -3,22 +3,12 @@ class Anonymizer: @staticmethod - def drop(row, pii_list): + def replace(row, replace_string, pii_list): new_row = [] for cell in row: for word in pii_list: if word in cell: - cell = cell.replace(word, "") - new_row.append(cell) - return new_row - - @staticmethod - def redact(row, pii_list): - new_row = [] - for cell in row: - for word in pii_list: - if word in cell: - cell = cell.replace(word, "[Redacted]") + cell = cell.replace(word, replace_string) new_row.append(cell) return new_row diff --git a/pii_anonymizer/spark/anonymize/tests/test_redact_anonymizer.py b/pii_anonymizer/spark/anonymize/tests/test_redact_anonymizer.py deleted file mode 100644 index d61cacd..0000000 --- a/pii_anonymizer/spark/anonymize/tests/test_redact_anonymizer.py +++ /dev/null @@ -1,40 +0,0 @@ -from unittest import TestCase -from pyspark.sql import SparkSession -from pii_anonymizer.spark.anonymize.anonymizer import Anonymizer - - -class TestRedactAnonymizer(TestCase): - def setUp(self) -> None: - self.SPARK = ( - SparkSession.builder.master("local") - .appName("Test PIIDetector") - .getOrCreate() - ) - - def test_redact_for_single_analyzer_result(self): - test_data_frame = self.SPARK.createDataFrame( - [("text containing pii", "something else")] - ) - - analyzer_results = ["pii"] - result = test_data_frame.rdd.map( - lambda row: Anonymizer.redact(row, analyzer_results) - ).toDF() - - actual = result.collect()[0][0] - - self.assertEqual(actual, "text containing [Redacted]") - - def test_redact_for_multiple_analyzer_results(self): - test_data_frame = self.SPARK.createDataFrame( - [("text containing pii1 and pii2", "something else")] - ) - analyzer_results = ["pii1", "pii2"] - - result = test_data_frame.rdd.map( - lambda row: Anonymizer.redact(row, analyzer_results) - ).toDF() - - actual = result.collect()[0][0] - - self.assertEqual(actual, "text containing [Redacted] and [Redacted]") diff --git a/pii_anonymizer/spark/anonymize/tests/test_drop_anonymizer.py b/pii_anonymizer/spark/anonymize/tests/test_replace_anonymizer.py similarity index 59% rename from pii_anonymizer/spark/anonymize/tests/test_drop_anonymizer.py rename to pii_anonymizer/spark/anonymize/tests/test_replace_anonymizer.py index 0410e95..90e1b13 100644 --- a/pii_anonymizer/spark/anonymize/tests/test_drop_anonymizer.py +++ b/pii_anonymizer/spark/anonymize/tests/test_replace_anonymizer.py @@ -3,7 +3,7 @@ from pii_anonymizer.spark.anonymize.anonymizer import Anonymizer -class TestDropAnonymizer(TestCase): +class TestReplaceAnonymizer(TestCase): def setUp(self) -> None: self.SPARK = ( SparkSession.builder.master("local") @@ -11,30 +11,33 @@ def setUp(self) -> None: .getOrCreate() ) - def test_drop_for_single_analyzer_result(self): + def test_replace_for_single_analyzer_result(self): + replace_string = "[REPLACED]" test_data_frame = self.SPARK.createDataFrame( [("text containing pii", "something else")] ) - analyzer_results = ["pii"] result = test_data_frame.rdd.map( - lambda row: Anonymizer.drop(row, analyzer_results) + lambda row: Anonymizer.replace(row, replace_string, analyzer_results) ).toDF() actual = result.collect()[0][0] - self.assertEqual(actual, "text containing ") + self.assertEqual(actual, f"text containing {replace_string}") - def test_drop_for_multiple_analyzer_results(self): + def test_replace_for_multiple_analyzer_results(self): + replace_string = "[REPLACED]" test_data_frame = self.SPARK.createDataFrame( [("text containing pii1 and pii2", "something else")] ) analyzer_results = ["pii1", "pii2"] result = test_data_frame.rdd.map( - lambda row: Anonymizer.drop(row, analyzer_results) + lambda row: Anonymizer.replace(row, replace_string, analyzer_results) ).toDF() actual = result.collect()[0][0] - self.assertEqual(actual, "text containing and ") + self.assertEqual( + actual, f"text containing {replace_string} and {replace_string}" + ) diff --git a/pii_anonymizer/standalone/analyze/detectors/pii_detector.py b/pii_anonymizer/standalone/analyze/detectors/pii_detector.py index 50ef69a..a7bee56 100644 --- a/pii_anonymizer/standalone/analyze/detectors/pii_detector.py +++ b/pii_anonymizer/standalone/analyze/detectors/pii_detector.py @@ -49,15 +49,15 @@ def analyze_and_anonymize(self, text: str): analyzer_results = analyzer_results + detector.execute(text) mode = self.config[ANONYMIZE].get("mode") + value = self.config[ANONYMIZE].get("value", "") + match mode: - case "drop": - redacted_text = Anonymizer.drop(text, analyzer_results) - case "redact": - redacted_text = Anonymizer.redact(text, analyzer_results) + case "replace": + redacted_text = Anonymizer.replace(text, value, analyzer_results) case "hash": redacted_text = Anonymizer.hash(text, analyzer_results) case _: - redacted_text = Anonymizer.drop(text, analyzer_results) + redacted_text = Anonymizer.replace(text, value, analyzer_results) return AnonymizerResult(redacted_text, analyzer_results) diff --git a/pii_anonymizer/standalone/anonymize/anonymizer.py b/pii_anonymizer/standalone/anonymize/anonymizer.py index f835bb5..e4126d6 100644 --- a/pii_anonymizer/standalone/anonymize/anonymizer.py +++ b/pii_anonymizer/standalone/anonymize/anonymizer.py @@ -5,15 +5,9 @@ class Anonymizer: @staticmethod - def drop(text: str, analyzer_results: [AnalyzerResult]): + def replace(text: str, replace_string: str, analyzer_results: [AnalyzerResult]): for result in analyzer_results: - text = text.replace(result.text, "") - return text - - @staticmethod - def redact(text: str, analyzer_results: [AnalyzerResult]): - for result in analyzer_results: - text = text.replace(result.text, "[Redacted]") + text = text.replace(result.text, replace_string) return text @staticmethod diff --git a/pii_anonymizer/standalone/anonymize/tests/test_redact_anonymizer.py b/pii_anonymizer/standalone/anonymize/tests/test_redact_anonymizer.py deleted file mode 100644 index a936e43..0000000 --- a/pii_anonymizer/standalone/anonymize/tests/test_redact_anonymizer.py +++ /dev/null @@ -1,20 +0,0 @@ -from unittest import TestCase -from pii_anonymizer.standalone.anonymize.anonymizer import Anonymizer -from pii_anonymizer.standalone.analyze.utils.analyzer_result import AnalyzerResult - - -class TestRedactAnonymizer(TestCase): - def test_redact_for_single_analyzer_result(self): - text = "text containing pii" - analyzer_results = [AnalyzerResult("pii", "PII_DETECTOR", 16, 18)] - result = Anonymizer.redact(text, analyzer_results) - self.assertEqual(result, "text containing [Redacted]") - - def test_redact_for_multiple_analyzer_results(self): - text = "text containing pii1 and pii2" - analyzer_results = [ - AnalyzerResult("pii1", "PII_DETECTOR", 16, 19), - AnalyzerResult("pii2", "PII_DETECTOR", 25, 28), - ] - result = Anonymizer.redact(text, analyzer_results) - self.assertEqual(result, "text containing [Redacted] and [Redacted]") diff --git a/pii_anonymizer/standalone/anonymize/tests/test_drop_anonymizer.py b/pii_anonymizer/standalone/anonymize/tests/test_replace_anonymizer.py similarity index 53% rename from pii_anonymizer/standalone/anonymize/tests/test_drop_anonymizer.py rename to pii_anonymizer/standalone/anonymize/tests/test_replace_anonymizer.py index aee6ade..ed3992b 100644 --- a/pii_anonymizer/standalone/anonymize/tests/test_drop_anonymizer.py +++ b/pii_anonymizer/standalone/anonymize/tests/test_replace_anonymizer.py @@ -3,18 +3,18 @@ from pii_anonymizer.standalone.analyze.utils.analyzer_result import AnalyzerResult -class TestDropAnonymizer(TestCase): - def test_drop_for_single_analyzer_result(self): +class TestReplaceAnonymizer(TestCase): + def test_Replace_for_single_analyzer_result(self): text = "text containing pii" analyzer_results = [AnalyzerResult("pii", "PII_DETECTOR", 16, 18)] - result = Anonymizer.drop(text, analyzer_results) - self.assertEqual(result, "text containing ") + result = Anonymizer.replace(text, "[REPLACED]", analyzer_results) + self.assertEqual(result, "text containing [REPLACED]") - def test_drop_for_multiple_analyzer_results(self): + def test_Replace_for_multiple_analyzer_results(self): text = "text containing pii1 and pii2" analyzer_results = [ AnalyzerResult("pii1", "PII_DETECTOR", 16, 19), AnalyzerResult("pii2", "PII_DETECTOR", 25, 28), ] - result = Anonymizer.drop(text, analyzer_results) - self.assertEqual(result, "text containing and ") + result = Anonymizer.replace(text, "[REPLACED]", analyzer_results) + self.assertEqual(result, "text containing [REPLACED] and [REPLACED]") diff --git a/pii_anonymizer/standalone/tests/config/test_config.json b/pii_anonymizer/standalone/tests/config/test_config.json index 1c17f4e..3924927 100644 --- a/pii_anonymizer/standalone/tests/config/test_config.json +++ b/pii_anonymizer/standalone/tests/config/test_config.json @@ -5,7 +5,7 @@ }, "analyze": {}, "anonymize": { - "mode": "redact", + "mode": "replace", "output_file_path": "/Users/wisuchoi/Documents/anonymizer/output" }, "report": {