Skip to content

Commit

Permalink
Merge drop and redact with replace
Browse files Browse the repository at this point in the history
  • Loading branch information
Pee Tankulrat authored and Pee Tankulrat committed Nov 22, 2022
1 parent 3438947 commit a389c4a
Show file tree
Hide file tree
Showing 12 changed files with 47 additions and 119 deletions.
15 changes: 9 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,19 +22,20 @@ The framework aims to work on a two-fold principle for detecting PII:
* [x] FIN/NRIC : A unique set of nine alpha-numeric characters on the Singapore National Registration Identity Card.

* Following anonymizers have been added
* [x] Redaction: Deletes all or part of a detected sensitive value.
* [x] Encryption : Encrypts the original sensitive data value using a cryptographic key. Cloud DLP supports several types of tokenization, including transformations that can be reversed, or "re-identified."
* [x] Replacement ('replace'): Replaces a detected sensitive value with a specified surrogate value. Leave the value empty to simply delete detected sensitive value.
* [x] Hash ('hash'): Hash detected sensitive value with sha256.


### TO-DO
Following features are part of the backlog with more features coming soon
* Detectors:
* [ ] NAME
* [ ] ADDRESS
* Anonymizers:
* [ ] Encryption : Encrypts the original sensitive data value using a cryptographic key. Cloud DLP supports several types of tokenization, including transformations that can be reversed, or "re-identified."
* [ ] Masking: Replaces a number of characters of a sensitive value with a specified surrogate character, such as a hash (#) or asterisk (*).
* [ ] Bucketing: "Generalizes" a sensitive value by replacing it with a range of values. (For example, replacing a specific age with an age range,
or temperatures with ranges corresponding to "Hot," "Medium," and "Cold.")
* [ ] Replacement: Replaces a detected sensitive value with a specified surrogate value.


You can have a detailed at upcoming features and backlog in this [Github Board](https://github.com/thoughtworks-datakind/anonymizer/projects/1?fullscreen=true)
Expand All @@ -53,13 +54,15 @@ An example for the config JSON is located at `<PROJECT_ROOT>/pii-anonymizer.json
},
"analyze": {
},
"anonymize": {
"mode": <replace|hash>,
"value": "string to replace",
"output_file_path" : <PATH TO YOUR CSV OUTPUT FOLDER>
},
"report" : {
"location" : <PATH TO YOUR REPORT OUTPUT FOLDER>,
"level" : <LOG LEVEL>
},
"anonymize": {
"output_file_path" : <PATH TO YOUR CSV OUTPUT FOLDER>
}
}
```
Expand Down
8 changes: 4 additions & 4 deletions pii-anonymizer.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@
"delimiter": ","
},
"analyze": {},
"anonymize": {
"mode": "replace",
"output_file_path": "./output"
},
"report": {
"location": "./output",
"level": "medium"
},
"anonymize": {
"mode": "hash",
"output_file_path": "./output"
}
}
2 changes: 1 addition & 1 deletion pii_anonymizer/common/config_validator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pii_anonymizer.common.constants import ANONYMIZE

anonymize_mode = ["redact", "drop", "hash"]
anonymize_mode = ["replace", "hash"]
anonymize_mode_err_msg = f"{ANONYMIZE}'s mode must be {' or '.join(anonymize_mode)}"


Expand Down
12 changes: 5 additions & 7 deletions pii_anonymizer/spark/analyze/detectors/pii_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,22 +86,20 @@ def get_redacted_text(self, input_data_frame: DataFrame, report: DataFrame):
column = input_data_frame.columns

mode = self.config[ANONYMIZE].get("mode")
value = self.config[ANONYMIZE].get("value", "")

match mode:
case "drop":
result = input_data_frame.rdd.map(
lambda row: Anonymizer.drop(row, pii_list)
).toDF(column)
case "redact":
case "replace":
result = input_data_frame.rdd.map(
lambda row: Anonymizer.redact(row, pii_list)
lambda row: Anonymizer.replace(row, value, pii_list)
).toDF(column)
case "hash":
result = input_data_frame.rdd.map(
lambda row: Anonymizer.hash(row, pii_list)
).toDF(column)
case _:
result = input_data_frame.rdd.map(
lambda row: Anonymizer.drop(row, pii_list)
lambda row: Anonymizer.replace(row, value, pii_list)
).toDF(column)

return result
Expand Down
14 changes: 2 additions & 12 deletions pii_anonymizer/spark/anonymize/anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,22 +3,12 @@

class Anonymizer:
@staticmethod
def drop(row, pii_list):
def replace(row, replace_string, pii_list):
new_row = []
for cell in row:
for word in pii_list:
if word in cell:
cell = cell.replace(word, "")
new_row.append(cell)
return new_row

@staticmethod
def redact(row, pii_list):
new_row = []
for cell in row:
for word in pii_list:
if word in cell:
cell = cell.replace(word, "[Redacted]")
cell = cell.replace(word, replace_string)
new_row.append(cell)
return new_row

Expand Down
40 changes: 0 additions & 40 deletions pii_anonymizer/spark/anonymize/tests/test_redact_anonymizer.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -3,38 +3,41 @@
from pii_anonymizer.spark.anonymize.anonymizer import Anonymizer


class TestDropAnonymizer(TestCase):
class TestReplaceAnonymizer(TestCase):
def setUp(self) -> None:
self.SPARK = (
SparkSession.builder.master("local")
.appName("Test PIIDetector")
.getOrCreate()
)

def test_drop_for_single_analyzer_result(self):
def test_replace_for_single_analyzer_result(self):
replace_string = "[REPLACED]"
test_data_frame = self.SPARK.createDataFrame(
[("text containing pii", "something else")]
)

analyzer_results = ["pii"]
result = test_data_frame.rdd.map(
lambda row: Anonymizer.drop(row, analyzer_results)
lambda row: Anonymizer.replace(row, replace_string, analyzer_results)
).toDF()

actual = result.collect()[0][0]

self.assertEqual(actual, "text containing ")
self.assertEqual(actual, f"text containing {replace_string}")

def test_drop_for_multiple_analyzer_results(self):
def test_replace_for_multiple_analyzer_results(self):
replace_string = "[REPLACED]"
test_data_frame = self.SPARK.createDataFrame(
[("text containing pii1 and pii2", "something else")]
)
analyzer_results = ["pii1", "pii2"]

result = test_data_frame.rdd.map(
lambda row: Anonymizer.drop(row, analyzer_results)
lambda row: Anonymizer.replace(row, replace_string, analyzer_results)
).toDF()

actual = result.collect()[0][0]

self.assertEqual(actual, "text containing and ")
self.assertEqual(
actual, f"text containing {replace_string} and {replace_string}"
)
10 changes: 5 additions & 5 deletions pii_anonymizer/standalone/analyze/detectors/pii_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,15 +49,15 @@ def analyze_and_anonymize(self, text: str):
analyzer_results = analyzer_results + detector.execute(text)

mode = self.config[ANONYMIZE].get("mode")
value = self.config[ANONYMIZE].get("value", "")

match mode:
case "drop":
redacted_text = Anonymizer.drop(text, analyzer_results)
case "redact":
redacted_text = Anonymizer.redact(text, analyzer_results)
case "replace":
redacted_text = Anonymizer.replace(text, value, analyzer_results)
case "hash":
redacted_text = Anonymizer.hash(text, analyzer_results)
case _:
redacted_text = Anonymizer.drop(text, analyzer_results)
redacted_text = Anonymizer.replace(text, value, analyzer_results)

return AnonymizerResult(redacted_text, analyzer_results)

Expand Down
10 changes: 2 additions & 8 deletions pii_anonymizer/standalone/anonymize/anonymizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,9 @@

class Anonymizer:
@staticmethod
def drop(text: str, analyzer_results: [AnalyzerResult]):
def replace(text: str, replace_string: str, analyzer_results: [AnalyzerResult]):
for result in analyzer_results:
text = text.replace(result.text, "")
return text

@staticmethod
def redact(text: str, analyzer_results: [AnalyzerResult]):
for result in analyzer_results:
text = text.replace(result.text, "[Redacted]")
text = text.replace(result.text, replace_string)
return text

@staticmethod
Expand Down

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -3,18 +3,18 @@
from pii_anonymizer.standalone.analyze.utils.analyzer_result import AnalyzerResult


class TestDropAnonymizer(TestCase):
def test_drop_for_single_analyzer_result(self):
class TestReplaceAnonymizer(TestCase):
def test_Replace_for_single_analyzer_result(self):
text = "text containing pii"
analyzer_results = [AnalyzerResult("pii", "PII_DETECTOR", 16, 18)]
result = Anonymizer.drop(text, analyzer_results)
self.assertEqual(result, "text containing ")
result = Anonymizer.replace(text, "[REPLACED]", analyzer_results)
self.assertEqual(result, "text containing [REPLACED]")

def test_drop_for_multiple_analyzer_results(self):
def test_Replace_for_multiple_analyzer_results(self):
text = "text containing pii1 and pii2"
analyzer_results = [
AnalyzerResult("pii1", "PII_DETECTOR", 16, 19),
AnalyzerResult("pii2", "PII_DETECTOR", 25, 28),
]
result = Anonymizer.drop(text, analyzer_results)
self.assertEqual(result, "text containing and ")
result = Anonymizer.replace(text, "[REPLACED]", analyzer_results)
self.assertEqual(result, "text containing [REPLACED] and [REPLACED]")
2 changes: 1 addition & 1 deletion pii_anonymizer/standalone/tests/config/test_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
},
"analyze": {},
"anonymize": {
"mode": "redact",
"mode": "replace",
"output_file_path": "/Users/wisuchoi/Documents/anonymizer/output"
},
"report": {
Expand Down

0 comments on commit a389c4a

Please sign in to comment.