-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Pee Tankulrat
authored and
Pee Tankulrat
committed
Nov 22, 2022
1 parent
7363b94
commit 3438947
Showing
8 changed files
with
97 additions
and
2 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,7 +9,7 @@ | |
"level": "medium" | ||
}, | ||
"anonymize": { | ||
"mode": "redact", | ||
"mode": "hash", | ||
"output_file_path": "./output" | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
42 changes: 42 additions & 0 deletions
42
pii_anonymizer/spark/anonymize/tests/test_hash_anonymizer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,42 @@ | ||
from unittest import TestCase | ||
from pyspark.sql import SparkSession | ||
from pii_anonymizer.spark.anonymize.anonymizer import Anonymizer | ||
from hashlib import sha256 | ||
|
||
|
||
class TestHashAnonymizer(TestCase): | ||
def setUp(self) -> None: | ||
self.SPARK = ( | ||
SparkSession.builder.master("local") | ||
.appName("Test PIIDetector") | ||
.getOrCreate() | ||
) | ||
|
||
def test_hash_for_single_analyzer_result(self): | ||
test_data_frame = self.SPARK.createDataFrame( | ||
[("text containing pii", "something else")] | ||
) | ||
hashed = sha256("pii".encode("utf-8")).hexdigest() | ||
analyzer_results = ["pii"] | ||
result = test_data_frame.rdd.map( | ||
lambda row: Anonymizer.hash(row, analyzer_results) | ||
).toDF() | ||
|
||
actual = result.collect()[0][0] | ||
|
||
self.assertEqual(actual, f"text containing {hashed}") | ||
|
||
def test_hash_for_multiple_analyzer_results(self): | ||
test_data_frame = self.SPARK.createDataFrame( | ||
[("text containing pii1 and pii2", "something else")] | ||
) | ||
analyzer_results = ["pii1", "pii2"] | ||
hashed1 = sha256("pii1".encode("utf-8")).hexdigest() | ||
hashed2 = sha256("pii2".encode("utf-8")).hexdigest() | ||
result = test_data_frame.rdd.map( | ||
lambda row: Anonymizer.hash(row, analyzer_results) | ||
).toDF() | ||
|
||
actual = result.collect()[0][0] | ||
|
||
self.assertEqual(actual, f"text containing {hashed1} and {hashed2}") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
24 changes: 24 additions & 0 deletions
24
pii_anonymizer/standalone/anonymize/tests/test_hash_anonymizer.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,24 @@ | ||
from unittest import TestCase | ||
from pii_anonymizer.standalone.anonymize.anonymizer import Anonymizer | ||
from pii_anonymizer.standalone.analyze.utils.analyzer_result import AnalyzerResult | ||
from hashlib import sha256 | ||
|
||
|
||
class TestRedactAnonymizer(TestCase): | ||
def test_hash_for_single_analyzer_result(self): | ||
text = "text containing pii" | ||
hashed = sha256("pii".encode("utf-8")).hexdigest() | ||
analyzer_results = [AnalyzerResult("pii", "PII_DETECTOR", 16, 18)] | ||
result = Anonymizer.hash(text, analyzer_results) | ||
self.assertEqual(result, f"text containing {hashed}") | ||
|
||
def test_hash_for_multiple_analyzer_results(self): | ||
text = "text containing pii1 and pii2" | ||
hashed1 = sha256("pii1".encode("utf-8")).hexdigest() | ||
hashed2 = sha256("pii2".encode("utf-8")).hexdigest() | ||
analyzer_results = [ | ||
AnalyzerResult("pii1", "PII_DETECTOR", 16, 19), | ||
AnalyzerResult("pii2", "PII_DETECTOR", 25, 28), | ||
] | ||
result = Anonymizer.hash(text, analyzer_results) | ||
self.assertEqual(result, f"text containing {hashed1} and {hashed2}") |
3438947
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Related to Issue #12
thoughtworks-datakind/anonymizer#12