Skip to content

Commit

Permalink
Anonymize with sha256
Browse files Browse the repository at this point in the history
  • Loading branch information
Pee Tankulrat authored and Pee Tankulrat committed Nov 22, 2022
1 parent 7363b94 commit 3438947
Show file tree
Hide file tree
Showing 8 changed files with 97 additions and 2 deletions.
2 changes: 1 addition & 1 deletion pii-anonymizer.json
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
"level": "medium"
},
"anonymize": {
"mode": "redact",
"mode": "hash",
"output_file_path": "./output"
}
}
2 changes: 1 addition & 1 deletion pii_anonymizer/common/config_validator.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from pii_anonymizer.common.constants import ANONYMIZE

anonymize_mode = ["redact", "drop"]
anonymize_mode = ["redact", "drop", "hash"]
anonymize_mode_err_msg = f"{ANONYMIZE}'s mode must be {' or '.join(anonymize_mode)}"


Expand Down
4 changes: 4 additions & 0 deletions pii_anonymizer/spark/analyze/detectors/pii_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,6 +95,10 @@ def get_redacted_text(self, input_data_frame: DataFrame, report: DataFrame):
result = input_data_frame.rdd.map(
lambda row: Anonymizer.redact(row, pii_list)
).toDF(column)
case "hash":
result = input_data_frame.rdd.map(
lambda row: Anonymizer.hash(row, pii_list)
).toDF(column)
case _:
result = input_data_frame.rdd.map(
lambda row: Anonymizer.drop(row, pii_list)
Expand Down
13 changes: 13 additions & 0 deletions pii_anonymizer/spark/anonymize/anonymizer.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,6 @@
from hashlib import sha256


class Anonymizer:
@staticmethod
def drop(row, pii_list):
Expand All @@ -18,3 +21,13 @@ def redact(row, pii_list):
cell = cell.replace(word, "[Redacted]")
new_row.append(cell)
return new_row

@staticmethod
def hash(row, pii_list):
new_row = []
for cell in row:
for word in pii_list:
if word in cell:
cell = cell.replace(word, sha256(word.encode("utf-8")).hexdigest())
new_row.append(cell)
return new_row
42 changes: 42 additions & 0 deletions pii_anonymizer/spark/anonymize/tests/test_hash_anonymizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
from unittest import TestCase
from pyspark.sql import SparkSession
from pii_anonymizer.spark.anonymize.anonymizer import Anonymizer
from hashlib import sha256


class TestHashAnonymizer(TestCase):
def setUp(self) -> None:
self.SPARK = (
SparkSession.builder.master("local")
.appName("Test PIIDetector")
.getOrCreate()
)

def test_hash_for_single_analyzer_result(self):
test_data_frame = self.SPARK.createDataFrame(
[("text containing pii", "something else")]
)
hashed = sha256("pii".encode("utf-8")).hexdigest()
analyzer_results = ["pii"]
result = test_data_frame.rdd.map(
lambda row: Anonymizer.hash(row, analyzer_results)
).toDF()

actual = result.collect()[0][0]

self.assertEqual(actual, f"text containing {hashed}")

def test_hash_for_multiple_analyzer_results(self):
test_data_frame = self.SPARK.createDataFrame(
[("text containing pii1 and pii2", "something else")]
)
analyzer_results = ["pii1", "pii2"]
hashed1 = sha256("pii1".encode("utf-8")).hexdigest()
hashed2 = sha256("pii2".encode("utf-8")).hexdigest()
result = test_data_frame.rdd.map(
lambda row: Anonymizer.hash(row, analyzer_results)
).toDF()

actual = result.collect()[0][0]

self.assertEqual(actual, f"text containing {hashed1} and {hashed2}")
2 changes: 2 additions & 0 deletions pii_anonymizer/standalone/analyze/detectors/pii_detector.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ def analyze_and_anonymize(self, text: str):
redacted_text = Anonymizer.drop(text, analyzer_results)
case "redact":
redacted_text = Anonymizer.redact(text, analyzer_results)
case "hash":
redacted_text = Anonymizer.hash(text, analyzer_results)
case _:
redacted_text = Anonymizer.drop(text, analyzer_results)

Expand Down
10 changes: 10 additions & 0 deletions pii_anonymizer/standalone/anonymize/anonymizer.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
from pii_anonymizer.standalone.analyze.utils.analyzer_result import AnalyzerResult

from hashlib import sha256


class Anonymizer:
@staticmethod
Expand All @@ -13,3 +15,11 @@ def redact(text: str, analyzer_results: [AnalyzerResult]):
for result in analyzer_results:
text = text.replace(result.text, "[Redacted]")
return text

@staticmethod
def hash(text: str, analyzer_results: [AnalyzerResult]):
for result in analyzer_results:
text = text.replace(
result.text, sha256(result.text.encode("utf-8")).hexdigest()
)
return text
24 changes: 24 additions & 0 deletions pii_anonymizer/standalone/anonymize/tests/test_hash_anonymizer.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
from unittest import TestCase
from pii_anonymizer.standalone.anonymize.anonymizer import Anonymizer
from pii_anonymizer.standalone.analyze.utils.analyzer_result import AnalyzerResult
from hashlib import sha256


class TestRedactAnonymizer(TestCase):
def test_hash_for_single_analyzer_result(self):
text = "text containing pii"
hashed = sha256("pii".encode("utf-8")).hexdigest()
analyzer_results = [AnalyzerResult("pii", "PII_DETECTOR", 16, 18)]
result = Anonymizer.hash(text, analyzer_results)
self.assertEqual(result, f"text containing {hashed}")

def test_hash_for_multiple_analyzer_results(self):
text = "text containing pii1 and pii2"
hashed1 = sha256("pii1".encode("utf-8")).hexdigest()
hashed2 = sha256("pii2".encode("utf-8")).hexdigest()
analyzer_results = [
AnalyzerResult("pii1", "PII_DETECTOR", 16, 19),
AnalyzerResult("pii2", "PII_DETECTOR", 25, 28),
]
result = Anonymizer.hash(text, analyzer_results)
self.assertEqual(result, f"text containing {hashed1} and {hashed2}")

1 comment on commit 3438947

@pee-tw
Copy link
Collaborator

@pee-tw pee-tw commented on 3438947 Dec 14, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please sign in to comment.