Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add NER PII Masking module #319

Draft
wants to merge 5 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions autorag/nodes/passagefilter/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .percentile_cutoff import similarity_percentile_cutoff
from .ner_pii_masking import ner_pii_masking
from .pass_passage_filter import pass_passage_filter
from .threshold_cutoff import similarity_threshold_cutoff
17 changes: 14 additions & 3 deletions autorag/nodes/passagefilter/base.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import functools
import os
from pathlib import Path
from typing import Union, Tuple, List

import pandas as pd

from autorag.utils import result_to_dataframe, validate_qa_dataset
from autorag.utils import result_to_dataframe, validate_qa_dataset, fetch_contents


# same with passage filter from now
Expand Down Expand Up @@ -33,8 +34,18 @@ def wrapper(
assert "retrieved_ids" in previous_result.columns, "previous_result must have retrieved_ids column."
ids = previous_result["retrieved_ids"].tolist()

filtered_contents, filtered_ids, filtered_scores = func(queries=queries, contents_list=contents,
scores_list=scores, ids_list=ids, *args, **kwargs)
if func.__name__ == 'recency_filter':
corpus_df = pd.read_parquet(os.path.join(project_dir, "data", "corpus.parquet"))
metadatas = fetch_contents(corpus_df, ids, column_name='metadata')
times = [[time['last_modified_datetime'] for time in time_list] for time_list in metadatas]
filtered_contents, filtered_ids, filtered_scores \
= func(contents_list=contents, scores_list=scores, ids_list=ids, time_list=times, *args, **kwargs)
elif func.__name__ == 'ner_pii_masking':
filtered_contents, filtered_ids, filtered_scores = func(contents_list=contents,
scores_list=scores, ids_list=ids, *args, **kwargs)
else:
filtered_contents, filtered_ids, filtered_scores = func(queries=queries, contents_list=contents,
scores_list=scores, ids_list=ids, *args, **kwargs)

return filtered_contents, filtered_ids, filtered_scores

Expand Down
41 changes: 41 additions & 0 deletions autorag/nodes/passagefilter/ner_pii_masking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
import asyncio
from typing import List, Tuple

from transformers import pipeline

from autorag.nodes.passagefilter.base import passage_filter_node


@passage_filter_node
def ner_pii_masking(contents_list: List[List[str]],
scores_list: List[List[float]], ids_list: List[List[str]],
) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]:
"""
Mask PII in the contents using NER.
Use a Hugging Face NER model for PII Masking

:param contents_list: The list of lists of contents to filter
:param scores_list: The list of lists of scores retrieved
:param ids_list: The list of lists of ids retrieved
:return: Tuple of lists containing the filtered contents, ids, and scores
"""
model = pipeline("ner", grouped_entities=True)

tasks = [mask_pii(model, contents) for contents in contents_list]
loop = asyncio.get_event_loop()
results = loop.run_until_complete(asyncio.gather(*tasks))
masked_contents_list = list(results)

return masked_contents_list, ids_list, scores_list


async def mask_pii(model, contents: List[str]) -> List[str]:
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Actually, this is useless async operation.
I'll make a PR for resolving this kind of issue in the model rerankers...

new_contents_list = []
for content in contents:
new_contents = content
response = model(content)
for entry in response:
entity_group_tag = f"[{entry['entity_group']}_{entry['start']}]"
new_contents = new_contents.replace(entry["word"], entity_group_tag).strip()
new_contents_list.append(new_contents)
return new_contents_list
1 change: 1 addition & 0 deletions autorag/support.py
Original file line number Diff line number Diff line change
Expand Up @@ -43,6 +43,7 @@ def get_support_modules(module_name: str) -> Callable:
'pass_passage_filter': ('autorag.nodes.passagefilter', 'pass_passage_filter'),
'similarity_threshold_cutoff': ('autorag.nodes.passagefilter', 'similarity_threshold_cutoff'),
'similarity_percentile_cutoff': ('autorag.nodes.passagefilter', 'similarity_percentile_cutoff'),
'ner_pii_masking': ('autorag.nodes.passagefilter', 'ner_pii_masking'),
# passage_compressor
'tree_summarize': ('autorag.nodes.passagecompressor', 'tree_summarize'),
'pass_compressor': ('autorag.nodes.passagecompressor', 'pass_compressor'),
Expand Down
23 changes: 23 additions & 0 deletions docs/source/nodes/passage_filter/ner_pii_masking.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
# NER PII Masking

This module is inspired by
LlamaIndex ['PII Masking'](https://docs.llamaindex.ai/en/stable/examples/node_postprocessor/PII/#option-1-use-ner-model-for-pii-masking).

Use a Hugging Face NER model for PII Masking

## What is PII Masking?

PII(Personally Identifiable Information) Masking is a data protection method that obscures sensitive personal
information to prevent unauthorized access while retaining data utility for analysis or development. Techniques include
encryption, substitution, and scrambling, ensuring compliance and minimizing breach risks.

## **Module Parameters**

- **Not Applicable (N/A):** There are no direct module parameters specified for the `ner_pii_masking` module.

## **Example config.yaml**

```yaml
modules:
- module_type: ner_pii_masking
```
1 change: 1 addition & 0 deletions docs/source/nodes/passage_filter/passage_filter.md
Original file line number Diff line number Diff line change
Expand Up @@ -51,4 +51,5 @@ maxdepth: 1
---
similarity_threshold_cutoff.md
similarity_percentile_cutoff.md
ner_pii_masking.md
```
3 changes: 3 additions & 0 deletions sample_config/full.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -76,6 +76,9 @@ node_lines:
- module_type: pass_passage_filter
- module_type: similarity_threshold_cutoff
threshold: 0.85
- module_type: similarity_percentile_cutoff
threshold: 0.6
- module_type: ner_pii_masking
- node_type: passage_compressor
strategy:
metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision]
Expand Down
15 changes: 15 additions & 0 deletions tests/autorag/nodes/passagefilter/test_ner_pii_masking.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
from autorag.nodes.passagefilter import ner_pii_masking
from tests.autorag.nodes.passagefilter.test_passage_filter_base import base_passage_filter_test, contents_example, \
ids_example, scores_example, project_dir, previous_result, base_passage_filter_node_test


def test_ner_pii_masking():
original_ner = ner_pii_masking.__wrapped__
contents, ids, scores = original_ner(contents_example, scores_example, ids_example)
assert contents[1][3] == "[PER_0] is one of the members of [ORG_34]."
base_passage_filter_test(contents, ids, scores)


def test_ner_pii_masking_node():
result_df = ner_pii_masking(project_dir=project_dir, previous_result=previous_result)
base_passage_filter_node_test(result_df)