diff --git a/autorag/nodes/passagefilter/__init__.py b/autorag/nodes/passagefilter/__init__.py index aee64a0a8..5b7a0eb70 100644 --- a/autorag/nodes/passagefilter/__init__.py +++ b/autorag/nodes/passagefilter/__init__.py @@ -1,3 +1,4 @@ from .percentile_cutoff import similarity_percentile_cutoff +from .ner_pii_masking import ner_pii_masking from .pass_passage_filter import pass_passage_filter from .threshold_cutoff import similarity_threshold_cutoff diff --git a/autorag/nodes/passagefilter/base.py b/autorag/nodes/passagefilter/base.py index 1b2f874bc..183625900 100644 --- a/autorag/nodes/passagefilter/base.py +++ b/autorag/nodes/passagefilter/base.py @@ -1,10 +1,11 @@ import functools +import os from pathlib import Path from typing import Union, Tuple, List import pandas as pd -from autorag.utils import result_to_dataframe, validate_qa_dataset +from autorag.utils import result_to_dataframe, validate_qa_dataset, fetch_contents # same with passage filter from now @@ -33,8 +34,18 @@ def wrapper( assert "retrieved_ids" in previous_result.columns, "previous_result must have retrieved_ids column." ids = previous_result["retrieved_ids"].tolist() - filtered_contents, filtered_ids, filtered_scores = func(queries=queries, contents_list=contents, - scores_list=scores, ids_list=ids, *args, **kwargs) + if func.__name__ == 'recency_filter': + corpus_df = pd.read_parquet(os.path.join(project_dir, "data", "corpus.parquet")) + metadatas = fetch_contents(corpus_df, ids, column_name='metadata') + times = [[time['last_modified_datetime'] for time in time_list] for time_list in metadatas] + filtered_contents, filtered_ids, filtered_scores \ + = func(contents_list=contents, scores_list=scores, ids_list=ids, time_list=times, *args, **kwargs) + elif func.__name__ == 'ner_pii_masking': + filtered_contents, filtered_ids, filtered_scores = func(contents_list=contents, + scores_list=scores, ids_list=ids, *args, **kwargs) + else: + filtered_contents, filtered_ids, filtered_scores = func(queries=queries, contents_list=contents, + scores_list=scores, ids_list=ids, *args, **kwargs) return filtered_contents, filtered_ids, filtered_scores diff --git a/autorag/nodes/passagefilter/ner_pii_masking.py b/autorag/nodes/passagefilter/ner_pii_masking.py new file mode 100644 index 000000000..3ac3608f9 --- /dev/null +++ b/autorag/nodes/passagefilter/ner_pii_masking.py @@ -0,0 +1,41 @@ +import asyncio +from typing import List, Tuple + +from transformers import pipeline + +from autorag.nodes.passagefilter.base import passage_filter_node + + +@passage_filter_node +def ner_pii_masking(contents_list: List[List[str]], + scores_list: List[List[float]], ids_list: List[List[str]], + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Mask PII in the contents using NER. + Use a Hugging Face NER model for PII Masking + + :param contents_list: The list of lists of contents to filter + :param scores_list: The list of lists of scores retrieved + :param ids_list: The list of lists of ids retrieved + :return: Tuple of lists containing the filtered contents, ids, and scores + """ + model = pipeline("ner", grouped_entities=True) + + tasks = [mask_pii(model, contents) for contents in contents_list] + loop = asyncio.get_event_loop() + results = loop.run_until_complete(asyncio.gather(*tasks)) + masked_contents_list = list(results) + + return masked_contents_list, ids_list, scores_list + + +async def mask_pii(model, contents: List[str]) -> List[str]: + new_contents_list = [] + for content in contents: + new_contents = content + response = model(content) + for entry in response: + entity_group_tag = f"[{entry['entity_group']}_{entry['start']}]" + new_contents = new_contents.replace(entry["word"], entity_group_tag).strip() + new_contents_list.append(new_contents) + return new_contents_list diff --git a/autorag/support.py b/autorag/support.py index dbc126eaa..678f31e00 100644 --- a/autorag/support.py +++ b/autorag/support.py @@ -43,6 +43,7 @@ def get_support_modules(module_name: str) -> Callable: 'pass_passage_filter': ('autorag.nodes.passagefilter', 'pass_passage_filter'), 'similarity_threshold_cutoff': ('autorag.nodes.passagefilter', 'similarity_threshold_cutoff'), 'similarity_percentile_cutoff': ('autorag.nodes.passagefilter', 'similarity_percentile_cutoff'), + 'ner_pii_masking': ('autorag.nodes.passagefilter', 'ner_pii_masking'), # passage_compressor 'tree_summarize': ('autorag.nodes.passagecompressor', 'tree_summarize'), 'pass_compressor': ('autorag.nodes.passagecompressor', 'pass_compressor'), diff --git a/docs/source/nodes/passage_filter/ner_pii_masking.md b/docs/source/nodes/passage_filter/ner_pii_masking.md new file mode 100644 index 000000000..9b1866e7a --- /dev/null +++ b/docs/source/nodes/passage_filter/ner_pii_masking.md @@ -0,0 +1,23 @@ +# NER PII Masking + +This module is inspired by +LlamaIndex ['PII Masking'](https://docs.llamaindex.ai/en/stable/examples/node_postprocessor/PII/#option-1-use-ner-model-for-pii-masking). + +Use a Hugging Face NER model for PII Masking + +## What is PII Masking? + +PII(Personally Identifiable Information) Masking is a data protection method that obscures sensitive personal +information to prevent unauthorized access while retaining data utility for analysis or development. Techniques include +encryption, substitution, and scrambling, ensuring compliance and minimizing breach risks. + +## **Module Parameters** + +- **Not Applicable (N/A):** There are no direct module parameters specified for the `ner_pii_masking` module. + +## **Example config.yaml** + +```yaml +modules: + - module_type: ner_pii_masking +``` diff --git a/docs/source/nodes/passage_filter/passage_filter.md b/docs/source/nodes/passage_filter/passage_filter.md index bac749656..4d6530e7d 100644 --- a/docs/source/nodes/passage_filter/passage_filter.md +++ b/docs/source/nodes/passage_filter/passage_filter.md @@ -51,4 +51,5 @@ maxdepth: 1 --- similarity_threshold_cutoff.md similarity_percentile_cutoff.md +ner_pii_masking.md ``` diff --git a/sample_config/full.yaml b/sample_config/full.yaml index df95651b4..ee9c94f82 100644 --- a/sample_config/full.yaml +++ b/sample_config/full.yaml @@ -76,6 +76,9 @@ node_lines: - module_type: pass_passage_filter - module_type: similarity_threshold_cutoff threshold: 0.85 + - module_type: similarity_percentile_cutoff + threshold: 0.6 + - module_type: ner_pii_masking - node_type: passage_compressor strategy: metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision] diff --git a/tests/autorag/nodes/passagefilter/test_ner_pii_masking.py b/tests/autorag/nodes/passagefilter/test_ner_pii_masking.py new file mode 100644 index 000000000..0d6d76011 --- /dev/null +++ b/tests/autorag/nodes/passagefilter/test_ner_pii_masking.py @@ -0,0 +1,15 @@ +from autorag.nodes.passagefilter import ner_pii_masking +from tests.autorag.nodes.passagefilter.test_passage_filter_base import base_passage_filter_test, contents_example, \ + ids_example, scores_example, project_dir, previous_result, base_passage_filter_node_test + + +def test_ner_pii_masking(): + original_ner = ner_pii_masking.__wrapped__ + contents, ids, scores = original_ner(contents_example, scores_example, ids_example) + assert contents[1][3] == "[PER_0] is one of the members of [ORG_34]." + base_passage_filter_test(contents, ids, scores) + + +def test_ner_pii_masking_node(): + result_df = ner_pii_masking(project_dir=project_dir, previous_result=previous_result) + base_passage_filter_node_test(result_df)