From d1949fa25da6a7736c7fff4031bb00e2ddaef55b Mon Sep 17 00:00:00 2001 From: kimbwook Date: Fri, 12 Apr 2024 16:07:14 +0900 Subject: [PATCH 1/4] just commit for merge another branch --- autorag/nodes/passagefilter/ner_pii_masking.py | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 autorag/nodes/passagefilter/ner_pii_masking.py diff --git a/autorag/nodes/passagefilter/ner_pii_masking.py b/autorag/nodes/passagefilter/ner_pii_masking.py new file mode 100644 index 000000000..139597f9c --- /dev/null +++ b/autorag/nodes/passagefilter/ner_pii_masking.py @@ -0,0 +1,2 @@ + + From 6f6c18a6dc28961a13578f0c96772b3f12ea03ed Mon Sep 17 00:00:00 2001 From: kimbwook Date: Fri, 12 Apr 2024 17:15:59 +0900 Subject: [PATCH 2/4] just commit for merge another branch --- autorag/nodes/passagefilter/__init__.py | 1 + autorag/nodes/passagefilter/base.py | 17 ++++++++-- .../nodes/passagefilter/ner_pii_masking.py | 33 +++++++++++++++++++ autorag/support.py | 1 + .../passagefilter/test_ner_pii_masking.py | 15 +++++++++ 5 files changed, 64 insertions(+), 3 deletions(-) create mode 100644 tests/autorag/nodes/passagefilter/test_ner_pii_masking.py diff --git a/autorag/nodes/passagefilter/__init__.py b/autorag/nodes/passagefilter/__init__.py index 5e25e7e38..d46326a73 100644 --- a/autorag/nodes/passagefilter/__init__.py +++ b/autorag/nodes/passagefilter/__init__.py @@ -1,2 +1,3 @@ +from .ner_pii_masking import ner_pii_masking from .pass_passage_filter import pass_passage_filter from .threshold_cutoff import similarity_threshold_cutoff diff --git a/autorag/nodes/passagefilter/base.py b/autorag/nodes/passagefilter/base.py index 1b2f874bc..183625900 100644 --- a/autorag/nodes/passagefilter/base.py +++ b/autorag/nodes/passagefilter/base.py @@ -1,10 +1,11 @@ import functools +import os from pathlib import Path from typing import Union, Tuple, List import pandas as pd -from autorag.utils import result_to_dataframe, validate_qa_dataset +from autorag.utils import result_to_dataframe, validate_qa_dataset, fetch_contents # same with passage filter from now @@ -33,8 +34,18 @@ def wrapper( assert "retrieved_ids" in previous_result.columns, "previous_result must have retrieved_ids column." ids = previous_result["retrieved_ids"].tolist() - filtered_contents, filtered_ids, filtered_scores = func(queries=queries, contents_list=contents, - scores_list=scores, ids_list=ids, *args, **kwargs) + if func.__name__ == 'recency_filter': + corpus_df = pd.read_parquet(os.path.join(project_dir, "data", "corpus.parquet")) + metadatas = fetch_contents(corpus_df, ids, column_name='metadata') + times = [[time['last_modified_datetime'] for time in time_list] for time_list in metadatas] + filtered_contents, filtered_ids, filtered_scores \ + = func(contents_list=contents, scores_list=scores, ids_list=ids, time_list=times, *args, **kwargs) + elif func.__name__ == 'ner_pii_masking': + filtered_contents, filtered_ids, filtered_scores = func(contents_list=contents, + scores_list=scores, ids_list=ids, *args, **kwargs) + else: + filtered_contents, filtered_ids, filtered_scores = func(queries=queries, contents_list=contents, + scores_list=scores, ids_list=ids, *args, **kwargs) return filtered_contents, filtered_ids, filtered_scores diff --git a/autorag/nodes/passagefilter/ner_pii_masking.py b/autorag/nodes/passagefilter/ner_pii_masking.py index 139597f9c..aee0d4bae 100644 --- a/autorag/nodes/passagefilter/ner_pii_masking.py +++ b/autorag/nodes/passagefilter/ner_pii_masking.py @@ -1,2 +1,35 @@ +from typing import List, Tuple +from transformers import pipeline +from autorag.nodes.passagefilter.base import passage_filter_node + + +@passage_filter_node +def ner_pii_masking(contents_list: List[List[str]], + scores_list: List[List[float]], ids_list: List[List[str]], + ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: + """ + Mask PII in the contents using NER. + Uses HF transformers model. + + :param contents_list: The list of lists of contents to filter + :param scores_list: The list of lists of scores retrieved + :param ids_list: The list of lists of ids retrieved + :return: Tuple of lists containing the filtered contents, ids, and scores + """ + model = pipeline("ner", grouped_entities=True) + + masked_contents_list = list( + map(lambda contents: list(map(lambda content: mask_pii(model, content), contents)), contents_list)) + + return masked_contents_list, ids_list, scores_list + + +def mask_pii(model, text: str) -> str: + new_text = text + response = model(text) + for entry in response: + entity_group_tag = f"[{entry['entity_group']}_{entry['start']}]" + new_text = new_text.replace(entry["word"], entity_group_tag).strip() + return new_text diff --git a/autorag/support.py b/autorag/support.py index 79e153772..48f048290 100644 --- a/autorag/support.py +++ b/autorag/support.py @@ -42,6 +42,7 @@ def get_support_modules(module_name: str) -> Callable: # passage_filter 'pass_passage_filter': ('autorag.nodes.passagefilter', 'pass_passage_filter'), 'similarity_threshold_cutoff': ('autorag.nodes.passagefilter', 'similarity_threshold_cutoff'), + '' # passage_compressor 'tree_summarize': ('autorag.nodes.passagecompressor', 'tree_summarize'), 'pass_compressor': ('autorag.nodes.passagecompressor', 'pass_compressor'), diff --git a/tests/autorag/nodes/passagefilter/test_ner_pii_masking.py b/tests/autorag/nodes/passagefilter/test_ner_pii_masking.py new file mode 100644 index 000000000..0d6d76011 --- /dev/null +++ b/tests/autorag/nodes/passagefilter/test_ner_pii_masking.py @@ -0,0 +1,15 @@ +from autorag.nodes.passagefilter import ner_pii_masking +from tests.autorag.nodes.passagefilter.test_passage_filter_base import base_passage_filter_test, contents_example, \ + ids_example, scores_example, project_dir, previous_result, base_passage_filter_node_test + + +def test_ner_pii_masking(): + original_ner = ner_pii_masking.__wrapped__ + contents, ids, scores = original_ner(contents_example, scores_example, ids_example) + assert contents[1][3] == "[PER_0] is one of the members of [ORG_34]." + base_passage_filter_test(contents, ids, scores) + + +def test_ner_pii_masking_node(): + result_df = ner_pii_masking(project_dir=project_dir, previous_result=previous_result) + base_passage_filter_node_test(result_df) From 799c1530281e4d391668411d357beb8b89b031b8 Mon Sep 17 00:00:00 2001 From: kimbwook Date: Fri, 12 Apr 2024 17:28:17 +0900 Subject: [PATCH 3/4] Implement ner_pii_masking --- .../nodes/passagefilter/ner_pii_masking.py | 2 +- autorag/support.py | 1 + .../nodes/passage_filter/ner_pii_masking.md | 23 +++++++++++++++++++ .../nodes/passage_filter/passage_filter.md | 1 + sample_config/full.yaml | 3 +++ 5 files changed, 29 insertions(+), 1 deletion(-) create mode 100644 docs/source/nodes/passage_filter/ner_pii_masking.md diff --git a/autorag/nodes/passagefilter/ner_pii_masking.py b/autorag/nodes/passagefilter/ner_pii_masking.py index aee0d4bae..420277779 100644 --- a/autorag/nodes/passagefilter/ner_pii_masking.py +++ b/autorag/nodes/passagefilter/ner_pii_masking.py @@ -11,7 +11,7 @@ def ner_pii_masking(contents_list: List[List[str]], ) -> Tuple[List[List[str]], List[List[str]], List[List[float]]]: """ Mask PII in the contents using NER. - Uses HF transformers model. + Use a Hugging Face NER model for PII Masking :param contents_list: The list of lists of contents to filter :param scores_list: The list of lists of scores retrieved diff --git a/autorag/support.py b/autorag/support.py index dbc126eaa..678f31e00 100644 --- a/autorag/support.py +++ b/autorag/support.py @@ -43,6 +43,7 @@ def get_support_modules(module_name: str) -> Callable: 'pass_passage_filter': ('autorag.nodes.passagefilter', 'pass_passage_filter'), 'similarity_threshold_cutoff': ('autorag.nodes.passagefilter', 'similarity_threshold_cutoff'), 'similarity_percentile_cutoff': ('autorag.nodes.passagefilter', 'similarity_percentile_cutoff'), + 'ner_pii_masking': ('autorag.nodes.passagefilter', 'ner_pii_masking'), # passage_compressor 'tree_summarize': ('autorag.nodes.passagecompressor', 'tree_summarize'), 'pass_compressor': ('autorag.nodes.passagecompressor', 'pass_compressor'), diff --git a/docs/source/nodes/passage_filter/ner_pii_masking.md b/docs/source/nodes/passage_filter/ner_pii_masking.md new file mode 100644 index 000000000..9b1866e7a --- /dev/null +++ b/docs/source/nodes/passage_filter/ner_pii_masking.md @@ -0,0 +1,23 @@ +# NER PII Masking + +This module is inspired by +LlamaIndex ['PII Masking'](https://docs.llamaindex.ai/en/stable/examples/node_postprocessor/PII/#option-1-use-ner-model-for-pii-masking). + +Use a Hugging Face NER model for PII Masking + +## What is PII Masking? + +PII(Personally Identifiable Information) Masking is a data protection method that obscures sensitive personal +information to prevent unauthorized access while retaining data utility for analysis or development. Techniques include +encryption, substitution, and scrambling, ensuring compliance and minimizing breach risks. + +## **Module Parameters** + +- **Not Applicable (N/A):** There are no direct module parameters specified for the `ner_pii_masking` module. + +## **Example config.yaml** + +```yaml +modules: + - module_type: ner_pii_masking +``` diff --git a/docs/source/nodes/passage_filter/passage_filter.md b/docs/source/nodes/passage_filter/passage_filter.md index bac749656..4d6530e7d 100644 --- a/docs/source/nodes/passage_filter/passage_filter.md +++ b/docs/source/nodes/passage_filter/passage_filter.md @@ -51,4 +51,5 @@ maxdepth: 1 --- similarity_threshold_cutoff.md similarity_percentile_cutoff.md +ner_pii_masking.md ``` diff --git a/sample_config/full.yaml b/sample_config/full.yaml index df95651b4..ee9c94f82 100644 --- a/sample_config/full.yaml +++ b/sample_config/full.yaml @@ -76,6 +76,9 @@ node_lines: - module_type: pass_passage_filter - module_type: similarity_threshold_cutoff threshold: 0.85 + - module_type: similarity_percentile_cutoff + threshold: 0.6 + - module_type: ner_pii_masking - node_type: passage_compressor strategy: metrics: [retrieval_token_f1, retrieval_token_recall, retrieval_token_precision] From ad5e07e57f86ba78d075a4fbd9fd376f46b0ce2c Mon Sep 17 00:00:00 2001 From: kimbwook Date: Fri, 12 Apr 2024 20:30:49 +0900 Subject: [PATCH 4/4] apply async --- .../nodes/passagefilter/ner_pii_masking.py | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/autorag/nodes/passagefilter/ner_pii_masking.py b/autorag/nodes/passagefilter/ner_pii_masking.py index 420277779..3ac3608f9 100644 --- a/autorag/nodes/passagefilter/ner_pii_masking.py +++ b/autorag/nodes/passagefilter/ner_pii_masking.py @@ -1,3 +1,4 @@ +import asyncio from typing import List, Tuple from transformers import pipeline @@ -20,16 +21,21 @@ def ner_pii_masking(contents_list: List[List[str]], """ model = pipeline("ner", grouped_entities=True) - masked_contents_list = list( - map(lambda contents: list(map(lambda content: mask_pii(model, content), contents)), contents_list)) + tasks = [mask_pii(model, contents) for contents in contents_list] + loop = asyncio.get_event_loop() + results = loop.run_until_complete(asyncio.gather(*tasks)) + masked_contents_list = list(results) return masked_contents_list, ids_list, scores_list -def mask_pii(model, text: str) -> str: - new_text = text - response = model(text) - for entry in response: - entity_group_tag = f"[{entry['entity_group']}_{entry['start']}]" - new_text = new_text.replace(entry["word"], entity_group_tag).strip() - return new_text +async def mask_pii(model, contents: List[str]) -> List[str]: + new_contents_list = [] + for content in contents: + new_contents = content + response = model(content) + for entry in response: + entity_group_tag = f"[{entry['entity_group']}_{entry['start']}]" + new_contents = new_contents.replace(entry["word"], entity_group_tag).strip() + new_contents_list.append(new_contents) + return new_contents_list