Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Gen-Ai-Orchestrator] Add title to embedded documents for better retrieval #1639

Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,9 @@ async def execute_qa_chain(query: RagQuery, debug: bool) -> RagResponse:
identifier=f'{doc.metadata["id"]}',
title=doc.metadata['title'],
url=doc.metadata['url'],
content=doc.page_content,
content=doc.metadata['original_text']
if 'original_text' in doc.metadata
else doc.page_content,
),
response['source_documents'],
)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@

import pandas as pd
from docopt import docopt
from langchain.document_loaders.dataframe import DataFrameLoader
from langchain.embeddings.base import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.document_loaders import CSVLoader
Expand Down Expand Up @@ -100,13 +101,14 @@ def index_documents(args):
)

logging.debug(f"Read input CSV file {args['<input_csv>']}")
csv_loader = CSVLoader(
file_path=args['<input_csv>'],
source_column='url',
metadata_columns=('title', 'url'),
csv_args={'delimiter': '|', 'quotechar': '"'},
)
docs = csv_loader.load()
df = pd.read_csv(args['<input_csv>'], delimiter='|', quotechar='"')
# add original row nb
df['row'] = df.index
# add a 'source' metadata (this is the source's URL column at the moment,
# but may come from another column when input CSV file's format evolves)
df['source'] = df['url']
loader = DataFrameLoader(df, page_content_column='text')
docs = loader.load()
for doc in docs:
doc.metadata['index_session_id'] = session_uuid
doc.metadata['index_datetime'] = formatted_datetime
Expand All @@ -120,6 +122,9 @@ def index_documents(args):
splitted_docs = text_splitter.split_documents(docs)
# Add chunk id ('n/N') metadata to each chunk
splitted_docs = generate_ids_for_each_chunks(splitted_docs=splitted_docs)
# Add title to text (for better semantic search) and text to
# metadata (to easily get original text without title)
splitted_docs = add_title_to_text(splitted_docs=splitted_docs)

logging.debug(f"Get embeddings model from {args['<embeddings_cfg>']} config file")
with open(args['<embeddings_cfg>'], 'r') as file:
Expand Down Expand Up @@ -164,6 +169,21 @@ def generate_ids_for_each_chunks(
return splitted_docs


def add_title_to_text(
splitted_docs: Iterable[Document],
) -> Iterable[Document]:
"""Add 'title' from metadata to Document's page_content for better semantic search."""
for doc in splitted_docs:
# Store the original page_content in the metadata
doc.metadata['original_text'] = doc.page_content
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This will multiple by 2 the weight of a database from the previous implementation for small DB it's not an issue for bigger ones this will have consequent storage (and cost) impact.


# Add title to page_content
if 'title' in doc.metadata:
title = doc.metadata['title']
doc.page_content = f'Titre: {title}\n\n{doc.page_content}'
return splitted_docs


def em_settings_from_config(setting_dict: dict) -> BaseEMSetting:
"""Get embeddings settings from config dict."""
# Create settings class according to embeddings provider from config file
Expand Down