From 17c2f063388c7297939d35ae16320c596659ed9b Mon Sep 17 00:00:00 2001 From: Yuhong Sun Date: Thu, 30 Nov 2023 13:24:22 -0800 Subject: [PATCH] Add more metadata options for File connector (#797) --- backend/danswer/connectors/file/connector.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/backend/danswer/connectors/file/connector.py b/backend/danswer/connectors/file/connector.py index df28d7ed4d4..8d73fe95e0b 100644 --- a/backend/danswer/connectors/file/connector.py +++ b/backend/danswer/connectors/file/connector.py @@ -12,6 +12,7 @@ from danswer.connectors.cross_connector_utils.file_utils import load_files_from_zip from danswer.connectors.cross_connector_utils.file_utils import read_file from danswer.connectors.cross_connector_utils.file_utils import read_pdf_file +from danswer.connectors.cross_connector_utils.time_utils import time_str_to_utc from danswer.connectors.file.utils import check_file_ext_is_valid from danswer.connectors.file.utils import get_file_ext from danswer.connectors.interfaces import GenerateDocumentsOutput @@ -63,15 +64,20 @@ def _process_file( else: file_content_raw, metadata = read_file(file) + dt_str = metadata.get("doc_updated_at") + final_time_updated = time_str_to_utc(dt_str) if dt_str else time_updated + return [ Document( id=file_name, sections=[ - Section(link=metadata.get("link", None), text=file_content_raw.strip()) + Section(link=metadata.get("link"), text=file_content_raw.strip()) ], source=DocumentSource.FILE, semantic_identifier=file_name, - doc_updated_at=time_updated, + doc_updated_at=final_time_updated, + primary_owners=metadata.get("primary_owners"), + secondary_owners=metadata.get("secondary_owners"), metadata={}, ) ]