From 39a53c8e5741775b640fe633a7ab9a85afa9a1b5 Mon Sep 17 00:00:00 2001 From: ChengZi Date: Wed, 16 Oct 2024 20:22:51 +0800 Subject: [PATCH] _discard_invalid_meta Signed-off-by: ChengZi --- src/milvus_haystack/document_store.py | 32 ++++++++++++++++++++++++++- 1 file changed, 31 insertions(+), 1 deletion(-) diff --git a/src/milvus_haystack/document_store.py b/src/milvus_haystack/document_store.py index 2dbd93c..a9edea4 100644 --- a/src/milvus_haystack/document_store.py +++ b/src/milvus_haystack/document_store.py @@ -313,7 +313,7 @@ def write_documents(self, documents: List[Document], policy: DuplicatePolicy = D from pymilvus import Collection, MilvusException - documents_cp = deepcopy(documents) + documents_cp = [MilvusDocumentStore._discard_invalid_meta(doc) for doc in deepcopy(documents)] if len(documents_cp) > 0 and not isinstance(documents_cp[0], Document): err_msg = "param 'documents' must contain a list of objects of type Document" raise ValueError(err_msg) @@ -905,3 +905,33 @@ def _convert_sparse_to_dict(self, sparse_embedding: SparseEmbedding) -> Dict: def _convert_dict_to_sparse(self, sparse_dict: Dict) -> SparseEmbedding: return SparseEmbedding(indices=list(sparse_dict.keys()), values=list(sparse_dict.values())) + + @staticmethod + def _discard_invalid_meta(document: Document): + """ + Remove metadata fields with unsupported types from the document. + """ + from pymilvus import DataType + from pymilvus.orm.types import infer_dtype_bydata + + if not isinstance(document, Document): + raise ValueError("document must be an instance of Document") + if document.meta: + discarded_keys = [] + new_meta = {} + for key, value in document.meta.items(): + dtype = infer_dtype_bydata(value) + if dtype in (DataType.UNKNOWN, DataType.NONE): + discarded_keys.append(key) + else: + new_meta[key] = value + + if discarded_keys: + msg = ( + f"Document {document.id} has metadata fields with unsupported types: {discarded_keys}. " + f"Supported types refer to Pymilvus DataType. The values of these fields will be discarded." + ) + logger.warning(msg) + document.meta = new_meta + + return document