Help needed: Issue with split_overlap
in DocumentSplitter using Haystack 2.6.1 and Weaviate
#8511
Replies: 7 comments 3 replies
-
@davidsbatista any ideas? I haven't investigated, but this may be a bug. |
Beta Was this translation helpful? Give feedback.
-
which |
Beta Was this translation helpful? Give feedback.
-
With the following pip packages
and with the code import os
from haystack import Pipeline
from haystack.components.converters import PyPDFToDocument
from haystack.components.embedders import SentenceTransformersDocumentEmbedder
from haystack.components.preprocessors import DocumentSplitter, DocumentCleaner
from haystack.components.writers import DocumentWriter
from haystack_integrations.document_stores.weaviate import WeaviateDocumentStore
from haystack_integrations.document_stores.weaviate import AuthApiKey
from haystack.utils import Secret
def pip_write_document(files):
auth_client_secret = AuthApiKey(Secret.from_env_var("WEAVIATE_API_KEY"))
doc_store = WeaviateDocumentStore(
auth_client_secret=auth_client_secret,
url=os.environ["WEAVIATE_API_URL"],
collection_settings={"class": "new_collection"}
)
converter = PyPDFToDocument()
cleaner = DocumentCleaner()
splitter = DocumentSplitter(split_by="word", split_length=250, split_overlap=50)
embedder = SentenceTransformersDocumentEmbedder(model="sentence-transformers/all-MiniLM-L6-v2")
writer = DocumentWriter(document_store=doc_store)
write_pipeline = Pipeline()
write_pipeline.add_component("converter", converter)
write_pipeline.add_component("cleaner", cleaner)
write_pipeline.add_component("splitter", splitter)
write_pipeline.add_component("embedder", embedder)
write_pipeline.add_component("writer", writer)
write_pipeline.connect("converter", "cleaner")
write_pipeline.connect("cleaner", "splitter")
write_pipeline.connect("splitter", "embedder")
write_pipeline.connect("embedder.documents", "writer")
write_pipeline.run({"converter": {"sources": files}})
return doc_store
def main():
doc_store = pip_write_document(['NYSE_RHT_2019.pdf', 'hellofresh-se_2023.pdf']) which is based on your code, It worked for both using a local Docker image and the hosted free API. These is the settings that appear to me on the hosted weaviate collections |
Beta Was this translation helpful? Give feedback.
-
Now, there's still a problem when I try to retrieve documents: In [11]: doc_store.filter_documents() The following error shows up, which is similar - I will focus on trying to solve this ---------------------------------------------------------------------------
AioRpcError Traceback (most recent call last)
File ~/test/.venv/lib/python3.12/site-packages/weaviate/collections/grpc/query.py:804, in _QueryGRPC.__call(self, request)
803 assert self._connection.grpc_stub is not None
--> 804 res = await _Retry(4).with_exponential_backoff(
805 0,
806 f"Searching in collection {request.collection}",
807 self._connection.grpc_stub.Search,
808 request,
809 metadata=self._connection.grpc_headers(),
810 timeout=self._connection.timeout_config.query,
811 )
812 return cast(search_get_pb2.SearchReply, res)
File ~/test/.venv/lib/python3.12/site-packages/weaviate/collections/grpc/retry.py:31, in _Retry.with_exponential_backoff(self, count, error, f, *args, **kwargs)
30 if e.code() != StatusCode.UNAVAILABLE:
---> 31 raise e
32 logger.info(
33 f"{error} received exception: {e}. Retrying with exponential backoff in {2**count} seconds"
34 )
File ~/test/.venv/lib/python3.12/site-packages/weaviate/collections/grpc/retry.py:28, in _Retry.with_exponential_backoff(self, count, error, f, *args, **kwargs)
27 try:
---> 28 return await f(*args, **kwargs)
29 except AioRpcError as e:
File ~/test/.venv/lib/python3.12/site-packages/grpc/aio/_call.py:327, in _UnaryResponseMixin.__await__(self)
326 else:
--> 327 raise _create_rpc_error(
328 self._cython_call._initial_metadata,
329 self._cython_call._status,
330 )
331 else:
AioRpcError: <AioRpcError of RPC that terminated with:
status = StatusCode.UNKNOWN
details = "creating primitive value for _split_overlap: proto: invalid type: []interface {}"
debug_error_string = "UNKNOWN:Error received from peer {created_time:"2024-11-01T15:41:57.063495+01:00", grpc_status:2, grpc_message:"creating primitive value for _split_overlap: proto:\xc2\xa0invalid type: []interface {}"}"
>
During handling of the above exception, another exception occurred:
WeaviateQueryError Traceback (most recent call last)
Cell In[11], line 1
----> 1 doc_store.filter_documents()
File ~/test/.venv/lib/python3.12/site-packages/haystack_integrations/document_stores/weaviate/document_store.py:398, in WeaviateDocumentStore.filter_documents(self, filters)
396 else:
397 result = self._query()
--> 398 return [self._to_document(doc) for doc in result]
File ~/test/.venv/lib/python3.12/site-packages/weaviate/collections/iterator.py:59, in _ObjectIterator.__next__(self)
57 def __next__(self) -> Object[TProperties, TReferences]:
58 if len(self.__iter_object_cache) == 0:
---> 59 res = self.__query.fetch_objects(
60 limit=self.__iter_cache_size,
61 after=self.__iter_object_last_uuid,
62 include_vector=self.__inputs.include_vector,
63 return_metadata=self.__inputs.return_metadata,
64 return_properties=self.__inputs.return_properties,
65 return_references=self.__inputs.return_references,
66 )
67 self.__iter_object_cache = res.objects # type: ignore
68 if len(self.__iter_object_cache) == 0:
File ~/test/.venv/lib/python3.12/site-packages/weaviate/syncify.py:23, in convert.<locals>.sync_method(self, __new_name, *args, **kwargs)
20 @wraps(method) # type: ignore
21 def sync_method(self, *args, __new_name=new_name, **kwargs):
22 async_func = getattr(cls, __new_name)
---> 23 return _EventLoopSingleton.get_instance().run_until_complete(
24 async_func, self, *args, **kwargs
25 )
File ~/test/.venv/lib/python3.12/site-packages/weaviate/event_loop.py:41, in _EventLoop.run_until_complete(self, f, *args, **kwargs)
39 raise WeaviateClosedClientError()
40 fut = asyncio.run_coroutine_threadsafe(f(*args, **kwargs), self.loop)
---> 41 return fut.result()
File /opt/homebrew/Cellar/python@3.12/3.12.6/Frameworks/Python.framework/Versions/3.12/lib/python3.12/concurrent/futures/_base.py:456, in Future.result(self, timeout)
454 raise CancelledError()
455 elif self._state == FINISHED:
--> 456 return self.__get_result()
457 else:
458 raise TimeoutError()
File /opt/homebrew/Cellar/python@3.12/3.12.6/Frameworks/Python.framework/Versions/3.12/lib/python3.12/concurrent/futures/_base.py:401, in Future.__get_result(self)
399 if self._exception:
400 try:
--> 401 raise self._exception
402 finally:
403 # Break a reference cycle with the exception in self._exception
404 self = None
File ~/test/.venv/lib/python3.12/site-packages/weaviate/collections/queries/fetch_objects/query.py:65, in _FetchObjectsQueryAsync.fetch_objects(self, limit, offset, after, filters, sort, include_vector, return_metadata, return_properties, return_references)
18 async def fetch_objects(
19 self,
20 *,
(...)
29 return_references: Optional[ReturnReferences[TReferences]] = None
30 ) -> QueryReturnType[Properties, References, TProperties, TReferences]:
31 """Retrieve the objects in this collection without any search.
32
33 Arguments:
(...)
63 If the network connection to Weaviate fails.
64 """
---> 65 res = await self._query.get(
66 limit=limit,
67 offset=offset,
68 after=after,
69 filters=filters,
70 sort=sort,
71 return_metadata=self._parse_return_metadata(return_metadata, include_vector),
72 return_properties=self._parse_return_properties(return_properties),
73 return_references=self._parse_return_references(return_references),
74 )
75 return self._result_to_query_return(
76 res,
77 _QueryOptions.from_input(
(...)
85 return_references,
86 )
File ~/test/.venv/lib/python3.12/site-packages/weaviate/collections/grpc/query.py:814, in _QueryGRPC.__call(self, request)
812 return cast(search_get_pb2.SearchReply, res)
813 except (AioRpcError, WeaviateRetryError) as e:
--> 814 raise WeaviateQueryError(str(e), "GRPC search")
WeaviateQueryError: Query call with protocol GRPC search failed with message <AioRpcError of RPC that terminated with:
status = StatusCode.UNKNOWN
details = "creating primitive value for _split_overlap: proto: invalid type: []interface {}"
debug_error_string = "UNKNOWN:Error received from peer {created_time:"2024-11-01T15:41:57.063495+01:00", grpc_status:2, grpc_message:"creating primitive value for _split_overlap: proto:\xc2\xa0invalid type: []interface {}"}"
```>. |
Beta Was this translation helpful? Give feedback.
-
I've been investigating this issue for a while.
properties = [p.name for p in self.collection.config.get().properties]
try:
result = self.collection.iterator(include_vector=True, return_properties=properties)
At this point, we should probably open an issue on Weaviate... |
Beta Was this translation helpful? Give feedback.
-
@anakin87 can you open an issue on weaviate? |
Beta Was this translation helpful? Give feedback.
-
Haystack issue: deepset-ai/haystack-core-integrations#1172 |
Beta Was this translation helpful? Give feedback.
-
Hello to everyone,
I am using Haystack (v2.6.1) with Weaviate as the document store. I encounter a problem when using DocumentSplitter with
split_overlap
greater than zero. Whensplit_overlap
is set to values greater than zero, a_split_overlap
field is added to the Document, causing errors both when writing to the database and during retrieval withWeaviateEmbeddingRetriever
orWeaviateBM25Retriever
.Code used for writing to the database
This function returns the following error:
Traceback (most recent call last):
File "/home/stefano/projects/techbuddy/_progetti_test/weaviate/main.py", line 462, in
pip_write_document(FilesRetriever("data").retrieve_files())
File "/home/stefano/projects/techbuddy/_progetti_test/weaviate/main.py", line 308, in pip_write_document
write_pipeline.run({"converter": {"sources": files}})
File "/home/stefano/projects/techbuddy/venv_techbuddy/lib/python3.12/site-packages/haystack/core/pipeline/pipeline.py", line 229, in run
res: Dict[str, Any] = self._run_component(name, components_inputs[name])
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/stefano/projects/techbuddy/venv_techbuddy/lib/python3.12/site-packages/haystack/core/pipeline/pipeline.py", line 67, in _run_component
res: Dict[str, Any] = instance.run(**inputs)
^^^^^^^^^^^^^^^^^^^^^^
File "/home/stefano/projects/techbuddy/venv_techbuddy/lib/python3.12/site-packages/haystack/components/writers/document_writer.py", line 101, in run
documents_written = self.document_store.write_documents(documents=documents, policy=policy)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/stefano/projects/techbuddy/venv_techbuddy/lib/python3.12/site-packages/haystack_integrations/document_stores/weaviate/document_store.py", line 485, in write_documents
return self._batch_write(documents)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/stefano/projects/techbuddy/venv_techbuddy/lib/python3.12/site-packages/haystack_integrations/document_stores/weaviate/document_store.py", line 435, in _batch_write
raise DocumentStoreError(msg)
haystack.document_stores.errors.errors.DocumentStoreError: Failed to write object with id '8a2bd66589d4c9f25f0010ffebe7ae39b3462d7efc22bba10b02d02da8bdd8dd'. Error: 'WeaviateInsertManyAllFailedError("Every object failed during insertion. Here is the set of all errors: invalid text array property '_split_overlap' on class 'Weatest': invalid text array value: [map[doc_id:fe648d7a9bb493b97889b74c05dc73b00faddf325c9a009b1c8b734064b11017 range:[%!s(float64=0) %!s(float64=370)]]]")'
Failed to write object with id 'fe648d7a9bb493b97889b74c05dc73b00faddf325c9a009b1c8b734064b11017'. Error: 'WeaviateInsertManyAllFailedError("Every object failed during insertion. Here is the set of all errors: invalid text array property '_split_overlap' on class 'Weatest': invalid text array value: [map[doc_id:8a2bd66589d4c9f25f0010ffebe7ae39b3462d7efc22bba10b02d02da8bdd8dd range:[%!s(float64=1406) %!s(float64=1776)]] map[doc_id:83417f06daf15892787e8c3b8bd035582d264a5a06985c4258a4a4d81017e69f range:[%!s(float64=0) %!s(float64=358)]]]")'
Failed to write object with id '83417f06daf15892787e8c3b8bd035582d264a5a06985c4258a4a4d81017e69f'. Error: 'WeaviateInsertManyAllFailedError("Every object failed during insertion. Here is the set of all errors: invalid text array property '_split_overlap' on class 'Weatest': invalid text array value: [map[doc_id:fe648d7a9bb493b97889b74c05dc73b00faddf325c9a009b1c8b734064b11017 range:[%!s(float64=1411) %!s(float64=1769)]] map[doc_id:42a0cfd1660e4b209260c3f88680f35f66c0a6785413377ab34961fbf7bd3ae2 range:[%!s(float64=0) %!s(float64=342)]]]")'
Failed to write object with id '42a0cfd1660e4b209260c3f88680f35f66c0a6785413377ab34961fbf7bd3ae2'. Error: 'WeaviateInsertManyAllFailedError("Every object failed during insertion. Here is the set of all errors: invalid text array property '_split_overlap' on class 'Weatest': invalid text array value: [map[doc_id:83417f06daf15892787e8c3b8bd035582d264a5a06985c4258a4a4d81017e69f range:[%!s(float64=1465) %!s(float64=1807)]] map[doc_id:b738f9a66b9ae022e4e9fbca42459c20662ea8bf152fa03f54bc774973c590a1 range:[%!s(float64=0) %!s(float64=333)]]]")'
Failed to write object with id 'b738f9a66b9ae022e4e9fbca42459c20662ea8bf152fa03f54bc774973c590a1'. Error: 'WeaviateInsertManyAllFailedError("Every object failed during insertion. Here is the set of all errors: invalid text array property '_split_overlap' on class 'Weatest': invalid text array value: [map[doc_id:42a0cfd1660e4b209260c3f88680f35f66c0a6785413377ab34961fbf7bd3ae2 range:[%!s(float64=1505) %!s(float64=1838)]] map[doc_id:9363a5a8342987bbd8a48656a3338fd6e237619b76de8764dcc9d7e193907ab6 range:[%!s(float64=0) %!s(float64=274)]]]")'
Failed to write object with id '9363a5a8342987bbd8a48656a3338fd6e237619b76de8764dcc9d7e193907ab6'. Error: 'WeaviateInsertManyAllFailedError("Every object failed during insertion. Here is the set of all errors: invalid text array property '_split_overlap' on class 'Weatest': invalid text array value: [map[doc_id:b738f9a66b9ae022e4e9fbca42459c20662ea8bf152fa03f54bc774973c590a1 range:[%!s(float64=1240) %!s(float64=1514)]] map[doc_id:5dc098c8ce84ec313e1909d7037ac2c5771f193c63b22b462bf62f76aed44060 range:[%!s(float64=0) %!s(float64=339)]]]")'
Failed to write object with id '5dc098c8ce84ec313e1909d7037ac2c5771f193c63b22b462bf62f76aed44060'. Error: 'WeaviateInsertManyAllFailedError("Every object failed during insertion. Here is the set of all errors: invalid text array property '_split_overlap' on class 'Weatest': invalid text array value: [map[doc_id:9363a5a8342987bbd8a48656a3338fd6e237619b76de8764dcc9d7e193907ab6 range:[%!s(float64=1154) %!s(float64=1493)]] map[doc_id:2626be78a522bc98fb9001396539291a455ce7c0647eaea1de3e2c340c7bb425 range:[%!s(float64=0) %!s(float64=382)]]]")'
Failed to write object with id '2626be78a522bc98fb9001396539291a455ce7c0647eaea1de3e2c340c7bb425'. Error: 'WeaviateInsertManyAllFailedError("Every object failed during insertion. Here is the set of all errors: invalid text array property '_split_overlap' on class 'Weatest': invalid text array value: [map[doc_id:5dc098c8ce84ec313e1909d7037ac2c5771f193c63b22b462bf62f76aed44060 range:[%!s(float64=1374) %!s(float64=1756)]] map[doc_id:7ac5ef216d348e18bb215433d599717d83089c748a2881d06583d83934b805bc range:[%!s(float64=0) %!s(float64=371)]]]")'
Failed to write object with id '7ac5ef216d348e18bb215433d599717d83089c748a2881d06583d83934b805bc'. Error: 'WeaviateInsertManyAllFailedError("Every object failed during insertion. Here is the set of all errors: invalid text array property '_split_overlap' on class 'Weatest': invalid text array value: [map[doc_id:2626be78a522bc98fb9001396539291a455ce7c0647eaea1de3e2c340c7bb425 range:[%!s(float64=1436) %!s(float64=1807)]]]")'
[1]+ Done clear
Solution adopted for writing
To solve the writing problem, I defined the class of the object to be written to the database through the collection_settings parameter when instantiating the WeaviateDocumentStore class:
With this adjustment, I am able to write to the database without errors.
Problem during reading from the database
The problem also occurs during reading from the database. When attempting to retrieve with WeaviateEmbeddingRetriever or WeaviateBM25Retriever, I receive the following error:
Traceback (most recent call last):
File "/home/stefano/projects/techbuddy/venv_techbuddy/lib/python3.12/site-packages/weaviate/collections/grpc/query.py", line 798, in __call
res = await self._connection.grpc_stub.Search(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/stefano/projects/techbuddy/venv_techbuddy/lib/python3.12/site-packages/grpc/aio/_call.py", line 327, in await
raise _create_rpc_error(
grpc.aio._call.AioRpcError: <AioRpcError of RPC that terminated with:
status = StatusCode.UNKNOWN
details = "creating primitive value for _split_overlap: proto: invalid type: []interface {}"
debug_error_string = "UNKNOWN:Error received from peer {created_time:"2024-10-31T15:47:15.576706374+01:00", grpc_status:2, grpc_message:"creating primitive value for _split_overlap: proto:\xc2\xa0invalid type: []interface {}"}"
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/stefano/projects/techbuddy/_progetti_test/weaviate/main.py", line 466, in
retrieve_documents_BM25()
File "/home/stefano/projects/techbuddy/_progetti_test/weaviate/main.py", line 429, in retrieve_documents_BM25
results = retriever.run(query="formattazione")
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/stefano/projects/techbuddy/venv_techbuddy/lib/python3.12/site-packages/haystack_integrations/components/retrievers/weaviate/bm25_retriever.py", line 107, in run
documents = self._document_store._bm25_retrieval(query=query, filters=filters, top_k=top_k)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/stefano/projects/techbuddy/venv_techbuddy/lib/python3.12/site-packages/haystack_integrations/document_stores/weaviate/document_store.py", line 502, in _bm25_retrieval
result = self.collection.query.bm25(
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/stefano/projects/techbuddy/venv_techbuddy/lib/python3.12/site-packages/weaviate/syncify.py", line 23, in sync_method
return _EventLoopSingleton.get_instance().run_until_complete(
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/home/stefano/projects/techbuddy/venv_techbuddy/lib/python3.12/site-packages/weaviate/event_loop.py", line 40, in run_until_complete
return fut.result()
^^^^^^^^^^^^
File "/usr/lib/python3.12/concurrent/futures/_base.py", line 456, in result
return self.__get_result()
^^^^^^^^^^^^^^^^^^^
File "/usr/lib/python3.12/concurrent/futures/_base.py", line 401, in __get_result
raise self._exception
File "/home/stefano/projects/techbuddy/venv_techbuddy/lib/python3.12/site-packages/weaviate/collections/queries/bm25/query.py", line 85, in bm25
res = await self._query.bm25(
^^^^^^^^^^^^^^^^^^^^^^^
File "/home/stefano/projects/techbuddy/venv_techbuddy/lib/python3.12/site-packages/weaviate/collections/grpc/query.py", line 805, in __call
raise WeaviateQueryError(str(e), "GRPC search") # pyright: ignore
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
weaviate.exceptions.WeaviateQueryError: Query call with protocol GRPC search failed with message <AioRpcError of RPC that terminated with:
status = StatusCode.UNKNOWN
details = "creating primitive value for _split_overlap: proto: invalid type: []interface {}"
debug_error_string = "UNKNOWN:Error received from peer {created_time:"2024-10-31T15:47:15.576706374+01:00", grpc_status:2, grpc_message:"creating primitive value for _split_overlap: proto:\xc2\xa0invalid type: []interface {}"}"
Below is an example of the function I use:
Working alternative
Using the Weaviate library directly, everything works correctly:
At this point, I kindly ask for your help to understand if I am making a mistake or if it is a potential bug. Any guidance or suggestions would be greatly appreciated!
Beta Was this translation helpful? Give feedback.
All reactions