Skip to content

Commit

Permalink
fix: make Document serialization backward compatible
Browse files Browse the repository at this point in the history
  • Loading branch information
masci committed Dec 18, 2024
1 parent 55f4aa5 commit f72ff29
Show file tree
Hide file tree
Showing 2 changed files with 91 additions and 36 deletions.
30 changes: 21 additions & 9 deletions llama-index-core/llama_index/core/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -934,25 +934,37 @@ def __init__(self, **data: Any) -> None:
If 'extra_info' was passed, store it in 'metadata'.
"""
if "doc_id" in data:
value = data.pop("doc_id")
if "id_" in data:
msg = "Cannot pass both 'doc_id' and 'id_' to create a Document, use 'id_'"
raise ValueError(msg)
data["id_"] = data.pop("doc_id")
msg = "'doc_id' is deprecated and 'id_' will be used instead"
logging.warning(msg)
else:
data["id_"] = value

if "extra_info" in data:
value = data.pop("extra_info")
if "metadata" in data:
msg = "Cannot pass both 'extra_info' and 'metadata' to create a Document, use 'metadata'"
raise ValueError(msg)
data["metadata"] = data.pop("extra_info")
msg = "'extra_info' is deprecated and 'metadata' will be used instead"
logging.warning(msg)
else:
data["metadata"] = value

if "text" in data:
text = data.pop("text")
if "text_resource" in data:
msg = "Cannot pass both 'text' and 'text_resource' to create a Document, use 'text_resource'"
raise ValueError(msg)
data["text_resource"] = MediaResource(text=data.pop("text"))
msg = "'text' is deprecated and 'text_resource' will be used instead"
logging.warning(msg)
else:
data["text_resource"] = MediaResource(text=text)

super().__init__(**data)

def custom_model_dump(self, handler: Any) -> Dict[str, Any]:
"""For full backward compatibility with the text field, we customize the model serializer."""
data = super().custom_model_dump(handler)
data["text"] = self.text
return data

@property
def text(self) -> str:
"""Provided for backward compatibility, it returns the content of text_resource."""
Expand Down
97 changes: 70 additions & 27 deletions llama-index-core/tests/schema/test_schema.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import base64
import logging
from io import BytesIO
from pathlib import Path
from unittest import mock
Expand Down Expand Up @@ -100,33 +101,46 @@ def test_build_text_node_text_resource() -> None:
assert node.text == "test data"


def test_document_init() -> None:
doc = Document(doc_id="test")
assert doc.doc_id == "test"
assert doc.id_ == "test"
with pytest.raises(
ValueError,
match="Cannot pass both 'doc_id' and 'id_' to create a Document, use 'id_'",
):
doc = Document(id_="test", doc_id="test")

doc = Document(extra_info={"key": "value"})
assert doc.metadata == {"key": "value"}
with pytest.raises(
ValueError,
match="Cannot pass both 'extra_info' and 'metadata' to create a Document, use 'metadata'",
):
doc = Document(extra_info={}, metadata={})

doc = Document(text="test")
assert doc.text == "test"
assert doc.text_resource
assert doc.text_resource.text == "test"
with pytest.raises(
ValueError,
match="Cannot pass both 'text' and 'text_resource' to create a Document, use 'text_resource'",
):
doc = Document(text="test", text_resource="test")
def test_document_init(caplog) -> None:
with caplog.at_level(logging.WARNING):
# Legacy init
doc = Document(doc_id="test")
assert doc.doc_id == "test"
assert doc.id_ == "test"
# Legacy init mixed with new
doc = Document(id_="test", doc_id="legacy_test")
assert "'doc_id' is deprecated and 'id_' will be used instead" in caplog.text
assert doc.id_ == "test"
caplog.clear()

# Legacy init
doc = Document(extra_info={"key": "value"})
assert doc.metadata == {"key": "value"}
assert doc.extra_info == {"key": "value"}
# Legacy init mixed with new
doc = Document(extra_info={"old_key": "old_value"}, metadata={"key": "value"})
assert (
"'extra_info' is deprecated and 'metadata' will be used instead"
in caplog.text
)
assert doc.metadata == {"key": "value"}
assert doc.extra_info == {"key": "value"}
caplog.clear()

# Legacy init
doc = Document(text="test")
assert doc.text == "test"
assert doc.text_resource
assert doc.text_resource.text == "test"
# Legacy init mixed with new
doc = Document(text="legacy_test", text_resource=MediaResource(text="test"))
assert (
"'text' is deprecated and 'text_resource' will be used instead"
in caplog.text
)
assert doc.text == "test"
assert doc.text_resource
assert doc.text_resource.text == "test"


def test_document_properties():
Expand All @@ -145,6 +159,35 @@ def test_document_str():
assert str(doc) == "Doc ID: test_id\nText: Lo..."


def test_document_legacy_roundtrip():
origin = Document(id_="test_id", text="this is a test")
assert origin.model_dump() == {
"id_": "test_id",
"embedding": None,
"metadata": {},
"excluded_embed_metadata_keys": [],
"excluded_llm_metadata_keys": [],
"relationships": {},
"metadata_template": "{key}: {value}",
"metadata_separator": "\n",
"text": "this is a test",
"text_resource": {
"embeddings": None,
"text": "this is a test",
"mimetype": None,
"path": None,
"url": None,
},
"image_resource": None,
"audio_resource": None,
"video_resource": None,
"text_template": "{metadata_str}\n\n{content}",
"class_name": "Document",
}
dest = Document(**origin.model_dump())
assert dest.text == "this is a test"


def test_image_document_empty():
doc = ImageDocument(id_="test")
assert doc.id_ == "test"
Expand Down

0 comments on commit f72ff29

Please sign in to comment.