From 0061d8bc2cba832d786165079e10a8b830a85306 Mon Sep 17 00:00:00 2001 From: Ikko Eltociear Ashimine Date: Tue, 8 Oct 2024 02:34:49 +0900 Subject: [PATCH 1/4] docs: update BGEM3Demo.ipynb (#16395) --- docs/docs/examples/managed/BGEM3Demo.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/docs/examples/managed/BGEM3Demo.ipynb b/docs/docs/examples/managed/BGEM3Demo.ipynb index d41ef6ea64aa0..d617466b4f98d 100644 --- a/docs/docs/examples/managed/BGEM3Demo.ipynb +++ b/docs/docs/examples/managed/BGEM3Demo.ipynb @@ -89,7 +89,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "## Retrieve relavant documents" + "## Retrieve relevant documents" ] }, { From 54f5a0bbea46c49f653f628f18964a9a0cb66455 Mon Sep 17 00:00:00 2001 From: Rakesh Mehta <46493063+rakeshmehta0308@users.noreply.github.com> Date: Mon, 7 Oct 2024 18:40:55 +0100 Subject: [PATCH 2/4] Fix ZyteWebReader article parsing (#16401) --- .../data_connectors/WebPageDemo.ipynb | 75 ++++++++++++++++--- .../llama_index/readers/web/zyte_web/base.py | 14 +++- .../llama-index-readers-web/pyproject.toml | 2 +- 3 files changed, 75 insertions(+), 16 deletions(-) diff --git a/docs/docs/examples/data_connectors/WebPageDemo.ipynb b/docs/docs/examples/data_connectors/WebPageDemo.ipynb index 3330cdea468cf..197f4a08fc72c 100644 --- a/docs/docs/examples/data_connectors/WebPageDemo.ipynb +++ b/docs/docs/examples/data_connectors/WebPageDemo.ipynb @@ -620,15 +620,23 @@ "pip install zyte-api\n", "```\n", "\n", - "To get access to your ZYTE API key please visit: https://www.zyte.com/zyte-api/" + "To get access to your ZYTE API key please visit: https://docs.zyte.com/zyte-api/get-started.html" ] }, { "cell_type": "code", "execution_count": null, - "id": "f49f22bf", + "id": "31e1aaa5-8bfc-452f-9c72-15def22f872f", "metadata": {}, - "outputs": [], + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "5871\n" + ] + } + ], "source": [ "from llama_index.readers.web import ZyteWebReader\n", "\n", @@ -636,20 +644,54 @@ "# import nest_asyncio\n", "# nest_asyncio.apply()\n", "\n", + "\n", + "# Initiate ZyteWebReader with your Zyte API key\n", + "zyte_reader = ZyteWebReader(\n", + " api_key=\"your ZYTE API key here\",\n", + " mode=\"article\", # or \"html-text\" or \"html\"\n", + ")\n", + "\n", + "urls = [\n", + " \"https://www.zyte.com/blog/web-scraping-apis/\",\n", + " \"https://www.zyte.com/blog/system-integrators-extract-big-data/\",\n", + "]\n", + "\n", + "documents = zyte_reader.load_data(\n", + " urls=urls,\n", + ")\n", + "\n", + "print(len(documents[0].text))" + ] + }, + { + "cell_type": "markdown", + "id": "c21ae76e-1b2c-480e-a58f-9f9becce15a6", + "metadata": {}, + "source": [ + "Browser rendering and javascript can be enabled by passing setting corresponding parameters during initialization. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "f49f22bf", + "metadata": {}, + "outputs": [], + "source": [ "zyte_dw_params = {\n", " \"browserHtml\": True, # Enable browser rendering\n", " \"javascript\": True, # Enable JavaScript\n", "}\n", "\n", - "# Initiate ZyteWebReader with your Zyte API key\n", + "# Initiate ZyteWebReader with your Zyte API key and use default \"article\" mode\n", "zyte_reader = ZyteWebReader(\n", - " api_key=\"Your Zyte API Key\",\n", + " api_key=\"your ZYTE API key here\",\n", " download_kwargs=zyte_dw_params,\n", ")\n", "\n", - "# Load documents from URLs as markdown\n", + "# Load documents from URLs\n", "documents = zyte_reader.load_data(\n", - " urls=[\"https://www.zyte.com/blog/system-integrators-extract-big-data/\"],\n", + " urls=urls,\n", ")" ] }, @@ -662,7 +704,7 @@ { "data": { "text/plain": [ - "7150" + "4355" ] }, "execution_count": null, @@ -674,6 +716,14 @@ "len(documents[0].text)" ] }, + { + "cell_type": "markdown", + "id": "133d26d7-c26d-40b2-b08f-6c838fd3a6b6", + "metadata": {}, + "source": [ + "Set \"continue_on_failure\" to False if you'd like to stop when any request fails." + ] + }, { "cell_type": "code", "execution_count": null, @@ -682,14 +732,15 @@ "outputs": [], "source": [ "zyte_reader = ZyteWebReader(\n", - " api_key=\"Your API Key\",\n", + " api_key=\"your ZYTE API key here\",\n", " mode=\"html-text\",\n", " download_kwargs=zyte_dw_params,\n", + " continue_on_failure=False,\n", ")\n", "\n", - "# Load documents from URLs as markdown\n", + "# Load documents from URLs\n", "documents = zyte_reader.load_data(\n", - " urls=[\"https://www.zyte.com/blog/system-integrators-extract-big-data/\"],\n", + " urls=urls,\n", ")" ] }, @@ -702,7 +753,7 @@ { "data": { "text/plain": [ - "19554" + "17488" ] }, "execution_count": null, diff --git a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/base.py b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/base.py index cf11bc5925bdc..10e59a733a604 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/base.py +++ b/llama-index-integrations/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/base.py @@ -44,6 +44,7 @@ class ZyteWebReader(BasePydanticReader): mode: str n_conn: int download_kwargs: Optional[dict] + continue_on_failure: bool def __init__( self, @@ -51,10 +52,15 @@ def __init__( mode: Literal["article", "html", "html-text"] = "article", n_conn: int = 15, download_kwargs: Optional[Dict[str, Any]] = None, + continue_on_failure: bool = True, ) -> None: """Initialize with file path.""" super().__init__( - api_key=api_key, mode=mode, n_conn=n_conn, download_kwargs=download_kwargs + api_key=api_key, + mode=mode, + n_conn=n_conn, + download_kwargs=download_kwargs, + continue_on_failure=continue_on_failure, ) try: from zyte_api import AsyncZyteAPI @@ -81,12 +87,14 @@ def class_name(cls) -> str: return "ZyteWebReader" def _zyte_html_option(self) -> str: - if "browserHtml" in self.download_kwargs: + if self.download_kwargs and "browserHtml" in self.download_kwargs: return "browserHtml" return "httpResponseBody" def _get_article(self, page: Dict) -> str: - return page["article"]["headline"] + "\n\n" + page["article"]["articleBody"] + headline = page["article"].get("headline", "") + article_body = page["article"].get("articleBody", "") + return headline + "\n\n" + article_body def _zyte_request_params(self, url: str) -> dict: request_params: Dict[str, Any] = {"url": url} diff --git a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml index c410a02d9d0f5..132eb64a4b03d 100644 --- a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml +++ b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml @@ -45,7 +45,7 @@ license = "MIT" maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"] name = "llama-index-readers-web" readme = "README.md" -version = "0.2.3" +version = "0.2.4" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" From 4655797b6dc65a9fbddfcfe6b15b6f4eda0a4e73 Mon Sep 17 00:00:00 2001 From: Logan Date: Mon, 7 Oct 2024 12:06:44 -0600 Subject: [PATCH 3/4] don't include tool calls if there was none (#16408) --- .github/workflows/coverage.yml | 9 ++++++--- .../llama_index/llms/openai/base.py | 6 ++++-- .../llama_index/llms/openai/utils.py | 2 +- .../llms/llama-index-llms-openai/pyproject.toml | 2 +- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 9ac0bff8c26d9..873a8fad1e1e2 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -52,11 +52,13 @@ jobs: CHANGED_FILES=$(pants list --changed-since=origin/main) # Find which roots contain changed files + CHANGED_ROOTS="" FILTER_PATTERNS="[" for file in $CHANGED_FILES; do root=$(echo "$file" | cut -d'/' -f1,2,3) if [[ ! "$FILTER_PATTERNS" =~ "$root" ]]; then FILTER_PATTERNS="${FILTER_PATTERNS}'${root}'," + CHANGED_ROOTS="${CHANGED_ROOTS} ${root}/::" fi done @@ -64,8 +66,9 @@ jobs: FILTER_PATTERNS="${FILTER_PATTERNS%,}]" echo "Coverage filter patterns: $FILTER_PATTERNS" + echo "Changed roots: $CHANGED_ROOTS" - pants --level=error --no-local-cache test \ + pants --no-local-cache test \ --test-use-coverage \ - --changed-since=origin/main \ - --coverage-py-filter="$FILTER_PATTERNS" + --coverage-py-filter="${FILTER_PATTERNS}" \ + ${CHANGED_ROOTS} diff --git a/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/base.py b/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/base.py index d1a8bcf92aa13..3bf3b2b1d50f8 100644 --- a/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/base.py +++ b/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/base.py @@ -488,7 +488,8 @@ def gen() -> ChatResponseGen: additional_kwargs = {} if is_function: tool_calls = update_tool_calls(tool_calls, delta.tool_calls) - additional_kwargs["tool_calls"] = tool_calls + if tool_calls: + additional_kwargs["tool_calls"] = tool_calls yield ChatResponse( message=ChatMessage( @@ -738,7 +739,8 @@ async def gen() -> ChatResponseAsyncGen: additional_kwargs = {} if is_function: tool_calls = update_tool_calls(tool_calls, delta.tool_calls) - additional_kwargs["tool_calls"] = tool_calls + if tool_calls: + additional_kwargs["tool_calls"] = tool_calls yield ChatResponse( message=ChatMessage( diff --git a/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/utils.py b/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/utils.py index a076a9ffc6014..f43e8f4180d0a 100644 --- a/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/utils.py +++ b/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/utils.py @@ -285,7 +285,7 @@ def from_openai_message(openai_message: ChatCompletionMessage) -> ChatMessage: # function_call = None # deprecated in OpenAI v 1.1.0 additional_kwargs: Dict[str, Any] = {} - if openai_message.tool_calls is not None: + if openai_message.tool_calls: tool_calls: List[ChatCompletionMessageToolCall] = openai_message.tool_calls additional_kwargs.update(tool_calls=tool_calls) diff --git a/llama-index-integrations/llms/llama-index-llms-openai/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-openai/pyproject.toml index d207e8437a53f..ccba47ea29e4f 100644 --- a/llama-index-integrations/llms/llama-index-llms-openai/pyproject.toml +++ b/llama-index-integrations/llms/llama-index-llms-openai/pyproject.toml @@ -29,7 +29,7 @@ exclude = ["**/BUILD"] license = "MIT" name = "llama-index-llms-openai" readme = "README.md" -version = "0.2.11" +version = "0.2.12" [tool.poetry.dependencies] python = ">=3.8.1,<4.0" From 62849af227189fcea75f140d9e7954ca3eac7d46 Mon Sep 17 00:00:00 2001 From: Logan Date: Mon, 7 Oct 2024 14:49:48 -0600 Subject: [PATCH 4/4] Fix ChatMessage serialization with janky openai types (#16410) --- .github/workflows/coverage.yml | 11 +++++-- .../llama_index/core/base/llms/types.py | 28 +++++++--------- .../tests/test_openai.py | 33 +++++++++++++++++++ 3 files changed, 54 insertions(+), 18 deletions(-) diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml index 873a8fad1e1e2..a11c388488a03 100644 --- a/.github/workflows/coverage.yml +++ b/.github/workflows/coverage.yml @@ -55,7 +55,14 @@ jobs: CHANGED_ROOTS="" FILTER_PATTERNS="[" for file in $CHANGED_FILES; do - root=$(echo "$file" | cut -d'/' -f1,2,3) + # Start with the full path + root="$file" + # Keep going up the directory tree until we find a directory containing a marker file + # (e.g., 'pyproject.toml' for python projects) + while [[ ! -f "$root/pyproject.toml" && "$root" != "." && "$root" != "/" ]]; do + root=$(dirname "$root") + done + if [[ ! "$FILTER_PATTERNS" =~ "$root" ]]; then FILTER_PATTERNS="${FILTER_PATTERNS}'${root}'," CHANGED_ROOTS="${CHANGED_ROOTS} ${root}/::" @@ -68,7 +75,7 @@ jobs: echo "Coverage filter patterns: $FILTER_PATTERNS" echo "Changed roots: $CHANGED_ROOTS" - pants --no-local-cache test \ + pants --level=error --no-local-cache test \ --test-use-coverage \ --coverage-py-filter="${FILTER_PATTERNS}" \ ${CHANGED_ROOTS} diff --git a/llama-index-core/llama_index/core/base/llms/types.py b/llama-index-core/llama_index/core/base/llms/types.py index 971db8e743841..124044d3bd11d 100644 --- a/llama-index-core/llama_index/core/base/llms/types.py +++ b/llama-index-core/llama_index/core/base/llms/types.py @@ -14,7 +14,12 @@ Any, ) -from llama_index.core.bridge.pydantic import BaseModel, Field, ConfigDict +from llama_index.core.bridge.pydantic import ( + BaseModel, + Field, + ConfigDict, + field_serializer, +) from llama_index.core.constants import DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS from llama_index.core.schema import ImageType @@ -95,7 +100,8 @@ def from_str( return cls(role=role, content=content, **kwargs) def _recursive_serialization(self, value: Any) -> Any: - if isinstance(value, (V1BaseModel, V2BaseModel)): + if isinstance(value, V2BaseModel): + value.model_rebuild() # ensures all fields are initialized and serializable return value.model_dump() # type: ignore if isinstance(value, dict): return { @@ -106,23 +112,13 @@ def _recursive_serialization(self, value: Any) -> Any: return [self._recursive_serialization(item) for item in value] return value + @field_serializer("additional_kwargs", check_fields=False) + def serialize_additional_kwargs(self, value: Any, _info: Any) -> Any: + return self._recursive_serialization(value) + def dict(self, **kwargs: Any) -> Dict[str, Any]: return self.model_dump(**kwargs) - def model_dump(self, **kwargs: Any) -> Dict[str, Any]: - # ensure all additional_kwargs are serializable - msg = super().model_dump(**kwargs) - - for key, value in msg.get("additional_kwargs", {}).items(): - value = self._recursive_serialization(value) - if not isinstance(value, (str, int, float, bool, dict, list, type(None))): - raise ValueError( - f"Failed to serialize additional_kwargs value: {value}" - ) - msg["additional_kwargs"][key] = value - - return msg - class LogProb(BaseModel): """LogProb of a token.""" diff --git a/llama-index-integrations/llms/llama-index-llms-openai/tests/test_openai.py b/llama-index-integrations/llms/llama-index-llms-openai/tests/test_openai.py index 1fde36dc316d4..3abefb60ffbd1 100644 --- a/llama-index-integrations/llms/llama-index-llms-openai/tests/test_openai.py +++ b/llama-index-integrations/llms/llama-index-llms-openai/tests/test_openai.py @@ -441,3 +441,36 @@ def test_completion_model_with_retry(MockSyncOpenAI: MagicMock) -> None: # The actual retry count is max_retries - 1 # see https://github.com/jd/tenacity/issues/459 assert mock_instance.completions.create.call_count == 3 + + +@patch("llama_index.llms.openai.base.SyncOpenAI") +def test_ensure_chat_message_is_serializable(MockSyncOpenAI: MagicMock) -> None: + with CachedOpenAIApiKeys(set_fake_key=True): + mock_instance = MockSyncOpenAI.return_value + mock_instance.chat.completions.create.return_value = mock_chat_completion_v1() + + llm = OpenAI(model="gpt-3.5-turbo") + message = ChatMessage(role="user", content="test message") + + response = llm.chat([message]) + response.message.additional_kwargs["test"] = ChatCompletionChunk( + id="chatcmpl-6ptKyqKOGXZT6iQnqiXAH8adNLUzD", + object="chat.completion.chunk", + created=1677825464, + model="gpt-3.5-turbo-0301", + choices=[ + ChunkChoice( + delta=ChoiceDelta(role="assistant", content="test"), + finish_reason=None, + index=0, + ) + ], + ) + data = response.message.dict() + assert isinstance(data, dict) + assert isinstance(data["additional_kwargs"], dict) + assert isinstance(data["additional_kwargs"]["test"]["choices"], list) + assert ( + data["additional_kwargs"]["test"]["choices"][0]["delta"]["content"] + == "test" + )