Merge branch 'main' into add-docling

run-llama · Oct 7, 2024 · ec1fc89 · ec1fc89
2 parents 3879b3e + 62849af
commit ec1fc89
Show file tree

Hide file tree

Showing 10 changed files with 140 additions and 40 deletions.
diff --git a/.github/workflows/coverage.yml b/.github/workflows/coverage.yml
@@ -52,20 +52,30 @@ jobs:
           CHANGED_FILES=$(pants list --changed-since=origin/main)
 
           # Find which roots contain changed files
+          CHANGED_ROOTS=""
           FILTER_PATTERNS="["
           for file in $CHANGED_FILES; do
-            root=$(echo "$file" | cut -d'/' -f1,2,3)
+            # Start with the full path
+            root="$file"
+            # Keep going up the directory tree until we find a directory containing a marker file
+            # (e.g., 'pyproject.toml' for python projects)
+            while [[ ! -f "$root/pyproject.toml" && "$root" != "." && "$root" != "/" ]]; do
+              root=$(dirname "$root")
+            done
+
             if [[ ! "$FILTER_PATTERNS" =~ "$root" ]]; then
               FILTER_PATTERNS="${FILTER_PATTERNS}'${root}',"
+              CHANGED_ROOTS="${CHANGED_ROOTS} ${root}/::"
             fi
           done
 
           # remove the last comma and close the bracket
           FILTER_PATTERNS="${FILTER_PATTERNS%,}]"
 
           echo "Coverage filter patterns: $FILTER_PATTERNS"
+          echo "Changed roots: $CHANGED_ROOTS"
 
           pants --level=error --no-local-cache test \
             --test-use-coverage \
-            --changed-since=origin/main \
-            --coverage-py-filter="$FILTER_PATTERNS"
+            --coverage-py-filter="${FILTER_PATTERNS}" \
+            ${CHANGED_ROOTS}
diff --git a/docs/docs/examples/data_connectors/WebPageDemo.ipynb b/docs/docs/examples/data_connectors/WebPageDemo.ipynb
@@ -620,36 +620,78 @@
     "pip install zyte-api\n",
     "```\n",
     "\n",
-    "To get access to your ZYTE API key please visit: https://www.zyte.com/zyte-api/"
+    "To get access to your ZYTE API key please visit: https://docs.zyte.com/zyte-api/get-started.html"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "f49f22bf",
+   "id": "31e1aaa5-8bfc-452f-9c72-15def22f872f",
    "metadata": {},
-   "outputs": [],
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "5871\n"
+     ]
+    }
+   ],
    "source": [
     "from llama_index.readers.web import ZyteWebReader\n",
     "\n",
     "# Required to run it in notebook\n",
     "# import nest_asyncio\n",
     "# nest_asyncio.apply()\n",
     "\n",
+    "\n",
+    "# Initiate ZyteWebReader with your Zyte API key\n",
+    "zyte_reader = ZyteWebReader(\n",
+    "    api_key=\"your ZYTE API key here\",\n",
+    "    mode=\"article\",  # or \"html-text\" or \"html\"\n",
+    ")\n",
+    "\n",
+    "urls = [\n",
+    "    \"https://www.zyte.com/blog/web-scraping-apis/\",\n",
+    "    \"https://www.zyte.com/blog/system-integrators-extract-big-data/\",\n",
+    "]\n",
+    "\n",
+    "documents = zyte_reader.load_data(\n",
+    "    urls=urls,\n",
+    ")\n",
+    "\n",
+    "print(len(documents[0].text))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "c21ae76e-1b2c-480e-a58f-9f9becce15a6",
+   "metadata": {},
+   "source": [
+    "Browser rendering and javascript can be enabled by passing setting corresponding parameters during initialization. "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f49f22bf",
+   "metadata": {},
+   "outputs": [],
+   "source": [
     "zyte_dw_params = {\n",
     "    \"browserHtml\": True,  # Enable browser rendering\n",
     "    \"javascript\": True,  # Enable JavaScript\n",
     "}\n",
     "\n",
-    "# Initiate ZyteWebReader with your Zyte API key\n",
+    "# Initiate ZyteWebReader with your Zyte API key and use default \"article\" mode\n",
     "zyte_reader = ZyteWebReader(\n",
-    "    api_key=\"Your Zyte API Key\",\n",
+    "    api_key=\"your ZYTE API key here\",\n",
     "    download_kwargs=zyte_dw_params,\n",
     ")\n",
     "\n",
-    "# Load documents from URLs as markdown\n",
+    "# Load documents from URLs\n",
     "documents = zyte_reader.load_data(\n",
-    "    urls=[\"https://www.zyte.com/blog/system-integrators-extract-big-data/\"],\n",
+    "    urls=urls,\n",
     ")"
    ]
   },
@@ -662,7 +704,7 @@
     {
      "data": {
       "text/plain": [
-       "7150"
+       "4355"
       ]
      },
      "execution_count": null,
@@ -674,6 +716,14 @@
     "len(documents[0].text)"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "id": "133d26d7-c26d-40b2-b08f-6c838fd3a6b6",
+   "metadata": {},
+   "source": [
+    "Set \"continue_on_failure\" to False if you'd like to stop when any request fails."
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -682,14 +732,15 @@
    "outputs": [],
    "source": [
     "zyte_reader = ZyteWebReader(\n",
-    "    api_key=\"Your API Key\",\n",
+    "    api_key=\"your ZYTE API key here\",\n",
     "    mode=\"html-text\",\n",
     "    download_kwargs=zyte_dw_params,\n",
+    "    continue_on_failure=False,\n",
     ")\n",
     "\n",
-    "# Load documents from URLs as markdown\n",
+    "# Load documents from URLs\n",
     "documents = zyte_reader.load_data(\n",
-    "    urls=[\"https://www.zyte.com/blog/system-integrators-extract-big-data/\"],\n",
+    "    urls=urls,\n",
     ")"
    ]
   },
@@ -702,7 +753,7 @@
     {
      "data": {
       "text/plain": [
-       "19554"
+       "17488"
       ]
      },
      "execution_count": null,

diff --git a/docs/docs/examples/managed/BGEM3Demo.ipynb b/docs/docs/examples/managed/BGEM3Demo.ipynb
@@ -89,7 +89,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Retrieve relavant documents"
+    "## Retrieve relevant documents"
    ]
   },
   {

diff --git a/llama-index-core/llama_index/core/base/llms/types.py b/llama-index-core/llama_index/core/base/llms/types.py
@@ -14,7 +14,12 @@
     Any,
 )
 
-from llama_index.core.bridge.pydantic import BaseModel, Field, ConfigDict
+from llama_index.core.bridge.pydantic import (
+    BaseModel,
+    Field,
+    ConfigDict,
+    field_serializer,
+)
 from llama_index.core.constants import DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS
 from llama_index.core.schema import ImageType
 
@@ -95,7 +100,8 @@ def from_str(
         return cls(role=role, content=content, **kwargs)
 
     def _recursive_serialization(self, value: Any) -> Any:
-        if isinstance(value, (V1BaseModel, V2BaseModel)):
+        if isinstance(value, V2BaseModel):
+            value.model_rebuild()  # ensures all fields are initialized and serializable
             return value.model_dump()  # type: ignore
         if isinstance(value, dict):
             return {
@@ -106,23 +112,13 @@ def _recursive_serialization(self, value: Any) -> Any:
             return [self._recursive_serialization(item) for item in value]
         return value
 
+    @field_serializer("additional_kwargs", check_fields=False)
+    def serialize_additional_kwargs(self, value: Any, _info: Any) -> Any:
+        return self._recursive_serialization(value)
+
     def dict(self, **kwargs: Any) -> Dict[str, Any]:
         return self.model_dump(**kwargs)
 
-    def model_dump(self, **kwargs: Any) -> Dict[str, Any]:
-        # ensure all additional_kwargs are serializable
-        msg = super().model_dump(**kwargs)
-
-        for key, value in msg.get("additional_kwargs", {}).items():
-            value = self._recursive_serialization(value)
-            if not isinstance(value, (str, int, float, bool, dict, list, type(None))):
-                raise ValueError(
-                    f"Failed to serialize additional_kwargs value: {value}"
-                )
-            msg["additional_kwargs"][key] = value
-
-        return msg
-
 
 class LogProb(BaseModel):
     """LogProb of a token."""

diff --git a/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/base.py b/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/base.py
@@ -488,7 +488,8 @@ def gen() -> ChatResponseGen:
                 additional_kwargs = {}
                 if is_function:
                     tool_calls = update_tool_calls(tool_calls, delta.tool_calls)
-                    additional_kwargs["tool_calls"] = tool_calls
+                    if tool_calls:
+                        additional_kwargs["tool_calls"] = tool_calls
 
                 yield ChatResponse(
                     message=ChatMessage(
@@ -738,7 +739,8 @@ async def gen() -> ChatResponseAsyncGen:
                 additional_kwargs = {}
                 if is_function:
                     tool_calls = update_tool_calls(tool_calls, delta.tool_calls)
-                    additional_kwargs["tool_calls"] = tool_calls
+                    if tool_calls:
+                        additional_kwargs["tool_calls"] = tool_calls
 
                 yield ChatResponse(
                     message=ChatMessage(

diff --git a/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/utils.py b/llama-index-integrations/llms/llama-index-llms-openai/llama_index/llms/openai/utils.py
@@ -285,7 +285,7 @@ def from_openai_message(openai_message: ChatCompletionMessage) -> ChatMessage:
     # function_call = None  # deprecated in OpenAI v 1.1.0
 
     additional_kwargs: Dict[str, Any] = {}
-    if openai_message.tool_calls is not None:
+    if openai_message.tool_calls:
         tool_calls: List[ChatCompletionMessageToolCall] = openai_message.tool_calls
         additional_kwargs.update(tool_calls=tool_calls)
 

diff --git a/llama-index-integrations/llms/llama-index-llms-openai/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-openai/pyproject.toml
@@ -29,7 +29,7 @@ exclude = ["**/BUILD"]
 license = "MIT"
 name = "llama-index-llms-openai"
 readme = "README.md"
-version = "0.2.11"
+version = "0.2.12"
 
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"

diff --git a/llama-index-integrations/llms/llama-index-llms-openai/tests/test_openai.py b/llama-index-integrations/llms/llama-index-llms-openai/tests/test_openai.py
@@ -441,3 +441,36 @@ def test_completion_model_with_retry(MockSyncOpenAI: MagicMock) -> None:
     # The actual retry count is max_retries - 1
     # see https://github.com/jd/tenacity/issues/459
     assert mock_instance.completions.create.call_count == 3
+
+
+@patch("llama_index.llms.openai.base.SyncOpenAI")
+def test_ensure_chat_message_is_serializable(MockSyncOpenAI: MagicMock) -> None:
+    with CachedOpenAIApiKeys(set_fake_key=True):
+        mock_instance = MockSyncOpenAI.return_value
+        mock_instance.chat.completions.create.return_value = mock_chat_completion_v1()
+
+        llm = OpenAI(model="gpt-3.5-turbo")
+        message = ChatMessage(role="user", content="test message")
+
+        response = llm.chat([message])
+        response.message.additional_kwargs["test"] = ChatCompletionChunk(
+            id="chatcmpl-6ptKyqKOGXZT6iQnqiXAH8adNLUzD",
+            object="chat.completion.chunk",
+            created=1677825464,
+            model="gpt-3.5-turbo-0301",
+            choices=[
+                ChunkChoice(
+                    delta=ChoiceDelta(role="assistant", content="test"),
+                    finish_reason=None,
+                    index=0,
+                )
+            ],
+        )
+        data = response.message.dict()
+        assert isinstance(data, dict)
+        assert isinstance(data["additional_kwargs"], dict)
+        assert isinstance(data["additional_kwargs"]["test"]["choices"], list)
+        assert (
+            data["additional_kwargs"]["test"]["choices"][0]["delta"]["content"]
+            == "test"
+        )
diff --git a/...dex-integrations/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/base.py b/...dex-integrations/readers/llama-index-readers-web/llama_index/readers/web/zyte_web/base.py
@@ -44,17 +44,23 @@ class ZyteWebReader(BasePydanticReader):
     mode: str
     n_conn: int
     download_kwargs: Optional[dict]
+    continue_on_failure: bool
 
     def __init__(
         self,
         api_key: str,
         mode: Literal["article", "html", "html-text"] = "article",
         n_conn: int = 15,
         download_kwargs: Optional[Dict[str, Any]] = None,
+        continue_on_failure: bool = True,
     ) -> None:
         """Initialize with file path."""
         super().__init__(
-            api_key=api_key, mode=mode, n_conn=n_conn, download_kwargs=download_kwargs
+            api_key=api_key,
+            mode=mode,
+            n_conn=n_conn,
+            download_kwargs=download_kwargs,
+            continue_on_failure=continue_on_failure,
         )
         try:
             from zyte_api import AsyncZyteAPI
@@ -81,12 +87,14 @@ def class_name(cls) -> str:
         return "ZyteWebReader"
 
     def _zyte_html_option(self) -> str:
-        if "browserHtml" in self.download_kwargs:
+        if self.download_kwargs and "browserHtml" in self.download_kwargs:
             return "browserHtml"
         return "httpResponseBody"
 
     def _get_article(self, page: Dict) -> str:
-        return page["article"]["headline"] + "\n\n" + page["article"]["articleBody"]
+        headline = page["article"].get("headline", "")
+        article_body = page["article"].get("articleBody", "")
+        return headline + "\n\n" + article_body
 
     def _zyte_request_params(self, url: str) -> dict:
         request_params: Dict[str, Any] = {"url": url}

diff --git a/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml b/llama-index-integrations/readers/llama-index-readers-web/pyproject.toml
@@ -45,7 +45,7 @@ license = "MIT"
 maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"]
 name = "llama-index-readers-web"
 readme = "README.md"
-version = "0.2.3"
+version = "0.2.4"
 
 [tool.poetry.dependencies]
 python = ">=3.8.1,<4.0"