Skip to content

Commit

Permalink
Merge branch 'main' into add-docling
Browse files Browse the repository at this point in the history
  • Loading branch information
logan-markewich committed Oct 7, 2024
2 parents 3879b3e + 62849af commit ec1fc89
Show file tree
Hide file tree
Showing 10 changed files with 140 additions and 40 deletions.
16 changes: 13 additions & 3 deletions .github/workflows/coverage.yml
Original file line number Diff line number Diff line change
Expand Up @@ -52,20 +52,30 @@ jobs:
CHANGED_FILES=$(pants list --changed-since=origin/main)
# Find which roots contain changed files
CHANGED_ROOTS=""
FILTER_PATTERNS="["
for file in $CHANGED_FILES; do
root=$(echo "$file" | cut -d'/' -f1,2,3)
# Start with the full path
root="$file"
# Keep going up the directory tree until we find a directory containing a marker file
# (e.g., 'pyproject.toml' for python projects)
while [[ ! -f "$root/pyproject.toml" && "$root" != "." && "$root" != "/" ]]; do
root=$(dirname "$root")
done
if [[ ! "$FILTER_PATTERNS" =~ "$root" ]]; then
FILTER_PATTERNS="${FILTER_PATTERNS}'${root}',"
CHANGED_ROOTS="${CHANGED_ROOTS} ${root}/::"
fi
done
# remove the last comma and close the bracket
FILTER_PATTERNS="${FILTER_PATTERNS%,}]"
echo "Coverage filter patterns: $FILTER_PATTERNS"
echo "Changed roots: $CHANGED_ROOTS"
pants --level=error --no-local-cache test \
--test-use-coverage \
--changed-since=origin/main \
--coverage-py-filter="$FILTER_PATTERNS"
--coverage-py-filter="${FILTER_PATTERNS}" \
${CHANGED_ROOTS}
75 changes: 63 additions & 12 deletions docs/docs/examples/data_connectors/WebPageDemo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -620,36 +620,78 @@
"pip install zyte-api\n",
"```\n",
"\n",
"To get access to your ZYTE API key please visit: https://www.zyte.com/zyte-api/"
"To get access to your ZYTE API key please visit: https://docs.zyte.com/zyte-api/get-started.html"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f49f22bf",
"id": "31e1aaa5-8bfc-452f-9c72-15def22f872f",
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5871\n"
]
}
],
"source": [
"from llama_index.readers.web import ZyteWebReader\n",
"\n",
"# Required to run it in notebook\n",
"# import nest_asyncio\n",
"# nest_asyncio.apply()\n",
"\n",
"\n",
"# Initiate ZyteWebReader with your Zyte API key\n",
"zyte_reader = ZyteWebReader(\n",
" api_key=\"your ZYTE API key here\",\n",
" mode=\"article\", # or \"html-text\" or \"html\"\n",
")\n",
"\n",
"urls = [\n",
" \"https://www.zyte.com/blog/web-scraping-apis/\",\n",
" \"https://www.zyte.com/blog/system-integrators-extract-big-data/\",\n",
"]\n",
"\n",
"documents = zyte_reader.load_data(\n",
" urls=urls,\n",
")\n",
"\n",
"print(len(documents[0].text))"
]
},
{
"cell_type": "markdown",
"id": "c21ae76e-1b2c-480e-a58f-9f9becce15a6",
"metadata": {},
"source": [
"Browser rendering and javascript can be enabled by passing setting corresponding parameters during initialization. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "f49f22bf",
"metadata": {},
"outputs": [],
"source": [
"zyte_dw_params = {\n",
" \"browserHtml\": True, # Enable browser rendering\n",
" \"javascript\": True, # Enable JavaScript\n",
"}\n",
"\n",
"# Initiate ZyteWebReader with your Zyte API key\n",
"# Initiate ZyteWebReader with your Zyte API key and use default \"article\" mode\n",
"zyte_reader = ZyteWebReader(\n",
" api_key=\"Your Zyte API Key\",\n",
" api_key=\"your ZYTE API key here\",\n",
" download_kwargs=zyte_dw_params,\n",
")\n",
"\n",
"# Load documents from URLs as markdown\n",
"# Load documents from URLs\n",
"documents = zyte_reader.load_data(\n",
" urls=[\"https://www.zyte.com/blog/system-integrators-extract-big-data/\"],\n",
" urls=urls,\n",
")"
]
},
Expand All @@ -662,7 +704,7 @@
{
"data": {
"text/plain": [
"7150"
"4355"
]
},
"execution_count": null,
Expand All @@ -674,6 +716,14 @@
"len(documents[0].text)"
]
},
{
"cell_type": "markdown",
"id": "133d26d7-c26d-40b2-b08f-6c838fd3a6b6",
"metadata": {},
"source": [
"Set \"continue_on_failure\" to False if you'd like to stop when any request fails."
]
},
{
"cell_type": "code",
"execution_count": null,
Expand All @@ -682,14 +732,15 @@
"outputs": [],
"source": [
"zyte_reader = ZyteWebReader(\n",
" api_key=\"Your API Key\",\n",
" api_key=\"your ZYTE API key here\",\n",
" mode=\"html-text\",\n",
" download_kwargs=zyte_dw_params,\n",
" continue_on_failure=False,\n",
")\n",
"\n",
"# Load documents from URLs as markdown\n",
"# Load documents from URLs\n",
"documents = zyte_reader.load_data(\n",
" urls=[\"https://www.zyte.com/blog/system-integrators-extract-big-data/\"],\n",
" urls=urls,\n",
")"
]
},
Expand All @@ -702,7 +753,7 @@
{
"data": {
"text/plain": [
"19554"
"17488"
]
},
"execution_count": null,
Expand Down
2 changes: 1 addition & 1 deletion docs/docs/examples/managed/BGEM3Demo.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Retrieve relavant documents"
"## Retrieve relevant documents"
]
},
{
Expand Down
28 changes: 12 additions & 16 deletions llama-index-core/llama_index/core/base/llms/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,12 @@
Any,
)

from llama_index.core.bridge.pydantic import BaseModel, Field, ConfigDict
from llama_index.core.bridge.pydantic import (
BaseModel,
Field,
ConfigDict,
field_serializer,
)
from llama_index.core.constants import DEFAULT_CONTEXT_WINDOW, DEFAULT_NUM_OUTPUTS
from llama_index.core.schema import ImageType

Expand Down Expand Up @@ -95,7 +100,8 @@ def from_str(
return cls(role=role, content=content, **kwargs)

def _recursive_serialization(self, value: Any) -> Any:
if isinstance(value, (V1BaseModel, V2BaseModel)):
if isinstance(value, V2BaseModel):
value.model_rebuild() # ensures all fields are initialized and serializable
return value.model_dump() # type: ignore
if isinstance(value, dict):
return {
Expand All @@ -106,23 +112,13 @@ def _recursive_serialization(self, value: Any) -> Any:
return [self._recursive_serialization(item) for item in value]
return value

@field_serializer("additional_kwargs", check_fields=False)
def serialize_additional_kwargs(self, value: Any, _info: Any) -> Any:
return self._recursive_serialization(value)

def dict(self, **kwargs: Any) -> Dict[str, Any]:
return self.model_dump(**kwargs)

def model_dump(self, **kwargs: Any) -> Dict[str, Any]:
# ensure all additional_kwargs are serializable
msg = super().model_dump(**kwargs)

for key, value in msg.get("additional_kwargs", {}).items():
value = self._recursive_serialization(value)
if not isinstance(value, (str, int, float, bool, dict, list, type(None))):
raise ValueError(
f"Failed to serialize additional_kwargs value: {value}"
)
msg["additional_kwargs"][key] = value

return msg


class LogProb(BaseModel):
"""LogProb of a token."""
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -488,7 +488,8 @@ def gen() -> ChatResponseGen:
additional_kwargs = {}
if is_function:
tool_calls = update_tool_calls(tool_calls, delta.tool_calls)
additional_kwargs["tool_calls"] = tool_calls
if tool_calls:
additional_kwargs["tool_calls"] = tool_calls

yield ChatResponse(
message=ChatMessage(
Expand Down Expand Up @@ -738,7 +739,8 @@ async def gen() -> ChatResponseAsyncGen:
additional_kwargs = {}
if is_function:
tool_calls = update_tool_calls(tool_calls, delta.tool_calls)
additional_kwargs["tool_calls"] = tool_calls
if tool_calls:
additional_kwargs["tool_calls"] = tool_calls

yield ChatResponse(
message=ChatMessage(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,7 @@ def from_openai_message(openai_message: ChatCompletionMessage) -> ChatMessage:
# function_call = None # deprecated in OpenAI v 1.1.0

additional_kwargs: Dict[str, Any] = {}
if openai_message.tool_calls is not None:
if openai_message.tool_calls:
tool_calls: List[ChatCompletionMessageToolCall] = openai_message.tool_calls
additional_kwargs.update(tool_calls=tool_calls)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ exclude = ["**/BUILD"]
license = "MIT"
name = "llama-index-llms-openai"
readme = "README.md"
version = "0.2.11"
version = "0.2.12"

[tool.poetry.dependencies]
python = ">=3.8.1,<4.0"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -441,3 +441,36 @@ def test_completion_model_with_retry(MockSyncOpenAI: MagicMock) -> None:
# The actual retry count is max_retries - 1
# see https://github.com/jd/tenacity/issues/459
assert mock_instance.completions.create.call_count == 3


@patch("llama_index.llms.openai.base.SyncOpenAI")
def test_ensure_chat_message_is_serializable(MockSyncOpenAI: MagicMock) -> None:
with CachedOpenAIApiKeys(set_fake_key=True):
mock_instance = MockSyncOpenAI.return_value
mock_instance.chat.completions.create.return_value = mock_chat_completion_v1()

llm = OpenAI(model="gpt-3.5-turbo")
message = ChatMessage(role="user", content="test message")

response = llm.chat([message])
response.message.additional_kwargs["test"] = ChatCompletionChunk(
id="chatcmpl-6ptKyqKOGXZT6iQnqiXAH8adNLUzD",
object="chat.completion.chunk",
created=1677825464,
model="gpt-3.5-turbo-0301",
choices=[
ChunkChoice(
delta=ChoiceDelta(role="assistant", content="test"),
finish_reason=None,
index=0,
)
],
)
data = response.message.dict()
assert isinstance(data, dict)
assert isinstance(data["additional_kwargs"], dict)
assert isinstance(data["additional_kwargs"]["test"]["choices"], list)
assert (
data["additional_kwargs"]["test"]["choices"][0]["delta"]["content"]
== "test"
)
Original file line number Diff line number Diff line change
Expand Up @@ -44,17 +44,23 @@ class ZyteWebReader(BasePydanticReader):
mode: str
n_conn: int
download_kwargs: Optional[dict]
continue_on_failure: bool

def __init__(
self,
api_key: str,
mode: Literal["article", "html", "html-text"] = "article",
n_conn: int = 15,
download_kwargs: Optional[Dict[str, Any]] = None,
continue_on_failure: bool = True,
) -> None:
"""Initialize with file path."""
super().__init__(
api_key=api_key, mode=mode, n_conn=n_conn, download_kwargs=download_kwargs
api_key=api_key,
mode=mode,
n_conn=n_conn,
download_kwargs=download_kwargs,
continue_on_failure=continue_on_failure,
)
try:
from zyte_api import AsyncZyteAPI
Expand All @@ -81,12 +87,14 @@ def class_name(cls) -> str:
return "ZyteWebReader"

def _zyte_html_option(self) -> str:
if "browserHtml" in self.download_kwargs:
if self.download_kwargs and "browserHtml" in self.download_kwargs:
return "browserHtml"
return "httpResponseBody"

def _get_article(self, page: Dict) -> str:
return page["article"]["headline"] + "\n\n" + page["article"]["articleBody"]
headline = page["article"].get("headline", "")
article_body = page["article"].get("articleBody", "")
return headline + "\n\n" + article_body

def _zyte_request_params(self, url: str) -> dict:
request_params: Dict[str, Any] = {"url": url}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ license = "MIT"
maintainers = ["HawkClaws", "Hironsan", "NA", "an-bluecat", "bborn", "jasonwcfan", "kravetsmic", "pandazki", "ruze00", "selamanse", "thejessezhang"]
name = "llama-index-readers-web"
readme = "README.md"
version = "0.2.3"
version = "0.2.4"

[tool.poetry.dependencies]
python = ">=3.8.1,<4.0"
Expand Down

0 comments on commit ec1fc89

Please sign in to comment.