From 3914b158c631e9531ab3393cbbbe4b1ca715a0e7 Mon Sep 17 00:00:00 2001 From: OceanLi <122793010+ohdearquant@users.noreply.github.com> Date: Wed, 4 Dec 2024 13:39:45 -0500 Subject: [PATCH] added document types --- .../agentchat/contrib/graph_rag/document.py | 23 ++-- autogen/agentchat/contrib/vectordb/base.py | 34 ++---- autogen/extensions/__init__.py | 7 ++ autogen/extensions/rag/__init__.py | 3 + autogen/extensions/rag/types.py | 114 ++++++++++++++++++ 5 files changed, 148 insertions(+), 33 deletions(-) create mode 100644 autogen/extensions/rag/__init__.py create mode 100644 autogen/extensions/rag/types.py diff --git a/autogen/agentchat/contrib/graph_rag/document.py b/autogen/agentchat/contrib/graph_rag/document.py index 1ee116c71..a2da786d5 100644 --- a/autogen/agentchat/contrib/graph_rag/document.py +++ b/autogen/agentchat/contrib/graph_rag/document.py @@ -5,18 +5,9 @@ # Portions derived from https://github.com/microsoft/autogen are under the MIT License. # SPDX-License-Identifier: MIT from dataclasses import dataclass -from enum import Enum, auto from typing import Optional - -class DocumentType(Enum): - """ - Enum for supporting document type. - """ - - TEXT = auto() - HTML = auto() - PDF = auto() +from autogen.extensions import RAG @dataclass @@ -25,6 +16,14 @@ class Document: A wrapper of graph store query results. """ - doctype: DocumentType - data: Optional[object] = None + doctype: RAG.DocumentType + metadata: RAG.Metadata path_or_url: Optional[str] = "" + + @property + def content(self): + return self.content + + @content.setter + def _(self, value): + self.content = value diff --git a/autogen/agentchat/contrib/vectordb/base.py b/autogen/agentchat/contrib/vectordb/base.py index 1454d6531..d15105c9f 100644 --- a/autogen/agentchat/contrib/vectordb/base.py +++ b/autogen/agentchat/contrib/vectordb/base.py @@ -4,23 +4,9 @@ # # Portions derived from https://github.com/microsoft/autogen are under the MIT License. # SPDX-License-Identifier: MIT -from typing import ( - Any, - Callable, - List, - Mapping, - Optional, - Protocol, - Sequence, - Tuple, - TypedDict, - Union, - runtime_checkable, -) - -Metadata = Union[Mapping[str, Any], None] -Vector = Union[Sequence[float], Sequence[int]] -ItemID = Union[str, int] # chromadb doesn't support int ids, VikingDB does +from typing import Any, Callable, List, Optional, Protocol, Tuple, TypedDict, Union, runtime_checkable + +from autogen.extensions import RAG class Document(TypedDict): @@ -32,17 +18,23 @@ class Document(TypedDict): embedding: Vector, Optional | the vector representation of the content. """ - id: ItemID + id: RAG.ItemID content: str - metadata: Optional[Metadata] - embedding: Optional[Vector] + metadata: Optional[RAG.Metadata] + embedding: Optional[RAG.Vector] + + @property + def doctype(self) -> str: + return self.metadata.get("doctype", RAG.DocumentType.TEXT).value """QueryResults is the response from the vector database for a query/queries. A query is a list containing one string while queries is a list containing multiple strings. The response is a list of query results, each query result is a list of tuples containing the document and the distance. """ -QueryResults = List[List[Tuple[Document, float]]] +QueryResult = List[Tuple[Document, RAG.Distance]] +QueryResults = List[QueryResult] +ItemID = Union[str, int] @runtime_checkable diff --git a/autogen/extensions/__init__.py b/autogen/extensions/__init__.py index e69de29bb..76562991f 100644 --- a/autogen/extensions/__init__.py +++ b/autogen/extensions/__init__.py @@ -0,0 +1,7 @@ +# Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai +# +# SPDX-License-Identifier: Apache-2.0 + +from .rag import types as RAG + +__all__ = ["RAG"] diff --git a/autogen/extensions/rag/__init__.py b/autogen/extensions/rag/__init__.py new file mode 100644 index 000000000..bcd5401d5 --- /dev/null +++ b/autogen/extensions/rag/__init__.py @@ -0,0 +1,3 @@ +# Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai +# +# SPDX-License-Identifier: Apache-2.0 diff --git a/autogen/extensions/rag/types.py b/autogen/extensions/rag/types.py new file mode 100644 index 000000000..b0322e815 --- /dev/null +++ b/autogen/extensions/rag/types.py @@ -0,0 +1,114 @@ +# Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai +# +# SPDX-License-Identifier: Apache-2.0 + +from enum import Enum, auto +from typing import Any, List, Mapping, Optional, Protocol, Sequence, Union, runtime_checkable + +from pydantic import AnyUrl, FilePath + +Metadata = Union[Mapping[str, Any], None] +Vector = Union[Sequence[float], Sequence[int]] +ItemID = Union[str, int] # chromadb doesn't support int ids, VikingDB does +SourceLocation = Union[FilePath, AnyUrl, str] +Distance = float + + +class NodeType(str, Enum): + + DOCUMENT = "document" + ENTITY = "entity" + RELATION = "relation" + MEMORY = "memory" + OTHERS = "others" + + +class DocumentType(Enum): + """ + Enum for supporting document type. + """ + + TEXT = auto() + HTML = auto() + PDF = auto() + IMAGE = auto() + AUDIO = auto() + + +class DatastoreType(str, Enum): + + VECTOR = "vector" + GRAPH = "graph" + SQL = "sql" + + +@runtime_checkable +class Node(Protocol): + + id: Optional[ItemID] = None + metadata: Metadata + content: Any = None + + @property + def nodetype(self) -> NodeType: + return self.metadata.get("nodetype", NodeType.OTHERS) + + +@runtime_checkable +class DB(Protocol): + + metadata: Metadata + + @property + def db_type(self) -> str: + return self.metadata.get("db_type", "unknown") + + def init(self, *args, **kwargs): + pass + + +@runtime_checkable +class QueryEngine(Protocol): + + db: DB + + def init_db(self, *args, **kwargs): + pass + + def add_records(self, new_records: List): + pass + + def query(self, query: str, **kwargs): + pass + + +@runtime_checkable +class Document(Protocol): + """A Document is a record in the vector database. + + id: ItemID | the unique identifier of the document. + content: str | the text content of the chunk. + metadata: Metadata, Optional | contains additional information about the document such as source, date, etc. + embedding: Vector, Optional | the vector representation of the content. + """ + + metadata: Metadata = {} + content: Optional[object] + + @property + def doctype(self) -> str: + return self.metadata.get("doctype", DocumentType.TEXT).value + + +__all__ = [ + "Metadata", + "Vector", + "SourceLocation", + "NodeType", + "DocumentType", + "DatastoreType", + "Node", + "DB", + "QueryEngine", + "Document", +]