added document types

ag2ai · Dec 4, 2024 · 3914b15 · 3914b15
1 parent 0a9e847
commit 3914b15
Show file tree

Hide file tree

Showing 5 changed files with 148 additions and 33 deletions.
diff --git a/autogen/agentchat/contrib/graph_rag/document.py b/autogen/agentchat/contrib/graph_rag/document.py
@@ -5,18 +5,9 @@
 # Portions derived from https://github.com/microsoft/autogen are under the MIT License.
 # SPDX-License-Identifier: MIT
 from dataclasses import dataclass
-from enum import Enum, auto
 from typing import Optional
 
-
-class DocumentType(Enum):
-    """
-    Enum for supporting document type.
-    """
-
-    TEXT = auto()
-    HTML = auto()
-    PDF = auto()
+from autogen.extensions import RAG
 
 
 @dataclass
@@ -25,6 +16,14 @@ class Document:
     A wrapper of graph store query results.
     """
 
-    doctype: DocumentType
-    data: Optional[object] = None
+    doctype: RAG.DocumentType
+    metadata: RAG.Metadata
     path_or_url: Optional[str] = ""
+
+    @property
+    def content(self):
+        return self.content
+
+    @content.setter
+    def _(self, value):
+        self.content = value
diff --git a/autogen/agentchat/contrib/vectordb/base.py b/autogen/agentchat/contrib/vectordb/base.py
@@ -4,23 +4,9 @@
 #
 # Portions derived from  https://github.com/microsoft/autogen are under the MIT License.
 # SPDX-License-Identifier: MIT
-from typing import (
-    Any,
-    Callable,
-    List,
-    Mapping,
-    Optional,
-    Protocol,
-    Sequence,
-    Tuple,
-    TypedDict,
-    Union,
-    runtime_checkable,
-)
-
-Metadata = Union[Mapping[str, Any], None]
-Vector = Union[Sequence[float], Sequence[int]]
-ItemID = Union[str, int]  # chromadb doesn't support int ids, VikingDB does
+from typing import Any, Callable, List, Optional, Protocol, Tuple, TypedDict, Union, runtime_checkable
+
+from autogen.extensions import RAG
 
 
 class Document(TypedDict):
@@ -32,17 +18,23 @@ class Document(TypedDict):
     embedding: Vector, Optional | the vector representation of the content.
     """
 
-    id: ItemID
+    id: RAG.ItemID
     content: str
-    metadata: Optional[Metadata]
-    embedding: Optional[Vector]
+    metadata: Optional[RAG.Metadata]
+    embedding: Optional[RAG.Vector]
+
+    @property
+    def doctype(self) -> str:
+        return self.metadata.get("doctype", RAG.DocumentType.TEXT).value
 
 
 """QueryResults is the response from the vector database for a query/queries.
 A query is a list containing one string while queries is a list containing multiple strings.
 The response is a list of query results, each query result is a list of tuples containing the document and the distance.
 """
-QueryResults = List[List[Tuple[Document, float]]]
+QueryResult = List[Tuple[Document, RAG.Distance]]
+QueryResults = List[QueryResult]
+ItemID = Union[str, int]
 
 
 @runtime_checkable

diff --git a/autogen/extensions/__init__.py b/autogen/extensions/__init__.py
@@ -0,0 +1,7 @@
+# Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from .rag import types as RAG
+
+__all__ = ["RAG"]
diff --git a/autogen/extensions/rag/__init__.py b/autogen/extensions/rag/__init__.py
@@ -0,0 +1,3 @@
+# Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai
+#
+# SPDX-License-Identifier: Apache-2.0
diff --git a/autogen/extensions/rag/types.py b/autogen/extensions/rag/types.py
@@ -0,0 +1,114 @@
+# Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from enum import Enum, auto
+from typing import Any, List, Mapping, Optional, Protocol, Sequence, Union, runtime_checkable
+
+from pydantic import AnyUrl, FilePath
+
+Metadata = Union[Mapping[str, Any], None]
+Vector = Union[Sequence[float], Sequence[int]]
+ItemID = Union[str, int]  # chromadb doesn't support int ids, VikingDB does
+SourceLocation = Union[FilePath, AnyUrl, str]
+Distance = float
+
+
+class NodeType(str, Enum):
+
+    DOCUMENT = "document"
+    ENTITY = "entity"
+    RELATION = "relation"
+    MEMORY = "memory"
+    OTHERS = "others"
+
+
+class DocumentType(Enum):
+    """
+    Enum for supporting document type.
+    """
+
+    TEXT = auto()
+    HTML = auto()
+    PDF = auto()
+    IMAGE = auto()
+    AUDIO = auto()
+
+
+class DatastoreType(str, Enum):
+
+    VECTOR = "vector"
+    GRAPH = "graph"
+    SQL = "sql"
+
+
+@runtime_checkable
+class Node(Protocol):
+
+    id: Optional[ItemID] = None
+    metadata: Metadata
+    content: Any = None
+
+    @property
+    def nodetype(self) -> NodeType:
+        return self.metadata.get("nodetype", NodeType.OTHERS)
+
+
+@runtime_checkable
+class DB(Protocol):
+
+    metadata: Metadata
+
+    @property
+    def db_type(self) -> str:
+        return self.metadata.get("db_type", "unknown")
+
+    def init(self, *args, **kwargs):
+        pass
+
+
+@runtime_checkable
+class QueryEngine(Protocol):
+
+    db: DB
+
+    def init_db(self, *args, **kwargs):
+        pass
+
+    def add_records(self, new_records: List):
+        pass
+
+    def query(self, query: str, **kwargs):
+        pass
+
+
+@runtime_checkable
+class Document(Protocol):
+    """A Document is a record in the vector database.
+
+    id: ItemID | the unique identifier of the document.
+    content: str | the text content of the chunk.
+    metadata: Metadata, Optional | contains additional information about the document such as source, date, etc.
+    embedding: Vector, Optional | the vector representation of the content.
+    """
+
+    metadata: Metadata = {}
+    content: Optional[object]
+
+    @property
+    def doctype(self) -> str:
+        return self.metadata.get("doctype", DocumentType.TEXT).value
+
+
+__all__ = [
+    "Metadata",
+    "Vector",
+    "SourceLocation",
+    "NodeType",
+    "DocumentType",
+    "DatastoreType",
+    "Node",
+    "DB",
+    "QueryEngine",
+    "Document",
+]