Skip to content

Commit

Permalink
added document types
Browse files Browse the repository at this point in the history
  • Loading branch information
ohdearquant committed Dec 4, 2024
1 parent 0a9e847 commit 3914b15
Show file tree
Hide file tree
Showing 5 changed files with 148 additions and 33 deletions.
23 changes: 11 additions & 12 deletions autogen/agentchat/contrib/graph_rag/document.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,18 +5,9 @@
# Portions derived from https://github.com/microsoft/autogen are under the MIT License.
# SPDX-License-Identifier: MIT
from dataclasses import dataclass
from enum import Enum, auto
from typing import Optional


class DocumentType(Enum):
"""
Enum for supporting document type.
"""

TEXT = auto()
HTML = auto()
PDF = auto()
from autogen.extensions import RAG


@dataclass
Expand All @@ -25,6 +16,14 @@ class Document:
A wrapper of graph store query results.
"""

doctype: DocumentType
data: Optional[object] = None
doctype: RAG.DocumentType
metadata: RAG.Metadata
path_or_url: Optional[str] = ""

@property
def content(self):
return self.content

@content.setter
def _(self, value):
self.content = value
34 changes: 13 additions & 21 deletions autogen/agentchat/contrib/vectordb/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,23 +4,9 @@
#
# Portions derived from https://github.com/microsoft/autogen are under the MIT License.
# SPDX-License-Identifier: MIT
from typing import (
Any,
Callable,
List,
Mapping,
Optional,
Protocol,
Sequence,
Tuple,
TypedDict,
Union,
runtime_checkable,
)

Metadata = Union[Mapping[str, Any], None]
Vector = Union[Sequence[float], Sequence[int]]
ItemID = Union[str, int] # chromadb doesn't support int ids, VikingDB does
from typing import Any, Callable, List, Optional, Protocol, Tuple, TypedDict, Union, runtime_checkable

from autogen.extensions import RAG


class Document(TypedDict):
Expand All @@ -32,17 +18,23 @@ class Document(TypedDict):
embedding: Vector, Optional | the vector representation of the content.
"""

id: ItemID
id: RAG.ItemID
content: str
metadata: Optional[Metadata]
embedding: Optional[Vector]
metadata: Optional[RAG.Metadata]
embedding: Optional[RAG.Vector]

@property
def doctype(self) -> str:
return self.metadata.get("doctype", RAG.DocumentType.TEXT).value


"""QueryResults is the response from the vector database for a query/queries.
A query is a list containing one string while queries is a list containing multiple strings.
The response is a list of query results, each query result is a list of tuples containing the document and the distance.
"""
QueryResults = List[List[Tuple[Document, float]]]
QueryResult = List[Tuple[Document, RAG.Distance]]
QueryResults = List[QueryResult]
ItemID = Union[str, int]


@runtime_checkable
Expand Down
7 changes: 7 additions & 0 deletions autogen/extensions/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai
#
# SPDX-License-Identifier: Apache-2.0

from .rag import types as RAG

__all__ = ["RAG"]
3 changes: 3 additions & 0 deletions autogen/extensions/rag/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
# Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai
#
# SPDX-License-Identifier: Apache-2.0
114 changes: 114 additions & 0 deletions autogen/extensions/rag/types.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,114 @@
# Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai
#
# SPDX-License-Identifier: Apache-2.0

from enum import Enum, auto
from typing import Any, List, Mapping, Optional, Protocol, Sequence, Union, runtime_checkable

from pydantic import AnyUrl, FilePath

Metadata = Union[Mapping[str, Any], None]
Vector = Union[Sequence[float], Sequence[int]]
ItemID = Union[str, int] # chromadb doesn't support int ids, VikingDB does
SourceLocation = Union[FilePath, AnyUrl, str]
Distance = float


class NodeType(str, Enum):

DOCUMENT = "document"
ENTITY = "entity"
RELATION = "relation"
MEMORY = "memory"
OTHERS = "others"


class DocumentType(Enum):
"""
Enum for supporting document type.
"""

TEXT = auto()
HTML = auto()
PDF = auto()
IMAGE = auto()
AUDIO = auto()


class DatastoreType(str, Enum):

VECTOR = "vector"
GRAPH = "graph"
SQL = "sql"


@runtime_checkable
class Node(Protocol):

id: Optional[ItemID] = None
metadata: Metadata
content: Any = None

@property
def nodetype(self) -> NodeType:
return self.metadata.get("nodetype", NodeType.OTHERS)


@runtime_checkable
class DB(Protocol):

metadata: Metadata

@property
def db_type(self) -> str:
return self.metadata.get("db_type", "unknown")

def init(self, *args, **kwargs):
pass


@runtime_checkable
class QueryEngine(Protocol):

db: DB

def init_db(self, *args, **kwargs):
pass

def add_records(self, new_records: List):
pass

def query(self, query: str, **kwargs):
pass


@runtime_checkable
class Document(Protocol):
"""A Document is a record in the vector database.
id: ItemID | the unique identifier of the document.
content: str | the text content of the chunk.
metadata: Metadata, Optional | contains additional information about the document such as source, date, etc.
embedding: Vector, Optional | the vector representation of the content.
"""

metadata: Metadata = {}
content: Optional[object]

@property
def doctype(self) -> str:
return self.metadata.get("doctype", DocumentType.TEXT).value


__all__ = [
"Metadata",
"Vector",
"SourceLocation",
"NodeType",
"DocumentType",
"DatastoreType",
"Node",
"DB",
"QueryEngine",
"Document",
]

0 comments on commit 3914b15

Please sign in to comment.