Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Neo4j Graph Rag Integration #139

Merged
merged 15 commits into from
Dec 6, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
46 changes: 46 additions & 0 deletions .github/workflows/contrib-graph-rag-tests.yml
Original file line number Diff line number Diff line change
Expand Up @@ -64,3 +64,49 @@ jobs:
with:
file: ./coverage.xml
flags: unittests

GraphRagIntegrationTest-Neo4j-Llmaindex-Ubuntu:
runs-on: ubuntu-latest
strategy:
fail-fast: false
matrix:
python-version: ["3.10", "3.11"]
services:
neo4j:
image: neo4j:latest
ports:
- 7687:7687
- 7474:7474
env:
NEO4J_AUTH: neo4j/password
steps:
- uses: actions/checkout@v4
- name: Set up Python ${{ matrix.python-version }}
uses: actions/setup-python@v5
with:
python-version: ${{ matrix.python-version }}
- name: Install packages and dependencies for all tests
run: |
python -m pip install --upgrade pip wheel
pip install pytest
- name: Install Neo4j and Llama-index when on linux
run: |
pip install -e .[neo4j_graph_rag]
- name: Set AUTOGEN_USE_DOCKER based on OS
shell: bash
run: |
echo "AUTOGEN_USE_DOCKER=False" >> $GITHUB_ENV
- name: Coverage
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
AZURE_OPENAI_API_BASE: ${{ secrets.AZURE_OPENAI_API_BASE }}
OAI_CONFIG_LIST: ${{ secrets.OAI_CONFIG_LIST }}
run: |
pip install pytest-cov>=5
pytest test/agentchat/contrib/graph_rag/test_neo4j_graph_rag.py --skip-openai
- name: Upload coverage to Codecov
uses: codecov/codecov-action@v3
with:
file: ./coverage.xml
flags: unittests
4 changes: 3 additions & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,9 @@ repos:
website/docs/topics/code-execution/custom-executor.ipynb |
website/docs/topics/non-openai-models/cloud-gemini.ipynb |
notebook/.* |
test/agentchat/contrib/graph_rag/trip_planner_data/.*
test/agentchat/contrib/graph_rag/trip_planner_data/.* |
test/agentchat/contrib/graph_rag/paul_graham_essay.txt

)$
# See https://jaredkhan.com/blog/mypy-pre-commit
- repo: local
Expand Down
185 changes: 185 additions & 0 deletions autogen/agentchat/contrib/graph_rag/neo4j_graph_query_engine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,185 @@
# Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai
#
# SPDX-License-Identifier: Apache-2.0
import os
from typing import Dict, List, Optional, TypeAlias, Union

from llama_index.core import PropertyGraphIndex, SimpleDirectoryReader
from llama_index.core.base.embeddings.base import BaseEmbedding
from llama_index.core.indices.property_graph import SchemaLLMPathExtractor
from llama_index.core.indices.property_graph.transformations.schema_llm import Triple
from llama_index.core.llms import LLM
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore
from llama_index.llms.openai import OpenAI

from .document import Document
from .graph_query_engine import GraphQueryEngine, GraphStoreQueryResult


class Neo4jGraphQueryEngine(GraphQueryEngine):
"""
This class serves as a wrapper for a Neo4j database-backed PropertyGraphIndex query engine,
facilitating the creation, updating, and querying of graphs.

It builds a PropertyGraph Index from input documents,
storing and retrieving data from a property graph in the Neo4j database.

Using SchemaLLMPathExtractor, it defines schemas with entities, relationships, and other properties based on the input,
which are added into the preprty graph.

For usage, please refer to example notebook/agentchat_graph_rag_neo4j.ipynb
"""

def __init__(
self,
host: str = "bolt://localhost",
port: int = 7687,
database: str = "neo4j",
username: str = "neo4j",
password: str = "neo4j",
llm: LLM = OpenAI(model="gpt-3.5-turbo", temperature=0.0),
embedding: BaseEmbedding = OpenAIEmbedding(model_name="text-embedding-3-small"),
entities: Optional[TypeAlias] = None,
relations: Optional[TypeAlias] = None,
validation_schema: Optional[Union[Dict[str, str], List[Triple]]] = None,
strict: Optional[bool] = True,
):
"""
Initialize a Neo4j Property graph.
Please also refer to https://docs.llamaindex.ai/en/stable/examples/property_graph/graph_store/

Args:
name (str): Property graph name.
host (str): Neo4j hostname.
port (int): Neo4j port number.
database (str): Neo4j database name.
username (str): Neo4j username.
password (str): Neo4j password.
llm (LLM): Language model to use for extracting tripletss.
embedding (BaseEmbedding): Embedding model to use constructing index and query
entities (Optional[TypeAlias]): Custom possible entities to include in the graph.
relations (Optional[TypeAlias]): Custom poissble relations to include in the graph.
validation_schema (Optional[Union[Dict[str, str], List[Triple]]): Custom schema to validate the extracted triplets
strict (Optional[bool]): If false, allows for values outside of the schema, useful for using the schema as a suggestion.
"""
self.host = host
self.port = port
self.database = database
self.username = username
self.password = password
self.llm = llm
self.embedding = embedding
self.entities = entities
self.relations = relations
self.validation_schema = validation_schema
self.strict = strict

def init_db(self, input_doc: List[Document] | None = None):
"""
Build the knowledge graph with input documents.
Eric-Shang marked this conversation as resolved.
Show resolved Hide resolved
"""
self.input_files = []
for doc in input_doc:
if os.path.exists(doc.path_or_url):
self.input_files.append(doc.path_or_url)
else:
raise ValueError(f"Document file not found: {doc.path_or_url}")

self.graph_store = Neo4jPropertyGraphStore(
username=self.username,
password=self.password,
url=self.host + ":" + str(self.port),
database=self.database,
)

# delete all entities and relationships in case a graph pre-exists
self._clear()

self.documents = SimpleDirectoryReader(input_files=self.input_files).load_data()

# Extract paths following a strict schema of allowed entities, relationships, and which entities can be connected to which relationships.
# To add more extractors, please refer to https://docs.llamaindex.ai/en/latest/module_guides/indexing/lpg_index_guide/#construction
self.kg_extractors = [
SchemaLLMPathExtractor(
llm=self.llm,
possible_entities=self.entities,
possible_relations=self.relations,
kg_validation_schema=self.validation_schema,
strict=self.strict,
)
]

self.index = PropertyGraphIndex.from_documents(
self.documents,
embed_model=self.embedding,
kg_extractors=self.kg_extractors,
property_graph_store=self.graph_store,
show_progress=True,
)

def add_records(self, new_records: List) -> bool:
Eric-Shang marked this conversation as resolved.
Show resolved Hide resolved
Eric-Shang marked this conversation as resolved.
Show resolved Hide resolved
"""
Add new records to the knowledge graph. Must be local files.

Args:
new_records (List[Document]): List of new documents to add.

Returns:
bool: True if successful, False otherwise.
"""
if self.graph_store is None:
raise ValueError("Knowledge graph is not initialized. Please call init_db first.")

try:
"""
SimpleDirectoryReader will select the best file reader based on the file extensions, including:
[DocxReader, EpubReader, HWPReader, ImageReader, IPYNBReader, MarkdownReader, MboxReader,
PandasCSVReader, PandasExcelReader,PDFReader,PptxReader, VideoAudioReader]
"""
new_documents = SimpleDirectoryReader(input_files=[doc.path_or_url for doc in new_records]).load_data()

for doc in new_documents:
self.index.insert(doc)

return True
except Exception as e:
print(f"Error adding records: {e}")
return False

def query(self, question: str, n_results: int = 1, **kwargs) -> GraphStoreQueryResult:
"""
Query the knowledge graph with a question.

Args:
question: a human input question.
n_results: number of results to return.

Returns:
A GrapStoreQueryResult object containing the answer and related triplets.
"""
if self.graph_store is None:
raise ValueError("Knowledge graph is not created.")

# query the graph to get the answer
query_engine = self.index.as_query_engine(include_text=True)
response = str(query_engine.query(question))

# retrieve source triplets that are semantically related to the question
retriever = self.index.as_retriever(include_text=False)
nodes = retriever.retrieve(question)
triplets = []
for node in nodes:
entities = [sub.split("(")[0].strip() for sub in node.text.split("->")]
triplet = " -> ".join(entities)
triplets.append(triplet)

return GraphStoreQueryResult(answer=response, results=triplets)

def _clear(self) -> None:
"""
Delete all entities and relationships in the graph.
TODO: Delete all the data in the database including indexes and constraints.
"""
with self.graph_store._driver.session() as session:
session.run("MATCH (n) DETACH DELETE n;")
Eric-Shang marked this conversation as resolved.
Show resolved Hide resolved
83 changes: 83 additions & 0 deletions autogen/agentchat/contrib/graph_rag/neo4j_graph_rag_capability.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
# Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai
#
# SPDX-License-Identifier: Apache-2.0

from typing import Any, Dict, List, Optional, Tuple, Union

from autogen import Agent, ConversableAgent, UserProxyAgent

from .graph_query_engine import GraphStoreQueryResult
from .graph_rag_capability import GraphRagCapability
from .neo4j_graph_query_engine import Neo4jGraphQueryEngine


class Neo4jGraphCapability(GraphRagCapability):
"""
The Neo4j graph capability integrates Neo4j Property graph into a graph rag agent.
Ref: https://neo4j.com/labs/genai-ecosystem/llamaindex/#_property_graph_constructing_modules/


For usage, please refer to example notebook/agentchat_graph_rag_neo4j.ipynb
"""

def __init__(self, query_engine: Neo4jGraphQueryEngine):
"""
initialize GraphRAG capability with a graph query engine
"""
self.query_engine = query_engine

def add_to_agent(self, agent: UserProxyAgent):
"""
Add Neo4j GraphRAG capability to a UserProxyAgent.
The restriction to a UserProxyAgent to make sure the returned message only contains information retrieved from the graph DB instead of any LLMs.
"""

self.graph_rag_agent = agent

# Validate the agent config
if agent.llm_config not in (None, False):
raise Exception(
"Agents with GraphRAG capabilities do not use an LLM configuration. Please set your llm_config to None or False."
)

# Register method to generate the reply using a Neo4j query
# All other reply methods will be removed
agent.register_reply(
[ConversableAgent, None], self._reply_using_neo4j_query, position=0, remove_other_reply_funcs=True
)

def _reply_using_neo4j_query(
self,
recipient: ConversableAgent,
messages: Optional[List[Dict]] = None,
sender: Optional[Agent] = None,
config: Optional[Any] = None,
) -> Tuple[bool, Union[str, Dict, None]]:
"""
Query neo4j and return the message. Internally, it queries the Property graph
and returns the answer from the graph query engine.
TODO: reply with a dictionary including both the answer and semantic source triplets.

Args:
recipient: The agent instance that will receive the message.
messages: A list of messages in the conversation history with the sender.
sender: The agent instance that sent the message.
config: Optional configuration for message processing.

Returns:
A tuple containing a boolean indicating success and the assistant's reply.
"""
question = self._get_last_question(messages[-1])

result: GraphStoreQueryResult = self.query_engine.query(question)

return True, result.answer

def _get_last_question(self, message: Union[Dict, str]):
"""Retrieves the last message from the conversation history."""
if isinstance(message, str):
return message
if isinstance(message, Dict):
if "content" in message:
return message["content"]
return None
Loading
Loading