ag2ai · AgentGenie · Dec 6, 2024 · Dec 3, 2024 · Dec 4, 2024 · Dec 4, 2024
diff --git a/.github/workflows/contrib-graph-rag-tests.yml b/.github/workflows/contrib-graph-rag-tests.yml
@@ -64,3 +64,49 @@ jobs:
         with:
           file: ./coverage.xml
           flags: unittests
+
+  GraphRagIntegrationTest-Neo4j-Llmaindex-Ubuntu:
+    runs-on: ubuntu-latest
+    strategy:
+      fail-fast: false
+      matrix:
+        python-version: ["3.10", "3.11"]
+    services:
+      neo4j:
+        image: neo4j:latest
+        ports:
+          - 7687:7687
+          - 7474:7474
+        env:
+          NEO4J_AUTH: neo4j/password
+    steps:
+      - uses: actions/checkout@v4
+      - name: Set up Python ${{ matrix.python-version }}
+        uses: actions/setup-python@v5
+        with:
+          python-version: ${{ matrix.python-version }}
+      - name: Install packages and dependencies for all tests
+        run: |
+          python -m pip install --upgrade pip wheel
+          pip install pytest
+      - name: Install Neo4j and Llama-index when on linux
+        run: |
+          pip install -e .[neo4j_graph_rag]
+      - name: Set AUTOGEN_USE_DOCKER based on OS
+        shell: bash
+        run: |
+          echo "AUTOGEN_USE_DOCKER=False" >> $GITHUB_ENV
+      - name: Coverage
+        env:
+          OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
+          AZURE_OPENAI_API_KEY: ${{ secrets.AZURE_OPENAI_API_KEY }}
+          AZURE_OPENAI_API_BASE: ${{ secrets.AZURE_OPENAI_API_BASE }}
+          OAI_CONFIG_LIST: ${{ secrets.OAI_CONFIG_LIST }}
+        run: |
+          pip install pytest-cov>=5
+          pytest test/agentchat/contrib/graph_rag/test_neo4j_graph_rag.py --skip-openai
+      - name: Upload coverage to Codecov
+        uses: codecov/codecov-action@v3
+        with:
+          file: ./coverage.xml
+          flags: unittests
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -48,7 +48,9 @@ repos:
               website/docs/topics/code-execution/custom-executor.ipynb |
               website/docs/topics/non-openai-models/cloud-gemini.ipynb |
               notebook/.* |
-              test/agentchat/contrib/graph_rag/trip_planner_data/.*
+              test/agentchat/contrib/graph_rag/trip_planner_data/.* |
+              test/agentchat/contrib/graph_rag/paul_graham_essay.txt
+
             )$
   # See https://jaredkhan.com/blog/mypy-pre-commit
   - repo: local

diff --git a/autogen/agentchat/contrib/graph_rag/neo4j_graph_query_engine.py b/autogen/agentchat/contrib/graph_rag/neo4j_graph_query_engine.py
@@ -0,0 +1,185 @@
+# Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai
+#
+# SPDX-License-Identifier: Apache-2.0
+import os
+from typing import Dict, List, Optional, TypeAlias, Union
+
+from llama_index.core import PropertyGraphIndex, SimpleDirectoryReader
+from llama_index.core.base.embeddings.base import BaseEmbedding
+from llama_index.core.indices.property_graph import SchemaLLMPathExtractor
+from llama_index.core.indices.property_graph.transformations.schema_llm import Triple
+from llama_index.core.llms import LLM
+from llama_index.embeddings.openai import OpenAIEmbedding
+from llama_index.graph_stores.neo4j import Neo4jPropertyGraphStore
+from llama_index.llms.openai import OpenAI
+
+from .document import Document
+from .graph_query_engine import GraphQueryEngine, GraphStoreQueryResult
+
+
+class Neo4jGraphQueryEngine(GraphQueryEngine):
+    """
+    This class serves as a wrapper for a Neo4j database-backed PropertyGraphIndex query engine,
+    facilitating the creation, updating, and querying of graphs.
+
+    It builds a PropertyGraph Index from input documents,
+    storing and retrieving data from a property graph in the Neo4j database.
+
+    Using SchemaLLMPathExtractor, it defines schemas with entities, relationships, and other properties based on the input,
+    which are added into the preprty graph.
+
+    For usage, please refer to example notebook/agentchat_graph_rag_neo4j.ipynb
+    """
+
+    def __init__(
+        self,
+        host: str = "bolt://localhost",
+        port: int = 7687,
+        database: str = "neo4j",
+        username: str = "neo4j",
+        password: str = "neo4j",
+        llm: LLM = OpenAI(model="gpt-3.5-turbo", temperature=0.0),
+        embedding: BaseEmbedding = OpenAIEmbedding(model_name="text-embedding-3-small"),
+        entities: Optional[TypeAlias] = None,
+        relations: Optional[TypeAlias] = None,
+        validation_schema: Optional[Union[Dict[str, str], List[Triple]]] = None,
+        strict: Optional[bool] = True,
+    ):
+        """
+        Initialize a Neo4j Property graph.
+        Please also refer to https://docs.llamaindex.ai/en/stable/examples/property_graph/graph_store/
+
+        Args:
+            name (str): Property graph name.
+            host (str): Neo4j hostname.
+            port (int): Neo4j port number.
+            database (str): Neo4j database name.
+            username (str): Neo4j username.
+            password (str): Neo4j password.
+            llm (LLM): Language model to use for extracting tripletss.
+            embedding (BaseEmbedding): Embedding model to use constructing index and query
+            entities (Optional[TypeAlias]): Custom possible entities to include in the graph.
+            relations (Optional[TypeAlias]): Custom poissble relations to include in the graph.
+            validation_schema (Optional[Union[Dict[str, str], List[Triple]]): Custom schema to validate the extracted triplets
+            strict (Optional[bool]): If false, allows for values outside of the schema, useful for using the schema as a suggestion.
+        """
+        self.host = host
+        self.port = port
+        self.database = database
+        self.username = username
+        self.password = password
+        self.llm = llm
+        self.embedding = embedding
+        self.entities = entities
+        self.relations = relations
+        self.validation_schema = validation_schema
+        self.strict = strict
+
+    def init_db(self, input_doc: List[Document] | None = None):
+        """
+        Build the knowledge graph with input documents.
+        """
+        self.input_files = []
+        for doc in input_doc:
+            if os.path.exists(doc.path_or_url):
+                self.input_files.append(doc.path_or_url)
+            else:
+                raise ValueError(f"Document file not found: {doc.path_or_url}")
+
+        self.graph_store = Neo4jPropertyGraphStore(
+            username=self.username,
+            password=self.password,
+            url=self.host + ":" + str(self.port),
+            database=self.database,
+        )
+
+        # delete all entities and relationships in case a graph pre-exists
+        self._clear()
+
+        self.documents = SimpleDirectoryReader(input_files=self.input_files).load_data()
+
+        # Extract paths following a strict schema of allowed entities, relationships, and which entities can be connected to which relationships.
+        # To add more extractors, please refer to https://docs.llamaindex.ai/en/latest/module_guides/indexing/lpg_index_guide/#construction
+        self.kg_extractors = [
+            SchemaLLMPathExtractor(
+                llm=self.llm,
+                possible_entities=self.entities,
+                possible_relations=self.relations,
+                kg_validation_schema=self.validation_schema,
+                strict=self.strict,
+            )
+        ]
+
+        self.index = PropertyGraphIndex.from_documents(
+            self.documents,
+            embed_model=self.embedding,
+            kg_extractors=self.kg_extractors,
+            property_graph_store=self.graph_store,
+            show_progress=True,
+        )
+
+    def add_records(self, new_records: List) -> bool:
+        """
+        Add new records to the knowledge graph. Must be local files.
+
+        Args:
+            new_records (List[Document]): List of new documents to add.
+
+        Returns:
+            bool: True if successful, False otherwise.
+        """
+        if self.graph_store is None:
+            raise ValueError("Knowledge graph is not initialized. Please call init_db first.")
+
+        try:
+            """
+            SimpleDirectoryReader will select the best file reader based on the file extensions, including:
+            [DocxReader, EpubReader, HWPReader, ImageReader, IPYNBReader, MarkdownReader, MboxReader,
+            PandasCSVReader, PandasExcelReader,PDFReader,PptxReader, VideoAudioReader]
+            """
+            new_documents = SimpleDirectoryReader(input_files=[doc.path_or_url for doc in new_records]).load_data()
+
+            for doc in new_documents:
+                self.index.insert(doc)
+
+            return True
+        except Exception as e:
+            print(f"Error adding records: {e}")
+            return False
+
+    def query(self, question: str, n_results: int = 1, **kwargs) -> GraphStoreQueryResult:
+        """
+        Query the knowledge graph with a question.
+
+        Args:
+        question: a human input question.
+        n_results: number of results to return.
+
+        Returns:
+        A GrapStoreQueryResult object containing the answer and related triplets.
+        """
+        if self.graph_store is None:
+            raise ValueError("Knowledge graph is not created.")
+
+        # query the graph to get the answer
+        query_engine = self.index.as_query_engine(include_text=True)
+        response = str(query_engine.query(question))
+
+        # retrieve source triplets that are semantically related to the question
+        retriever = self.index.as_retriever(include_text=False)
+        nodes = retriever.retrieve(question)
+        triplets = []
+        for node in nodes:
+            entities = [sub.split("(")[0].strip() for sub in node.text.split("->")]
+            triplet = " -> ".join(entities)
+            triplets.append(triplet)
+
+        return GraphStoreQueryResult(answer=response, results=triplets)
+
+    def _clear(self) -> None:
+        """
+        Delete all entities and relationships in the graph.
+        TODO: Delete all the data in the database including indexes and constraints.
+        """
+        with self.graph_store._driver.session() as session:
+            session.run("MATCH (n) DETACH DELETE n;")
diff --git a/autogen/agentchat/contrib/graph_rag/neo4j_graph_rag_capability.py b/autogen/agentchat/contrib/graph_rag/neo4j_graph_rag_capability.py
@@ -0,0 +1,83 @@
+# Copyright (c) 2023 - 2024, Owners of https://github.com/ag2ai
+#
+# SPDX-License-Identifier: Apache-2.0
+
+from typing import Any, Dict, List, Optional, Tuple, Union
+
+from autogen import Agent, ConversableAgent, UserProxyAgent
+
+from .graph_query_engine import GraphStoreQueryResult
+from .graph_rag_capability import GraphRagCapability
+from .neo4j_graph_query_engine import Neo4jGraphQueryEngine
+
+
+class Neo4jGraphCapability(GraphRagCapability):
+    """
+    The Neo4j graph capability integrates Neo4j Property graph into a graph rag agent.
+    Ref: https://neo4j.com/labs/genai-ecosystem/llamaindex/#_property_graph_constructing_modules/
+
+
+    For usage, please refer to example notebook/agentchat_graph_rag_neo4j.ipynb
+    """
+
+    def __init__(self, query_engine: Neo4jGraphQueryEngine):
+        """
+        initialize GraphRAG capability with a graph query engine
+        """
+        self.query_engine = query_engine
+
+    def add_to_agent(self, agent: UserProxyAgent):
+        """
+        Add Neo4j GraphRAG capability to a UserProxyAgent.
+        The restriction to a UserProxyAgent to make sure the returned message only contains information retrieved from the graph DB instead of any LLMs.
+        """
+
+        self.graph_rag_agent = agent
+
+        # Validate the agent config
+        if agent.llm_config not in (None, False):
+            raise Exception(
+                "Agents with GraphRAG capabilities do not use an LLM configuration. Please set your llm_config to None or False."
+            )
+
+        # Register method to generate the reply using a Neo4j query
+        # All other reply methods will be removed
+        agent.register_reply(
+            [ConversableAgent, None], self._reply_using_neo4j_query, position=0, remove_other_reply_funcs=True
+        )
+
+    def _reply_using_neo4j_query(
+        self,
+        recipient: ConversableAgent,
+        messages: Optional[List[Dict]] = None,
+        sender: Optional[Agent] = None,
+        config: Optional[Any] = None,
+    ) -> Tuple[bool, Union[str, Dict, None]]:
+        """
+        Query neo4j and return the message. Internally, it queries the Property graph
+        and returns the answer from the graph query engine.
+        TODO: reply with a dictionary including both the answer and semantic source triplets.
+
+        Args:
+            recipient: The agent instance that will receive the message.
+            messages: A list of messages in the conversation history with the sender.
+            sender: The agent instance that sent the message.
+            config: Optional configuration for message processing.
+
+        Returns:
+            A tuple containing a boolean indicating success and the assistant's reply.
+        """
+        question = self._get_last_question(messages[-1])
+
+        result: GraphStoreQueryResult = self.query_engine.query(question)
+
+        return True, result.answer
+
+    def _get_last_question(self, message: Union[Dict, str]):
+        """Retrieves the last message from the conversation history."""
+        if isinstance(message, str):
+            return message
+        if isinstance(message, Dict):
+            if "content" in message:
+                return message["content"]
+        return None