diff --git a/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/.gitignore b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/.gitignore new file mode 100644 index 0000000000000..b79e9dc84ab00 --- /dev/null +++ b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/.gitignore @@ -0,0 +1,180 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# Poetry +poetry.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json + +# Cognee +.data +.env +.local.env +.prod.env +cognee/.data/ +*.lance/ +.cognee_system/ +.data_storage/ +.anon_id +SWE-bench_testsample/ + +full_run.ipynb + +# Environments +.env +.env.local +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ diff --git a/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/BUILD b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/BUILD new file mode 100644 index 0000000000000..0896ca890d8bf --- /dev/null +++ b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/BUILD @@ -0,0 +1,3 @@ +poetry_requirements( + name="poetry", +) diff --git a/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/Makefile b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/Makefile new file mode 100644 index 0000000000000..b9eab05aa3706 --- /dev/null +++ b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/README.md b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/README.md new file mode 100644 index 0000000000000..dfcce56ed606d --- /dev/null +++ b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/README.md @@ -0,0 +1,67 @@ +# LlamaIndex Graph Rag Integration: Cognee + +Cognee assists developers in introducing greater predictability and management into their Retrieval-Augmented Generation (RAG) workflows through the use of graph architectures, vector stores, and auto-optimizing pipelines. Displaying information as a graph is the clearest way to grasp the content of your documents. Crucially, graphs allow systematic navigation and extraction of data from documents based on their hierarchy. + +For more information, visit [Cognee documentation](https://docs.cognee.ai/) + +## Installation + +```shell +pip install llama-index-graph-rag-cognee +``` + +## Usage + +```python +import os +import pandas as pd +import asyncio + +from llama_index.core import Document +from llama_index.graph_rag.cognee import CogneeGraphRAG + + +async def example_graph_rag_cognee(): + # Gather documents to add to GraphRAG + news = pd.read_csv( + "https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/news_articles.csv" + )[:5] + news.head() + documents = [ + Document(text=f"{row['title']}: {row['text']}") + for i, row in news.iterrows() + ] + + # Instantiate cognee GraphRAG + cogneeRAG = CogneeGraphRAG( + llm_api_key=os.environ["OPENAI_API_KEY"], + llm_provider="openai", + llm_model="gpt-4o-mini", + graph_db_provider="networkx", + vector_db_provider="lancedb", + relational_db_provider="sqlite", + db_name="cognee_db", + ) + + # Add data to cognee + await cogneeRAG.add(documents, "test") + + # Process data into a knowledge graph + await cogneeRAG.process_data("test") + + # Answer prompt based on knowledge graph + search_results = await cogneeRAG.search("person") + print("\n\nExtracted sentences are:\n") + for result in search_results: + print(f"{result}\n") + + # Search for related nodes + search_results = await cogneeRAG.get_related_nodes("person") + print("\n\nRelated nodes are:\n") + for result in search_results: + print(f"{result}\n") + + +if __name__ == "__main__": + asyncio.run(example_graph_rag_cognee()) +``` diff --git a/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/llama_index/graph_rag/cognee/BUILD b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/llama_index/graph_rag/cognee/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/llama_index/graph_rag/cognee/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/llama_index/graph_rag/cognee/__init__.py b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/llama_index/graph_rag/cognee/__init__.py new file mode 100644 index 0000000000000..03a993e559537 --- /dev/null +++ b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/llama_index/graph_rag/cognee/__init__.py @@ -0,0 +1,3 @@ +from llama_index.graph_rag.cognee.graph_rag import CogneeGraphRAG + +__all__ = ["CogneeGraphRAG"] diff --git a/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/llama_index/graph_rag/cognee/base.py b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/llama_index/graph_rag/cognee/base.py new file mode 100644 index 0000000000000..68b56e5b17e23 --- /dev/null +++ b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/llama_index/graph_rag/cognee/base.py @@ -0,0 +1,54 @@ +from abc import abstractmethod +from typing import Protocol + + +# NOTE: This is a bare-bone suggestion for an abstract protocol to define GraphRAG for llama-index +# This should be expanded upon and integrated to llama-index-core to support multiple different GraphRAG +# libraries in the future +class GraphRAG(Protocol): + """Abstract graph RAG protocol. + + This protocol defines the interface for a graphRAG, which is responsible + for adding, storing, processing and retrieving information from knowledge graphs. + + Attributes: + llm_api_key: str: Api key for desired llm. + graph_db_provider: str: The graph database provider. + vector_db_provider: str: The vector database provider. + relational_db_provider: str: The relational database provider. + db_name: str: The name of the databases. + """ + + @abstractmethod + async def add(self, data, dataset_name): + """Add data to the specified dataset. + This data will later be processed and made into a knowledge graph. + + Args: + data (Any): The data to be added to the graph. + dataset_name (str): Name of the dataset or node set where the data will be added. + """ + + @abstractmethod + async def process_data(self, dataset_name: str): + """Process and structure data in the dataset and make a knowledge graph out of it. + + Args: + dataset_name (str): The dataset name to process. + """ + + @abstractmethod + async def search(self, query: str): + """Search the graph for relevant information based on a query. + + Args: + query (str): The query string to match against data from the graph. + """ + + @abstractmethod + async def get_related_nodes(self, node_id: str): + """Search the graph for relevant nodes or relationships based on node id. + + Args: + node_id (str): The name of the node to match against nodes in the graph. + """ diff --git a/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/llama_index/graph_rag/cognee/graph_rag.py b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/llama_index/graph_rag/cognee/graph_rag.py new file mode 100644 index 0000000000000..6b4e91250ab62 --- /dev/null +++ b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/llama_index/graph_rag/cognee/graph_rag.py @@ -0,0 +1,142 @@ +import os +import pathlib +from typing import List, Union + +import cognee + +from llama_index.core import Document + +from .base import GraphRAG + + +class CogneeGraphRAG(GraphRAG): + """Cognee GraphRAG, handles adding, storing, processing and retrieving information from knowledge graphs. + + Unlike traditional RAG models that retrieve unstructured text snippets, graphRAG utilizes knowledge graphs. + A knowledge graph represents entities as nodes and their relationships as edges, often in a structured semantic format. + This enables the system to retrieve more precise and structured information about an entity, its relationships, and its properties. + + Attributes: + llm_api_key: str: Api key for desired llm. + llm_provider: str: Provider for desired llm. + llm_model: str: Model for desired llm. + graph_db_provider: str: The graph database provider. + vector_db_provider: str: The vector database provider. + relational_db_provider: str: The relational database provider. + db_name: str: The name of the databases. + """ + + def __init__( + self, + llm_api_key: str, + llm_provider: str = "openai", + llm_model: str = "gpt-4o-mini", + graph_db_provider: str = "networkx", + vector_db_provider: str = "lancedb", + relational_db_provider: str = "sqlite", + db_name: str = "cognee_db", + ) -> None: + cognee.config.set_llm_config( + { + "llm_api_key": llm_api_key, + "llm_provider": llm_provider, + "llm_model": llm_model, + } + ) + + cognee.config.set_vector_db_config({"vector_db_provider": vector_db_provider}) + cognee.config.set_relational_db_config( + {"db_provider": relational_db_provider, "db_name": db_name} + ) + cognee.config.set_graph_database_provider(graph_db_provider) + + data_directory_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".data_storage/") + ).resolve() + ) + + cognee.config.data_root_directory(data_directory_path) + cognee_directory_path = str( + pathlib.Path( + os.path.join(pathlib.Path(__file__).parent, ".cognee_system/") + ).resolve() + ) + cognee.config.system_root_directory(cognee_directory_path) + + async def add( + self, data: Union[Document, List[Document]], dataset_name: str + ) -> None: + """Add data to the specified dataset. + This data will later be processed and made into a knowledge graph. + + Args: + data (Any): The data to be added to the graph. + dataset_name (str): Name of the dataset or node set where the data will be added. + """ + # Convert LlamaIndex Document type to text + if isinstance(data, List) and len(data) > 0: + data = [data.text for data in data if type(data) == Document] + elif type(data) == Document: + data = [data.text] + + await cognee.add(data, dataset_name) + + async def process_data(self, dataset_names: str) -> None: + """Process and structure data in the dataset and make a knowledge graph out of it. + + Args: + dataset_name (str): The dataset name to process. + """ + user = await cognee.modules.users.methods.get_default_user() + datasets = await cognee.modules.data.methods.get_datasets_by_name( + dataset_names, user.id + ) + await cognee.cognify(datasets, user) + + async def get_graph_url(self, graphistry_password, graphistry_username) -> str: + """Retrieve the URL or endpoint for visualizing or interacting with the graph. + + Returns: + str: The URL endpoint of the graph. + """ + if graphistry_password and graphistry_username: + cognee.config.set_graphistry_config( + {"username": graphistry_username, "password": graphistry_password} + ) + + from cognee.shared.utils import render_graph + from cognee.infrastructure.databases.graph import get_graph_engine + import graphistry + + graphistry.login( + username=graphistry_username, + password=graphistry_password, + ) + graph_engine = await get_graph_engine() + + graph_url = await render_graph(graph_engine.graph) + print(graph_url) + return graph_url + + async def search(self, query: str) -> list: + """Search the graph for relevant information based on a query. + + Args: + query (str): The query string to match against data from the graph. + """ + user = await cognee.modules.users.methods.get_default_user() + return await cognee.search( + cognee.api.v1.search.SearchType.SUMMARIES, query, user + ) + + async def get_related_nodes(self, node_id: str) -> list: + """Search the graph for relevant nodes or relationships based on node id. + + Args: + node_id (str): The name of the node to match against nodes in the graph. + """ + user = await cognee.modules.users.methods.get_default_user() + return await cognee.search( + cognee.api.v1.search.SearchType.INSIGHTS, node_id, user + ) diff --git a/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/pyproject.toml b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/pyproject.toml new file mode 100644 index 0000000000000..7e03fdaa6a858 --- /dev/null +++ b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/pyproject.toml @@ -0,0 +1,69 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +contains_example = false +import_path = "llama_index.graph_rag.cognee" + +[tool.llamahub.class_authors] +GraphRag = "llama-index" + +[tool.mypy] +disallow_untyped_defs = true +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.8" + +[tool.poetry] +authors = ["Your Name "] +description = "llama-index graph rag cognee integration" +exclude = ["**/BUILD"] +license = "MIT" +name = "llama-index-graph-rag-cognee" +readme = "README.md" +version = "0.1.0" + +[tool.poetry.dependencies] +python = ">=3.10,<3.12" +cognee = "^0.1.20" +httpx = "~=0.27.0" +llama-index-core = "^0.12.5" +pytest-cov = "^6.0.0" + +[tool.poetry.group.dev.dependencies] +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "8.2" +pytest-asyncio = "^0.25.0" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" +types-setuptools = "67.1.0.0" + +[tool.poetry.group.dev.dependencies.black] +extras = ["jupyter"] +version = "<=23.9.1,>=23.7.0" + +[tool.poetry.group.dev.dependencies.codespell] +extras = ["toml"] +version = ">=v2.2.6" + +[[tool.poetry.packages]] +include = "llama_index/" + +[tool.pytest.ini_options] +asyncio_default_fixture_loop_scope = "function" diff --git a/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/tests/BUILD b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/tests/BUILD new file mode 100644 index 0000000000000..1f617359f48aa --- /dev/null +++ b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/tests/BUILD @@ -0,0 +1,3 @@ +python_tests( + interpreter_constraints=["==3.10.*", "==3.11.*", "==3.12.*"] +) diff --git a/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/tests/__init__.py b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/tests/__init__.py new file mode 100644 index 0000000000000..e69de29bb2d1d diff --git a/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/tests/test_add_data.py b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/tests/test_add_data.py new file mode 100644 index 0000000000000..c0e41af04a5ae --- /dev/null +++ b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/tests/test_add_data.py @@ -0,0 +1,43 @@ +from llama_index.core import Document +import asyncio +import pytest +from llama_index.graph_rag.cognee import CogneeGraphRAG + + +@pytest.mark.asyncio() +async def test_add_data(monkeypatch): + # Instantiate cognee GraphRAG + cogneeGraphRAG = CogneeGraphRAG( + llm_api_key="", + llm_provider="openai", + llm_model="gpt-4o-mini", + graph_db_provider="networkx", + vector_db_provider="lancedb", + relational_db_provider="sqlite", + db_name="cognee_db", + ) + + # Mock logging to graphistry + async def mock_add_return(add, dataset_name): + return True + + import cognee + + monkeypatch.setattr(cognee, "add", mock_add_return) + + # Gather documents to add to GraphRAG + documents = [ + Document( + text="Jessica Miller, Experienced Sales Manager with a strong track record in driving sales growth and building high-performing teams." + ), + Document( + text="David Thompson, Creative Graphic Designer with over 8 years of experience in visual design and branding." + ), + ] + + await cogneeGraphRAG.add(documents, "test") + await cogneeGraphRAG.add(documents[0], "test") + + +if __name__ == "__main__": + asyncio.run(test_add_data()) diff --git a/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/tests/test_get_graph_url.py b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/tests/test_get_graph_url.py new file mode 100644 index 0000000000000..22ee4f6872ec9 --- /dev/null +++ b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/tests/test_get_graph_url.py @@ -0,0 +1,48 @@ +import asyncio +import pytest +from llama_index.graph_rag.cognee import CogneeGraphRAG + + +@pytest.mark.asyncio() +async def test_get_graph_url(monkeypatch): + # Instantiate cognee GraphRAG + cogneeRAG = CogneeGraphRAG( + llm_api_key="", + llm_provider="openai", + llm_model="gpt-4o-mini", + graph_db_provider="networkx", + vector_db_provider="lancedb", + relational_db_provider="sqlite", + db_name="cognee_db", + ) + + # Mock logging to graphistry + def mock_graphistry_return(username, password): + return True + + import graphistry + + monkeypatch.setattr(graphistry, "login", mock_graphistry_return) + + # Mock render of graph + async def mock_render_return(graph): + return "link" + + from cognee.shared import utils + + monkeypatch.setattr(utils, "render_graph", mock_render_return) + + await cogneeRAG.get_graph_url("password", "username") + + from cognee.base_config import get_base_config + + assert ( + get_base_config().graphistry_password == "password" + ), "Password was not set properly" + assert ( + get_base_config().graphistry_username == "username" + ), "Username was not set properly" + + +if __name__ == "__main__": + asyncio.run(test_get_graph_url()) diff --git a/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/tests/test_graph_rag_cognee.py b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/tests/test_graph_rag_cognee.py new file mode 100644 index 0000000000000..3573b18b7e591 --- /dev/null +++ b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/tests/test_graph_rag_cognee.py @@ -0,0 +1,64 @@ +import os +import asyncio + +import cognee +import pytest +from llama_index.core import Document +from llama_index.graph_rag.cognee import CogneeGraphRAG + + +@pytest.mark.skipif( + os.getenv("OPENAI_API_KEY") is None, + reason="OPENAI_API_KEY not available to test Cognee integration", +) +@pytest.mark.asyncio() +async def test_graph_rag_cognee(): + documents = [ + Document( + text="Jessica Miller, Experienced Sales Manager with a strong track record in driving sales growth and building high-performing teams." + ), + Document( + text="David Thompson, Creative Graphic Designer with over 8 years of experience in visual design and branding." + ), + ] + + # Instantiate cognee GraphRAG + cogneeRAG = CogneeGraphRAG( + llm_api_key=os.environ["OPENAI_API_KEY"], + llm_provider="openai", + llm_model="gpt-4o-mini", + graph_db_provider="networkx", + vector_db_provider="lancedb", + relational_db_provider="sqlite", + db_name="cognee_db", + ) + + # Add data to cognee + await cogneeRAG.add(documents, "test") + # Process data into a knowledge graph + await cogneeRAG.process_data("test") + + # Answer prompt based on knowledge graph + search_results = await cogneeRAG.search("person") + + assert len(search_results) > 0, "No search results found" + + print("\n\nExtracted sentences are:\n") + for result in search_results: + print(f"{result}\n") + + # Search for related nodes + search_results = await cogneeRAG.get_related_nodes("person") + print("\n\nRelated nodes are:\n") + for result in search_results: + print(f"{result}\n") + + assert len(search_results) > 0, "No search results found" + + # Clean all data from previous runs + await cognee.prune.prune_data() + await cognee.prune.prune_system(metadata=True) + + +if __name__ == "__main__": + asyncio.run(test_graph_rag_cognee())