run-llama · logan-markewich · Dec 20, 2024 · Dec 16, 2024 · Dec 16, 2024 · Dec 16, 2024
diff --git a/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/.gitignore b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/.gitignore
@@ -0,0 +1,177 @@
+llama_index/_static
+.DS_Store
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+bin/
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+etc/
+include/
+lib/
+lib64/
+parts/
+sdist/
+share/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+.ruff_cache
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+notebooks/
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+pyvenv.cfg
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# Jetbrains
+.idea
+modules/
+*.swp
+
+# VsCode
+.vscode
+
+# pipenv
+Pipfile
+Pipfile.lock
+
+# pyright
+pyrightconfig.json
+
+# Cognee
+.data
+.env
+.local.env
+.prod.env
+cognee/.data/
+*.lance/
+.cognee_system/
+.data_storage/
+.anon_id
+SWE-bench_testsample/
+
+full_run.ipynb
+
+# Environments
+.env
+.env.local
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
diff --git a/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/BUILD b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/BUILD
@@ -0,0 +1,3 @@
+poetry_requirements(
+    name="poetry",
+)
diff --git a/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/Makefile b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/Makefile
@@ -0,0 +1,17 @@
+GIT_ROOT ?= $(shell git rev-parse --show-toplevel)
+
+help:	## Show all Makefile targets.
+	@grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}'
+
+format:	## Run code autoformatters (black).
+	pre-commit install
+	git ls-files | xargs pre-commit run black --files
+
+lint:	## Run linters: pre-commit (black, ruff, codespell) and mypy
+	pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files
+
+test:	## Run tests via pytest.
+	pytest tests
+
+watch-docs:	## Build and watch documentation.
+	sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/
diff --git a/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/README.md b/llama-index-integrations/graph_rag/llama-index-graph-rag-cognee/README.md
@@ -0,0 +1,65 @@
+# LlamaIndex Graph Rag Integration: Cognee
+
+Cognee assists developers in introducing greater predictability and management into their Retrieval-Augmented Generation (RAG) workflows through the use of graph architectures, vector stores, and auto-optimizing pipelines. Displaying information as a graph is the clearest way to grasp the content of your documents. Crucially, graphs allow systematic navigation and extraction of data from documents based on their hierarchy.
+
+For more information, visit [Cognee documentation](https://docs.cognee.ai/)
+
+## Installation
+
+```shell
+pip install llama-index-graph-rag-cognee
+```
+
+## Usage
+
+```python
+import os
+import pandas as pd
+import asyncio
+
+from llama_index.core import Document
+from llama_index.graph_rag.cognee import CogneeGraphRAG
+
+
+async def example_graph_rag_cognee():
+    # Gather documents to add to GraphRAG
+    news = pd.read_csv(
+        "https://raw.githubusercontent.com/tomasonjo/blog-datasets/main/news_articles.csv"
+    )[:5]
+    news.head()
+    documents = [
+        Document(text=f"{row['title']}: {row['text']}")
+        for i, row in news.iterrows()
+    ]
+
+    # Instantiate cognee GraphRAG
+    cogneeRAG = CogneeGraphRAG(
+        llm_api_key=os.environ["OPENAI_API_KEY"],
+        graph_db_provider="networkx",
+        vector_db_provider="lancedb",
+        relational_db_provider="sqlite",
+        db_name="cognee_db",
+    )
+
+    # Add data to cognee
+    await cogneeRAG.add(documents, "test")
+
+    # Process data into a knowledge graph
+    await cogneeRAG.process_data("test")
+
+    # Answer prompt based on knowledge graph
+    search_results = await cogneeRAG.search("person")
+    print("\n\nExtracted sentences are:\n")
+    for result in search_results:
+        print(f"{result}\n")
+
+    # Search for related nodes
+    search_results = await cogneeRAG.get_related_nodes("person")
+    print("\n\nRelated nodes are:\n")
+    for result in search_results:
+        print(f"{result}\n")
+
+
+if __name__ == "__main__":
+    asyncio.run(example_graph_rag_cognee())
+```
diff --git a/...ex-integrations/graph_rag/llama-index-graph-rag-cognee/llama_index/graph_rag/cognee/BUILD b/...ex-integrations/graph_rag/llama-index-graph-rag-cognee/llama_index/graph_rag/cognee/BUILD
@@ -0,0 +1 @@
+python_sources()
diff --git a/...egrations/graph_rag/llama-index-graph-rag-cognee/llama_index/graph_rag/cognee/__init__.py b/...egrations/graph_rag/llama-index-graph-rag-cognee/llama_index/graph_rag/cognee/__init__.py
@@ -0,0 +1,3 @@
+from llama_index.graph_rag.cognee.graph_rag import CogneeGraphRAG
+
+__all__ = ["CogneeGraphRAG"]
diff --git a/...-integrations/graph_rag/llama-index-graph-rag-cognee/llama_index/graph_rag/cognee/base.py b/...-integrations/graph_rag/llama-index-graph-rag-cognee/llama_index/graph_rag/cognee/base.py
@@ -0,0 +1,54 @@
+from abc import abstractmethod
+from typing import Protocol
+
+
+# NOTE: This is a bare-bone suggestion for an abstract protocol to define GraphRAG for llama-index
+# This should be expanded upon and integrated to llama-index-core to support multiple different GraphRAG
+# libraries in the future
+class GraphRAG(Protocol):
+    """Abstract graph RAG protocol.
+
+    This protocol defines the interface for a graphRAG, which is responsible
+    for adding, storing, processing and retrieving information from knowledge graphs.
+
+    Attributes:
+        llm_api_key: str: Api key for desired llm.
+        graph_db_provider: str: The graph database provider.
+        vector_db_provider: str: The vector database provider.
+        relational_db_provider: str: The relational database provider.
+        db_name: str: The name of the databases.
+    """
+
+    @abstractmethod
+    async def add(self, data, dataset_name):
+        """Add data to the specified dataset.
+        This data will later be processed and made into a knowledge graph.
+
+        Args:
+             data (Any): The data to be added to the graph.
+             dataset_name (str): Name of the dataset or node set where the data will be added.
+        """
+
+    @abstractmethod
+    async def process_data(self, dataset_name: str):
+        """Process and structure data in the dataset and make a knowledge graph out of it.
+
+        Args:
+            dataset_name (str): The dataset name to process.
+        """
+
+    @abstractmethod
+    async def search(self, query: str):
+        """Search the graph for relevant information based on a query.
+
+        Args:
+            query (str): The query string to match against data from the graph.
+        """
+
+    @abstractmethod
+    async def get_related_nodes(self, node_id: str):
+        """Search the graph for relevant nodes or relationships based on node id.
+
+        Args:
+            node_id (str): The name of the node to match against nodes in the graph.
+        """
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,3 @@
		from llama_index.graph_rag.cognee.graph_rag import CogneeGraphRAG

		__all__ = ["CogneeGraphRAG"]