Knowledge (#1567)

* initial knowledge * WIP * Adding core knowledge sources * Improve types and better support for file paths * added additional sources * fix linting * update yaml to include optional deps * adding in lorenze feedback * ensure embeddings are persisted * improvements all around Knowledge class * return this * properly reset memory * properly reset memory+knowledge * consolodation and improvements * linted * cleanup rm unused embedder * fix test * fix duplicate * generating cassettes for knowledge test * updated default embedder * None embedder to use default on pipeline cloning * improvements * fixed text_file_knowledge * mypysrc fixes * type check fixes * added extra cassette * just mocks * linted * mock knowledge query to not spin up db * linted * verbose run * put a flag * fix * adding docs * better docs * improvements from review * more docs * linted * rm print * more fixes * clearer docs * added docstrings and type hints for cli --------- Co-authored-by: João Moura <[email protected]> Co-authored-by: Lorenze Jay <[email protected]>
crewAIInc · Nov 20, 2024 · 14a36d3 · 14a36d3
1 parent fde1ee4
commit 14a36d3
Show file tree

Hide file tree

Showing 37 changed files with 2,305 additions and 269 deletions.
diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml
@@ -26,7 +26,7 @@ jobs:
         run: uv python install 3.11.9
 
       - name: Install the project
-        run: uv sync --dev
+        run: uv sync --dev --all-extras
 
       - name: Run tests
-        run: uv run pytest tests
+        run: uv run pytest tests -vv
diff --git a/docs/concepts/knowledge.mdx b/docs/concepts/knowledge.mdx
@@ -0,0 +1,75 @@
+---
+title: Knowledge
+description: What is knowledge in CrewAI and how to use it.
+icon: book
+---
+
+# Using Knowledge in CrewAI
+
+## Introduction
+
+The Knowledge class in CrewAI provides a powerful way to manage and query knowledge sources for your AI agents. This guide will show you how to implement knowledge management in your CrewAI projects.
+Additionally, we have specific tools for generate knowledge sources for strings, text files, PDF's, and Spreadsheets. You can expand on any source type by extending the `KnowledgeSource` class.
+
+## Basic Implementation
+
+Here's a simple example of how to use the Knowledge class:
+
+```python
+from crewai import Agent, Task, Crew, Process, LLM
+from crewai.knowledge.source.string_knowledge_source import StringKnowledgeSource
+
+# Create a knowledge source
+content = "Users name is John. He is 30 years old and lives in San Francisco."
+string_source = StringKnowledgeSource(
+    content=content, metadata={"preference": "personal"}
+)
+
+
+llm = LLM(model="gpt-4o-mini", temperature=0)
+  # Create an agent with the knowledge store
+agent = Agent(
+    role="About User",
+    goal="You know everything about the user.",
+    backstory="""You are a master at understanding people and their preferences.""",
+    verbose=True,
+    allow_delegation=False,
+    llm=llm,
+)
+task = Task(
+    description="Answer the following questions about the user: {question}",
+    expected_output="An answer to the question.",
+    agent=agent,
+)
+
+crew = Crew(
+    agents=[agent],
+    tasks=[task],
+    verbose=True,
+    process=Process.sequential,
+    knowledge={"sources": [string_source], "metadata": {"preference": "personal"}}, # Enable knowledge by adding the sources here. You can also add more sources to the sources list.
+)
+
+result = crew.kickoff(inputs={"question": "What city does John live in and how old is he?"})
+```
+
+
+## Embedder Configuration
+
+You can also configure the embedder for the knowledge store. This is useful if you want to use a different embedder for the knowledge store than the one used for the agents.
+
+```python
+...
+string_source = StringKnowledgeSource(
+    content="Users name is John. He is 30 years old and lives in San Francisco.",
+    metadata={"preference": "personal"}
+)
+crew = Crew(
+    ...
+    knowledge={
+        "sources": [string_source],
+        "metadata": {"preference": "personal"},
+        "embedder_config": {"provider": "openai", "config": {"model": "text-embedding-3-small"}},
+    },
+)
+```
diff --git a/pyproject.toml b/pyproject.toml
@@ -39,6 +39,16 @@ Repository = "https://github.com/crewAIInc/crewAI"
 [project.optional-dependencies]
 tools = ["crewai-tools>=0.14.0"]
 agentops = ["agentops>=0.3.0"]
+fastembed = ["fastembed>=0.4.1"]
+pdfplumber = [
+    "pdfplumber>=0.11.4",
+]
+pandas = [
+    "pandas>=2.2.3",
+]
+openpyxl = [
+    "openpyxl>=3.1.5",
+]
 mem0 = ["mem0ai>=0.1.29"]
 
 [tool.uv]

diff --git a/src/crewai/__init__.py b/src/crewai/__init__.py
@@ -1,7 +1,9 @@
 import warnings
+
 from crewai.agent import Agent
 from crewai.crew import Crew
 from crewai.flow.flow import Flow
+from crewai.knowledge.knowledge import Knowledge
 from crewai.llm import LLM
 from crewai.pipeline import Pipeline
 from crewai.process import Process
@@ -15,4 +17,14 @@
     module="pydantic.main",
 )
 __version__ = "0.80.0"
-__all__ = ["Agent", "Crew", "Process", "Task", "Pipeline", "Router", "LLM", "Flow"]
+__all__ = [
+    "Agent",
+    "Crew",
+    "Process",
+    "Task",
+    "Pipeline",
+    "Router",
+    "LLM",
+    "Flow",
+    "Knowledge",
+]
diff --git a/src/crewai/agent.py b/src/crewai/agent.py
@@ -11,8 +11,8 @@
 from crewai.cli.constants import ENV_VARS
 from crewai.llm import LLM
 from crewai.memory.contextual.contextual_memory import ContextualMemory
-from crewai.tools.agent_tools.agent_tools import AgentTools
 from crewai.tools import BaseTool
+from crewai.tools.agent_tools.agent_tools import AgentTools
 from crewai.utilities import Converter, Prompts
 from crewai.utilities.constants import TRAINED_AGENTS_DATA_FILE, TRAINING_DATA_FILE
 from crewai.utilities.token_counter_callback import TokenCalcHandler
@@ -52,6 +52,7 @@ class Agent(BaseAgent):
             role: The role of the agent.
             goal: The objective of the agent.
             backstory: The backstory of the agent.
+            knowledge: The knowledge base of the agent.
             config: Dict representation of agent configuration.
             llm: The language model that will run the agent.
             function_calling_llm: The language model that will handle the tool calling for this agent, it overrides the crew function_calling_llm.
@@ -272,6 +273,18 @@ def execute_task(
             if memory.strip() != "":
                 task_prompt += self.i18n.slice("memory").format(memory=memory)
 
+        # Integrate the knowledge base
+        if self.crew and self.crew.knowledge:
+            knowledge_snippets = self.crew.knowledge.query([task.prompt()])
+            valid_snippets = [
+                result["context"] 
+                for result in knowledge_snippets 
+                if result and result.get("context")
+            ]
+            if valid_snippets:
+                formatted_knowledge = "\n".join(valid_snippets)
+                task_prompt += f"\n\nAdditional Information:\n{formatted_knowledge}"
+
         tools = tools or self.tools or []
         self.create_agent_executor(tools=tools, task=task)
 

diff --git a/src/crewai/cli/cli.py b/src/crewai/cli/cli.py
@@ -136,24 +136,32 @@ def log_tasks_outputs() -> None:
 @click.option("-l", "--long", is_flag=True, help="Reset LONG TERM memory")
 @click.option("-s", "--short", is_flag=True, help="Reset SHORT TERM memory")
 @click.option("-e", "--entities", is_flag=True, help="Reset ENTITIES memory")
+@click.option("-kn", "--knowledge", is_flag=True, help="Reset KNOWLEDGE storage")
 @click.option(
     "-k",
     "--kickoff-outputs",
     is_flag=True,
     help="Reset LATEST KICKOFF TASK OUTPUTS",
 )
 @click.option("-a", "--all", is_flag=True, help="Reset ALL memories")
-def reset_memories(long, short, entities, kickoff_outputs, all):
+def reset_memories(
+    long: bool,
+    short: bool,
+    entities: bool,
+    knowledge: bool,
+    kickoff_outputs: bool,
+    all: bool,
+) -> None:
     """
     Reset the crew memories (long, short, entity, latest_crew_kickoff_ouputs). This will delete all the data saved.
     """
     try:
-        if not all and not (long or short or entities or kickoff_outputs):
+        if not all and not (long or short or entities or knowledge or kickoff_outputs):
             click.echo(
                 "Please specify at least one memory type to reset using the appropriate flags."
             )
             return
-        reset_memories_command(long, short, entities, kickoff_outputs, all)
+        reset_memories_command(long, short, entities, knowledge, kickoff_outputs, all)
     except Exception as e:
         click.echo(f"An error occurred while resetting memories: {e}", err=True)
 

diff --git a/src/crewai/cli/reset_memories_command.py b/src/crewai/cli/reset_memories_command.py
@@ -5,9 +5,17 @@
 from crewai.memory.long_term.long_term_memory import LongTermMemory
 from crewai.memory.short_term.short_term_memory import ShortTermMemory
 from crewai.utilities.task_output_storage_handler import TaskOutputStorageHandler
+from crewai.knowledge.storage.knowledge_storage import KnowledgeStorage
 
 
-def reset_memories_command(long, short, entity, kickoff_outputs, all) -> None:
+def reset_memories_command(
+    long,
+    short,
+    entity,
+    knowledge,
+    kickoff_outputs,
+    all,
+) -> None:
     """
     Reset the crew memories.
 
@@ -17,6 +25,7 @@ def reset_memories_command(long, short, entity, kickoff_outputs, all) -> None:
       entity (bool): Whether to reset the entity memory.
       kickoff_outputs (bool): Whether to reset the latest kickoff task outputs.
       all (bool): Whether to reset all memories.
+      knowledge (bool): Whether to reset the knowledge.
     """
 
     try:
@@ -25,6 +34,7 @@ def reset_memories_command(long, short, entity, kickoff_outputs, all) -> None:
             EntityMemory().reset()
             LongTermMemory().reset()
             TaskOutputStorageHandler().reset()
+            KnowledgeStorage().reset()
             click.echo("All memories have been reset.")
         else:
             if long:
@@ -40,6 +50,9 @@ def reset_memories_command(long, short, entity, kickoff_outputs, all) -> None:
             if kickoff_outputs:
                 TaskOutputStorageHandler().reset()
                 click.echo("Latest Kickoff outputs stored has been reset.")
+            if knowledge:
+                KnowledgeStorage().reset()
+                click.echo("Knowledge has been reset.")
 
     except subprocess.CalledProcessError as e:
         click.echo(f"An error occurred while resetting the memories: {e}", err=True)

diff --git a/src/crewai/crew.py b/src/crewai/crew.py
@@ -27,6 +27,7 @@
 from crewai.memory.entity.entity_memory import EntityMemory
 from crewai.memory.long_term.long_term_memory import LongTermMemory
 from crewai.memory.short_term.short_term_memory import ShortTermMemory
+from crewai.knowledge.knowledge import Knowledge
 from crewai.memory.user.user_memory import UserMemory
 from crewai.process import Process
 from crewai.task import Task
@@ -201,6 +202,10 @@ class Crew(BaseModel):
         default=[],
         description="List of execution logs for tasks",
     )
+    knowledge: Optional[Dict[str, Any]] = Field(
+        default=None, description="Knowledge for the crew. Add knowledge sources to the knowledge object."
+    )
+
 
     @field_validator("id", mode="before")
     @classmethod
@@ -275,6 +280,15 @@ def create_crew_memory(self) -> "Crew":
                 self._user_memory = None
         return self
 
+    @model_validator(mode="after")
+    def create_crew_knowledge(self) -> "Crew":
+        if self.knowledge:
+            try:
+                self.knowledge = Knowledge(**self.knowledge) if isinstance(self.knowledge, dict) else self.knowledge
+            except (TypeError, ValueError) as e:
+                raise ValueError(f"Invalid knowledge configuration: {str(e)}")
+        return self
+
     @model_validator(mode="after")
     def check_manager_llm(self):
         """Validates that the language model is set when using hierarchical process."""

diff --git a/src/crewai/knowledge/__init__.py b/src/crewai/knowledge/__init__.py
diff --git a/src/crewai/knowledge/embedder/__init__.py b/src/crewai/knowledge/embedder/__init__.py
diff --git a/src/crewai/knowledge/embedder/base_embedder.py b/src/crewai/knowledge/embedder/base_embedder.py
@@ -0,0 +1,55 @@
+from abc import ABC, abstractmethod
+from typing import List
+
+import numpy as np
+
+
+class BaseEmbedder(ABC):
+    """
+    Abstract base class for text embedding models
+    """
+
+    @abstractmethod
+    def embed_chunks(self, chunks: List[str]) -> np.ndarray:
+        """
+        Generate embeddings for a list of text chunks
+
+        Args:
+            chunks: List of text chunks to embed
+
+        Returns:
+            Array of embeddings
+        """
+        pass
+
+    @abstractmethod
+    def embed_texts(self, texts: List[str]) -> np.ndarray:
+        """
+        Generate embeddings for a list of texts
+
+        Args:
+            texts: List of texts to embed
+
+        Returns:
+            Array of embeddings
+        """
+        pass
+
+    @abstractmethod
+    def embed_text(self, text: str) -> np.ndarray:
+        """
+        Generate embedding for a single text
+
+        Args:
+            text: Text to embed
+
+        Returns:
+            Embedding array
+        """
+        pass
+
+    @property
+    @abstractmethod
+    def dimension(self) -> int:
+        """Get the dimension of the embeddings"""
+        pass