From 739e4af9e84bd4a06ed753d39843ac3757f20f41 Mon Sep 17 00:00:00 2001 From: Tvrtko Sternak Date: Fri, 20 Dec 2024 12:12:18 +0100 Subject: [PATCH] Polish realtime agent blogpost, Add blogpost and api docs to website --- .../img/realtime_agent_swarm.png | 4 +- .../img/twilio_endpoint_config.png | 3 + .../blog/2024-12-18-RealtimeAgent/index.mdx | 80 ++++++++++++------- website/mint.json | 15 +++- 4 files changed, 71 insertions(+), 31 deletions(-) create mode 100644 website/blog/2024-12-18-RealtimeAgent/img/twilio_endpoint_config.png diff --git a/website/blog/2024-12-18-RealtimeAgent/img/realtime_agent_swarm.png b/website/blog/2024-12-18-RealtimeAgent/img/realtime_agent_swarm.png index 1b8d80aa3b..36bdebfba1 100644 --- a/website/blog/2024-12-18-RealtimeAgent/img/realtime_agent_swarm.png +++ b/website/blog/2024-12-18-RealtimeAgent/img/realtime_agent_swarm.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:30bf2c8a7d346604dabf85036328b28657e9472d55087b65869497ceb9142f98 -size 300823 +oid sha256:2940cb19d60b03e70b7e12b3032b94f726cb6b7566f3654bd3b11d7897bd2d28 +size 714662 diff --git a/website/blog/2024-12-18-RealtimeAgent/img/twilio_endpoint_config.png b/website/blog/2024-12-18-RealtimeAgent/img/twilio_endpoint_config.png new file mode 100644 index 0000000000..aae4fae92b --- /dev/null +++ b/website/blog/2024-12-18-RealtimeAgent/img/twilio_endpoint_config.png @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:884c225b1e51966e926de1544d478934c69eb75528dcff5b1c65f64d17e31771 +size 122412 diff --git a/website/blog/2024-12-18-RealtimeAgent/index.mdx b/website/blog/2024-12-18-RealtimeAgent/index.mdx index d41f357bd9..37dd11259b 100644 --- a/website/blog/2024-12-18-RealtimeAgent/index.mdx +++ b/website/blog/2024-12-18-RealtimeAgent/index.mdx @@ -9,16 +9,17 @@ tags: [Realtime API, Voice Agents, Swarm Teams, Twilio, AI Tools] --- -![Realtime Agent Swarm](img/realtime_agent_swarm.png) + + **TL;DR:** -- **RealtimeAgent** is coming in the AG2 0.6 release, enabling real-time conversational AI. +- [**`RealtimeAgent`**](/docs/reference/agentchat/realtime_agent/realtime_agent) is coming in the AG2 0.6 release, enabling real-time conversational AI. - Features include real-time voice interactions, seamless task delegation to Swarm teams, and Twilio-based telephony integration. -- Learn how to integrate Twilio and RealtimeAgent into your swarm in this blogpost. +- Learn how to integrate [Twilio](https://www.twilio.com/) and [**`RealtimeAgent`**](/docs/reference/agentchat/realtime_agent/realtime_agent) into your swarm in this blogpost. ### **Realtime API Support: What's New?** -We're thrilled to announce the release of **RealtimeAgent**, extending AG2's capabilities to support **real-time conversational AI tasks**. This new experimental feature makes it possible for developers to build agents capable of handling voice-based interactions with minimal latency, integrating OpenAI’s Realtime API, Twilio for telephony, and AG2’s Swarm orchestration. +We're thrilled to announce the release of [**`RealtimeAgent`**](/docs/reference/agentchat/realtime_agent/realtime_agent), extending AG2's capabilities to support **real-time conversational AI tasks**. This new experimental feature makes it possible for developers to build agents capable of handling voice-based interactions with minimal latency, integrating [OpenAI’s Realtime API](https://openai.com/index/introducing-the-realtime-api/), [Twilio](https://www.twilio.com/) for telephony, and [AG2’s Swarm orchestration](/docs/topics/swarm). ### **Why Realtime API Support Matters** @@ -36,21 +37,21 @@ Traditionally, conversational AI tasks have focused on asynchronous interactions ### **Key Features of RealtimeAgent** -#### **1. RealtimeAgent** -- Acts as the central interface for handling real-time interactions. -- Bridges voice input/output with AG2’s task-handling capabilities. +1. **[**`RealtimeAgent`**](/docs/reference/agentchat/realtime_agent/realtime_agent)** + - Acts as the central interface for handling real-time interactions. + - Bridges voice input/output with AG2’s task-handling capabilities. -#### **2. RealtimeAgent swarm integration** -- Seamless integration of RealtimeAgent into Swarm +2. RealtimeAgent swarm integration + - Seamless integration of [**`RealtimeAgent`**](/docs/reference/agentchat/realtime_agent/realtime_agent) into Swarm -#### **3. TwilioAdapter** -- Connects agents to Twilio for telephony support. -- Simplifies the process of handling voice calls with clear API methods. +3. [**`TwilioAudioAdapter`**](/docs/reference/agentchat/realtime_agent/twilio_observer#twilioaudioadapter) + - Connects agents to Twilio for telephony support. + - Simplifies the process of handling voice calls with clear API methods. ### **Real-World Applications** -Here’s how RealtimeAgent transforms use cases: +Here’s how [**`RealtimeAgent`**](/docs/reference/agentchat/realtime_agent/realtime_agent) transforms use cases: - **Customer Support**: Enable agents to answer customer queries in real time while delegating complex tasks to a Swarm team. - **Healthcare**: Facilitate real-time interactions between patients and medical AI assistants with immediate task handoffs. @@ -59,16 +60,37 @@ Here’s how RealtimeAgent transforms use cases: ### **Tutorial: Integrating RealtimeAgent with Swarm Teams** -In this tutorial, we’ll demonstrate how to implement OpenAI's [Airline Customer Service Example](https://github.com/openai/swarm/tree/main/examples/airline) using AG2's new **RealtimeAgent**. By leveraging real-time capabilities, you can create a seamless voice-driven experience for customer service tasks, enhanced with Swarm's task orchestration for efficient problem-solving. +In this tutorial, we’ll demonstrate how to implement OpenAI's [Airline Customer Service Example](https://github.com/openai/swarm/tree/main/examples/airline) using AG2's new [**`RealtimeAgent`**](/docs/reference/agentchat/realtime_agent/realtime_agent). By leveraging real-time capabilities, you can create a seamless voice-driven experience for customer service tasks, enhanced with Swarm's task orchestration for efficient problem-solving. This guide will walk you through the setup, implementation, and connection process, ensuring you’re ready to build real-time conversational agents with ease. +#### General overview + +To illustrate the system, the diagram below provides a high-level view of how the Realtime Agent collaborates with a swarm of specialized agents to handle customer service tasks. + +![Realtime Agent Swarm](img/realtime_agent_swarm.png) + +1. **Twilio Integration:** + The user begins the interaction by placing a voice call via Twilio. Twilio handles the audio transmission and forwards the conversation to the Realtime Agent, enabling real-time processing of the user’s request. + +2. **Realtime Agent Coordination:** + At the heart of the system, the Realtime Agent leverages OpenAI’s Realtime API to understand and manage the user’s intent dynamically. It facilitates the conversation and ensures the user’s query is routed correctly within the swarm. + +3. **Specialized Agents in the Swarm:** + - The **Triage Agent** acts as the gateway, identifying the nature of the request—such as a flight change or cancellation—and handing it off to the relevant specialized agent. + - For example, a **Flight Change Agent** processes requests to modify a booking, while a **Flight Cancellation Agent** handles cancellations and refunds. + +4. **Interactive Feedback Loop:** + Throughout the interaction, agents communicate with the user through the Realtime Agent, asking clarifying questions or providing status updates. Each agent follows strict policies to ensure consistent and reliable service. + +5. **Task Completion:** + Once the task is complete—such as changing a flight or issuing a refund—the system confirms the resolution with the user and ensures there are no additional queries before closing the case. #### **Prerequisites** Before we begin, ensure you have the following set up: -1. **Ngrok**: Exposing your local service to the web for Twilio integration. -2. **Twilio**: Setting up Twilio for telephony connectivity. +1. [**Ngrok**](https://ngrok.com/): Exposing your local service to the web for Twilio integration. +2. [**Twilio**](https://www.twilio.com/): Setting up Twilio for telephony connectivity. #### Ngrok setup @@ -101,7 +123,7 @@ With your public URL set up, you’re ready to configure Twilio to send requests #### **Twilio Setup** -To connect Twilio with your RealtimeAgent, follow these steps: +To connect Twilio with your [**`RealtimeAgent`**](/docs/reference/agentchat/realtime_agent/realtime_agent), follow these steps: 1. **Create a Twilio Account** If you don’t already have a Twilio account, you can [sign up here](https://login.twilio.com/u/signup). Twilio offers trial accounts, which are perfect for testing purposes. @@ -115,16 +137,17 @@ To connect Twilio with your RealtimeAgent, follow these steps: - Append `/incoming-call` to the URL. For example: - If your ngrok URL is `https://abc123.ngrok.app`, the webhook URL should be: `https://abc123.ngrok.app/incoming-call` +![Twilio endpoint config](img/twilio_endpoint_config.png) 4. **Save Changes** Once the webhook URL is set, save your changes. -You’re now ready to test the integration between Twilio and your RealtimeAgent! +You’re now ready to test the integration between Twilio and your [**`RealtimeAgent`**](/docs/reference/agentchat/realtime_agent/realtime_agent)! ### **Swarm Implementation for Airline Customer Service** -In this section, we’ll configure a Swarm to handle airline customer service tasks, such as flight changes and cancellations. This implementation builds upon the [original Swarm example notebook](/docs/notebooks/agentchat_swarm), which we’ve adapted to work seamlessly with the RealtimeAgent acting as a `UserProxyAgent`. +In this section, we’ll configure a Swarm to handle airline customer service tasks, such as flight changes and cancellations. This implementation builds upon the [original Swarm example notebook](/notebooks/agentchat_swarm), which we’ve adapted to work seamlessly with the [**`RealtimeAgent`**](/docs/reference/agentchat/realtime_agent/realtime_agent) acting as a [**`UserProxyAgent`**](/docs/reference/agentchat/user_proxy_agent). -You can explore and run the complete implementation of the RealtimeAgent demonstrated here by visiting [this notebook](/docs/notebooks/agentchat_realtime_swarm). +You can explore and run the complete implementation of the [**`RealtimeAgent`**](/docs/reference/agentchat/realtime_agent/realtime_agent) demonstrated here by visiting [this notebook](/notebooks/agentchat_realtime_swarm). For the sake of brevity, we’ll focus on the key sections of the Swarm setup in this blog post, highlighting the essential components. @@ -189,7 +212,7 @@ def initiate_refund() -> str: ### **Connecting the Swarm to the RealtimeAgent** -In this section, we will connect the Swarm to the **RealtimeAgent**, enabling real-time voice interaction and task delegation. To achieve this, we use FastAPI to create a lightweight server that acts as a bridge between Twilio and the RealtimeAgent. +In this section, we will connect the Swarm to the [**`RealtimeAgent`**](/docs/reference/agentchat/realtime_agent/realtime_agent), enabling real-time voice interaction and task delegation. To achieve this, we use FastAPI to create a lightweight server that acts as a bridge between Twilio and the [**`RealtimeAgent`**](/docs/reference/agentchat/realtime_agent/realtime_agent). The key functionalities of this implementation include: @@ -211,7 +234,7 @@ async def handle_incoming_call(request: Request): return HTMLResponse(content=str(response), media_type="application/xml") ``` 2. **WebSocket Media Stream** - A WebSocket endpoint, `/media-stream`, is established to manage real-time audio communication between Twilio and a realtime model inference client such as OpenAI's realtime API. This allows audio data to flow seamlessly, enabling the RealtimeAgent to process and respond to user queries. + A WebSocket endpoint, `/media-stream`, is established to manage real-time audio communication between Twilio and a realtime model inference client such as OpenAI's realtime API. This allows audio data to flow seamlessly, enabling the [**`RealtimeAgent`**](/docs/reference/agentchat/realtime_agent/realtime_agent) to process and respond to user queries. ```python @app.websocket("/media-stream") async def handle_media_stream(websocket: WebSocket): @@ -220,7 +243,7 @@ async def handle_media_stream(websocket: WebSocket): ... ``` 3. **Initializing the RealtimeAgent** - Inside the WebSocket handler, we instantiate the **RealtimeAgent** with the following components: + Inside the WebSocket handler, we instantiate the [**`RealtimeAgent`**](/docs/reference/agentchat/realtime_agent/realtime_agent) with the following components: - **Name**: The identifier for the agent (e.g., `Customer_service_Bot`). - **LLM Configuration**: The configuration for the underlying realtime model inference client that powers the agent. - **Audio Adapter**: A TwilioAudioAdapter is used to handle audio streaming between Twilio and the agent. @@ -234,7 +257,7 @@ async def handle_media_stream(websocket: WebSocket): ) ``` 4. **Registering the Swarm** - The RealtimeAgent is linked to a Swarm of agents responsible for different customer service tasks. + The [**`RealtimeAgent`**](/docs/reference/agentchat/realtime_agent/realtime_agent) is linked to a Swarm of agents responsible for different customer service tasks. - `initial_agent`: The first agent to process incoming queries (e.g., a triage agent). - `agents`: A list of specialized agents for handling specific tasks like flight modifications, cancellations, or lost baggage. ```python @@ -244,7 +267,7 @@ async def handle_media_stream(websocket: WebSocket): ) ``` 5. **Running the RealtimeAgent** - The `run()` method is invoked to start the RealtimeAgent, enabling it to handle real-time voice interactions and delegate tasks to the registered Swarm agents. + The `run()` method is invoked to start the [**`RealtimeAgent`**](/docs/reference/agentchat/realtime_agent/realtime_agent), enabling it to handle real-time voice interactions and delegate tasks to the registered Swarm agents. ```python await realtime_agent.run() ``` @@ -296,7 +319,7 @@ async def handle_media_stream(websocket: WebSocket): ### **Results: Running the Service** -With everything set up, it’s time to put your RealtimeAgent to the test! Follow these steps to make your first call and interact with the AI system: +With everything set up, it’s time to put your [**`RealtimeAgent`**](/docs/reference/agentchat/realtime_agent/realtime_agent) to the test! Follow these steps to make your first call and interact with the AI system: 1. **Ensure Everything is Running** - Verify that your **ngrok** session is still active and providing a public URL. @@ -304,13 +327,14 @@ With everything set up, it’s time to put your RealtimeAgent to the test! Follo 2. **Place a Call** - Use a cell phone or landline to call your **Twilio number**. + - Alternatively, you can use [twilllo dev phone](https://www.twilio.com/docs/labs/dev-phone) to call your **Twilio number** 3. **Watch the Magic Happen** - Start speaking! You’ll hear the AI system’s initial message and then be able to interact with it in real-time. #### **Realtime Agent and Swarm Workflow in Action** -The following images showcase the seamless interaction between the **RealtimeAgent** and the Swarm of agents as they work together to handle a live customer request. Here's how the process unfolds: +The following images showcase the seamless interaction between the [**`RealtimeAgent`**](/docs/reference/agentchat/realtime_agent/realtime_agent) and the Swarm of agents as they work together to handle a live customer request. Here's how the process unfolds: 1. **Service Initialization** Our service starts successfully, ready to handle incoming calls and process real-time interactions. @@ -342,7 +366,7 @@ This flow highlights the power of integrating real-time audio interaction with t ### **Caveats and Future Improvements** -While the RealtimeAgent and Swarm integration is a powerful tool, there are a few things to keep in mind as we continue to refine the system: +While the [**`RealtimeAgent`**](/docs/reference/agentchat/realtime_agent/realtime_agent) and Swarm integration is a powerful tool, there are a few things to keep in mind as we continue to refine the system: - **Work in Progress**: The agent is still evolving, and we’re actively polishing its details for a smoother experience in the coming weeks. - **Transcription Challenges**: Occasionally, the agent may mishear inputs, particularly when dictating complex information like flight numbers or letters. diff --git a/website/mint.json b/website/mint.json index 5ee80c8390..cdd050962a 100644 --- a/website/mint.json +++ b/website/mint.json @@ -300,6 +300,17 @@ "docs/reference/agentchat/contrib/web_surfer" ] }, + { + "group": "agentchat.realtime_agent", + "pages": [ + "docs/reference/agentchat/realtime_agent/client", + "docs/reference/agentchat/realtime_agent/function_observer", + "docs/reference/agentchat/realtime_agent/realtime_agent", + "docs/reference/agentchat/realtime_agent/realtime_observer", + "docs/reference/agentchat/realtime_agent/twilio_observer", + "docs/reference/agentchat/realtime_agent/websocket_observer" + ] + }, "docs/reference/agentchat/agent", "docs/reference/agentchat/assistant_agent", "docs/reference/agentchat/chat", @@ -450,6 +461,7 @@ { "group": "Recent posts", "pages": [ + "blog/2024-12-18-RealtimeAgent/index", "blog/2024-12-06-FalkorDB-Structured/index", "blog/2024-12-02-ReasoningAgent2/index", "blog/2024-11-27-Prompt-Leakage-Probing/index", @@ -571,7 +583,8 @@ "notebooks/JSON_mode_example", "notebooks/agentchat_RetrieveChat", "notebooks/agentchat_graph_rag_neo4j", - "notebooks/agentchat_swarm_enhanced" + "notebooks/agentchat_swarm_enhanced", + "notebooks/agentchat_realtime_swarm" ] }, "notebooks/Gallery"