diff --git a/autogen/agentchat/contrib/reasoning_agent.py b/autogen/agentchat/contrib/reasoning_agent.py index c1e3391b89..c8c1c3db93 100644 --- a/autogen/agentchat/contrib/reasoning_agent.py +++ b/autogen/agentchat/contrib/reasoning_agent.py @@ -5,7 +5,12 @@ from typing import Any, Callable, Dict, List, Literal, Optional, Tuple, Union from ..agent import Agent -from ..assistant_agent import AssistantAgent +from ..assistant_agent import AssistantAgent +import random +import math + +EPSILON = 1e-6 + TreeofThought_message = """ Role: Expert Planning AI Assistant @@ -63,11 +68,11 @@ def __init__(self, content: str, parent: Optional["ThinkNode"] = None) -> None: - Providing trajectory utilities to get the full path from root to this node """ self.content = content - self.value = None + self.value = 0 self.parent = parent self.depth = self.parent.depth + 1 if parent else 0 self.children = [] - self.visits = 0 # TODO: remove this line if not used. + self.visits = 0 if self.parent: self.parent.children.append(self) @@ -175,9 +180,101 @@ def add_nodes(node: ThinkNode, node_id: str = "0"): print("Make sure graphviz is installed on your system: https://graphviz.org/download/") + +def extract_sft_dataset(root): + """ + Extract the best trajectory or multiple equally good trajectories + for SFT training. + + Args: + root: The root node of the tree. + + Returns: + List of best trajectories, where each trajectory is a pair of instruction and response. + """ + instruction = root.content + idx = len("# Question: ") + len(root.content) + 1 + + def _find_leaf_nodes(node): + """Recursively find all leaf nodes.""" + if not node.children: + return [node] + leafs = [] + for child in node.children: + leafs.extend(_find_leaf_nodes(child)) + return leafs + + # Step 1: Find all leaf nodes + leaf_nodes = _find_leaf_nodes(root) + + # Step 2: Determine the highest score among leaf nodes + max_value = max(leaf_nodes, key=lambda x: x.value).value + + # Step 3: Collect all leaf nodes with the highest score + best_leafs = [leaf for leaf in leaf_nodes if leaf.value == max_value] + + # Step 4: Collect trajectories for all the best leaf nodes + best_trajectories = [{"instruction": instruction, "response": leaf.trajectory[idx:]} for leaf in best_leafs] + + return best_trajectories + + +def extract_rlhf_preference_dataset(root, contrastive_threshold=0.2): + """ + Extract and generate preference pairs for RLHF training by comparing sibling nodes. + + Args: + root: The root node of the tree. + contrastive_threshold (float): between (0, 1), a distance measure that we are confidence to call + one is positive and another is negative. + + Returns: + A list of preference pairs, where each pair contains two responses and + indicates which one is preferred. + """ + preference_pairs = [] + + assert contrastive_threshold > 0 + assert contrastive_threshold < 1 + + def traverse_tree(node): + """Traverse the tree to compare sibling nodes and collect preferences.""" + if not node.children: + return # Leaf node, no comparisons needed + + # Step 1: Compare all sibling nodes + for i in range(len(node.children)): + for j in range(len(node.children)): + if i == j: + continue + child_a, child_b = node.children[i], node.children[j] + + is_a_better = False + if child_a.visits > 0 and child_b.visits > 0: + # for MCTS + is_a_better = child_a.value / child_a.visits - child_b.value / child_b.visits > contrastive_threshold + else: + # for Beam Search + is_a_better = child_a.value - child_b.value > contrastive_threshold + if is_a_better: + preference_pairs.append({ + "instruction": node.trajectory, + "preferred_response": f"Step {child_a.depth}: {child_a.content}", + "dispreferred_response": f"Step {child_b.depth}: {child_b.content}", + }) + + # Step 2: Recurse into child nodes + for child in node.children: + traverse_tree(child) + + # Start traversal from the root + traverse_tree(root) + + return preference_pairs + class ReasoningAgent(AssistantAgent): def __init__( - self, name, llm_config, max_depth=4, beam_size=3, answer_approach="pool", verbose=True, **kwargs + self, name, llm_config, max_depth=4, beam_size=3, answer_approach="pool", verbose=True, reason_config: dict=None, **kwargs ) -> None: """Initialize a ReasoningAgent that uses tree-of-thought reasoning., @@ -185,8 +282,8 @@ def __init__( name: Name of the agent llm_config: Configuration for the language model max_depth (int): Maximum depth of the reasoning tree - beam_size (int): Number of parallel reasoning paths to maintain - answer_approach (str): Either "pool" or "best" - how to generate final answer + beam_size (int): DEPRECATED. Number of parallel reasoning paths to maintain + answer_approach (str): DEPRECATED. Either "pool" or "best" - how to generate final answer verbose (bool): Whether to show intermediate steps """ super().__init__(name=name, llm_config=llm_config, **kwargs) @@ -202,7 +299,19 @@ def __init__( system_message="Rate the thinking trajectories for score 1 - 5 (1: worst, 5: best).", llm_config=llm_config, ) - self.register_reply([Agent, None], ReasoningAgent.generate_response) + + if reason_config: + method = reason_config.get("method", "beam_search") + if method == "beam_search": + self.register_reply([Agent, None], ReasoningAgent.generate_beam_response) + if "beam_size" in reason_config: + self.beam_size = reason_config["beam_size"] + if "answer_approach" in reason_config: + self.answer_approach = reason_config["answer_approach"] + elif method == "mcts": + self.register_reply([Agent, None], ReasoningAgent.generate_mcts_response) + self.mcts_simulations = reason_config.get("nsim", 10) + self.exploration_constant = reason_config.get("exploration_constant", 1.41) self._root = None @@ -216,7 +325,8 @@ def rate_node(self, node: ThinkNode) -> float: float: Normalized score between 0 and 1 indicating trajectory quality """ self.send( - message=f"Rate the trajectory:\n{node.trajectory}", recipient=self.grader, request_reply=True, silent=False + message=f"Rate:\n{node.trajectory}", recipient=self.grader, request_reply=True, + silent=not self.verbose, ) rating = self.grader.last_message()["content"].strip() try: @@ -226,7 +336,7 @@ def rate_node(self, node: ThinkNode) -> float: reward = 0.0 # Default reward if parsing fails return reward - def generate_response(self, messages, sender, config=None): + def generate_beam_response(self, messages, sender, config=None): """Generate a response using tree-of-thought reasoning. Implements beam search through a tree of reasoning steps, using the thinker @@ -257,29 +367,14 @@ def generate_response(self, messages, sender, config=None): while prev_leafs and len(final_answers) < self.beam_size: new_leafs = [] for node in prev_leafs: - if (self.max_depth and node.depth >= self.max_depth) or "TERMINATE" in node.content: + if self.is_terminal(node): # Reached max depth; collect possible answers if node.value is None: node.value = self.rate_node(node) final_answers.add(node) continue - self.thinker.clear_history() - self.send( - message=f"{node.trajectory}\n---\nWhat are the possible next steps?", - recipient=self.thinker, - request_reply=True, - silent=False, - ) - reply = self.thinker.last_message()["content"].strip() - - options = re.findall( - r"Option \d+:(.+?)(?=Option \d+:|$)", reply, re.DOTALL - ) # the options that the thinker provides - for option in options: - new_leafs.append( - ThinkNode(content=option.strip().rstrip(), parent=node) - ) # each option is a new leaf node + new_leafs += self.expand(node) prev_leafs = new_leafs @@ -321,3 +416,113 @@ def generate_response(self, messages, sender, config=None): final_answer = self.chat_messages[self][-1]["content"].strip() return True, final_answer + + def generate_mcts_response(self, messages, sender, config=None): + if sender == self: + return False, "" # Defer the LLM call to next reply functions. + + messages = self._oai_messages[sender] if messages is None else messages + prompt = messages[-1]["content"].strip() + if not prompt: + return True, "TERMINATE" + + # Extract the ground truth for more accurate evaluation. + # TODO: in the future, allow user to pass a callable (func) to calculate reward. + if "GROUND_TRUTH" in prompt: + idx = prompt.find("GROUND_TRUTH") + prompt, ground_truth = prompt[:idx].rstrip(), prompt[idx:] + else: + ground_truth = None + + root = ThinkNode(content=prompt, parent=None) + self._root = root + answer_nodes = [] + + # TODO: future, parallelism with Swarm agent or AsyncOpenAI client. + for _ in range(self.mcts_simulations): + node = root + + # Selection + while not self.is_terminal(node) and len(node.children) > 0: + choices_weights = [ + # exploitation term + + (child.value / (child.visits + EPSILON)) + + # exploration term + self.exploration_constant * math.sqrt((2 * math.log(node.visits + EPSILON) / (child.visits + EPSILON))) + for child in node.children + ] + node = node.children[choices_weights.index(max(choices_weights))] + + # Expansion and Simulation + while not self.is_terminal(node): + if len(node.children) == 0: + self.expand(node) + node = random.choice(node.children) + + # Add answer (leaf) node and evaluate answer + self.send( + message=f"Answer the question {prompt}. Here is my thinking process:\n{node.trajectory}", + recipient=self, + request_reply=True, + silent=not self.verbose) + _answer = self.last_message(self)["content"].strip() + # We add the answer (as a node) to the leaf to help + # future logging and debugging. + _ans_node = ThinkNode(content=_answer, parent=node) + if ground_truth: + # override the system message + self.grader.update_system_message(f"Rate the answer for score 1 - 5 (1: worst, 5: best). The Ground Truth is:\n{ground_truth}") + + reward = self.rate_node(_ans_node) + _ans_node.value = reward + answer_nodes.append(_ans_node) + + # Backpropagation + while node is not None: + node.visits += 1 + if node.value is None: + node.value = reward + else: + node.value += reward + node = node.parent + + # Best action + best_ans_node = max(answer_nodes, key=lambda node: node.value) + return True, best_ans_node.content + + + def expand(self, node: ThinkNode) -> List: + """ + Expand the node by generating possible next steps based on the current trajectory. + + This method sends a message to the thinker agent, asking for possible next steps + that can be taken from the current node's trajectory. It processes the response to + extract the options provided by the thinker and creates new ThinkNode instances + for each option. + + Args: + node (ThinkNode): The node to expand, representing the current state in the reasoning process. + + Returns: + List[ThinkNode]: A list of new ThinkNode instances created from the options provided by the thinker. + """ + self.thinker.clear_history() + self.send( + message=f"{node.trajectory}\n---\nWhat are the possible next steps?", + recipient=self.thinker, + request_reply=True, + silent=not self.verbose) + reply = self.thinker.last_message()["content"].strip() + + # Extract options from reply using regex: + # - Matches text between "Option N:" and either next "Option N:" or end of string + # - (?=...) is a lookahead to match option boundary without including it + # - re.DOTALL allows . to match newlines + options = re.findall(r"Option \d+:(.+?)(?=Option \d+:|$)", reply, re.DOTALL) + + return [ThinkNode(content=option.strip().rstrip(), parent=node) for option in options] + + + def is_terminal(self, node): + return node.depth >= self.max_depth or "TERMINATE" in node.content + diff --git a/notebook/tree_of_thoughts.png b/notebook/tree_of_thoughts.png index 57825cdfff..85ffa38c25 100644 --- a/notebook/tree_of_thoughts.png +++ b/notebook/tree_of_thoughts.png @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6049d94ab2659ee43a0e50b2086c5e43a4bab419834a8c80acb1bbb5c780a65 -size 300686 +oid sha256:45a644fa66a8052cf166e9c691e45f2aa60e3ca6e30c629265b9d6d68898279b +size 396536 diff --git a/website/blog/2024-12-02-ReasoningAgent2/index.mdx b/website/blog/2024-12-02-ReasoningAgent2/index.mdx index d194c3f365..a4f747032f 100644 --- a/website/blog/2024-12-02-ReasoningAgent2/index.mdx +++ b/website/blog/2024-12-02-ReasoningAgent2/index.mdx @@ -5,7 +5,6 @@ authors: - BabyCNM - skzhang1 - sonichi - - BabyCNM - qingyunwu tags: [LLM, GPT, research] --- @@ -83,6 +82,7 @@ from autogen import ( visualize_tree ) + # Configure the model config_list = [{"model": "gpt-4", "api_key": os.environ.get("OPENAI_API_KEY")}] @@ -186,6 +186,7 @@ After asking a question to the `ReasoningAgent`, you only need to simply call th ```python import json + data = reasoning_agent._root.to_dict() with open("reasoning_tree.json", "w") as f: json.dump(data, f) @@ -208,43 +209,7 @@ new_node = pickle.load(open("reasoning_tree.pkl", "rb")) This step finds the best trajectory in the thought tree and converts it to a SFT dataset as a sequence of strings. The best trajectory is determined by following the highest-scoring path from root to leaf. ```python -def extract_sft_dataset(root): - """ - Extract the best trajectory or multiple equally good trajectories - for SFT training. - - Args: - root: The root node of the tree. - - Returns: - List of best trajectories, where each trajectory is a pair of instruction and response. - """ - instruction = root.content - idx = len("# Question: ") + len(root.content) + 1 - - def find_leaf_nodes(node): - """Recursively find all leaf nodes.""" - if not node.children: - return [node] - leafs = [] - for child in node.children: - leafs.extend(find_leaf_nodes(child)) - return leafs - - # Step 1: Find all leaf nodes - leaf_nodes = find_leaf_nodes(root) - - # Step 2: Determine the highest score among leaf nodes - max_value = max(leaf_nodes, key=lambda x: x.value).value - - # Step 3: Collect all leaf nodes with the highest score - best_leafs = [leaf for leaf in leaf_nodes if leaf.value == max_value] - - # Step 4: Collect trajectories for all the best leaf nodes - best_trajectories = [{"instruction": instruction, "response": leaf.trajectory[idx:]} for leaf in best_leafs] - - return best_trajectories - +from autogen.agentchat.contrib.reasoning_agent import extract_sft_dataset # Example usage sft_data = extract_sft_dataset(reason_agent._root) @@ -255,52 +220,7 @@ json.dump(sft_data, open("sft_data.json", "w"), indent=2) This step generates preference pairs by comparing sibling nodes in the tree. For each parent node with multiple children, we create training pairs where the higher-scored response is marked as preferred over the lower-scored one. ```python -def extract_rlhf_preference_dataset(root, contrastive_threshold=0.2): - """ - Extract and generate preference pairs for RLHF training by comparing sibling nodes. - - Args: - root: The root node of the tree. - contrastive_threshold (float): between (0, 1), a distance measure that we are confidence to call - one is positive and another is negative. - - Returns: - A list of preference pairs, where each pair contains two responses and - indicates which one is preferred. - """ - preference_pairs = [] - - assert contrastive_threshold > 0 - assert contrastive_threshold < 1 - - def traverse_tree(node): - """Traverse the tree to compare sibling nodes and collect preferences.""" - if not node.children: - return # Leaf node, no comparisons needed - - # Step 1: Compare all sibling nodes - for i in range(len(node.children)): - for j in range(len(node.children)): - if i == j: - continue - child_a, child_b = node.children[i], node.children[j] - if child_a.value - child_b.value > contrastive_threshold: - preference_pairs.append({ - "instruction": node.trajectory, - "preferred_response": f"Step {child_a.depth}: {child_a.content}", - "dispreferred_response": f"Step {child_b.depth}: {child_b.content}", - }) - - - # Step 2: Recurse into child nodes - for child in node.children: - traverse_tree(child) - - # Start traversal from the root - traverse_tree(root) - - return preference_pairs - +from autogen.agentchat.contrib.reasoning_agent import extract_rlhf_preference_dataset # Example usage rlhf_data = extract_rlhf_preference_dataset(reason_agent._root) diff --git a/website/blog/2024-12-18-Reasoning-and-MCTS/index.mdx b/website/blog/2024-12-18-Reasoning-and-MCTS/index.mdx new file mode 100644 index 0000000000..819062fe8f --- /dev/null +++ b/website/blog/2024-12-18-Reasoning-and-MCTS/index.mdx @@ -0,0 +1,189 @@ +--- +title: MCTS vs Beam Search in Reasoning Agent to Help LLM Post-Training +authors: + - BabyCNM + - Hk669 + - sonichi + - qingyunwu +tags: [LLM, GPT, research] +--- + +![Tree of Thoughts](img/reasoningagent_1.png) + +**TL;DR:** +* We introduce Monte Carlo Tree Search (MCTS) as an alternative to Beam Search in ReasoningAgent +* MCTS is particularly effective when ground truth evaluation is available or when LLM-based evaluation is expensive +* We provide detailed complexity analysis and comparison between MCTS and Beam Search approaches +* The resulting search trees can be used to generate high-quality training datasets for LLM fine-tuning + +## Introduction + +In our [previous post](/blog/2024-12-02-ReasoningAgent2), we introduced ReasoningAgent with Beam Search for systematic reasoning. Today, we explore an alternative approach using Monte Carlo Tree Search (MCTS) that offers unique advantages in certain scenarios, particularly when: + +1. Ground truth evaluation is available (either from human feedback or labeled data) +2. LLM-based evaluation is expensive or unreliable +3. You want to generate high-quality training data for future LLM fine-tuning + +## MCTS vs Beam Search: Key Differences + +### Search Strategy +- **Beam Search**: Maintains a fixed number (beam size) of most promising paths at each step +- **MCTS**: Dynamically explores the search space, balancing exploitation of known good paths with exploration of new possibilities + +### Evaluation Timing +- **Beam Search**: Evaluates every node at every step +- **MCTS**: Only evaluates leaf nodes during simulation, making it more efficient when evaluation is expensive + +### Memory Usage +- **Beam Search**: Memory usage is bounded by beam size × depth +- **MCTS**: Memory grows with number of simulations but focuses on promising paths + +## Implementation Details + +The MCTS implementation in ReasoningAgent follows four key steps: + +1. **Selection**: Choose nodes to explore using UCT (Upper Confidence Bound for Trees) +```python +choices_weights = [ + # exploitation term + (child.value / (child.visits + EPSILON)) + + # exploration term + self.exploration_constant * math.sqrt((2 * math.log(node.visits + EPSILON) / (child.visits + EPSILON))) + for child in node.children +] +``` + +2. **Expansion**: Generate possible next steps using the thinker agent +```python +# Expansion happens through the expand() method +new_nodes = self.expand(node) +``` + +3. **Simulation**: Run random simulations to leaf nodes +```python +while not self.is_terminal(node): + if len(node.children) == 0: + self.expand(node) + node = random.choice(node.children) +``` + +4. **Backpropagation**: Update node statistics based on simulation results +```python +while node is not None: + node.visits += 1 + if node.value is None: + node.value = reward + else: + node.value += reward + node = node.parent +``` + +### Ground Truth Evaluation + +ReasoningAgent now supports ground truth evaluation by allowing users to include a "GROUND_TRUTH" marker in their prompts. This enables more accurate evaluation of reasoning paths: + +```python +# Example usage with ground truth +prompt = """What is the expected maximum dice value if you can roll a 6-sided dice three times? + +GROUND_TRUTH: +We define X as the highest outcome among the three rolls. +The probability that X is at least m is 1 - \left(\frac{m-1}{6}\right)^3 for each m from 1 to 6. +Summing these probabilities gives the expectation E(X) = \sum_{m=1}^{6} [1 - (\frac{m-1}{6})^3]. +Calculating this sum results in E(X) = 6 - \frac{225}{216} = \frac{119}{24}, which approximates to 4.9583. +Therefore, the expected maximum value when rolling a six-sided die three times is \frac{119}{24} or approximately 4.9583.""" + +# The agent will use the ground truth to provide more accurate evaluation scores +ans = user_proxy.initiate_chat(mcts_agent, message=prompt)``` + +When ground truth is provided: +1. The agent automatically splits the prompt into the question and ground truth +2. The grader's system message is updated to include the ground truth +3. Evaluation scores become more reliable since they're based on actual correct answers + +This feature is particularly useful for: +- Training data generation with verified correct answers +- Educational applications where correct solutions are known +- Fine-tuning reward models with ground truth supervision + + +## Generating Training Data + +Both MCTS and Beam Search can generate valuable training data, but in different ways: + +### From MCTS: +```python +from autogen.agentchat.contrib.reasoning_agent import extract_sft_dataset, extract_rlhf_preference_dataset + +# Get SFT data from successful paths +sft_data = extract_sft_dataset(reason_agent._root) + +# Get preference pairs for RLHF +rlhf_data = extract_rlhf_preference_dataset(reason_agent._root) +``` + +The MCTS approach tends to generate: +- More diverse reasoning paths +- Better exploration of alternative solutions +- Stronger contrast between good and bad paths (useful for RLHF) + +## Complexity Analysis + +Let's analyze the computational complexity of both approaches: + +$d$: maximum depth of search tree +$w$: average branching factor (options per node) +$n$: number of Monte Carlo simulations +$b$: beam size + +### MCTS +- **Time Complexity**: $O(n \times d)$ + - Each simulation traverses max depth $d$ + - Performs $n$ simulations +- **Memory Complexity**: $O(w^d)$ worst case, but typically much lower in practice + - Tree grows based on visited paths + - Focuses on promising branches + +### Beam Search +- **Time Complexity**: $O(d \times b \times (w + 1))$ + - At each depth $d$, evaluates $b$ beams + - Each beam generates $w$ new options + - Plus one evaluation per beam +- **Memory Complexity**: $O(b \times d)$ + - Maintains $b$ paths + - Each path has depth $d$ + +## When to Use Each Approach + +### Use MCTS when: +1. You have reliable ground truth evaluation +2. LLM-based evaluation is expensive +3. You want to generate training data with diverse, high-quality reasoning paths +4. Exploration of the solution space is important + +### Use Beam Search when: +1. Exploration is not very important, as the quality of previous steps is indicative for future steps +2. LLM-based evaluation is cheap and reliable +3. The problem space is well-structured +4. Memory constraints are strict + + +## Conclusion + +While both MCTS and Beam Search are valuable approaches for ReasoningAgent, they serve different purposes: + +- MCTS excels at thorough exploration and generating training data +- Beam Search is more efficient for quick, direct problem-solving + +The choice between them should be based on your specific needs regarding: +- Evaluation cost and availability +- Time and resource constraints +- Intended use of the results + +## For Further Reading + +* [Original ReasoningAgent with Beam Search](/blog/2024-12-02-ReasoningAgent2) +* [Documentation about ReasoningAgent](/docs/reference/agentchat/contrib/reasoning_agent) +* [MCTS in Wikipedia](https://en.wikipedia.org/wiki/Monte_Carlo_tree_search) + +*Join our [Discord](https://discord.com/invite/pAbnFJrkgZ) server to discuss your experiences with these approaches and suggest improvements.*