Skip to content

Commit

Permalink
add n responses support
Browse files Browse the repository at this point in the history
- Add script for RTC eval on arena hard auto
- Add ability to evaluate pass@n for AIME bench
- Return n samples from proxy when n is set
  • Loading branch information
codelion committed Nov 29, 2024
1 parent d5b468c commit 76008bd
Show file tree
Hide file tree
Showing 5 changed files with 501 additions and 96 deletions.
75 changes: 61 additions & 14 deletions optillm.py
Original file line number Diff line number Diff line change
Expand Up @@ -306,6 +306,52 @@ async def run_approach(approach):
responses, tokens = zip(*results)
return list(responses), sum(tokens)

def execute_n_times(n: int, approaches, operation: str, system_prompt: str, initial_query: str, client: Any, model: str) -> Tuple[Union[str, List[str]], int]:
"""
Execute the pipeline n times and return n responses.
Args:
n (int): Number of times to run the pipeline
approaches (list): List of approaches to execute
operation (str): Operation type ('SINGLE', 'AND', or 'OR')
system_prompt (str): System prompt
initial_query (str): Initial query
client: OpenAI client instance
model (str): Model identifier
Returns:
Tuple[Union[str, List[str]], int]: List of responses and total token count
"""
responses = []
total_tokens = 0

for _ in range(n):
if operation == 'SINGLE':
response, tokens = execute_single_approach(approaches[0], system_prompt, initial_query, client, model)
elif operation == 'AND':
response, tokens = execute_combined_approaches(approaches, system_prompt, initial_query, client, model)
elif operation == 'OR':
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
response, tokens = loop.run_until_complete(execute_parallel_approaches(approaches, system_prompt, initial_query, client, model))
loop.close()
else:
raise ValueError(f"Unknown operation: {operation}")

# If response is already a list (from OR operation), extend responses
# Otherwise append the single response
if isinstance(response, list):
responses.extend(response)
else:
responses.append(response)
total_tokens += tokens

# If n=1 and we got a single response, return it as is
# Otherwise return the list of responses
if n == 1 and len(responses) == 1:
return responses[0], total_tokens
return responses, total_tokens

def generate_streaming_response(final_response, model):
# Yield the final response
if isinstance(final_response, list):
Expand Down Expand Up @@ -393,11 +439,12 @@ def proxy():
stream = data.get('stream', False)
messages = data.get('messages', [])
model = data.get('model', server_config['model'])
n = data.get('n', server_config['n']) # Get n value from request or config

optillm_approach = data.get('optillm_approach', server_config['approach'])
logger.debug(data)
server_config['mcts_depth'] = data.get('mcts_depth', server_config['mcts_depth'])
server_config['mcts_exploration' ] = data.get('mcts_exploration', server_config['mcts_exploration'])
server_config['mcts_exploration'] = data.get('mcts_exploration', server_config['mcts_exploration'])
server_config['mcts_simulations'] = data.get('mcts_simulations', server_config['mcts_simulations'])

system_prompt, initial_query, message_optillm_approach = parse_conversation(messages)
Expand Down Expand Up @@ -428,26 +475,26 @@ def proxy():
contains_none = any(approach == 'none' for approach in approaches)

if operation == 'SINGLE' and approaches[0] == 'none':
# For none approach, return the response directly
result, _ = execute_single_approach(approaches[0], system_prompt, initial_query, client, model)
# For none approach with n>1, make n separate calls
if n > 1:
responses = []
completion_tokens = 0
for _ in range(n):
result, tokens = execute_single_approach(approaches[0], system_prompt, initial_query, client, model)
responses.append(result)
completion_tokens += tokens
result = responses
else:
result, completion_tokens = execute_single_approach(approaches[0], system_prompt, initial_query, client, model)
logger.debug(f'Direct proxy response: {result}')
return jsonify(result), 200

elif operation == 'AND' or operation == 'OR':
if contains_none:
raise ValueError("'none' approach cannot be combined with other approaches")

# Handle non-none approaches
if operation == 'SINGLE':
response, completion_tokens = execute_single_approach(approaches[0], system_prompt, initial_query, client, model)
elif operation == 'AND':
response, completion_tokens = execute_combined_approaches(approaches, system_prompt, initial_query, client, model)
elif operation == 'OR':
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
response, completion_tokens = loop.run_until_complete(execute_parallel_approaches(approaches, system_prompt, initial_query, client, model))
else:
raise ValueError(f"Unknown operation: {operation}")
# Handle non-none approaches with n attempts
response, completion_tokens = execute_n_times(n, approaches, operation, system_prompt, initial_query, client, model)

except Exception as e:
logger.error(f"Error processing request: {str(e)}")
Expand Down
141 changes: 75 additions & 66 deletions scripts/eval_aime_benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,8 @@
import logging
import re
import time

from typing import List, Dict, Tuple, Optional
from datetime import datetime

from openai import OpenAI
from datasets import load_dataset
from tqdm import tqdm
Expand All @@ -17,7 +15,7 @@
logger = logging.getLogger(__name__)

# Initialize OpenAI client
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"), base_url="http://localhost:8000/v1")
client = OpenAI(api_key=os.environ.get("OPENAI_API_KEY"), base_url="http://localhost:8888/v1")

SYSTEM_PROMPT = '''You are solving AIME (American Invitational Mathematics Examination) problems.
Expand Down Expand Up @@ -48,50 +46,30 @@ def extract_answer(response: str) -> Optional[int]:
"""
Extract the numerical answer from a math solution response.
Handles various formats of boxed answers and falls back to last number if needed.
Args:
response (str): The complete response text from the model
Returns:
Optional[int]: The extracted answer as an integer, or None if no valid answer found
"""
if not response:
return None

# Clean the response: normalize whitespace and handle potential Unicode
# Clean the response
response = ' '.join(response.split())

# List of regex patterns to try, in order of preference
patterns = [
# $n=\boxed{X}$ format
r'\$n=\\boxed{(\d+)}\$',

# LaTeX display style answer: \[\boxed{X}\] or \[\boxed{X}.\]
r'\\\[\\boxed{(\d+)}\\\]',
r'\\\[\\boxed{(\d+)}\.\\\]',

# Inline LaTeX \boxed{X}
r'\\boxed{(\d+)}',

# Common variations
r'\$\\boxed{(\d+)}\$',
r'boxed{(\d+)}',

# Less strict patterns
r'\\boxed\s*{\s*(\d+)\s*}',
r'\bboxed\s*{\s*(\d+)\s*}',

# Plain text answer indicators
r'final answer is[^\d]*(\d+)',
r'answer is[^\d]*(\d+)',
r'answer:[^\d]*(\d+)',
r'= ?(\d+)$'
]

# Try each pattern in order
for pattern in patterns:
matches = re.finditer(pattern, response, re.IGNORECASE)
# Get the last match for this pattern (in case there are multiple)
last_match = None
for match in matches:
last_match = match
Expand All @@ -102,47 +80,70 @@ def extract_answer(response: str) -> Optional[int]:
except (ValueError, IndexError):
continue

# Fallback: Extract all numbers and take the last one
# This is our last resort, assuming the answer typically comes last
numbers = re.findall(r'(\d+)', response)
if numbers:
try:
# Convert to int and return the last number found
return int(numbers[-1])
except ValueError:
pass

# If all methods fail, return None
return None

def get_llm_response(problem: str, model: str) -> str:
"""
Get response from the LLM for a given problem.
"""
try:
response = client.chat.completions.create(
response = client.with_options(timeout=1000.0).chat.completions.create(
model=model,
messages=[
# {"role": "system", "content": SYSTEM_PROMPT},
{"role": "user", "content": SYSTEM_PROMPT + problem}
],
max_tokens=8192,
# extra_body={
# "decoding": "entropy_decoding",
# }
)
return response.choices[0].message.content.strip()
except Exception as e:
logger.error(f"Error getting LLM response: {e}")
return ""

def evaluate_response(predicted_answer: Optional[int], correct_answer: int) -> bool:
def make_n_attempts(problem: str, model: str, n: int) -> List[Dict]:
"""
Make n attempts to solve a problem and return all responses and predictions.
Args:
problem (str): The problem text
model (str): The model identifier
n (int): Number of attempts to make
Returns:
List[Dict]: List of dictionaries containing response and predicted answer for each attempt
"""
attempts = []
for i in range(n):
response = get_llm_response(problem, model)
predicted_answer = extract_answer(response)
attempts.append({
"attempt_number": i + 1,
"response": response,
"predicted_answer": predicted_answer
})
return attempts

def evaluate_pass_at_n(attempts: List[Dict], correct_answer: int) -> Tuple[bool, Optional[int]]:
"""
Evaluate if the predicted answer matches the correct answer.
Evaluate if any of the n attempts got the correct answer.
Args:
attempts (List[Dict]): List of attempt results
correct_answer (int): The correct answer
Returns:
Tuple[bool, Optional[int]]: (whether any attempt was correct, first correct attempt number)
"""
if predicted_answer is None:
return False
return predicted_answer == correct_answer
for attempt in attempts:
if attempt["predicted_answer"] == correct_answer:
return True, attempt["attempt_number"]
return False, None

def load_existing_results(filename: str) -> List[Dict]:
"""Load existing results from file if it exists."""
Expand All @@ -165,76 +166,84 @@ def get_last_processed_index(results: List[Dict]) -> int:
return -1
return max(int(r.get('index', -1)) for r in results)

def analyze_results(results: List[Dict]):
"""Analyze and print summary statistics of the results."""
def analyze_results(results: List[Dict], n: int):
"""
Analyze and print summary statistics of the results.
Args:
results (List[Dict]): List of evaluation results
n (int): Number of attempts per problem
"""
total = len(results)
correct = sum(1 for r in results if r['is_correct'])
accuracy = correct / total if total > 0 else 0

print("\n=== Results Summary ===")
print(f"Evaluation mode: pass@{n}")
print(f"Total problems: {total}")
print(f"Correct answers: {correct}")
print(f"Accuracy: {accuracy:.2%}")

# Print incorrect problems for analysis
print("\n=== Incorrect Answers ===")
# Calculate attempt statistics
successful_attempts = [r['first_correct_attempt'] for r in results if r['is_correct']]
if successful_attempts:
avg_attempts = sum(successful_attempts) / len(successful_attempts)
print(f"\nFor correct solutions:")
print(f"Average attempts needed: {avg_attempts:.2f}")
print(f"Attempt distribution:")
for i in range(1, n + 1):
count = sum(1 for x in successful_attempts if x == i)
print(f" Attempt {i}: {count} problems")

print("\n=== Incorrect Problems ===")
for r in results:
if not r['is_correct']:
print(f"Problem {r['index']}:")
print(f"Expected: {r['correct_answer']}")
print(f"Predicted: {r['predicted_answer']}")
print("Predicted answers across attempts:", [
attempt['predicted_answer'] for attempt in r['attempts']
])
print("---")

def main(model: str):
def main(model: str, n_attempts: int):
"""Main evaluation function."""
# Create results directory if it doesn't exist
os.makedirs("results", exist_ok=True)

# Setup results file
results_file = f"evaluation_results_{model.replace('/', '_')}.json"
# Include n_attempts in filename to keep separate results for different n values
results_file = f"evaluation_results_{model.replace('/', '_')}_pass_at_{n_attempts}.json"

# Load dataset
dataset = load_2024_dataset()

# Load existing results
existing_results = load_existing_results(results_file)
last_processed_index = get_last_processed_index(existing_results)

# Process problems
for idx, item in enumerate(tqdm(dataset, desc="Evaluating problems")):
if idx <= last_processed_index:
continue

problem_text = item['problem']
correct_answer = int(item['answer'])

# Get model response
response = get_llm_response(problem_text, model)
logger.debug(f"Response: {response}")
predicted_answer = extract_answer(response)
is_correct = evaluate_response(predicted_answer, correct_answer)
# Make n attempts for each problem
attempts = make_n_attempts(problem_text, model, n_attempts)
is_correct, first_correct = evaluate_pass_at_n(attempts, correct_answer)

# Save result
result = {
"index": idx,
"problem": problem_text,
"model_response": response,
"predicted_answer": predicted_answer,
"attempts": attempts,
"correct_answer": correct_answer,
"is_correct": is_correct
"is_correct": is_correct,
"first_correct_attempt": first_correct
}
save_result(results_file, result)

# Optional: Add delay between requests if needed
time.sleep(300)

# Analyze results
final_results = load_existing_results(results_file)
analyze_results(final_results)
analyze_results(final_results, n_attempts)

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Evaluate LLM performance on AIME 2024 problems")
parser.add_argument("--model", type=str, required=True, help="OpenAI model to use (e.g., gpt-4, gpt-3.5-turbo)")
parser.add_argument("--n", type=int, default=1, help="Number of attempts per problem (for pass@n evaluation)")
args = parser.parse_args()

main(args.model)
main(args.model, args.n)
Loading

0 comments on commit 76008bd

Please sign in to comment.