-
Notifications
You must be signed in to change notification settings - Fork 24
/
Copy pathprompt_evaluator.py
124 lines (101 loc) · 5.4 KB
/
prompt_evaluator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
import asyncio
import pandas as pd
from vertexai.generative_models import GenerativeModel
from tqdm.asyncio import tqdm_asyncio
import backoff
class ReviewModelError(Exception):
"""Custom exception for review model errors."""
pass
class PromptEvaluator:
def __init__(self, df_train, target_model_name, target_model_config, review_model_name, review_model_config, safety_settings, review_prompt_template_path):
self.df_train = df_train
self.target_model_name = target_model_name
self.target_model_config = target_model_config
self.review_model_name = review_model_name
self.review_model_config = review_model_config
self.safety_settings = safety_settings
self.review_prompt_template_path = review_prompt_template_path
self.target_model = GenerativeModel(self.target_model_name)
self.review_model = GenerativeModel(self.review_model_name)
@backoff.on_exception(backoff.expo, Exception, max_tries=5)
async def generate_target_model_response(self, question, prompt):
target_model = GenerativeModel(
self.target_model_name,
generation_config=self.target_model_config,
safety_settings=self.safety_settings,
system_instruction=prompt
)
response = await target_model.generate_content_async(
question,
stream=False,
)
return response.text
@backoff.on_exception(backoff.expo, Exception, max_tries=5)
async def generate_review_model_response(self, review_prompt):
review_response = await self.review_model.generate_content_async(
[review_prompt],
generation_config=self.review_model_config,
safety_settings=self.safety_settings,
stream=False,
)
return review_response.text.strip().lower()
async def generate_and_review(self, row, prompt):
try:
model_response = await self.generate_target_model_response(row["question"], prompt)
# Load the review prompt from the text file
with open(self.review_prompt_template_path, 'r') as f:
review_prompt_template = f.read().strip()
# Fill in the review prompt with the model response and ground truth
review_prompt = review_prompt_template.format(model_response=model_response, ground_truth=row['answer'])
# Now use the review model to compare the model response with the ground truth
review_result = await self.generate_review_model_response(review_prompt)
# Check if the target model returned a valid response
if not model_response or not isinstance(model_response, str):
raise ReviewModelError("Target model did not return a valid response.")
# Assert that the review model returns either 'true' or 'false'
if review_result not in ['true', 'false']:
raise ReviewModelError("Review model did not return a valid response.")
is_correct = review_result == 'true' # Check if the response is 'True'
return row.name, model_response, is_correct
except ReviewModelError as e:
print(f"Error: {e}. The review model did not return a valid response. Terminating the program.")
raise # Re-raise the exception to be caught in the main function
except Exception as e:
print(f"An error occurred: {e}. Terminating the program.")
raise # Re-raise the exception to be caught in the main function
async def evaluate_prompt(self, prompt):
tasks = [self.generate_and_review(row, prompt) for _, row in self.df_train.iterrows()]
# Create a tqdm progress bar
with tqdm_asyncio(total=len(tasks), desc="Evaluating Prompt") as pbar:
async def wrapped_task(task):
result = await task
pbar.update(1) # Update progress bar after task completion
return result
# Run tasks with progress bar updates
results = await asyncio.gather(*[wrapped_task(task) for task in tasks])
# Prepare results for saving
evaluation_results = []
for index, model_response, is_correct in results:
if index is not None: # Check if the index is valid
self.df_train.loc[index, 'model_response'] = model_response
self.df_train.loc[index, 'is_correct'] = is_correct
evaluation_results.append({
'question': self.df_train.loc[index, 'question'],
'ground_truth': self.df_train.loc[index, 'answer'],
'model_response': model_response,
'is_correct': is_correct
})
overall_accuracy = sum(self.df_train["is_correct"]) / len(self.df_train)
# Save results to CSV
results_df = pd.DataFrame(evaluation_results)
results_csv_path = 'evaluation_results.csv'
results_df.to_csv(results_csv_path, index=False)
return overall_accuracy
async def main(self, prompt):
try:
accuracy = await self.evaluate_prompt(prompt)
print(f"Overall accuracy for the prompt: {accuracy:.2f}")
except ReviewModelError:
print("The program has terminated due to an invalid response from the review model.")
except Exception as e:
print(f"The program has terminated due to an unexpected error: {e}")