-
Notifications
You must be signed in to change notification settings - Fork 0
/
metrics.py
269 lines (207 loc) · 11.7 KB
/
metrics.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
import os
from typing import List, Tuple, Union
import evaluate
import numpy as np
import torch
from loguru import logger
from rouge_score import rouge_scorer
from sentence_transformers import SentenceTransformer
sacrebleu = None
sentence_transformer_model_cache = {}
def calculate_per_sample_accuracy(prediction: int, truth: int) -> bool:
"""
Computes the accuracy of a single prediction.
This function checks if a given prediction matches the ground truth.
Parameters:
- prediction (int): The predicted value.
- truth (int): The actual ground truth value.
Returns:
- bool: True if the prediction matches the truth, False otherwise.
"""
return prediction == truth
def calculate_hit_rate_3(retrieved_int: List[int], truth: List[int]) -> float:
"""
Calculates the hit rate within the top 3 retrieved integers.
This function assesses how many of the truth integers are present
within the first three elements of the retrieved list of integers.
Parameters:
- retrieved_int (List[int]): The list of retrieved integers, ordered by relevance.
- truth (List[int]): The list of ground truth integers.
Returns:
- float: The hit rate, calculated as the proportion of truth integers found
in the top 3 retrieved integers, relative to the total number of truth integers.
"""
# Calculate the number of hits within the top 3 retrieved integers
hit = len(set(truth).intersection(set(retrieved_int[:3])))
# Normalize the hit count by the total number of truth integers to get the hit rate
hit_rate = hit / len(truth)
return hit_rate
def calculate_rougel(generation: str, truth: str) -> float:
"""
Calculates the ROUGE-L F-measure score between a generated string and the truth string.
ROUGE-L measures the longest common subsequence between the generated text and the truth text,
considering both the precision and recall of the sequences. It is widely used in evaluating
the quality of text generation systems.
Parameters:
- generation (str): The generated text to evaluate.
- truth (str): The ground truth text to compare against.
Returns:
- float: The ROUGE-L F-measure score, indicating the quality of the generated text.
"""
# Initialize the ROUGE scorer with the ROUGE-L metric
scorer = rouge_scorer.RougeScorer(["rougeL"], use_stemmer=True)
# Calculate the ROUGE scores between the generated text and the truth text
scores = scorer.score(generation, truth)
# Extract and return the ROUGE-L F-measure score
return scores["rougeL"].fmeasure
def load_sentence_transformer_model(model_name: str) -> SentenceTransformer:
"""
Loads a Sentence Transformer model by its name and moves it to the appropriate device.
Parameters:
- model_name (str): The name of the model to load.
Returns:
- SentenceTransformer: The loaded SentenceTransformer model.
"""
global sentence_transformer_model_cache
# a model cache ensure we do not load the model on every call
if model_name not in sentence_transformer_model_cache:
device = "cuda" if torch.cuda.is_available() else "cpu"
model = SentenceTransformer(model_name).to(device)
sentence_transformer_model_cache[model_name] = model
return sentence_transformer_model_cache[model_name]
def calculate_cosine_similarity(generated_text: str, reference_texts: Union[str, List[str]], model_name) -> float:
"""
Computes the cosine similarity score(s) between a generated text and reference text(s) using a sentence embedding model.
This function calculates the cosine similarity between the embedding of the generated text and the embedding(s)
of reference text(s). The embeddings are generated using a specified sentence embedding model. The cosine similarity
score is a measure of similarity between two vectors, ranging from -1 (completely different) to 1 (exactly the same).
Parameters:
- generated_text (str): The text generated by the model.
- reference_texts (Union[str, List[str]]): The reference text(s) for comparison. Can be a single string or a list of strings.
- model_name: The sentence embedding model used to generate text embeddings.
Returns:
- float: The average cosine similarity score between the generated text and the reference text(s). If reference_texts is a single
string, a single score is returned. If reference_texts is a list of strings, the average score across all references is returned.
The score is bounded between 0 (no similarity) and 1 (identical), with negative scores adjusted to 0.
"""
# Load/Reference model
model = load_sentence_transformer_model(model_name)
# Embedding for the generated text
generated_embedding = model.encode([generated_text])[0]
# Handling a single reference text
if isinstance(reference_texts, str):
# Embedding for the single reference text
reference_embedding = model.encode([reference_texts])[0]
# Compute cosine similarity
similarity_score = np.dot(generated_embedding, reference_embedding) / (np.linalg.norm(generated_embedding) * np.linalg.norm(reference_embedding))
# Ensure non-negative score
return max(similarity_score, 0)
# Handling multiple reference texts
else:
similarity_scores = []
for reference_text in reference_texts:
# Embedding for each reference text
reference_embedding = model.encode([reference_text])[0]
# Compute cosine similarity for each reference
individual_score = np.dot(generated_embedding, reference_embedding) / (np.linalg.norm(generated_embedding) * np.linalg.norm(reference_embedding))
similarity_scores.append(individual_score)
# Calculate and ensure non-negative average score
return max(np.mean(similarity_scores), 0)
def calculate_true_positive_false_positives_false_negatives(extracted_entities: List[str], ground_truth_entities: List[str]) -> Tuple[int, int, int]:
"""
Calculates true positives, false positives, and false negatives for entity extraction.
This function compares a list of extracted entities against a list of ground truth entities
to determine the count of true positives (correctly extracted entities), false positives
(incorrectly extracted entities), and false negatives (missed entities).
Both lists are case-insensitive, and leading/trailing spaces in extracted entities are ignored.
Parameters:
- extracted_entities (List[str]): The list of entities extracted by the model.
- ground_truth_entities (List[str]): The list of actual entities (ground truth).
Returns:
- Tuple[int, int, int]: A tuple containing the counts of true positives, false positives, and false negatives.
"""
# Normalize the extracted entities by making them lowercase and stripping leading/trailing spaces
normalized_extracted_entities = [entity.lower().strip() for entity in extracted_entities]
# Normalize the ground truth entities by making them lowercase
normalized_ground_truth_entities = [entity.lower() for entity in ground_truth_entities]
# Calculate true positives by finding the intersection between extracted and ground truth entities
true_positives = len(set(normalized_extracted_entities).intersection(set(normalized_ground_truth_entities)))
# Calculate false positives as extracted entities not in ground truth
false_positives = len(normalized_extracted_entities) - true_positives
# Calculate false negatives as ground truth entities not extracted
false_negatives = len(normalized_ground_truth_entities) - true_positives
return true_positives, false_positives, false_negatives
def calculate_f1_score(metrics_list: List[Tuple[int, int, int]]) -> float:
"""
Calculates the F1 score from a list of tuples containing true positives, false positives, and false negatives.
Parameters:
- metrics_list (List[Tuple[int, int, int]]): A list of tuples, where each tuple contains counts of true positives,
false positives, and false negatives in that order for various classifications or entity extractions.
Returns:
- float: The computed F1 score, ranging from 0 to 1.
"""
total_tp, total_fp, total_fn = 0, 0, 0
# Aggregate total true positives, false positives, and false negatives
for tp, fp, fn in metrics_list:
total_tp += tp
total_fp += fp
total_fn += fn
# Calculate precision and recall
precision = total_tp / (total_tp + total_fp) if (total_tp + total_fp) > 0 else 0
recall = total_tp / (total_tp + total_fn) if (total_tp + total_fn) > 0 else 0
# Calculate F1 score, handling the case where precision + recall equals 0
if precision + recall == 0:
return 0
else:
return 2 * precision * recall / (precision + recall)
def calculate_ndcg(predicted_relevance_scores: List[int], true_relevance_weights: List[float]) -> float:
"""
Calculates and evaluates the Normalized Discounted Cumulative Gain (NDCG) score directly from predicted relevance scores
against true relevance weights. It normalizes the scores to ensure a fair comparison, trimming the predicted scores
if necessary to match the length of the true relevance weights.
Parameters:
- predicted_relevance_scores (List[int]): Indices of items ranked by the algorithm, expected to be integers starting from 1.
- true_relevance_weights (List[float]): Actual relevance weights for the items, with higher values indicating greater relevance.
Returns:
- float: The NDCG score, normalized against the ideal ranking, ranging from 0 to 1.
"""
# Trim the predicted scores to match the true scores length if necessary
if len(predicted_relevance_scores) > len(true_relevance_weights):
predicted_relevance_scores = predicted_relevance_scores[:len(true_relevance_weights)]
dcg, idcg = 0.0, 0.0
# Calculate DCG for the predicted ranking
for i, score_index in enumerate(predicted_relevance_scores, start=1):
if score_index - 1 < len(true_relevance_weights):
relevance = true_relevance_weights[score_index - 1]
else:
relevance = 0
dcg += (np.power(2, relevance) - 1) / np.log2(i + 1)
# Calculate IDCG using sorted true relevance weights
for i, weight in enumerate(sorted(true_relevance_weights, reverse=True), start=1):
idcg += (np.power(2, weight) - 1) / np.log2(i + 1)
# Avoid division by zero
return 0 if idcg == 0 else dcg / idcg
def calculate_bleu_score(generated_text: str, reference_text: str, is_japanese: bool = False) -> float:
"""
Calculates the BLEU score for a generated text compared to a reference truth text. This function supports
both general text and Japanese-specific evaluation by using the sacrebleu library.
Parameters:
- generated_text (str): The generated text to be evaluated.
- reference_text (str): The reference truth text.
- is_japanese (bool, optional): Flag to indicate whether the text is in Japanese, requiring special tokenization.
Returns:
- float: The BLEU score as a percentage (0 to 1 scale) for the generated text against the reference truth.
"""
global sacrebleu
# if sacrebleu is None:
# sacrebleu = evaluate.load("sacrebleu")
# Preprocess input texts
generated_text = generated_text.lstrip("\n").rstrip("\n").split("\n")[0]
candidate = [generated_text]
reference = [[reference_text]]
# Compute BLEU score with or without Japanese-specific tokenization
bleu_args = {"predictions": candidate, "references": reference, "lowercase": True}
if is_japanese:
bleu_args["tokenize"] = "ja-mecab"
# score = sacrebleu.compute(**bleu_args)["score"] / 100
return 0