-
Notifications
You must be signed in to change notification settings - Fork 8
/
fbd_score.py
105 lines (95 loc) · 3.88 KB
/
fbd_score.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
# Frechet Bert Distance (FBD)
import copy
from utils import (
read_data,
get_model_configs,
get_embeddings,
calculate_feature_statistics,
calculate_frechet_distance,
transform_qa_pairs,
read_dialogue
)
def get_statistics(
querys,
answers,
tokenizer,
model,
batch_size,
use_cuda=True
):
feats = get_embeddings(
querys,
answers,
tokenizer,
model,
batch_size,
use_cuda
)
return calculate_feature_statistics(feats)
def calculate_fbd(
source_querys,
source_answers,
target_querys,
target_answers,
is_chinese,
pretrained_model_path,
batch_size,
device
):
tokenizer, model = get_model_configs(pretrained_model_path, is_chinese)
print('get statistics from source data ...')
mu1, sigma1 = get_statistics(source_querys, source_answers, tokenizer, model,
batch_size, use_cuda=(device=='gpu'))
print('get statistics from target data ...')
mu2, sigma2 = get_statistics(target_querys, target_answers, tokenizer, model,
batch_size, use_cuda=(device=='gpu'))
print('calculate FBD score ...')
score = calculate_frechet_distance(mu1, sigma1, mu2, sigma2)
#print('FBD score is {}'.format(score))
return score
def fbd_score(args):
if args.source_path is not None and args.target_path is not None:
source_querys, source_answers = read_data(args.source_path)
target_querys, target_answers = read_data(args.target_path)
elif args.data_path is not None:
source_querys, source_answers, _, _ = read_dialogue(args.data_path)
target_querys = copy.deepcopy(source_querys)
target_answers = copy.deepcopy(source_answers)
print(len(source_querys), len(source_answers))
if args.transform:
target_querys, target_answers = transform_qa_pairs(
target_querys,
target_answers,
args.transform,
args.ratio,
args.noise_dict,
args.repeat_dict
)
return calculate_fbd(
source_querys,
source_answers,
target_querys,
target_answers,
is_chinese=args.is_chinese,
pretrained_model_path=args.model_type, #args.pretrained_model_path,
batch_size=args.batch_size,
device=args.device
)
if __name__ == '__main__':
from argparse import ArgumentDefaultsHelpFormatter, ArgumentParser
parser = ArgumentParser(formatter_class=ArgumentDefaultsHelpFormatter)
parser.add_argument('--source_path', type=str, help='path to the file of question answer pair')
parser.add_argument('--target_path', type=str, help='path to the file of question answer pair')
parser.add_argument('--data_path', type=str, help='path to dialogue annotation data')
parser.add_argument('--model_type', type=str, default='', help='pretrained model type or path to pretrained model')
parser.add_argument('--is_chinese', type=int, default=0, help='Is Chinese corpus or not')
parser.add_argument('--pretrained_model_path', type=str, default=None, help='path to pretrained model path')
parser.add_argument('--batch_size', type=int, default=32)
parser.add_argument('--transform', type=str, default=None,
help='transformation type for target pairs: [noise | mismatch | permutate | repeat]')
parser.add_argument('--ratio', type=float, default=0.5, help='ratio of transformed pairs')
parser.add_argument('--noise_dict', type=str, default=None, help='path to the noise dictionary')
parser.add_argument('--repeat_dict', type=str, default=None, help='path to the repeatition dictionary')
parser.add_argument('--device', type=str, default='cpu', help='[cpu | gpu]')
args = parser.parse_args()
fbd_score(args)