-
Notifications
You must be signed in to change notification settings - Fork 1
/
infer.py
executable file
·408 lines (346 loc) · 17.2 KB
/
infer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
import os
import glob
import re
import textwrap
import torch
from unidecode import unidecode
from create_dataset import get_text_from_epub, get_text_from_html
from find_existing_attributions import get_attr_from_line, split_first_clause
from patch_model import get_patched_distilbert
from utils import tqdm, get_implications_ids, get_female_character_ids, get_character_blacklist_ids, zip_equal, config, get_implications
# It was a sunny day.
# "Hi Applejack!" screamed Rainbow Dash. "How are you?"
# "I'm fine, Rainbow Dash"
# narrator: It was a sunny day.
# [NEW_LINE]
# rainbow dash: (screamed) Hi Applejack!
# narrator: screamed Rainbow Dash.
# rainbow dash: How are you?
# [NEW_LINE]
# applejack: I'm fine, Rainbow Dash
def load_text_for_model(path):
"""Load text from a txt or epub file and pre-process."""
try:
if path.endswith('.txt'):
text = open(path, 'r', encoding='utf8').read()
elif path.endswith('.epub'):
text = get_text_from_epub(path)
elif path.endswith('.html'):
text = get_text_from_html(path)
else:
raise ValueError("Invalid file type: " + path)
except Exception as e:
print(f'Error loading {path}: {e}')
raise e
# remove/normalize special characters
text = unidecode(text)
# remove extra whitespace
text = re.sub(' {2,}', ' ', text)
# remove blank lines
text = "\n".join([t for t in text.splitlines() if t.strip()])
return text
def get_model_input(tokenizer, lines, verb, line_num, max_seq_length=512, mask_mode=1, use_existing_verbs=False, default_verb='said'):
"""
Returns text_ids, attention_mask, line which can be batched and fed into model.
"""
sep_id = tokenizer.sep_token_id
mask_id = tokenizer.mask_token_id
mask_token = tokenizer.mask_token
# get line (which will be modified with a [MASK])
line = lines[line_num]
# e.g: '"Hi Applejack!" she said.'
# e.g: "Rarity," She replied flatly, "You have a store in Canterlot."
# e.g: "Yes"
# In some cases, 'she' or 'She' needs to be replaced with '[MASK]'.
# In other cases, adding a new clause is necessary.
# TODO: check if this func improves results or not.
line_masked = None
if use_existing_verbs and verb is not None:
# if existing speech verb, replace with [MASK]
# e.g: '"Yes" she said.' -> '"Yes" [MASK] said.'
# e.g: '"Yes" She replied flatly.' -> '"Yes" [MASK] replied flatly.'
line_split = line.split('"')
# -> [before quote 1, quote 1, after quote 1, quote 2, after quote 2, ...]
for i, part in enumerate(line_split):
if i % 2 == 1 or i == 0:
continue
if verb not in part:
continue
# replace substring before clause change with f'verb [MASK].'
part1, part2 = split_first_clause(part)
part1 = f' {mask_token} {verb}.'
line_split[i] = part1 + part2
break
line_masked = '"'.join(line_split)
if line_masked is None:
# if no existing speech verb, add a new clause or replace old one
# e.g: '"Yes"' -> '"Yes" said [MASK].'
line_split = line.split('"')
# -> [before quote 1, quote 1, after quote 1, quote 2, after quote 2, ...]
# e.g: '"Yes" Applejack blinked. "I'm sure"' -> '"Yes" said [MASK]. Applejack blinked. "I'm sure"'
if mask_mode == 10:
line_split[2] = f' {mask_token} {default_verb}.' + line_split[2]
line_masked = '"'.join(line_split)
# e.g: '"Yes" Applejack blinked. "I'm sure"' -> '"Yes" said [MASK]. Applejack blinked. "I'm sure"'
elif mask_mode == 11:
line_split[-1] = f' {mask_token} {default_verb}.' + line_split[2]
line_masked = '"'.join(line_split)
# e.g: '"Yes" Applejack blinked. "I'm sure"' -> '"Yes" said [MASK]. "I'm sure"'
elif mask_mode == 20:
line_split[2] = f' {mask_token} {default_verb}. '
line_masked = '"'.join(line_split).strip()
# e.g: '"Yes" Applejack blinked. "I'm sure"' -> '"Yes" said [MASK]. "I'm sure"'
elif mask_mode == 21:
line_split[-1] = f' {mask_token} {default_verb}. '
line_masked = '"'.join(line_split).strip()
# e.g: '"Yes" Applejack blinked. "I'm sure"' -> '"Yes" said [MASK].'
elif mask_mode == 30:
line_masked = f'"{line_split[1]}" {mask_token} {default_verb}.'
# e.g: '"Yes" Applejack blinked. "I'm sure"' -> '"Yes" said [MASK].'
elif mask_mode == 31:
line_masked = f'"{line_split[-2]}" {mask_token} {default_verb}.'
# e.g: '"Yes" Applejack blinked. "I'm sure"' -> '"Yes" said [MASK].'
elif mask_mode == 40:
line_masked = f'{line_masked} {mask_token} {default_verb}.'
else:
raise ValueError(f'Invalid mask_mode: {mask_mode}')
assert mask_token in line_masked, f'"{mask_token}" (mask token) not found in line_masked below:\n{line_masked}'
assert mask_token in line_masked, f'"{mask_token}" (mask token) not found in line_masked below:\n{line_masked}'
assert line_masked is not None, f'line_masked is None'
text_ids = tokenizer(line_masked)['input_ids'][1:-1] # list[int]
assert mask_id in text_ids, f'mask_id ({mask_id}) not found in text_ids\n{text_ids}\n{line_masked}'
# randomly add a line from the previous lines or the next lines till target length is reached
prev_lines = lines[:line_num]
next_lines = lines[line_num + 1:]
prev_line_len = 0
next_line_len = 0
while len(text_ids) < max_seq_length:
if prev_lines and (prev_line_len <= next_line_len or not next_lines):
prev_line = prev_lines.pop(-1)
text_ids = tokenizer(prev_line)['input_ids'][1:-1] + [sep_id] + text_ids
prev_line_len += len(prev_line)
elif next_lines:
next_line = next_lines.pop(0)
text_ids = text_ids + [sep_id] + tokenizer(next_line)['input_ids'][1:-1]
next_line_len += len(next_line)
else:
break
# trim to max_seq_length
if len(text_ids) > max_seq_length:
text_ids = text_ids[:max_seq_length] if next_line_len > prev_line_len else text_ids[-max_seq_length:]
assert mask_id in text_ids, f'mask_id ({mask_id}) not found in text_ids after trim operation.'
# convert to tensor
text_ids = torch.tensor(text_ids, dtype=torch.long) # [max_seq_length]
# attention mask
attention_mask = torch.ones_like(text_ids) # [max_seq_length]
# pad to max_seq_length if needed
if len(text_ids) < max_seq_length:
pad = max_seq_length - len(text_ids)
text_ids = torch.nn.functional.pad(text_ids, (0, pad), value=0)
attention_mask = torch.nn.functional.pad(attention_mask, (0, pad), value=0)
return {
'text_ids': text_ids,
'attention_mask': attention_mask,
'line_masked': line_masked,
'line_num': line_num,
}
def collate_fn(batch):
"""Collate function for batching."""
return {
'text_ids': torch.stack([b['text_ids'] for b in batch]),
'attention_mask': torch.stack([b['attention_mask'] for b in batch]),
'lines_masked': [b['line_masked'] for b in batch],
'line_nums': [b['line_num'] for b in batch],
}
def apply_implications(probs: torch.Tensor):
"""
Take a [B, n_vocab] tensor of probabilities and merge probabilities of specied tokens.
"""
for (src_id, tgt_id) in get_implications_ids():
probs[:, tgt_id] += probs[:, src_id]
probs[:, src_id] = 0
def apply_blacklist(logits: torch.Tensor):
"""
Take a [B, n_vocab] tensor of logit scores and set the logit scores of specied tokens to -inf.
"""
for id in get_character_blacklist_ids():
logits[:, id] = -float('inf')
def apply_female_blacklist(logits: torch.Tensor):
for id in get_female_character_ids():
logits[:, id] = -float('inf')
def infer_speaker_of_line(tokenizer, model, text_ids, attention_mask, batch_lines_masked):
"""
Takes list of lines and line number and returns speaker.
Uses neural network to infer speaker when ambiguous.
"""
mask_id = tokenizer.mask_token_id
mask_token = tokenizer.mask_token
# run model
with torch.no_grad():
device = next(model.parameters()).device
logits = model(
text_ids .to(device=device),
attention_mask.to(device=device)
)['logits'].float()
# extract logits for [MASK] token
logits = logits[text_ids == mask_id, :] # [B, n_masks, vocab_size] -> [B, vocab_size]
logits = logits.cpu()
B, n_vocab = logits.shape
assert B == len(batch_lines_masked), f'Mismatch! Got {B} masked tokens and {len(batch_lines_masked)} batch size.'
# get top prediction
apply_blacklist(logits)
probs = torch.softmax(logits, dim=1) # [B, vocab_size]
apply_implications(probs)
confidences, indexes = probs.max(dim=1) # -> [B], [B]
# convert to speaker name
speakers = []
batch_lines_predicted = []
for i, (line, index, confidence) in enumerate(zip(batch_lines_masked, indexes.unbind(0), confidences.unbind(0))):
speaker = tokenizer._convert_id_to_token(index.item())
batch_lines_predicted.append(line.replace(mask_token, speaker))
speakers.append(speaker)
return speakers, confidences, batch_lines_predicted
def infer(lines, tokenizer, model, search_length=8, batch_size=64, fast_fill_threshold=0.99, max_seq_length=256, mask_mode=1, use_existing_verbs=False, default_verb='said', use_pbar=True):
"""
Takes text and returns ordered list of (speaker, speech_verb, line_num) tuples.
Uses neural network to infer speaker or speech_verb when ambiguous.
"""
lines = lines.copy()
line_attrs = [] # [[speaker, speech_verb, line_num], ...]
# split into (speaker, speech_verb, line) tuples
for line_num, line in enumerate(lines):
character, verbs = get_attr_from_line(line) # get speaker and speech_verb (if speaker is obvious)
if character in config['character_blacklist_infer']:
character = None
verb = ([v for v in verbs if v]+[None])[0] # grab first verb (if any)
line_attrs.append([character, verb, line_num])
# assign "Narrator" or "UnknownSpeaker" to None speakers
for i, (character, verb, line_num) in enumerate(line_attrs):
if character is None:
has_speech = lines[line_num].count('"') > 1
line_attrs[i][0] = "UnknownSpeaker" if has_speech else "Narrator"
# identify speaker for lines with "UnknownSpeaker"
# (use neural network to infer speaker. start with easiest lines and repeat until all lines are assigned)
missing_lines = [t for t in line_attrs if t[0] == "UnknownSpeaker"]
if use_pbar:
pbar = tqdm(total=len(missing_lines), desc='Inferring speakers', leave=False)
confidences = []
for _ in range(len(missing_lines)):
if len(missing_lines) == 0:
break
# infer every line with a missing speaker and find the highest confidence prediction
results = []
batch = []
for i, (character, verb, line_num) in enumerate(missing_lines[:search_length]):
is_last_item = i == len(missing_lines[:search_length])-1
item = get_model_input(tokenizer, lines, verb, line_num,
max_seq_length=max_seq_length, mask_mode=mask_mode,
use_existing_verbs=use_existing_verbs, default_verb=default_verb)
batch.append(item)
if len(batch) == batch_size or is_last_item:
batch = collate_fn(batch)
speaker_batch, confidence_batch, lines_pred_batch = infer_speaker_of_line(
tokenizer, model, batch['text_ids'], batch['attention_mask'], batch['lines_masked'])
for result in zip_equal(speaker_batch, confidence_batch, batch['line_nums'], lines_pred_batch):
results.append(result)
batch = []
# update with results
results = sorted(results, key=lambda x: x[1], reverse=True) # sort highest confidence first
new_line_nums = []
for i, (speaker, conf, line_num, line_pred) in enumerate(results):
is_first_item = bool(i == 0)
if not is_first_item and conf < fast_fill_threshold:
break
#tqdm.write(f"Labelled line {line_num:>4} with {speaker} (confidence: {conf:2.0%})\n{lines[line_num]}\n{line_pred}\n")
if use_pbar: pbar.update(1)
line_attrs[line_num][0] = speaker
lines[line_num] = line_pred
new_line_nums.append(line_num)
confidences.append(conf)
# remove labelled lines from missing_lines
missing_lines = list(filter(lambda x: x[2] not in new_line_nums, missing_lines))
if use_pbar: pbar.close()
implications = {k: v for k, v in get_implications()}
for i, (character, verb, line_num) in enumerate(line_attrs):
line_attrs[i][0] = implications.get(character, character)
mean_confidence = sum(confidences) / len(confidences)
return line_attrs, mean_confidence
def get_script_from_line_attrs(line_attrs, lines):
"""
Takes line_attrs and lines and returns play-script formatted string.
"""
out_lines = []
for character, verb, line_num in line_attrs:
line = lines[line_num].strip()
line_split = line.split('"')
# -> [before quote 1, quote 1, after quote 1, quote 2, after quote 2, ...]
for i, part in enumerate(line_split):
part = part.strip()
if not part:
continue
if i % 2 == 0:
out_lines.append(f'{"Narrator"} ({None}) : {part}')
else:
out_lines.append(f'{character} ({verb}) : {part}')
out_lines.append('')
return '\n'.join(out_lines)
def load_model(model_path, device, dtype):
model, tokenizer = get_patched_distilbert()[:2]
model.load_state_dict({k[6:]: v for k, v in torch.load(model_path)['state_dict'].items()})
model.eval().to(device).to(dtype=getattr(torch, dtype))
return model, tokenizer
if __name__ == '__main__':
import argparse
parser = argparse.ArgumentParser()
bool_fn = lambda x: x.lower() in ['true', '1', 't', 'y', 'yes']
parser.add_argument('--model_path', type=str)
parser.add_argument('--from_file' , type=str, default=None)
parser.add_argument('--from_dir' , type=str, default=None)
parser.add_argument('--output_dir', type=str, default=None)
parser.add_argument('--skip_existing', type=bool_fn, default=True)
parser.add_argument('--label_with_conf', type=bool_fn, default=False)
parser.add_argument('--device', type=str, default='cuda')
parser.add_argument('--dtype', type=str, default='half')
args = parser.parse_args()
infer_kwargs = dict(
search_length=32,
batch_size=64,
fast_fill_threshold=0.99,
max_seq_length=256,
mask_mode=21,
use_existing_verbs=False,
default_verb='said',
)
model, tokenizer = load_model(args.model_path, args.device, args.dtype)
files = []
if args.from_file is not None:
files.append(args.from_file)
if args.from_dir is not None:
files.extend(glob.glob(os.path.join(args.from_dir, '*.txt')))
files.extend(glob.glob(os.path.join(args.from_dir, '*.epub')))
os.makedirs(args.output_dir, exist_ok=True)
file_sizes = [os.path.getsize(f)//1024 for f in files]
pbar = tqdm(total=sum(file_sizes), desc='Processing files', unit='KB', smoothing=0.0)
confidences = []
for i, file in enumerate(files):
filename = os.path.basename(file)
filename_no_ext = os.path.splitext(filename)[0]
output_file = os.path.join(args.output_dir, filename_no_ext + '.txt')
if args.skip_existing and os.path.exists(output_file):
tqdm.write(f"Skipping {filename} (already exists)")
pbar.update(file_sizes[i])
continue
try:
text = load_text_for_model(file)
lines = [line.strip() for line in text.splitlines()]
attrs, mean_confidence = infer(lines, tokenizer, model, **infer_kwargs)
script_text = get_script_from_line_attrs(attrs, lines)
except Exception as e:
tqdm.write(f"Error processing {filename}: {e}")
raise e
open(output_file, 'w').write(script_text)
tqdm.write(f"Saved {filename} with mean confidence {mean_confidence:2.0%}")
confidences.append(mean_confidence)
pbar.update(file_sizes[i])
if len(confidences):
tqdm.write(f"Mean confidence of all files: {sum(confidences) / len(confidences):2.1%}")