forked from zbambergerNLP/principled-pre-training
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.py
209 lines (186 loc) · 9.12 KB
/
tokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
import typing
import torch
import transformers
from transformers import T5Tokenizer
def tokenize_function(
examples: typing.Dict[str, typing.Any],
tokenizer: transformers.PreTrainedTokenizer,
input_column_name: str = 'sentence',
target_column_name: str = 'label',
input_max_length: int = 512,
target_max_length: int = 512,
) -> typing.Dict[str, torch.Tensor]:
"""
Tokenizes batches of examples for an encoder-decoder model.
Args:
examples: A batch in the form of a dictionary mapping, mapping column names to their respective values.
tokenizer: A function which converts string tokens into input_ids and other model inputs.
input_column_name: Name of the column within the input dictionary that contains the text which will be
tokenized.
target_column_name: Name of the column within the input dictionary that contains the labels which will be
tokenized.
input_max_length: The maximum length of the input sequence.
target_max_length: The maximum length of the target sequence.
Returns:
A dictionary containing the original mappings, as well as the mapping between model input names (e.g.,
`input_ids`) and model input values (e.g., the tensor corresponding to the input IDs of the model).
"""
inputs = examples[input_column_name]
encoding = tokenizer(
inputs,
padding='max_length',
max_length=input_max_length,
truncation=True,
return_tensors="pt",
)
results = {'input_ids': encoding.input_ids, 'attention_mask': encoding.attention_mask}
# Labels are not preprocessed for the T5 model. model_inputs are returned as is
outputs = examples[target_column_name]
labels = tokenizer(
outputs,
padding='max_length',
max_length=target_max_length,
truncation=True,
return_tensors="pt",
)['input_ids']
# Replace the padding token with -100 to ignore it for loss computation
labels[labels == tokenizer.pad_token_id] = -100
results['labels'] = labels
return results
def tokenizer_function_one_input(
examples: typing.Dict[str, typing.Any],
tokenizer: T5Tokenizer,
label_names: typing.Dict[int, str],
prefix: str,
text_column_name: str = 'sentence',
label_column_name: str = 'label',
input_max_length: int = 512,
target_max_length: int = 512,
) -> typing.Dict[str, torch.Tensor]:
"""
Tokenizes batches of examples with only a single textual input for an encoder-decoder model.
Args:
examples: A batch in the form of a dictionary mapping, mapping column names to their respective values.
tokenizer: A function which converts string tokens into input_ids and other model inputs.
label_names: A dictionary mapping from the integer representation of the label to the string representation.
prefix: The string prefix prepended to each textual example. (This is task specific)
text_column_name: Name of the column within the input dictionary that contains the text which will be tokenized.
label_column_name: Name of the column within the input dictionary that contains the labels which will be
tokenized.
input_max_length: The maximum length of the input sequence.
target_max_length: The maximum length of the target sequence.
Returns:
A dictionary containing the original mappings, as well as the mapping between model input names (e.g.,
`input_ids`) and model input values (e.g., the tensor corresponding to the input IDs of the model).
"""
inputs = [f"{prefix}{sentence}" for sentence in examples[text_column_name]]
encoding = tokenizer(
inputs,
padding='max_length',
max_length=input_max_length,
truncation=True,
return_tensors="pt",
)
results = {'input_ids': encoding.input_ids, 'attention_mask': encoding.attention_mask}
# Labels are not preprocessed for the T5 model. model_inputs are returned as is
outputs = [label_names[label] for label in examples[label_column_name]]
labels = tokenizer(
outputs,
padding='max_length',
max_length=target_max_length,
truncation=True,
return_tensors="pt",
)['input_ids']
# Replace the padding token with -100 to ignore it for loss computation
labels[labels == tokenizer.pad_token_id] = -100
results['labels'] = labels
return results
def tokenizer_function_two_input(
examples: typing.Dict[str, typing.Any],
tokenizer: T5Tokenizer,
label_names: typing.Dict[int, str],
prefix_1: str,
prefix_2: str,
text_column_name_1: str = 'sentence1',
text_column_name_2: str = 'sentence2',
label_column_name: str = 'label',
is_regression: bool = False,
input_max_length: int = 512,
target_max_length: int = 512,
) -> typing.Dict[str, torch.Tensor]:
"""
Tokenizes batches of examples with only a single textual input for an encoder-decoder model.
This tokenizer function merges two sentences along with their corresponding prefixes. For example, given the first
sentence "I love NLP." and the second sentence "You too?", the combination would be:
"stsb sentence1: I love NLP. sentence2: You too?"
Args:
examples: A batch in the form of a dictionary mapping, mapping column names to their respective values.
tokenizer: A function which converts string tokens into input_ids and other model inputs.
label_names: A dictionary mapping from the integer representation of the label to the string representation.
prefix_1: The first string prefix prepended to each textual example. (This is task specific)
prefix_2: The second string prefix prepended to each textual example.
text_column_name_1: Name of the first column within the input dictionary that contains the text which will be
tokenized.
text_column_name_2: Name of the second column within the input dictionary that contains the text which will be
tokenized.
label_column_name: Name of the column within the input dictionary that contains the labels which will be
tokenized.
is_regression: True if task is a regression task, False if task is classification task.
input_max_length: The maximum length of the input sequence.
target_max_length: The maximum length of the target sequence.
Returns:
A dictionary containing the original mappings, as well as the mapping between model input names (e.g.,
`input_ids`) and model input values (e.g., the tensor corresponding to the input IDs of the model).
"""
# TODO: Add max length for inputs and targets as parameters.
inputs_1 = [f"{prefix_1}{sentence}" for sentence in examples[text_column_name_1]]
inputs_2 = [f"{prefix_2}{sentence}" for sentence in examples[text_column_name_2]]
inputs = [f"{sent1} {sent2}" for sent1, sent2 in zip(inputs_1, inputs_2)]
encoding = tokenizer(
inputs,
padding='max_length',
max_length=input_max_length,
truncation=True,
return_tensors="pt",
)
results = {'input_ids': encoding.input_ids, 'attention_mask': encoding.attention_mask}
if is_regression: # Training task involves predicting continuous values
outputs = [str(round(example, 1)) for example in examples[label_column_name]]
else: # Training task involves predicting a label from a predefined set of possible labels.
outputs = [label_names[example] for example in examples[label_column_name]]
# Seq2seq models expect labels in the form of tokenized text (multi-class prediction).
labels = tokenizer(
outputs,
padding='max_length',
max_length=target_max_length,
truncation=True,
return_tensors="pt",
)['input_ids']
# Replace the padding token with -100 to ignore it for loss computation
labels[labels == tokenizer.pad_token_id] = -100
results['labels'] = labels
return results
def tokenizer_function_t5_pre_training(
examples: typing.Dict[str, typing.List[str]],
tokenizer: T5Tokenizer,
text_column_name: str = 'text',
) -> transformers.tokenization_utils_base.BatchEncoding:
"""
Tokenizes batches of examples for pre-training a T5 model.
Args:
examples: A batch in the form of a dictionary mapping, mapping column names to their respective values.
tokenizer: A function which converts string tokens into input_ids and other model inputs.
text_column_name: Name of the column within the input dictionary that contains the text which will be
tokenized.
Returns:
A dictionary containing the original mappings, as well as the mapping between model input names (e.g.,
`input_ids`) and model input values (e.g., the tensor corresponding to the input IDs of the model).
"""
batch_encoding = tokenizer(
text=examples[text_column_name],
max_length=tokenizer.model_max_length,
padding='max_length',
truncation=True,
return_tensors='pt',
)
return batch_encoding