-
Notifications
You must be signed in to change notification settings - Fork 2
/
config.toml.template
430 lines (350 loc) · 20.1 KB
/
config.toml.template
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
# This is a template for config file.
# Rename this file by removing postfix '.template' to set the config.
[Basic]
## Random seed. It will be effective for random, numpy, pytorch.
random_seed = 0
## Cuda ID. It will override the environment variable CUDA_VISIBLE_DEVICES. Set to 0 by default.
cuda_id = 0
## Where to store the trial data, including log files, model checkpoints, test results.
## All data will be placed at {base_path}/{folder_name}. If the folder does not exist, it will create a new one.
base_path = "../models/task-name"
folder_name = "model-name"
[Corpus]
## Tag type for prediction. It depends on the task and dataset. Custom names are allowed.
## See flair documents (https://flairnlp.github.io/docs/tutorial-training/how-to-train-sequence-tagger) for more details.
## Example: "upos", "pos", "ner", "mlm", "class".
tag_type = "pos"
## Below defines the corpus to use for training/testing. Please only use one corpus for each trail.
## Supports all datasets provided by flair (https://github.com/flairNLP/flair), and the following.
## The Penn Treebank 3 dataset ====================================================================================
## - Description: The PTB datset from LDC.
## - Source: https://catalog.ldc.upenn.edu/LDC99T42
## - Supported Tag Types: "mlm", "pos"
##
## :param splits: Section splits for Penn Treebank.
## A split is composed of 3 datasets (train|dev|test), seperated by '|'. Each dataset
## is represented by numbers seperated by commas. You can use '-' to represent a continuous
## range of numbers (e.g.: "0,1,3-5,9|7|22-24"). These numbers are the section numbers in
## Penn Treebank, from 0 to 24. Default: "2-21|22|23"
## :param base_path: Base path for flair storage. Usually use default setting.
## :param in_memory: If set to True, keeps full dataset in memory, otherwise does disk reads
##
# [Corpus.PennTreebankCorpus]
# splits = "0-18|19-21|22-24"
## The Penn Treebank 3 dataset ====================================================================================
## - Description: The standard 10,000 word Penn Treebank corpus.
## - Source: http://www.fit.vutbr.cz/~imikolov/rnnlm/
## Mikolov Tomáš: Statistical Language Models based on Neural Networks. PhD thesis, Brno University of Technology, 2012.
## - Supported Tag Types: "mlm"
##
## :param base_path: Base path for flair storage. Usually use default setting.
## :param in_memory: If True, keeps dataset in memory giving speedups in training.
##
# [Corpus.StandardPTBCorpus]
## The BLLIP 1987-89 WSJ Corpus Release 1 dataset =================================================================
## - Description: The BLLIP dataset from LDC. You have to manually download the dataset from LDC and set base_path as
## the path to the tar file.
## - Source: https://catalog.ldc.upenn.edu/LDC2000T43
## - Supported Tag Types: "mlm", "pos"
##
## :param base_path: Path to the tar file downloaded from LDC.
## :param in_memory: If True, keeps dataset in memory giving speedups in training. In some conditions, the memory is
## not large enough to hold the entire dataset, in which case you should set it to False.
## :param mode: Which split to use. Default: 'XS'. Options:
## - XS: train 40k, dev 20k, test 20k
## - SM: train 200k, dev 20k, test 20k
## - MD: train 600k, dev 20k, test 20k
## - LG: train 1756k, dev 20k, test 20k
## - FULL: train 81%, dev 9%, test 10%
## - CUSTOM: defined with parameters.
## :param train_set_size: Number of sentences in the randomly split training set. Useful only if mode is CUSTOM.
## :param dev_set_size: Number of sentences in the randomly split dev set. Useful only if mode is CUSTOM.
## :param test_set_size: Number of sentences in the randomly split test set. Useful only if mode is CUSTOM.
## :param use_cache: Cache the dataset so that the second read will be much faster. Default: True
##
# [Corpus.BLLIPCorpus]
# base_path = "/path/to/bliip_87_89_wsj_LDC2000T43.tgz"
# in_memory = true
# mode = 'XS'
## The BLLIP 1987-89 WSJ Corpus Release 1 dataset =================================================================
## - Description: The BLLIP dataset from LDC, with options to do subword tokenization. You have to manually
## download the dataset from LDC and set base_path as the path to the tar file.
## - Source: https://catalog.ldc.upenn.edu/LDC2000T43
## - Supported Tag Types: "mlm"
##
## :param base_path: Path to the tar file downloaded from LDC. It will preprocess and cache the dataset when loading
## the dataset for the first time, so feel free to remove this parameter after the first run.
## :param in_memory: If True, keeps dataset in memory giving speedups in training. In some conditions, the memory is
## not large enough to hold the entire dataset, in which case you should set it to False.
## :param mode: Which split to use. Options: 'XS', 'SM', 'MD', 'LG', 'FULL'.
## :param use_subtoken: Use GPT2Tokenizer to do subword tokenization. Default: True.
## :param min_freq: The minimal frequency for a token to appear in the vocabulary. Default: 1.
##
# [Corpus.BLLIPTextCorpus]
# base_path = "/path/to/bliip_87_89_wsj_LDC2000T43.tgz"
# in_memory = true
# mode = 'XS'
## The BLLIP 1987-89 WSJ Corpus Release 1 dataset =================================================================
## - Description: BLLIP corpus follow the UDGN (https://aclanthology.org/2022.acl-long.327/) setting. You have to manually
## download the dataset from LDC and set base_path as the path to the tar file.
## - Source: https://catalog.ldc.upenn.edu/LDC2000T43
## - Supported Tag Types: "mlm"
##
## :param base_path: Path to the tar file downloaded from LDC. It will preprocess and cache the dataset when loading
## the dataset for the first time, so feel free to remove this parameter after the first run.
## :param in_memory: If True, keeps dataset in memory giving speedups in training. In some conditions, the memory is
## not large enough to hold the entire dataset, in which case you should set it to False.
## :param mode: Which split to use. Options: 'XS', 'SM', 'MD', 'LG', 'FULL'.
## :param min_freq: The minimal frequency for a token to appear in the vocabulary. Following UDGN, by default the
## the value is 28.
##
# [Corpus.UDGNBLLIPTextCorpus]
# base_path = "/path/to/bliip_87_89_wsj_LDC2000T43.tgz"
# in_memory = true
# mode = 'XS'
## The Universal Dependency dataset ===============================================================================
## - Description: Flair dataset (https://github.com/flairNLP/flair)
## - Source: https://universaldependencies.org/
## - Supported Tag Types: "pos", "upos", ...
##
[Corpus.UD_ENGLISH]
## The CoNLL-03 dataset ===========================================================================================
## - Description: Flair dataset (https://github.com/flairNLP/flair), but support auto-download. (Now flair 0.12.0
## supports auto-download too.
## - Source: https://www.clips.uantwerpen.be/conll2003/ner/
## - Supported Tag Types: "ner", ...
##
# [Corpus.AUTO_CONLL_03]
## The Stanford sentiment treebank dataset of SentEval ============================================================
## - Description: Flair dataset (https://github.com/flairNLP/flair). classified into NEGATIVE or POSITIVE sentiment.
## - Source: https://github.com/facebookresearch/SentEval
## - Supported Tag Types: "class"
##
# [Corpus.SENTEVAL_SST_BINARY]
## The Stanford sentiment treebank dataset of SentEval ============================================================
## - Description: Flair dataset (https://github.com/flairNLP/flair). classified into 5 sentiment classes.
## - Source: https://github.com/facebookresearch/SentEval
## - Supported Tag Types: "class"
##
# [Corpus.SENTEVAL_SST_GRANULAR]
## The COGS dataset ===============================================================================================
## - Description: COGS is a semantic parsing dataset based on a fragment of English. Originally proposed by
## Kim and Linzen (https://aclanthology.org/2020.emnlp-main.731/). This dataset is a sequence
## labeling version proposed by Ontanón et al. (https://arxiv.org/abs/2108.04378).
## - Source: https://github.com/google-research/google-research/tree/master/compositional_transformers
## - Supported Tag Types: "pos"
##
## :param dev_split: Which dev set to use. Options:
## - 'dev': Use the original in-vocab dev set.
## - 'gen': Use the out-of-vocab generated dev set.
## :param test_split: Which test set to use. Options:
## - 'test': Use the original in-vocab test set.
## - 'gen': Use the out-of-vocab generated test set.
## :param mapping: Which way for prediction. For more details, refer to the paper (https://arxiv.org/abs/2108.04378).
## Options: 'classifier', 'relative', 'attention'.
##
# [Corpus.COGS_SequenceLabeling]
# mapping = 'relative'
# test_split = 'gen'
## The CFQ dataset ================================================================================================
## - Description: A dependency parsing version of Compositional Freebase Questions (CFQ) challenge.
## CFQ was proposed by Keysers et al. (https://arxiv.org/abs/1912.09713). The dependency version was
## proposed in papaer Compositional Generalization in Dependency Parsing (Goodwin et al.,
## https://aclanthology.org/2022.acl-long.448/). Notice that this dataset is quite large.
## - Source: https://github.com/emilygoodwin/CFQ-dependencies
## - Supported Tag Types: "dependency"
##
## :param split: Dataset split. Options: 'mcd1', 'mcd2', 'mcd3', 'random'. Default: 'random'.
## * Hidden options: 'question_complexity', 'question_pattern', 'query_complexity', 'query_pattern'.
## Not recommended in this setting.
## :param use_dev: Use the official dev set. In the paper above, the author argues that the
## original dev split has the same distribution as the test set, which results in leaking
## the information of the test distribution. If set to False, then randomly pick 20% samples
## from the train set as the dev set. Notice that this may result in different splits on
## different devices. Though the result should not be affected too much.
## :param in_memory: If True, keeps dataset in memory giving speedups in training. In some conditions, the memory is
## not large enough to hold the entire dataset, in which case you should set it to False.
##
# [Corpus.CFQ_Dependency]
# split = "mcd1"
# use_dev = true
# in_memory = false
## Uncomment the following lines to make a down sample of the corpus
# [CorpusDownSample]
# percentage = 0.1
# downsample_train = true
# downsample_dev = true
# downsample_test = false
[Embeddings]
## Embedding provided by flair, each token has a vector representation.
## The tokens with frequency in train set no less than min_freq compose the vocabulary. Others will be treated as <unk>.
##
[Embeddings.OneHotEmbeddings]
embedding_length = 128
min_freq = 1
## Embedding for MLM task. Similar to flair OneHotEmbeddings, but add additional <MASK> token to the vocabulary.
## :param init_strategy: Use torch.nn.init.<init_strategy> to init the embedding.
##
# [Embeddings.MLMOneHotEmbeddings]
# embedding_length = 128
# min_freq = 1
# init_strategy = "xavier_uniform_"
## Embedding for MLM task. Use the vocabulary provided in the corpus setting (PTB, BLLIP).
## :param with_mask: Add additional <MASK> token to the vocabulary.
## :param init_strategy: Use torch.nn.init.<init_strategy> to init the embedding.
##
# [Embeddings.AutoMLMOneHotEmbeddings]
# embedding_length = 384
# with_mask = true
# init_strategy = "xavier_uniform_"
[SequenceTagger]
## Which task specific model to use for training. Options:
## - "CustomSequenceTagger": Sequence Labeling task. (tag_type: pos, ner)
## - "CustomTextClassifier": Text classification task. (tag_type: class)
## - "MaskedLanguageModel": Masked Language Model task. (tag_type: mlm)
## - "MultiLabelSequenceTagger": Multiple Sequence Labeling task. (CGOS dataset)
## - "DependencyParser": Dependency Parsing task. (CFQ dataset)
##
tagger = "CustomSequenceTagger"
## (Optional) Only valid if tagger = "CustomTextClassifier"
## Add a [CLS] token at the front of each sentence, and predict the label using its representation.
## Otherwise predict the label of a sentence using the first token of the sentence.
##
# add_cls = false
## (Optional) If True, adds trainable linear map on top of embedding layer. If False, no map.
## If you set this to an integer, you can control the dimensionality of the reprojection layer
## Used in ALBERT (Lan, Z. et al. https://arxiv.org/abs/1909.11942)
##
# reproject_embeddings = 768
## (Optional) Only valid if tagger = "MaskedLanguageModel"
## Use the embedding matrix as the final projector. A common regularization on MLM task.
##
# reuse_embedding_weight = true
## Below defines the core encoder module (Probabilistic Transformer, Transformer, Universal Transformer, etc.)
## See src/models/modules for more details. All modules registered in src/models/modules/__init__.py are ready to use.
## For each module, refer to its doc string for the usage.
## Identity encoder. The word embedding is the final representation.
# [SequenceTagger.module]
# name = "Identity"
## Probabilistic Transformer. Slightly optimized for speed.
## :param d_model: dimensions of Z nodes.
## :param n_head: number of heads.
## :param n_iter: number of iterations.
## :param damping_H: damping of H nodes update. 0 means no damping is applied.
## :param damping_Z: damping of Z nodes update. 0 means no damping is applied.
## :param stepsize_H: step size of H nodes update. 1 means full update is applied.
## :param stepsize_Z: step size of Z nodes update. 1 means full update is applied.
## :param regularize_H: regularization for updating H nodes.
## :param regularize_Z: regularization for updating Z nodes.
## 'regularize_H' and 'regularize_Z' are regularizations for MFVI. See
## 'Regularized Frank-Wolfe for Dense CRFs: GeneralizingMean Field and
## Beyond' (Ð.Khuê Lê-Huu, 2021) for details.
## :param norm: normalization method. Options: ['softmax', 'relu'], Default: 'softmax'.
## :param dists: distance pattern. Each distance group will use different factors.
## Dists should be groups of numbers seperated by ','. Each number represents
## a seperate point. Empty means all tenery factors share the same parameters.
## Note that the minimum seperate point you input should be 2. Default: "".
## E.g. "" -> [1, +oo)
## "3" -> [1, 2), [3, +oo)
## "2, 4" -> [1, 2), [2, 4), [4, +oo)
## i.e. {1}, {2, 3}, [4, +oo)
## :param async_update: update the q values asyncronously (Y first, then Z). Default: True.
## :param output_prob: If true, output a normalized probabilistic distribution. Otherwise
## output unnormalized scores.
## :param use_td: control tensor decomposition. Options:
## - 'no': no tensor decomposition;
## - 'uv:{rank}': each 'head' decompose to 2 matrices W = U @ V. Use a
## number to set the rank, e.g. 'uv:64';
## - 'uvw:{rank}': decompose to sum of product of 3 vectors
## W = \sum U * V * W, where * is the outer product. Use a number to set
## the rank, e.g. 'uvw:64'.
## Default: 'no'.
## :param dropout: dropout for training. Default: 0.1.
## :param block_msg: block the message passed to Z_j in factor (H_i=k, Z_i=a, Z_j=b). Default: False.
## :param use_projection: project the output using a feedforward block like transformer. Default: False.
##
[SequenceTagger.module]
name = "HalfLazyHeadProbEncoder"
d_model = 128
n_head = 18
n_iter = 2
damping_H = 0
damping_Z = 0
stepsize_H = 1
stepsize_Z = 1
regularize_H = 1
regularize_Z = 1
dists = "2, 3, 4"
async_update = true
use_td = "no"
dropout = 0.1
block_msg = false
## Vanilla Transformer.
## The hyperparameter names should be easy to recognize...
## :param pos_embed: Positional Encoding. Options: 'cos', 'add'.
##
# [SequenceTagger.module]
# name = "TransformerEncoder"
# d_model = 256
# d_ff = 2048
# n_layers = 4
# n_head = 14
# d_qkv = 128
# dropout = 0.15
# pos_embed = "add"
## Transformer with RPE.
## Following Self-Attention with Relative Position Representations (https://arxiv.org/abs/1803.02155).
## :param e_length: Create (2 * e_length - 1) bins for RPE.
## :param mode: Add RPE on Q, K or V. Options: 'q', 'k', 'v', 'qk', 'kv', 'qv', 'qkv'
##
# [SequenceTagger.module]
# name = "RelativeTransformerEncoder"
# d_model = 64
# d_ff = 256
# n_layers = 2
# n_head = 4
# d_qkv = 16
# dropout = 0.1
# pos_embed = "none"
# e_length = 9
# mode = "k"
[Trainer]
## Which task specific trainer to use for training. Options:
## - "MaskedLanguageModelTrainer": Masked Language Model task. (tag_type: mlm)
## - "CustomModelTrainer": All the rest tasks.
##
trainer = "CustomModelTrainer"
## All the following options are consistent with a flair trainer.
## See more details in https://github.com/flairNLP/flair/blob/v0.8/flair/trainers/trainer.py#L63
##
learning_rate = 0.0062
mini_batch_size = 64
mini_batch_chunk_size= 16
max_epochs = 100
shuffle = true
anneal_factor = 0.5
patience = 5
min_learning_rate = 0.0000001
embeddings_storage_mode = 'none'
## Optimizer. By default using torch.optim.Adam
## Notice that this might affect the optimizer parameters. For example, torch.optim.SGD does not have parameter 'eps'.
##
# optimizer = "torch.optim.SGD"
eps = 1e-9
weight_decay = 2.2e-6
## Only valid if trainer = "MaskedLanguageModelTrainer"
## Fix the masked words in dev/test set. Put sentences with similar lengths in to the same batch to accelerate.
##
# fix_dev_mask = true
# fix_test_mask = true
# use_bucket = false
## Add additional regularization term to train loss
## It depends on the module implementation.
##
# [Trainer.add_norm]
## In Probabilistic Transformer, we sometimes add a regularization term to the ternary score tensor.
## We empirically find it helpful to improve the performance on MLM tasks.
##
# [Trainer.add_norm.getTernaryNorm]
# p = 2
# lambda = 4e-4