Configure the following variables for training and translation:
# the group ID for the experiment
group_id = "std"
# an ID for your experiment. Model, log and state files will be saved in: expm/data_id/group_id/run_id
run_id = "base"
# the ID of the dataset to use
data_id = "w14ende"
# training, validation and test sets, created by mktrain.sh and mktest.sh correspondingly.
train_data = "cache/"+data_id+"/train.h5"
dev_data = "cache/"+data_id+"/dev.h5"
test_data = "cache/"+data_id+"/test.h5"
# the saved model file to fine tune with.
fine_tune_m = None
# non-exist indexes in the classifier.
# "<pad>":0, "<sos>":1, "<eos>":2, "<unk>":3
# add 3 to forbidden_indexes if there are <unk> tokens in data
# must be None if use_fast_loss is set in cnfg/hyp.py
forbidden_indexes = None#[0, 1]
# automatically remove the previous best train/validation model when saving the new best.
save_auto_clean = True
# allow the best performing model on the training set to overwrite the best performing model on the development set.
overwrite_eva=False
# after how much step save a checkpoint which you can fine tune with.
save_every = 1500
# maximum number of checkpoint models saved, useful for average or ensemble.
num_checkpoint = 4
# start saving checkpoints only after this epoch
epoch_start_checkpoint_save = 3
# optimize after the number of trained tokens is larger than "tokens_optm", designed to support large batch size on a single GPU effectively.
tokens_optm = 25000
# number of continuous epochs where no smaller validation loss found to early stop the training.
earlystop = 8
# maximum training epochs.
maxrun = 128
# number of training steps, 300000 for transformer big.
training_steps = 100000
# report training loss after these many optimization steps, and whether report evaluation result or not.
batch_report = 2000
report_eva = False
# run on GPU or not, and GPU device(s) to use. Data Parallel multi-GPU support can be enabled with values like: 'cuda:0, 1, 3'. Set gpuid to None to use all GPUs.
use_cuda = True
gpuid = 'cuda:0'
# use mixed precision (FP16)
use_amp = False
# use multi-gpu optimizer, may help bring slight acceleration for the training of large models (e.g. deep/big Transformers) with complex optimizers (e.g. Adam).
multi_gpu_optimizer = True
# bind the embedding matrix with the classifer weight in decoder
bindDecoderEmb = True
# sharing embedding of the encoder and the decoder or not.
share_emb = False
# size of the embeddings.
isize = 512
# hidden size for those feed-forward neural networks.
ff_hsize = isize * 4
# number of heads for multi-head attention.
nhead = max(1, isize // 64)
# hidden size for the attention model.
attn_hsize = None
# number of layers for encoder and decoder.
nlayer = 6
# dropout rate for hidden states.
drop = 0.1
# dropout rate applied to multi-head attention.
attn_drop = drop
# dropout rate applied to the activation of FFN.
act_drop = drop
# False for Hier/Incept Models
norm_output = True
# warm up steps for the training.
warm_step = 8000
# scalar of learning rate
lr_scale = 1.0
# label smoothing settings for the KL divergence.
label_smoothing = 0.1
# L2 regularization, 1e-5 for not very large dataset from The Best of BothWorlds: Combining Recent Advances in Neural Machine Translation
weight_decay = 0
# beam size for generating translations. Decoding of batches of data is supported, but requires more memory. Set to 1 for greedy decoding.
beam_size = 4
# length penalty applied to translating
length_penalty = 0.0
# use multi-gpu for translating or not. "predict.py" will take the last gpu rather than the first in case multi_gpu_decoding is set to False to avoid potential break due to out of memory, because the first gpu is the main device by default which takes more jobs.
multi_gpu_decoding = False
# random seed
seed = 666666
# save a model for every epoch regardless whether a lower loss/error rate has been reached. Useful for ensemble.
epoch_save = True
# to accelerate training through sampling, 0.8 and 0.1 in: Dynamic Sentence Sampling for Efficient Training of Neural Machine Translation
dss_ws = None
dss_rm = None
# apply ams for adam or not.
use_ams = False
# load embeddings retrieved with tools/check/ext_emb.py, and whether update them or not
src_emb = None
freeze_srcemb = False
tgt_emb = None
freeze_tgtemb = False
# scale down loaded embedding by sqrt(isize) or not, True as default to make positional embedding meaningful at beginning.
scale_down_emb = True
# training state file to resume training.
train_statesf = None
# saving training state or not
save_train_state = False
Configuration of following variables:
# reducing the optimization difficulty of models
ease_optimization = True
# using lipschitz constraint parameter initialization in [Lipschitz Constrained Parameter Initialization for Deep Transformers](https://aclanthology.org/2020.acl-main.38/)
lipschitz_initialization = True
# using advanced activation function, choices: None, "GeLU", "Swish", "Sigmoid", "NormSwish"
advance_activation_function = None
# using GLU activation function for FFN, choices: None, "GLU" or above activation functions.
use_glu_ffn = None
# computation order in Transformer sub-layer choices: "v1", "v2"
computation_order = "v2"
# default cached sequence length (for positional embedding, etc.)
cache_len_default = 256
# window size (one side) of relative positional embeddings, 0 to disable. 8 and 16 are used in [Self-Attention with Relative Position Representations](https://aclanthology.org/N18-2074/) for Transformer Base and Big respectively. relative_position_max_bucket_distance for the bucket relative positional encoding used by T5, [Exploring the Limits of Transfer Learning with a Unified Text-to-Text Transformer](https://www.jmlr.org/papers/v21/20-074.html), which slightly hampers the performance on WMT 14 En-De. disable_std_pemb to disable the standard positional embedding when use the relative position, or to disable only the decoder side with a tuple (False, True,), useful for AAN.
use_k_relative_position = 0
relative_position_max_bucket_distance = 0
disable_std_pemb = False
# using fast implementation of label smoothing loss, but it cannot exclude the negative impact of special tokens, like <pad>, on training. `forbidden_indexes` in `cnfg/base.py` shall be set to None to enable.
use_fast_loss = True
# configure maximum batch size w.r.t GPU memory
max_tokens_gpu = 6144
max_sentences_gpu = max_tokens_gpu // 6
max_pad_tokens_sentence = 32
normal_tokens_vs_pad_tokens = 4
# For BPE (using full vocabulary), the special <unk> token will never appear and thus can be removed from the vocabulary. Otherwise, it should be set to True.
use_unk = True
# learning rate, override by the GoogleLR in most cases
init_lr = 1e-4
# enable tqdm progress bar.
enable_tqdm = True
# trade CPU for IO and disk space, see [gzip](https://docs.python.org/3/library/gzip.html) and [h5py](http://docs.h5py.org/en/stable/high/dataset.html) for details.
raw_cache_compression_level = 9
# choices: None, "gzip", "lzf"
hdf5_data_compression = "gzip"
# choices: 0 to 9, default is 4. None for lzf.
hdf5_data_compression_level = 9
hdf5_model_compression = None
hdf5_model_compression_level = 0
# using the latest HDF5 version for its advantages even this forgos compatibility, see [h5py.File](https://docs.h5py.org/en/stable/high/file.html#version-bounding) for details.
hdf5_perf_over_camp = True
# whether to track creation order.
hdf5_track_order = False
# the existence of the names of model parameters. (save `named_parameters` or `parameters`)
hdf5_load_parameter_name = hdf5_save_parameter_name = False
# prune with length penalty in each beam decoding step
clip_beam_with_lp = True
# optimize speed even if it sacrifices reproduction
performance_over_reproduction = True
# use torch.inference_mode if supported
use_inference_mode = True
# enable torch checks, only support anomaly detection for the autograd engine currently.
enable_torch_check = True
# accelerate optimizer by using contigous parameters and gradients. Disabling it leads to better performance.
contiguous_parameters = False
# the number of checkpoints kept for `cnfg.save_auto_clean`
n_keep_best = 1
# use C backend. Disabling it leads to better performance.
use_c_backend = False
To interpret configurations in hyp.py
.
Additional configurations for dynamic batch sizes.
# If the angle change is greater than or equal to the minimum value in the history * dyn_tol_alpha, perform an optimization step.
dyn_tol_alpha = 1.1
# If fails to obtain a smaller angle change after this number of steps, perform an optimization step.
dyn_tol_amin = 3
# override the maximum tokens per batch configuration in `cnfg/base.py`. If there are no less than this number of tokens in a batch, an optimization step will be performed.
tokens_optm = tokens_optm * 10
# perform optimization step only in case the angle change is smaller than update_angle.
update_angle = 90.0 / dyn_tol_alpha
# number of records of the angle change reduction.
num_dynb_his = 50
# hyper parameter for parameter sampling. Ignored in case using softmax over normalized angle change reduction (default). Uncomment corresponding lines in `utils/dynbatch.py` to enable.
select_alpha = 3.0
Additional configurations for context-aware models.
# number of previous context sentences utilized
num_prev_sent = 2
# freeze the loaded sentence-level model
freeze_load_model = True
# unfreeze the bias and the weight matrix of the classifier of the sentence-level model
unfreeze_bias = True
unfreeze_weight = False
# number of layers for context encoding
num_layer_context = 1
Configuration of special token IDs.