diff --git a/examples/aquila/conf/train/train_aquila_7b.yaml b/examples/aquila/conf/train/train_aquila_7b.yaml index 0c7fe2c5..1210deef 100644 --- a/examples/aquila/conf/train/train_aquila_7b.yaml +++ b/examples/aquila/conf/train/train_aquila_7b.yaml @@ -1,11 +1,11 @@ system: tensor_model_parallel_size: 1 - pipeline_model_parallel_size: 2 + pipeline_model_parallel_size: 1 disable_bias_linear: True use_flash_attn: True use_distributed_optimizer: True precision: - bf16: True + fp16: True initial_loss_scale: 522893 min_loss_scale: 1.0 attention_softmax_in_fp32: True @@ -19,11 +19,11 @@ system: save_interval: 2000 model: - num_layers: 12 - hidden_size: 2048 + num_layers: 32 + hidden_size: 4096 num_attention_heads: 32 - seq_length: 4096 - max_position_embeddings: 4096 + seq_length: 2048 + max_position_embeddings: 2048 norm_epsilon: 1e-5 use_rotary_position_embeddings: true no_position_embedding: true @@ -37,10 +37,10 @@ model: hidden_dropout: 0.0 weight_decay: 0.1 clip_grad: 1.0 - train_samples: 1000 + train_samples: 1002539063 eval_iters: 0 micro_batch_size: 2 - global_batch_size: 16 + global_batch_size: 1728 seed: 1234 optimizer: @@ -50,14 +50,15 @@ model: lr_scheduler: lr: 2.0e-5 min_lr: 2.0e-6 - lr_warmup_samples: 10 + lr_warmup_samples: 3076172 lr_decay_style: cosine data: - data_path: ./build/data/pile_wikipedia_demo + data_path: ${data_path:??} split: 1 tokenizer: - tokenizer_type: QwenTokenizerFS - tokenizer_path: examples/aquila/qwentokenizer - vocab_size: 151851 - make_vocab_size_divisible_by: 64 \ No newline at end of file + tokenizer_type: AquilaTokenizerFS + vocab_file: ./examples/aquila/tokenizer/vocab.json + merge_file: ./examples/aquila/tokenizer/merges.txt + special_tokens_file: ./examples/aquila/tokenizer/special_tokens.txt + vocab_size: 100008 \ No newline at end of file