From 9ffba0f180bc9969cd6e2193b465111700e4cef1 Mon Sep 17 00:00:00 2001 From: lzy-dev Date: Thu, 26 Dec 2024 11:15:28 +0800 Subject: [PATCH] reset yaml --- examples/aquila/conf/config.yaml | 4 +-- .../aquila/conf/train/train_aquila_3b.yaml | 30 +++++++++---------- .../aquila/conf/train/train_aquila_7b.yaml | 4 --- 3 files changed, 16 insertions(+), 22 deletions(-) diff --git a/examples/aquila/conf/config.yaml b/examples/aquila/conf/config.yaml index 5d0cf7f54..d6ec0c544 100644 --- a/examples/aquila/conf/config.yaml +++ b/examples/aquila/conf/config.yaml @@ -9,8 +9,8 @@ experiment: type: train backend: megatron entrypoint: ./flagscale/train/train_aquila.py - cmds: - before_start: source /root/miniconda3/bin/activate flagscale + # cmds: + # before_start: source /root/miniconda3/bin/activate flagscale runner: backend: torchrun nnodes: 1 diff --git a/examples/aquila/conf/train/train_aquila_3b.yaml b/examples/aquila/conf/train/train_aquila_3b.yaml index 38cec612e..4d4e1ef61 100644 --- a/examples/aquila/conf/train/train_aquila_3b.yaml +++ b/examples/aquila/conf/train/train_aquila_3b.yaml @@ -26,20 +26,19 @@ system: ckpt_format: torch save_interval: 2385 - hetero: - enable_hetero: True - hetero_use_cpu_communication: False - use_partional_reduce_for_shared_embedding: False - # mesh format [tp1,cp1,ep1,dp1,pp1,(tp2,cp2...)] + # hetero: + # enable_hetero: True + # hetero_use_cpu_communication: False + # use_partional_reduce_for_shared_embedding: True + # # mesh format [tp1,cp1,ep1,dp1,pp1,(tp2,cp2...)] - hetero_pipeline_layer_split: [12,12] - hetero_process_meshes: [1,1,1,4,1, 1,1,1,4,1] - hetero_device_types: ["A800", "A800"] + # hetero_pipeline_layer_split: [12,12] + # hetero_process_meshes: [1,1,1,4,1, 1,1,1,4,1] + # hetero_device_types: ["A800", "A800"] - standalone_embedding_stage: False - hetero_current_device_type: "A800" + # standalone_embedding_stage: False + # hetero_current_device_type: "A800" model: - # use_mcore_models: True # deprecated transformer_impl: transformer_engine num_layers: 24 hidden_size: 1024 @@ -63,10 +62,10 @@ model: hidden_dropout: 0.0 weight_decay: 0.1 clip_grad: 1.0 - train_samples: 6400 #120B tokens + train_samples: 29297664 #120B tokens eval_iters: 0 - micro_batch_size: 4 - global_batch_size: 64 + micro_batch_size: 2 + global_batch_size: 1024 seed: 42 optimizer: @@ -80,8 +79,7 @@ model: lr_decay_style: cosine data: - # data_path: [40,/mnt/share/hetero_data/datasets/fineweb-edu-CC-3_5_text_document,28,/mnt/share/hetero_data/datasets/dclm-baseline-1.0-top_pct5_text_document,3,/mnt/share/hetero_data/datasets/k73_edu_qwen_text_document,5,/mnt/share/hetero_data/datasets/wxb_edu_qwen_text_document,20,/mnt/share/hetero_data/datasets/cosmopedia-v2-full_text_document,4,/mnt/share/hetero_data/datasets/infinst-kg-0712_text_document] - data_path: build/data/pile_wikipedia_demo + data_path: {data_path:??} split: 1 no_mmap_bin_files: true tokenizer: diff --git a/examples/aquila/conf/train/train_aquila_7b.yaml b/examples/aquila/conf/train/train_aquila_7b.yaml index 75bda1a77..1210deef4 100644 --- a/examples/aquila/conf/train/train_aquila_7b.yaml +++ b/examples/aquila/conf/train/train_aquila_7b.yaml @@ -1,8 +1,6 @@ system: tensor_model_parallel_size: 1 pipeline_model_parallel_size: 1 - context_parallel_size: 1 - ulysses_sp_parallel_size: 4 disable_bias_linear: True use_flash_attn: True use_distributed_optimizer: True @@ -26,7 +24,6 @@ model: num_attention_heads: 32 seq_length: 2048 max_position_embeddings: 2048 - max_position_embeddings: 2048 norm_epsilon: 1e-5 use_rotary_position_embeddings: true no_position_embedding: true @@ -58,7 +55,6 @@ model: data: data_path: ${data_path:??} - data_path: ./build/data/pile_wikipedia_demo split: 1 tokenizer: tokenizer_type: AquilaTokenizerFS