-
Notifications
You must be signed in to change notification settings - Fork 3
/
llama3-70b-fp8.yaml
185 lines (185 loc) · 4.67 KB
/
llama3-70b-fp8.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
run:
name: llama3-70b-fp8
results_dir:
time_limit: 01:00:00
dependency: singleton
trainer:
devices: 8
accelerator: gpu
precision: bf16
logger: false
enable_checkpointing: false
use_distributed_sampler: false
max_epochs: null
max_steps: 50
max_time: 05:23:30:00
log_every_n_steps: 1
val_check_interval: 400
limit_val_batches: 5
limit_test_batches: 5
accumulate_grad_batches: 1
gradient_clip_val: 1.0
enable_model_summary: false
exp_manager:
exp_dir:
explicit_log_dir:
name: megatron_gpt
create_wandb_logger: false
create_dllogger_logger: true
dllogger_logger_kwargs:
verbose: true
stdout: true
resume_if_exists: false
resume_ignore_no_checkpoint: true
create_checkpoint_callback: false
checkpoint_callback_params:
monitor: val_loss
save_top_k: 10
mode: min
always_save_nemo: false
save_nemo_on_train_end: false
filename: megatron_llama--{val_loss:.2f}-{step}-{consumed_samples}
model_parallel_size: 4
log_step_timing: true
step_timing_kwargs:
sync_cuda: true
buffer_size: 5
model:
mcore_gpt: true
moe_grouped_gemm: true
moe_token_dispatcher_type: alltoall
moe_aux_loss_coeff: 0.01
micro_batch_size: 1
global_batch_size: 1024
rampup_batch_size: null
tensor_model_parallel_size: 4
pipeline_model_parallel_size: 4
virtual_pipeline_model_parallel_size: 5
context_parallel_size: 2
encoder_seq_length: 8192
max_position_embeddings: 8192
num_layers: 80
hidden_size: 8192
ffn_hidden_size: 28672
num_attention_heads: 64
init_method_std: 0.008944
use_scaled_init_method: true
hidden_dropout: 0.0
attention_dropout: 0.0
ffn_dropout: 0
kv_channels: null
apply_query_key_layer_scaling: true
normalization: rmsnorm
layernorm_epsilon: 1.0e-05
do_layer_norm_weight_decay: false
make_vocab_size_divisible_by: 128
pre_process: true
post_process: true
persist_layer_norm: true
bias: false
activation: fast-swiglu
headscale: false
transformer_block_type: pre_ln
openai_gelu: false
normalize_attention_scores: true
position_embedding_type: rope
apply_rope_fusion: true
rotary_percentage: 1.0
rotary_base: 1000000.0
attention_type: multihead
share_embeddings_and_output_weights: false
overlap_p2p_comm: true
batch_p2p_comm: false
seq_len_interpolation_factor: null
num_query_groups: 8
tokenizer:
library: megatron
type: GPT2BPETokenizer
model: null
delimiter: null
vocab_file: gpt2-vocab.json
merge_file: gpt2-merges.txt
native_amp_init_scale: 4294967296
native_amp_growth_interval: 1000
hysteresis: 2
fp32_residual_connection: false
fp16_lm_cross_entropy: false
megatron_amp_O2: true
grad_allreduce_chunk_size_mb: 32
grad_div_ar_fusion: true
gradient_accumulation_fusion: true
bias_activation_fusion: true
bias_dropout_add_fusion: true
masked_softmax_fusion: false
get_attention_mask_from_fusion: true
seed: 1234
resume_from_checkpoint: null
use_cpu_initialization: false
onnx_safe: false
apex_transformer_log_level: 30
gradient_as_bucket_view: true
sync_batch_comm: false
activations_checkpoint_granularity: null
activations_checkpoint_method: null
activations_checkpoint_num_layers: null
num_micro_batches_with_partial_activation_checkpoints: null
activations_checkpoint_layers_per_pipeline: null
sequence_parallel: true
transformer_engine: true
fp8: true
fp8_e4m3: false
fp8_hybrid: true
fp8_margin: 0
fp8_interval: 1
fp8_amax_history_len: 1024
fp8_amax_compute_algo: max
reduce_amax: true
use_emha: false
ub_tp_comm_overlap: false
ub_tp_comm_overlap_cfg: null
use_flash_attention: true
nsys_profile:
enabled: false
start_step: 41
end_step: 43
ranks:
- 0
optim:
name: mcore_distributed_optim
lr: 0.00015
weight_decay: 0.1
betas:
- 0.9
- 0.95
sched:
name: CosineAnnealing
warmup_steps: 636
constant_steps: 11873
min_lr: 1.0e-05
grad_sync_dtype: bf16
overlap_grad_sync: true
overlap_param_sync: true
contiguous_grad_buffer: true
contiguous_param_buffer: true
ddp_bucket_size: 134217728
gc_interval: 0
precision: bf16
mcore_customization_config:
new_decoder_architecture: false
parallel_attention: false
data:
data_impl: mmap
splits_string: 90,8,2
seq_length: 8192
skip_warmup: true
num_workers: 2
dataloader_type: single
reset_position_ids: false
reset_attention_mask: false
eod_mask_loss: false
index_mapping_dir: "/gcs/index_mapping_dir"
data_cache_gcs_dir: gs://nemo-megatron-demo/training-data/tokenized/bpe2gpt/wikipedia/
data_cache_local_dir: /ssd/.cache
data_prefix:
- 1.0
- /ssd/.cache/wikipedia-tokenized-for-gpt2