You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
跑llama3-8b的sft微调时,报错
Traceback (most recent call last):
File "/home/LAB/huangjx/new/PaddleNLP/llm/run_finetune.py", line 730, in <module>
main()
File "/home/LAB/huangjx/new/PaddleNLP/llm/run_finetune.py", line 570, in main
train_result = trainer.train(resume_from_checkpoint=checkpoint)
File "/home/LAB/huangjx/.local/lib/python3.10/site-packages/paddlenlp/trainer/trainer.py", line 829, in train
return self._inner_training_loop(
File "/home/LAB/huangjx/.local/lib/python3.10/site-packages/paddlenlp/trainer/trainer.py", line 1203, in _inner_training_loop
self._maybe_log_save_evaluate(tr_loss, model, epoch, ignore_keys_for_eval, inputs=inputs)
File "/home/LAB/huangjx/.local/lib/python3.10/site-packages/paddlenlp/trainer/trainer.py", line 1478, in _maybe_log_save_evaluate
self._save_checkpoint(model, metrics=metrics)
File "/home/LAB/huangjx/.local/lib/python3.10/site-packages/paddlenlp/trainer/trainer.py", line 2460, in _save_checkpoint
metric_value = metrics[metric_to_check]
KeyError: 'eval_accuracy'
但如果我把config中"metric_for_best_model": "accuracy",删除,就不会报错。所以应该是不支持"metric_for_best_model": "accuracy".在这个过程中我开了pp和tp
软件环境
重复问题
错误描述
稳定复现步骤 & 代码
{
"model_name_or_path": "meta-llama/Meta-Llama-3-8B",
"dataset_name_or_path": "./data",
"output_dir": "./checkpoints/llama_sft_ckpts",
"per_device_train_batch_size": 1,
"gradient_accumulation_steps": 1,
"per_device_eval_batch_size": 1,
"eval_accumulation_steps": 1,
"num_train_epochs": 3,
"learning_rate": 3e-05,
"warmup_steps": 30,
"max_steps": 20,
"max_evaluate_steps": 3,
"logging_steps": 1,
"evaluation_strategy": "epoch",
"save_strategy": "epoch",
"src_length": 1024,
"max_length": 200,
"do_train": true,
"do_eval": true,
"disable_tqdm": true,
"load_best_model_at_end": true,
"eval_with_do_generation": false,
"metric_for_best_model": "accuracy",
"recompute": true,
"save_total_limit": 1,
"tensor_parallel_degree": 2,
"pipeline_parallel_degree": 2,
"pipeline_parallel_config": "disable_p2p_cache_shape",
"sharding": "stage2",
"zero_padding": false,
"unified_checkpoint": false,
"use_flash_attention": false
}
The text was updated successfully, but these errors were encountered: