Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

微调bgem3 卡住不动了 #1302

Open
yichuxue opened this issue Dec 26, 2024 · 0 comments
Open

微调bgem3 卡住不动了 #1302

yichuxue opened this issue Dec 26, 2024 · 0 comments

Comments

@yichuxue
Copy link

我的训练脚本

CUDA_VISIBLE_DEVICES=0,1
torchrun --nproc_per_node 2 \
	-m FlagEmbedding.finetune.embedder.encoder_only.m3 \
	--model_name_or_path BAAI/bge-m3 \
    --train_data /mnt/sda/app/embedding/train_data/embedding/20241024 \
    --output_dir /mnt/sda/app/embedding/train_out/embedding/20241024 \
    --train_group_size 8 \
    --query_max_len 256 \
    --passage_max_len 2048 \
    --pad_to_multiple_of 8 \
    --knowledge_distillation True \
    --same_dataset_within_batch True \
    --small_threshold 0 \
    --drop_threshold 0 \
    --overwrite_output_dir \
    --learning_rate 1e-5 \
    --fp16 \
    --num_train_epochs 2 \
    --per_device_train_batch_size 2 \
    --dataloader_drop_last True \
    --warmup_ratio 0.1 \
    --gradient_checkpointing \
    --logging_steps 1 \
    --save_steps 1000 \
    --negatives_cross_device \
    --temperature 0.02 \
    --sentence_pooling_method cls \
    --normalize_embeddings True \
    --kd_loss_type m3_kd_loss \
    --unified_finetuning True \
    --use_self_distill True \
    --fix_encoder False \
    --self_distill_start_step 0

训练日志

W1226 10:36:50.837000 713 site-packages/torch/distributed/run.py:793] 
W1226 10:36:50.837000 713 site-packages/torch/distributed/run.py:793] *****************************************
W1226 10:36:50.837000 713 site-packages/torch/distributed/run.py:793] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W1226 10:36:50.837000 713 site-packages/torch/distributed/run.py:793] *****************************************
12/26/2024 10:36:53 - WARNING - FlagEmbedding.abc.finetune.embedder.AbsRunner -   Process rank: 0, device: cuda:0, n_gpu: 1, distributed training: True, 16-bits training: True
12/26/2024 10:36:53 - INFO - FlagEmbedding.abc.finetune.embedder.AbsRunner -   Training/evaluation parameters EncoderOnlyEmbedderM3TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=True,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,
eval_on_start=False,
eval_steps=None,
eval_strategy=no,
eval_use_gather_object=False,
evaluation_strategy=None,
fix_encoder=False,
fix_position_embedding=False,
fp16=True,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'min_num_params': 0, 'xla': False, 'xla_fsdp_v2': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=True,
gradient_checkpointing_kwargs=None,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_always_push=False,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=<HUB_TOKEN>,
ignore_data_skip=False,
include_inputs_for_metrics=False,
include_num_input_tokens_seen=False,
include_tokens_per_second=False,
jit_mode_eval=False,
kd_loss_type=m3_kd_loss,
label_names=None,
label_smoothing_factor=0.0,
learning_rate=1e-05,
length_column_name=length,
load_best_model_at_end=False,
local_rank=0,
log_level=passive,
log_level_replica=warning,
log_on_each_node=True,
logging_dir=/mnt/sda/app/embedding/train_out/embedding/20241024/runs/Dec26_10-36-53_d03ea0239d59,
logging_first_step=False,
logging_nan_inf_filter=True,
logging_steps=1.0,
logging_strategy=steps,
lr_scheduler_kwargs={},
lr_scheduler_type=linear,
max_grad_norm=1.0,
max_steps=-1,
metric_for_best_model=None,
mp_parameters=,
neftune_noise_alpha=None,
negatives_cross_device=True,
no_cuda=False,
normalize_embeddings=True,
num_train_epochs=2.0,
optim=adamw_torch,
optim_args=None,
optim_target_modules=None,
output_dir=/mnt/sda/app/embedding/train_out/embedding/20241024,
overwrite_output_dir=True,
past_index=-1,
per_device_eval_batch_size=8,
per_device_train_batch_size=2,
prediction_loss_only=False,
push_to_hub=False,
push_to_hub_model_id=None,
push_to_hub_organization=None,
push_to_hub_token=<PUSH_TO_HUB_TOKEN>,
ray_scope=last,
remove_unused_columns=True,
report_to=[],
restore_callback_states_from_checkpoint=False,
resume_from_checkpoint=None,
run_name=/mnt/sda/app/embedding/train_out/embedding/20241024,
save_on_each_node=False,
save_only_model=False,
save_safetensors=True,
save_steps=1000,
save_strategy=steps,
save_total_limit=None,
seed=42,
self_distill_start_step=0,
sentence_pooling_method=cls,
skip_memory_metrics=True,
split_batches=None,
sub_batch_size=None,
temperature=0.02,
tf32=None,
torch_compile=False,
torch_compile_backend=None,
torch_compile_mode=None,
torch_empty_cache_steps=None,
torchdynamo=None,
tpu_metrics_debug=False,
tpu_num_cores=None,
unified_finetuning=True,
use_cpu=False,
use_ipex=False,
use_legacy_prediction_loop=False,
use_mps_device=False,
use_self_distill=True,
warmup_ratio=0.1,
warmup_steps=0,
weight_decay=0.0,
)
12/26/2024 10:36:53 - INFO - FlagEmbedding.abc.finetune.embedder.AbsRunner -   Model parameters EncoderOnlyEmbedderM3ModelArguments(model_name_or_path='BAAI/bge-m3', config_name=None, tokenizer_name=None, cache_dir=None, trust_remote_code=False, token=None, colbert_dim=-1)
12/26/2024 10:36:53 - INFO - FlagEmbedding.abc.finetune.embedder.AbsRunner -   Data parameters AbsEmbedderDataArguments(train_data=['/mnt/sda/app/embedding/train_data/embedding/20241024'], cache_path=None, train_group_size=8, query_max_len=256, passage_max_len=2048, pad_to_multiple_of=8, max_example_num_per_dataset=100000000, query_instruction_for_retrieval=None, query_instruction_format='{}{}', knowledge_distillation=True, passage_instruction_for_retrieval=None, passage_instruction_format='{}{}', shuffle_ratio=0.0, same_dataset_within_batch=True, small_threshold=0, drop_threshold=0)
12/26/2024 10:36:53 - WARNING - FlagEmbedding.abc.finetune.embedder.AbsRunner -   Process rank: 1, device: cuda:1, n_gpu: 1, distributed training: True, 16-bits training: True
12/26/2024 10:36:55 - INFO - FlagEmbedding.finetune.embedder.encoder_only.m3.runner -   Config: XLMRobertaConfig {
  "_name_or_path": "BAAI/bge-m3",
  "architectures": [
    "XLMRobertaModel"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "classifier_dropout": null,
  "eos_token_id": 2,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 1024,
  "id2label": {
    "0": "LABEL_0"
  },
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "label2id": {
    "LABEL_0": 0
  },
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 8194,
  "model_type": "xlm-roberta",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_past": true,
  "pad_token_id": 1,
  "position_embedding_type": "absolute",
  "torch_dtype": "float32",
  "transformers_version": "4.44.2",
  "type_vocab_size": 1,
  "use_cache": true,
  "vocab_size": 250002
}

12/26/2024 10:36:55 - INFO - FlagEmbedding.finetune.embedder.encoder_only.m3.runner -   loading existing colbert_linear and sparse_linear---------
12/26/2024 10:36:55 - INFO - FlagEmbedding.abc.finetune.embedder.AbsDataset -   loading data from /mnt/sda/app/embedding/train_data/embedding/20241024/train.jsonl ...
12/26/2024 10:36:56 - INFO - FlagEmbedding.abc.finetune.embedder.AbsDataset -   -- Rank 0: refresh data --

最后就是卡在这里一直不动,接近半小时了。数据量才2M

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant