diff --git a/examples/summarization/run_summarization.py b/examples/summarization/run_summarization.py old mode 100644 new mode 100755 index 23be70e213..db7a4913c9 --- a/examples/summarization/run_summarization.py +++ b/examples/summarization/run_summarization.py @@ -764,6 +764,9 @@ def compute_metrics(eval_preds): else: training_args.generation_config.max_length = data_args.val_max_target_length if data_args.num_beams is not None: + if data_args.num_beams == 1: + training_args.generation_config.length_penalty = None + training_args.generation_config.early_stopping = False training_args.generation_config.num_beams = data_args.num_beams elif training_args.generation_num_beams is not None: training_args.generation_config.num_beams = training_args.generation_num_beams diff --git a/examples/text-generation/README.md b/examples/text-generation/README.md old mode 100644 new mode 100755 index cac7bbbe50..e020e72a79 --- a/examples/text-generation/README.md +++ b/examples/text-generation/README.md @@ -443,7 +443,9 @@ More information on usage of the unifier script can be found in fp8 Habana docs: Some models can fit on HPU DRAM but can't fit on the CPU RAM. When we run a model on single card and don't use deepspeed, the `--disk_offload` flag allows to offload weights to disk during model quantization in HQT. When this flag is mentioned, during the quantization process, each weight first is loaded from disk to CPU RAM, when brought to HPU DRAM and quantized there. This way not all the model is on the CPU RAM but only one weight each time. To enable this weights offload mechanism, add `--disk_offload` flag to the topology command line. -Here is an example of using disk_offload in quantize command. Please make sure to run the measurement first. +Here is an example of using disk_offload in quantize command. +Please follow the "Running FP8 models on single device" section first before running the cmd below. + ```bash QUANT_CONFIG=./quantization_config/maxabs_quant.json TQDM_DISABLE=1 \ python run_generation.py \