diff --git a/docs/source/smooth_quant.md b/docs/source/smooth_quant.md index cae9a384d75..f5741fe98cf 100644 --- a/docs/source/smooth_quant.md +++ b/docs/source/smooth_quant.md @@ -324,7 +324,7 @@ IPEX (Intel Extension for PyTorch): 2.0/2.1 Dataset: lambada_openai -Task: text-generation +Task: text-generation provided by [ITREX](https://github.com/intel/intel-extension-for-transformers/tree/main/examples/huggingface/pytorch/text-generation/quantization) alpha [0.4, 0.6] is sweet spot region in SmoothQuant paper. @@ -370,6 +370,13 @@ A list of models that achieved a <1% accuracy drop is shown below. | databricks/dolly-v2-3b* | 0.6297 | 0.6247 | alpha=0.5, Ipex 2.1 | | tiiuae/falcon-7b-instruct | 0.6437 | 0.6392 | alpha=0.7, Pytorch | +The results listed below are achieved using IPEX optimize_transformers in model initialization for better performance. Please refer to the step-by-step [instruction](../../examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/ipex/README.md) for details. +| Model/Last token accuracy | FP32 Accuracy | INT8 (w/ SmoothQuant) | Notes | +|:----------:|:------:|:------:|-----------------------------------| +| LLaMa-2-7b-hf* | 0.7392 | 0.7332 | alpha=Auto, Ipex 2.1 | +| LLaMa-2-13b-hf* | 0.7677 | 0.7632 | alpha=Auto, Ipex 2.1 | + + Please note that for models with asterisk(*), we have set all add ops to FP32 during quantization step to achieve desirable results. ## Example diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/ipex/README.md b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/ipex/README.md new file mode 100644 index 00000000000..bb91d20beab --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/ipex/README.md @@ -0,0 +1,30 @@ +Step-by-Step +============ +This document describes the step-by-step instructions to run llama2 SmoothQuant with Intel® Neural Compressor and Intel® Extension for PyTorch. + +# Prerequisite +``` +# Installation dependencies +pip install -r requirements.txt +``` + +# Run Quantization + +## Llama-2-7b +```bash +python run_llama2_sq.py \ + --model-id meta-llama/Llama-2-7b-hf \ + --batch-size 56 \ + --sq-recipes "llama2-7b" +``` +## Llama-2-13b +```bash +python run_llama2_sq.py \ + --model-id meta-llama/Llama-2-13b-hf \ + --batch-size 56 \ + --sq-recipes "llama2-13b" \ + --padding +``` +> Notes: +> - INT8 model will be saved into "./saved_results" including "./saved_results/best_configure.json" and "./saved_results/best_model.pt", which can be loaded and evaluated by IPEX. +> - Parameter "--sq-recipes" decides the recipes used to do quantize, details can be found in scripts. \ No newline at end of file diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/ipex/requirements.txt b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/ipex/requirements.txt new file mode 100644 index 00000000000..9d1ad09e95f --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/ipex/requirements.txt @@ -0,0 +1,10 @@ +neural-compressor==2.4 +transformers==4.32.0 +datasets +accelerate +sentencepiece +protobuf +--extra-index-url https://download.pytorch.org/whl/cpu +torch==2.1.0+cpu +intel-extension-for-pytorch==2.1.0 + diff --git a/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/ipex/run_llama2_sq.py b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/ipex/run_llama2_sq.py new file mode 100644 index 00000000000..230644e492e --- /dev/null +++ b/examples/pytorch/nlp/huggingface_models/language-modeling/quantization/llm/ipex/run_llama2_sq.py @@ -0,0 +1,240 @@ +import argparse + +from datasets import load_dataset +from transformers import LlamaForCausalLM, LlamaTokenizer, AutoConfig + +import torch +from torch.nn.functional import pad +from torch.utils.data import DataLoader + +import intel_extension_for_pytorch as ipex + +parser = argparse.ArgumentParser('LLaMA generation script (int8 path)', add_help=False) + +parser.add_argument( + "-m", "--model-id", default=None, type=str, required=True, help="your llama model" +) +parser.add_argument( + "--sq-recipes", default=None, type=str, required=True, help="llama2-7b or llama2-13b" +) +parser.add_argument( + "--max-new-tokens", default=32, type=int, help="output max new tokens" +) +parser.add_argument("--dataset", nargs="?", default="NeelNanda/pile-10k") +parser.add_argument("--output-dir", nargs="?", default="./saved_results") + +parser.add_argument( + "--int8-bf16-mixed", + action="store_true", + help="by default it is int8-fp32 mixed, to enable int8 mixed amp bf16 (work on platforms like SPR)", +) +parser.add_argument("--input-tokens", default="32", type=str) +parser.add_argument("--prompt", default=None, type=str) +parser.add_argument("--padding", action="store_true", help="whether do padding in calib_dataloader") +parser.add_argument("--batch-size", default=1, type=int, help="batch size") +parser.add_argument("--alpha", default=0.8, type=float, help="alpha value for smoothquant") +parser.add_argument("--greedy", action="store_true") + +args = parser.parse_args() + +try: + ipex._C.disable_jit_linear_repack() +except Exception: + pass + +# amp autocast +if args.int8_bf16_mixed: + amp_enabled = True + amp_dtype = torch.bfloat16 +else: + amp_enabled = False + amp_dtype = torch.float32 + +num_beams = 1 if args.greedy else 4 + +# load model +config = AutoConfig.from_pretrained(args.model_id, torchscript=True) +if not hasattr(config, "text_max_length") and args.prompt is None: + config.text_max_length = int(args.input_tokens) + int(args.max_new_tokens) + +user_model = LlamaForCausalLM.from_pretrained( + args.model_id, config=config, low_cpu_mem_usage=True, torch_dtype=torch.float +) + +tokenizer = LlamaTokenizer.from_pretrained(args.model_id) +print("Data type of the model:", user_model.dtype) + +# dummy past key value +beam_idx_tmp = torch.zeros( + (2048, int(args.batch_size * num_beams)), dtype=torch.long +).contiguous() +global_past_key_value = [ + ( + torch.zeros(1, 0, 0, 1, dtype=torch.long).contiguous(), + torch.zeros( + [ + 1, + user_model.config.num_attention_heads, + 1, + int( + user_model.config.hidden_size + / user_model.config.num_attention_heads + ), + ] + ).contiguous(), + torch.zeros( + [ + 1, + user_model.config.num_attention_heads, + 1, + int( + user_model.config.hidden_size + / user_model.config.num_attention_heads + ), + ] + ).contiguous(), + beam_idx_tmp, + ) + for i in range(user_model.config.num_hidden_layers) +] + + +class Evaluator: + + def __init__(self, dataset, tokenizer, batch_size=1, pad_val=1, pad_max=512): + self.dataset = dataset + self.tokenizer = tokenizer + self.batch_size = batch_size + self.pad_val = pad_val + self.pad_max = pad_max + + # tokenize the dataset + self.dataset = self.dataset.map(self.tokenize_function, batched=True) + self.dataset.set_format(type="torch", columns=["input_ids"]) + + @torch.no_grad() + def tokenize_function(self, examples): + if "prompt" in examples: + example = self.tokenizer(examples["prompt"]) + elif "text" in examples: + example = self.tokenizer(examples["text"]) + elif "code" in examples: + example = self.tokenizer(examples["code"]) + return example + + @torch.no_grad() + def collate_batch(self, batch): + position_ids_padded = [] + input_ids_padded = [] + last_ind = [] + attention_mask_padded = [] + for text in batch: + input_ids = text["input_ids"] + if not args.padding: + input_ids = ( + input_ids[: int(self.pad_max)] + if len(input_ids) > int(self.pad_max) + else input_ids + ) #no_padding + else: + pad_len = self.pad_max - input_ids.shape[0] + input_ids = pad(input_ids, (0, pad_len), value=self.pad_val) + last_ind.append(input_ids.shape[0] - 1) + attention_mask = torch.ones(len(input_ids)) + position_ids = torch.arange(len(input_ids)) + input_ids_padded.append(input_ids) + attention_mask_padded.append(attention_mask) + position_ids_padded.append(position_ids) + return ( + ( + torch.vstack(input_ids_padded), + torch.vstack(attention_mask_padded), + torch.vstack(position_ids_padded), + tuple(global_past_key_value), + ), + torch.tensor(last_ind), + ) + + +calib_dataset = load_dataset(args.dataset, split="train") +user_model.eval() +if args.sq_recipes == "llama2-7b": + pad_max = 2048 +elif args.sq_recipes == "llama2-13b": + pad_max = 1024 +else: + pad_max = 512 +calib_evaluator = Evaluator(calib_dataset, tokenizer, args.batch_size, pad_max=pad_max) +calib_dataloader = DataLoader( + calib_evaluator.dataset, + batch_size=1, + shuffle=False, + collate_fn=calib_evaluator.collate_batch, +) + + +def calib_func(prepared_model): + for i, ( + (input_ids, attention_mask, position_ids, past_key_values), + last_ind, + ) in enumerate(calib_dataloader): + if i == 512: + break + prepared_model( + input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + past_key_values=past_key_values, + ) + + +example_inputs = None +for i, ( + (input_ids, attention_mask, position_ids, past_key_values), + last_ind, +) in enumerate(calib_dataloader): + example_inputs = (input_ids, attention_mask, position_ids, past_key_values) + break + +qconfig = ipex.quantization.get_smooth_quant_qconfig_mapping(alpha=args.alpha) +user_model = ipex.optimize_transformers( + user_model.eval(), + dtype=amp_dtype, + quantization_config=qconfig, + inplace=True, + deployment_mode=False, +) + +# steps for SmoothQuant with Intel® Neural Compressor +from neural_compressor import PostTrainingQuantConfig, quantization + +# quantization recipes +excluded_precisions = [] if args.int8_bf16_mixed else ["bf16"] +op_type_dict = {"add": {"weight": {"dtype": ["fp32"]}, "activation": {"dtype": ["fp32"]}}} +recipes = {} +if args.sq_recipes == "llama2-7b": + recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': 'auto', 'folding': False, 'default_alpha': 0.8, + 'auto_alpha_args': {"alpha_min": 0.8, "alpha_max": 0.99, + "alpha_step": 0.01, + "shared_criterion": "mean"}}} +elif args.sq_recipes == "llama2-13b": + recipes = {"smooth_quant": True, "smooth_quant_args": {'alpha': 'auto', 'folding': False, 'default_alpha': 0.8, + 'auto_alpha_args': {"alpha_min": 0.75, "alpha_max": 0.99, + "alpha_step": 0.01, + "shared_criterion": "max"}}} + + +conf = PostTrainingQuantConfig( + backend="ipex", + excluded_precisions=excluded_precisions, + op_type_dict=op_type_dict, + recipes=recipes, + example_inputs=example_inputs, +) +q_model = quantization.fit( + user_model, + conf, + calib_dataloader=calib_dataloader, + calib_func=calib_func, +) +q_model.save(args.output_dir)