From e275f2707cabaa45e034d5148a3dc7dbd5f512c1 Mon Sep 17 00:00:00 2001 From: plusbang Date: Thu, 28 Nov 2024 17:17:18 +0800 Subject: [PATCH] fix --- .../example/NPU/HF-Transformers-AutoModels/LLM/qwen.py | 9 ++++++++- python/llm/src/ipex_llm/transformers/npu_model.py | 3 ++- .../llm/src/ipex_llm/transformers/npu_models/convert.py | 8 +++++--- .../src/ipex_llm/transformers/npu_models/npu_llm_cpp.py | 2 +- .../transformers/npu_pipeline_model/convert_pipeline.py | 3 ++- 5 files changed, 18 insertions(+), 7 deletions(-) diff --git a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py index 2256be57e5a..57a2aa2b03f 100644 --- a/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py +++ b/python/llm/example/NPU/HF-Transformers-AutoModels/LLM/qwen.py @@ -54,7 +54,13 @@ parser.add_argument("--disable-transpose-value-cache", action="store_true", default=False) parser.add_argument("--intra-pp", type=int, default=None) parser.add_argument("--inter-pp", type=int, default=None) - parser.add_argument("--mixed-precision", action='store_true') + parser.add_argument("--mixed-precision", action='store_false') + parser.add_argument("--save-directory", type=str, + required=True, + help="The path of folder to save converted model, " + "If path not exists, lowbit model will be saved there. " + "Else, program will raise error.", + ) args = parser.parse_args() model_path = args.repo_id_or_model_path @@ -74,6 +80,7 @@ transpose_value_cache=not args.disable_transpose_value_cache, mixed_precision=args.mixed_precision, quantization_group_size=args.quantization_group_size, + save_directory=args.save_directory ) else: model = AutoModelForCausalLM.load_low_bit( diff --git a/python/llm/src/ipex_llm/transformers/npu_model.py b/python/llm/src/ipex_llm/transformers/npu_model.py index 2288b9b96d2..5abfdd89149 100644 --- a/python/llm/src/ipex_llm/transformers/npu_model.py +++ b/python/llm/src/ipex_llm/transformers/npu_model.py @@ -263,7 +263,7 @@ def optimize_npu_model(cls, *args, **kwargs): model.share_memory() if not pipeline: - if model.config.model_type in ["qwen2", "llama", "minicpm"]: + if model.config.model_type in ["qwen2"]: from ipex_llm.transformers.npu_models.convert import optimize_llm_single_process optimize_llm_single_process( llm, @@ -271,6 +271,7 @@ def optimize_npu_model(cls, *args, **kwargs): max_prompt_len=max_prompt_len, transpose_value_cache=transpose_value_cache, group_size=quantization_group_size, + qtype=qtype, save_directory=save_directory ) else: diff --git a/python/llm/src/ipex_llm/transformers/npu_models/convert.py b/python/llm/src/ipex_llm/transformers/npu_models/convert.py index 06e2c18f41b..4e7ac8d53e5 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/convert.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/convert.py @@ -322,7 +322,7 @@ def generate( output = torch.stack(output_tokens, dim=1) output = torch.cat((inputs, output), dim=1) time_t3 = time.perf_counter() - + reset(self.model_ptr) self.first_cost = time_t2 - time_t1 # seconds self.rest_cost_mean = (time_t3 - time_t2) / (idx - 1) # seconds @@ -345,9 +345,10 @@ def optimize_llm_single_process( max_prompt_len: int, transpose_value_cache: bool, group_size: int, + qtype: str, save_directory: str ): - from ipex_llm.transformers.npu_pipeline_model.convert_pipeline import convert_llm_for_deploy, convert_llm + from ipex_llm.transformers.npu_pipeline_model.convert_pipeline import convert_llm from .npu_llm_cpp import load_model_from_file convert_llm(model, @@ -355,6 +356,7 @@ def optimize_llm_single_process( max_prompt_len=max_prompt_len, transpose_value_cache=transpose_value_cache, group_size=group_size, + qtype=qtype, convert_model=True, save_directory=save_directory) try: @@ -364,7 +366,7 @@ def optimize_llm_single_process( model.vocab_size = model.config.vocab_size except: invalidInputError(False, - "False to InitLLMPipeline.") + "False to InitLLMPipeline.") # patch generate function import types model.generate = types.MethodType(generate, model) diff --git a/python/llm/src/ipex_llm/transformers/npu_models/npu_llm_cpp.py b/python/llm/src/ipex_llm/transformers/npu_models/npu_llm_cpp.py index c6cbdbaf5be..9507a7538f4 100644 --- a/python/llm/src/ipex_llm/transformers/npu_models/npu_llm_cpp.py +++ b/python/llm/src/ipex_llm/transformers/npu_models/npu_llm_cpp.py @@ -41,7 +41,6 @@ def get_shared_lib_info(lib_base_name: str): _, _lib_path = get_shared_lib_info("npu_llm") -print(f'_lib_path is {_lib_path}......') # Load the library _lib = ctypes.cdll.LoadLibrary(_lib_path) @@ -61,6 +60,7 @@ def get_shared_lib_info(lib_base_name: str): _lib.reset.argtypes = [ctypes.c_void_p] _lib.reset.restype = None + def load_model_from_file(model_dir: str): return _lib.load_model_from_file(model_dir.encode('utf-8')) diff --git a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py index 3e36726cda0..64e697ba800 100644 --- a/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py +++ b/python/llm/src/ipex_llm/transformers/npu_pipeline_model/convert_pipeline.py @@ -426,7 +426,8 @@ def convert_llm_for_deploy(model: torch.nn.Module, if not os.path.exists(save_directory): os.mkdir(save_directory) weight_dir = os.path.join(save_directory, "model_weights") - os.mkdir(weight_dir) + if not os.path.exists(weight_dir): + os.mkdir(weight_dir) layernorm_const = os.environ.get("IPEX_LLM_NPU_LAYERNORM_CONST", "1") == "1" if model.config.model_type == "qwen2":