From 55c50a2ec877ca8f2af879d0960df632996ed805 Mon Sep 17 00:00:00 2001 From: Jean Yu Date: Tue, 8 Oct 2024 18:59:53 -0500 Subject: [PATCH] Add llm integration with Intel Gaudi in llama-index-llms-gaudi (#16308) --- .../llms/llama-index-llms-gaudi/.gitignore | 153 +++++ .../llms/llama-index-llms-gaudi/BUILD | 3 + .../llms/llama-index-llms-gaudi/Makefile | 17 + .../llms/llama-index-llms-gaudi/README.md | 55 ++ .../llama-index-llms-gaudi/examples/BUILD | 1 + .../llama-index-llms-gaudi/examples/README.md | 29 + .../llama-index-llms-gaudi/examples/basic.py | 373 +++++++++++ .../llama_index/llms/gaudi/BUILD | 1 + .../llama_index/llms/gaudi/__init__.py | 4 + .../llama_index/llms/gaudi/base.py | 451 ++++++++++++++ .../llama_index/llms/gaudi/utils.py | 577 ++++++++++++++++++ .../llama-index-llms-gaudi/pyproject.toml | 71 +++ 12 files changed, 1735 insertions(+) create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/.gitignore create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/BUILD create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/Makefile create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/README.md create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/examples/BUILD create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/examples/README.md create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/examples/basic.py create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/BUILD create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/__init__.py create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/base.py create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/utils.py create mode 100644 llama-index-integrations/llms/llama-index-llms-gaudi/pyproject.toml diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/.gitignore b/llama-index-integrations/llms/llama-index-llms-gaudi/.gitignore new file mode 100644 index 0000000000000..990c18de22908 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/.gitignore @@ -0,0 +1,153 @@ +llama_index/_static +.DS_Store +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +bin/ +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +etc/ +include/ +lib/ +lib64/ +parts/ +sdist/ +share/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ +.ruff_cache + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints +notebooks/ + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ +pyvenv.cfg + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ + +# Jetbrains +.idea +modules/ +*.swp + +# VsCode +.vscode + +# pipenv +Pipfile +Pipfile.lock + +# pyright +pyrightconfig.json diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/BUILD b/llama-index-integrations/llms/llama-index-llms-gaudi/BUILD new file mode 100644 index 0000000000000..0896ca890d8bf --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/BUILD @@ -0,0 +1,3 @@ +poetry_requirements( + name="poetry", +) diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/Makefile b/llama-index-integrations/llms/llama-index-llms-gaudi/Makefile new file mode 100644 index 0000000000000..b9eab05aa3706 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/Makefile @@ -0,0 +1,17 @@ +GIT_ROOT ?= $(shell git rev-parse --show-toplevel) + +help: ## Show all Makefile targets. + @grep -E '^[a-zA-Z_-]+:.*?## .*$$' $(MAKEFILE_LIST) | awk 'BEGIN {FS = ":.*?## "}; {printf "\033[33m%-30s\033[0m %s\n", $$1, $$2}' + +format: ## Run code autoformatters (black). + pre-commit install + git ls-files | xargs pre-commit run black --files + +lint: ## Run linters: pre-commit (black, ruff, codespell) and mypy + pre-commit install && git ls-files | xargs pre-commit run --show-diff-on-failure --files + +test: ## Run tests via pytest. + pytest tests + +watch-docs: ## Build and watch documentation. + sphinx-autobuild docs/ docs/_build/html --open-browser --watch $(GIT_ROOT)/llama_index/ diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/README.md b/llama-index-integrations/llms/llama-index-llms-gaudi/README.md new file mode 100644 index 0000000000000..07ff53ba5d7e6 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/README.md @@ -0,0 +1,55 @@ +# LlamaIndex Llms Integration with Intel Gaudi + +## Installation + +```bash +pip install --upgrade-strategy eager optimum[habana] +pip install llama-index-llms-gaudi +pip install llama-index-llms-huggingface +``` + +## Usage + +```python +import argparse +import os, logging +from llama_index.llms.gaudi import GaudiLLM + + +def setup_parser(parser): + parser.add_argument(...) + args = parser.parse_args() + return args + + +if __name__ == "__main__": + parser = argparse.ArgumentParser( + description="GaudiLLM Basic Usage Example" + ) + args = setup_parser(parser) + args.model_name_or_path = "HuggingFaceH4/zephyr-7b-alpha" + + llm = GaudiLLM( + args=args, + logger=logger, + model_name="HuggingFaceH4/zephyr-7b-alpha", + tokenizer_name="HuggingFaceH4/zephyr-7b-alpha", + query_wrapper_prompt=PromptTemplate( + "<|system|>\n\n<|user|>\n{query_str}\n<|assistant|>\n" + ), + context_window=3900, + max_new_tokens=256, + generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95}, + messages_to_prompt=messages_to_prompt, + device_map="auto", + ) + + query = "Is the ocean blue?" + print("\n----------------- Complete ------------------") + completion_response = llm.complete(query) + print(completion_response.text) +``` + +## Examples + +- [More Examples](https://github.com/run-llama/llama_index/tree/main/llama-index-integrations/llms/llama-index-llms-gaudi/examples) diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/examples/BUILD b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/examples/README.md b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/README.md new file mode 100644 index 0000000000000..a9bdec0912010 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/README.md @@ -0,0 +1,29 @@ +# GaudiLLM Examples + +This folder contains examples showcasing how to use LlamaIndex with `gaudi` LLM integration `llama_index.llms.gaudi.GaudiLLM`. + +## Installation + +### On Intel Gaudi + +Install `llama-index-llms-gaudi`. This will also install `gaudi` and its dependencies. + +```bash +pip install --upgrade-strategy eager optimum[habana] +``` + +## List of Examples + +### Basic Example + +The example [basic.py](./basic.py) shows how to run `GaudiLLM` on Intel Gaudi and conduct tasks such as text completion. Run the example as following: + +```bash +python basic.py +``` + +> Please note that in this example we'll use [HuggingFaceH4/zephyr-7b-alpha](https://huggingface.co/HuggingFaceH4/zephyr-7b-alpha) model for demonstration. It requires `transformers` and `tokenizers` packages. +> +> ```bash +> pip install -U transformers tokenizers +> ``` diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/examples/basic.py b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/basic.py new file mode 100644 index 0000000000000..c2ec27582cf2f --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/examples/basic.py @@ -0,0 +1,373 @@ +import os, logging +import argparse +from llama_index.llms.gaudi import GaudiLLM +from llama_index.core.prompts import PromptTemplate + +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, +) +logger = logging.getLogger(__name__) + + +def setup_parser(parser): + # Arguments management + parser.add_argument( + "--device", "-d", type=str, choices=["hpu"], help="Device to run", default="hpu" + ) + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + # required=True, + help="Path to pre-trained model (on the HF Hub or locally).", + ) + parser.add_argument( + "--bf16", + default=True, + action="store_true", + help="Whether to perform generation in bf16 precision.", + ) + parser.add_argument( + "--max_new_tokens", type=int, default=100, help="Number of tokens to generate." + ) + parser.add_argument( + "--max_input_tokens", + type=int, + default=0, + help="If > 0 then pad and truncate the input sequences to this specified length of tokens. \ + if == 0, then truncate to 16 (original default) \ + if < 0, then do not truncate, use full input prompt", + ) + parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.") + parser.add_argument( + "--warmup", + type=int, + default=3, + help="Number of warmup iterations for benchmarking.", + ) + parser.add_argument( + "--n_iterations", + type=int, + default=5, + help="Number of inference iterations for benchmarking.", + ) + parser.add_argument( + "--local_rank", type=int, default=0, metavar="N", help="Local process rank." + ) + parser.add_argument( + "--use_kv_cache", + default=True, + action="store_true", + help="Whether to use the key/value cache for decoding. It should speed up generation.", + ) + parser.add_argument( + "--use_hpu_graphs", + default=True, + action="store_true", + help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.", + ) + parser.add_argument( + "--dataset_name", + default=None, + type=str, + help="Optional argument if you want to assess your model on a given dataset of the HF Hub.", + ) + parser.add_argument( + "--column_name", + default=None, + type=str, + help="If `--dataset_name` was given, this will be the name of the column to use as prompts for generation.", + ) + parser.add_argument( + "--do_sample", + action="store_true", + help="Whether to use sampling for generation.", + ) + parser.add_argument( + "--num_beams", + default=1, + type=int, + help="Number of beams used for beam search generation. 1 means greedy search will be performed.", + ) + parser.add_argument( + "--trim_logits", + action="store_true", + help="Calculate logits only for the last token to save memory in the first step.", + ) + parser.add_argument( + "--seed", + default=27, + type=int, + help="Seed to use for random generation. Useful to reproduce your runs with `--do_sample`.", + ) + parser.add_argument( + "--profiling_warmup_steps", + default=0, + type=int, + help="Number of steps to ignore for profiling.", + ) + parser.add_argument( + "--profiling_steps", + default=0, + type=int, + help="Number of steps to capture for profiling.", + ) + parser.add_argument( + "--profiling_record_shapes", + default=False, + type=bool, + help="Record shapes when enabling profiling.", + ) + parser.add_argument( + "--prompt", + default=None, + type=str, + nargs="*", + help='Optional argument to give a prompt of your choice as input. Can be a single string (eg: --prompt "Hello world"), or a list of space-separated strings (eg: --prompt "Hello world" "How are you?")', + ) + parser.add_argument( + "--bad_words", + default=None, + type=str, + nargs="+", + help="Optional argument list of words that are not allowed to be generated.", + ) + parser.add_argument( + "--force_words", + default=None, + type=str, + nargs="+", + help="Optional argument list of words that must be generated.", + ) + parser.add_argument( + "--assistant_model", + default=None, + type=str, + help="Optional argument to give a path to a draft/assistant model for assisted decoding.", + ) + parser.add_argument( + "--peft_model", + default=None, + type=str, + help="Optional argument to give a path to a PEFT model.", + ) + parser.add_argument("--num_return_sequences", type=int, default=1) + parser.add_argument( + "--token", + default=None, + type=str, + help="The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`).", + ) + parser.add_argument( + "--model_revision", + default="main", + type=str, + help="The specific model version to use (can be a branch name, tag name or commit id).", + ) + parser.add_argument( + "--attn_softmax_bf16", + action="store_true", + help="Whether to run attention softmax layer in lower precision provided that the model supports it and " + "is also running in lower precision.", + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + help="Output directory to store results in.", + ) + parser.add_argument( + "--bucket_size", + default=-1, + type=int, + help="Bucket size to maintain static shapes. If this number is negative (default is -1) \ + then we use `shape = prompt_length + max_new_tokens`. If a positive number is passed \ + we increase the bucket in steps of `bucket_size` instead of allocating to max (`prompt_length + max_new_tokens`).", + ) + parser.add_argument( + "--bucket_internal", + action="store_true", + help="Split kv sequence into buckets in decode phase. It improves throughput when max_new_tokens is large.", + ) + parser.add_argument( + "--dataset_max_samples", + default=-1, + type=int, + help="If a negative number is passed (default = -1) perform inference on the whole dataset, else use only `dataset_max_samples` samples.", + ) + parser.add_argument( + "--limit_hpu_graphs", + action="store_true", + help="Skip HPU Graph usage for first token to save memory", + ) + parser.add_argument( + "--reuse_cache", + action="store_true", + help="Whether to reuse key/value cache for decoding. It should save memory.", + ) + parser.add_argument( + "--verbose_workers", + action="store_true", + help="Enable output from non-master workers", + ) + parser.add_argument( + "--simulate_dyn_prompt", + default=None, + type=int, + nargs="*", + help="If empty, static prompt is used. If a comma separated list of integers is passed, we warmup and use those shapes for prompt length.", + ) + parser.add_argument( + "--reduce_recompile", + action="store_true", + help="Preprocess on cpu, and some other optimizations. Useful to prevent recompilations when using dynamic prompts (simulate_dyn_prompt)", + ) + + parser.add_argument( + "--use_flash_attention", + action="store_true", + help="Whether to enable Habana Flash Attention, provided that the model supports it.", + ) + parser.add_argument( + "--flash_attention_recompute", + action="store_true", + help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.", + ) + parser.add_argument( + "--flash_attention_causal_mask", + action="store_true", + help="Whether to enable Habana Flash Attention in causal mode on first token generation.", + ) + parser.add_argument( + "--flash_attention_fast_softmax", + action="store_true", + help="Whether to enable Habana Flash Attention in fast softmax mode.", + ) + parser.add_argument( + "--book_source", + action="store_true", + help="Whether to use project Guttenberg books data as input. Useful for testing large sequence lengths.", + ) + parser.add_argument( + "--torch_compile", + action="store_true", + help="Whether to use torch compiled model or not.", + ) + parser.add_argument( + "--ignore_eos", + default=True, + action=argparse.BooleanOptionalAction, + help="Whether to ignore eos, set False to disable it", + ) + parser.add_argument( + "--temperature", + default=1.0, + type=float, + help="Temperature value for text generation", + ) + parser.add_argument( + "--top_p", + default=1.0, + type=float, + help="Top_p value for generating text via sampling", + ) + parser.add_argument( + "--const_serialization_path", + "--csp", + type=str, + help="Path to serialize const params. Const params will be held on disk memory instead of being allocated on host memory.", + ) + parser.add_argument( + "--disk_offload", + action="store_true", + help="Whether to enable device map auto. In case no space left on cpu, weights will be offloaded to disk.", + ) + parser.add_argument( + "--trust_remote_code", + action="store_true", + help="Whether or not to allow for custom models defined on the Hub in their own modeling files.", + ) + args = parser.parse_args() + + if args.torch_compile: + args.use_hpu_graphs = False + + if not args.use_hpu_graphs: + args.limit_hpu_graphs = False + + args.quant_config = os.getenv("QUANT_CONFIG", "") + if args.quant_config == "" and args.disk_offload: + logger.warning( + "`--disk_offload` was tested only with fp8, it may not work with full precision. If error raises try to remove the --disk_offload flag." + ) + return args + + +# Transform a string into input zephyr-specific input +def completion_to_prompt(completion): + return f"<|system|>\n\n<|user|>\n{completion}\n<|assistant|>\n" + + +# Transform a list of chat messages into zephyr-specific input +def messages_to_prompt(messages): + prompt = "" + for message in messages: + if message.role == "system": + prompt += f"<|system|>\n{message.content}\n" + elif message.role == "user": + prompt += f"<|user|>\n{message.content}\n" + elif message.role == "assistant": + prompt += f"<|assistant|>\n{message.content}\n" + + # ensure we start with a system prompt, insert blank if needed + if not prompt.startswith("<|system|>\n"): + prompt = "<|system|>\n\n" + prompt + + # add final assistant prompt + prompt = prompt + "<|assistant|>\n" + + return prompt + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="GaudiLLM Basic Usage Example") + args = setup_parser(parser) + args.model_name_or_path = "HuggingFaceH4/zephyr-7b-alpha" + + llm = GaudiLLM( + args=args, + logger=logger, + model_name="HuggingFaceH4/zephyr-7b-alpha", + tokenizer_name="HuggingFaceH4/zephyr-7b-alpha", + query_wrapper_prompt=PromptTemplate( + "<|system|>\n\n<|user|>\n{query_str}\n<|assistant|>\n" + ), + context_window=3900, + max_new_tokens=256, + generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95}, + messages_to_prompt=messages_to_prompt, + device_map="auto", + ) + + query = "Is the ocean blue?" + print("\n----------------- Complete ------------------") + completion_response = llm.complete(query) + print(completion_response.text) + print("\n----------------- Stream Complete ------------------") + response_iter = llm.stream_complete(query) + for response in response_iter: + print(response.delta, end="", flush=True) + print("\n----------------- Chat ------------------") + from llama_index.core.llms import ChatMessage + + message = ChatMessage(role="user", content=query) + resp = llm.chat([message]) + print(resp) + print("\n----------------- Stream Chat ------------------") + message = ChatMessage(role="user", content=query) + resp = llm.stream_chat([message], max_tokens=256) + for r in resp: + print(r.delta, end="") diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/BUILD b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/BUILD new file mode 100644 index 0000000000000..db46e8d6c978c --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/BUILD @@ -0,0 +1 @@ +python_sources() diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/__init__.py b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/__init__.py new file mode 100644 index 0000000000000..5ef1883df2fb4 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/__init__.py @@ -0,0 +1,4 @@ +from llama_index.llms.gaudi.base import GaudiLLM + + +__all__ = ["GaudiLLM"] diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/base.py b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/base.py new file mode 100644 index 0000000000000..25732482fddb5 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/base.py @@ -0,0 +1,451 @@ +import logging +from typing import Any, Callable, List, Optional, Sequence, Union + +from llama_index.core.base.llms.types import ChatMessage +from llama_index.core.bridge.pydantic import Field +from llama_index.llms.huggingface.base import HuggingFaceLLM +from llama_index.core.callbacks import CallbackManager +from llama_index.core.constants import ( + DEFAULT_CONTEXT_WINDOW, + DEFAULT_NUM_OUTPUTS, +) +from llama_index.core.types import BaseOutputParser, PydanticProgramMode +from llama_index.core.prompts.base import PromptTemplate + +from llama_index.llms.gaudi.utils import initialize_model + +DEFAULT_HUGGINGFACE_MODEL = "Intel/neural-chat-7b-v3-1" + +logging.basicConfig( + format="%(asctime)s - %(levelname)s - %(name)s - %(message)s", + datefmt="%m/%d/%Y %H:%M:%S", + level=logging.INFO, +) +logger = logging.getLogger(__name__) + + +class GaudiLLM(HuggingFaceLLM): + r"""GaudiLLM LLM. + + Examples: + `pip install llama-index-llms-gaudi` + + ```python + from llama_index.llms.gaudi import GaudiLLM + import argparse + import os, logging + + def setup_parser(parser): + # Arguments management + parser.add_argument( + "--device", "-d", type=str, choices=["hpu"], help="Device to run", default="hpu" + ) + parser.add_argument( + "--model_name_or_path", + default=None, + type=str, + # required=True, + help="Path to pre-trained model (on the HF Hub or locally).", + ) + parser.add_argument( + "--bf16", + default=True, + action="store_true", + help="Whether to perform generation in bf16 precision.", + ) + parser.add_argument( + "--max_new_tokens", type=int, default=100, help="Number of tokens to generate." + ) + parser.add_argument( + "--max_input_tokens", + type=int, + default=0, + help="If > 0 then pad and truncate the input sequences to this specified length of tokens. \ + if == 0, then truncate to 16 (original default) \ + if < 0, then do not truncate, use full input prompt", + ) + parser.add_argument("--batch_size", type=int, default=1, help="Input batch size.") + parser.add_argument( + "--warmup", + type=int, + default=3, + help="Number of warmup iterations for benchmarking.", + ) + parser.add_argument( + "--n_iterations", + type=int, + default=5, + help="Number of inference iterations for benchmarking.", + ) + parser.add_argument( + "--local_rank", type=int, default=0, metavar="N", help="Local process rank." + ) + parser.add_argument( + "--use_kv_cache", + default=True, + action="store_true", + help="Whether to use the key/value cache for decoding. It should speed up generation.", + ) + parser.add_argument( + "--use_hpu_graphs", + default=True, + action="store_true", + help="Whether to use HPU graphs or not. Using HPU graphs should give better latencies.", + ) + parser.add_argument( + "--dataset_name", + default=None, + type=str, + help="Optional argument if you want to assess your model on a given dataset of the HF Hub.", + ) + parser.add_argument( + "--column_name", + default=None, + type=str, + help="If `--dataset_name` was given, this will be the name of the column to use as prompts for generation.", + ) + parser.add_argument( + "--do_sample", + action="store_true", + help="Whether to use sampling for generation.", + ) + parser.add_argument( + "--num_beams", + default=1, + type=int, + help="Number of beams used for beam search generation. 1 means greedy search will be performed.", + ) + parser.add_argument( + "--trim_logits", + action="store_true", + help="Calculate logits only for the last token to save memory in the first step.", + ) + parser.add_argument( + "--seed", + default=27, + type=int, + help="Seed to use for random generation. Useful to reproduce your runs with `--do_sample`.", + ) + parser.add_argument( + "--profiling_warmup_steps", + default=0, + type=int, + help="Number of steps to ignore for profiling.", + ) + parser.add_argument( + "--profiling_steps", + default=0, + type=int, + help="Number of steps to capture for profiling.", + ) + parser.add_argument( + "--profiling_record_shapes", + default=False, + type=bool, + help="Record shapes when enabling profiling.", + ) + parser.add_argument( + "--prompt", + default=None, + type=str, + nargs="*", + help='Optional argument to give a prompt of your choice as input. Can be a single string (eg: --prompt "Hello world"), or a list of space-separated strings (eg: --prompt "Hello world" "How are you?")', + ) + parser.add_argument( + "--bad_words", + default=None, + type=str, + nargs="+", + help="Optional argument list of words that are not allowed to be generated.", + ) + parser.add_argument( + "--force_words", + default=None, + type=str, + nargs="+", + help="Optional argument list of words that must be generated.", + ) + parser.add_argument( + "--assistant_model", + default=None, + type=str, + help="Optional argument to give a path to a draft/assistant model for assisted decoding.", + ) + parser.add_argument( + "--peft_model", + default=None, + type=str, + help="Optional argument to give a path to a PEFT model.", + ) + parser.add_argument("--num_return_sequences", type=int, default=1) + parser.add_argument( + "--token", + default=None, + type=str, + help="The token to use as HTTP bearer authorization for remote files. If not specified, will use the token " + "generated when running `huggingface-cli login` (stored in `~/.huggingface`).", + ) + parser.add_argument( + "--model_revision", + default="main", + type=str, + help="The specific model version to use (can be a branch name, tag name or commit id).", + ) + parser.add_argument( + "--attn_softmax_bf16", + action="store_true", + help="Whether to run attention softmax layer in lower precision provided that the model supports it and " + "is also running in lower precision.", + ) + parser.add_argument( + "--output_dir", + default=None, + type=str, + help="Output directory to store results in.", + ) + parser.add_argument( + "--bucket_size", + default=-1, + type=int, + help="Bucket size to maintain static shapes. If this number is negative (default is -1) \ + then we use `shape = prompt_length + max_new_tokens`. If a positive number is passed \ + we increase the bucket in steps of `bucket_size` instead of allocating to max (`prompt_length + max_new_tokens`).", + ) + parser.add_argument( + "--bucket_internal", + action="store_true", + help="Split kv sequence into buckets in decode phase. It improves throughput when max_new_tokens is large.", + ) + parser.add_argument( + "--dataset_max_samples", + default=-1, + type=int, + help="If a negative number is passed (default = -1) perform inference on the whole dataset, else use only `dataset_max_samples` samples.", + ) + parser.add_argument( + "--limit_hpu_graphs", + action="store_true", + help="Skip HPU Graph usage for first token to save memory", + ) + parser.add_argument( + "--reuse_cache", + action="store_true", + help="Whether to reuse key/value cache for decoding. It should save memory.", + ) + parser.add_argument( + "--verbose_workers", + action="store_true", + help="Enable output from non-master workers", + ) + parser.add_argument( + "--simulate_dyn_prompt", + default=None, + type=int, + nargs="*", + help="If empty, static prompt is used. If a comma separated list of integers is passed, we warmup and use those shapes for prompt length.", + ) + parser.add_argument( + "--reduce_recompile", + action="store_true", + help="Preprocess on cpu, and some other optimizations. Useful to prevent recompilations when using dynamic prompts (simulate_dyn_prompt)", + ) + parser.add_argument( + "--use_flash_attention", + action="store_true", + help="Whether to enable Habana Flash Attention, provided that the model supports it.", + ) + parser.add_argument( + "--flash_attention_recompute", + action="store_true", + help="Whether to enable Habana Flash Attention in recompute mode on first token generation. This gives an opportunity of splitting graph internally which helps reduce memory consumption.", + ) + parser.add_argument( + "--flash_attention_causal_mask", + action="store_true", + help="Whether to enable Habana Flash Attention in causal mode on first token generation.", + ) + parser.add_argument( + "--flash_attention_fast_softmax", + action="store_true", + help="Whether to enable Habana Flash Attention in fast softmax mode.", + ) + parser.add_argument( + "--book_source", + action="store_true", + help="Whether to use project Guttenberg books data as input. Useful for testing large sequence lengths.", + ) + parser.add_argument( + "--torch_compile", + action="store_true", + help="Whether to use torch compiled model or not.", + ) + parser.add_argument( + "--ignore_eos", + default=True, + action=argparse.BooleanOptionalAction, + help="Whether to ignore eos, set False to disable it", + ) + parser.add_argument( + "--temperature", + default=1.0, + type=float, + help="Temperature value for text generation", + ) + parser.add_argument( + "--top_p", + default=1.0, + type=float, + help="Top_p value for generating text via sampling", + ) + parser.add_argument( + "--const_serialization_path", + "--csp", + type=str, + help="Path to serialize const params. Const params will be held on disk memory instead of being allocated on host memory.", + ) + parser.add_argument( + "--disk_offload", + action="store_true", + help="Whether to enable device map auto. In case no space left on cpu, weights will be offloaded to disk.", + ) + parser.add_argument( + "--trust_remote_code", + action="store_true", + help="Whether or not to allow for custom models defined on the Hub in their own modeling files.", + ) + args = parser.parse_args() + + if args.torch_compile: + args.use_hpu_graphs = False + + if not args.use_hpu_graphs: + args.limit_hpu_graphs = False + + args.quant_config = os.getenv("QUANT_CONFIG", "") + if args.quant_config == "" and args.disk_offload: + logger.warning( + "`--disk_offload` was tested only with fp8, it may not work with full precision. If error raises try to remove the --disk_offload flag." + ) + return args + + def messages_to_prompt(messages): + prompt = "" + for message in messages: + if message.role == 'system': + prompt += f"<|system|>\n{message.content}\n" + elif message.role == 'user': + prompt += f"<|user|>\n{message.content}\n" + elif message.role == 'assistant': + prompt += f"<|assistant|>\n{message.content}\n" + + # ensure we start with a system prompt, insert blank if needed + if not prompt.startswith("<|system|>\n"): + prompt = "<|system|>\n\n" + prompt + + # add final assistant prompt + prompt = prompt + "<|assistant|>\n" + + return prompt + + def completion_to_prompt(completion): + return f"<|system|>\n\n<|user|>\n{completion}\n<|assistant|>\n" + + import torch + from llama_index.core.prompts import PromptTemplate + from llama_index.llms.optimum-intel import GaudiLLM + + parser = argparse.ArgumentParser(description="GaudiLLM Basic Usage Example") + args = setup_parser(parser) + args.model_name_or_path = "HuggingFaceH4/zephyr-7b-alpha" + + llm = GaudiLLM( + args=args, + logger=logger, + model_name="HuggingFaceH4/zephyr-7b-alpha", + tokenizer_name="HuggingFaceH4/zephyr-7b-alpha", + query_wrapper_prompt=PromptTemplate( + "<|system|>\n\n<|user|>\n{query_str}\n<|assistant|>\n" + ), + context_window=3900, + max_new_tokens=256, + generate_kwargs={"temperature": 0.7, "top_k": 50, "top_p": 0.95}, + messages_to_prompt=messages_to_prompt, + device_map="auto", + ) + + response = llm.complete("What is the meaning of life?") + print(str(response)) + ``` + """ + + model_name: str = Field( + default=DEFAULT_HUGGINGFACE_MODEL, + description=( + "The model name to use from HuggingFace. " + "Unused if `model` is passed in directly." + ), + ) + tokenizer_name: str = Field( + default=DEFAULT_HUGGINGFACE_MODEL, + description=( + "The name of the tokenizer to use from HuggingFace. " + "Unused if `tokenizer` is passed in directly." + ), + ) + + def __init__( + self, + args, + logger, + context_window: int = DEFAULT_CONTEXT_WINDOW, + max_new_tokens: int = DEFAULT_NUM_OUTPUTS, + query_wrapper_prompt: Union[str, PromptTemplate] = "{query_str}", + tokenizer_name: str = DEFAULT_HUGGINGFACE_MODEL, + model_name: str = DEFAULT_HUGGINGFACE_MODEL, + model: Optional[Any] = None, + tokenizer: Optional[Any] = None, + device_map: Optional[str] = "auto", + stopping_ids: Optional[List[int]] = None, + tokenizer_kwargs: Optional[dict] = None, + tokenizer_outputs_to_remove: Optional[list] = None, + model_kwargs: Optional[dict] = None, + generate_kwargs: Optional[dict] = None, + is_chat_model: Optional[bool] = False, + callback_manager: Optional[CallbackManager] = None, + system_prompt: str = "", + messages_to_prompt: Optional[Callable[[Sequence[ChatMessage]], str]] = None, + completion_to_prompt: Optional[Callable[[str], str]] = None, + pydantic_program_mode: PydanticProgramMode = PydanticProgramMode.DEFAULT, + output_parser: Optional[BaseOutputParser] = None, + ) -> None: + """Initialize params.""" + model_kwargs = model_kwargs or {} + + model, _, tokenizer, _ = initialize_model(args, logger) + + super().__init__( + context_window=context_window, + max_new_tokens=max_new_tokens, + query_wrapper_prompt=query_wrapper_prompt, + tokenizer_name=tokenizer_name, + model_name=model_name, + model=model, + tokenizer=tokenizer, + device_map=device_map, + stopping_ids=stopping_ids or [], + tokenizer_kwargs=tokenizer_kwargs or {}, + tokenizer_outputs_to_remove=tokenizer_outputs_to_remove or [], + model_kwargs=model_kwargs or {}, + generate_kwargs=generate_kwargs or {}, + is_chat_model=is_chat_model, + callback_manager=callback_manager, + system_prompt=system_prompt, + messages_to_prompt=messages_to_prompt, + completion_to_prompt=completion_to_prompt, + pydantic_program_mode=pydantic_program_mode, + output_parser=output_parser, + ) + + @classmethod + def class_name(cls) -> str: + return "GaudiLLM" diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/utils.py b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/utils.py new file mode 100644 index 0000000000000..060d7a649b5cb --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/llama_index/llms/gaudi/utils.py @@ -0,0 +1,577 @@ +# Copyright 2022 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +############################################################################### +# Copyright (C) 2020-2021 Habana Labs, Ltd. an Intel Company +############################################################################### + +import copy +import glob +import os +import shutil +import tempfile +import time +from pathlib import Path + +import torch +from transformers import AutoConfig, AutoModelForCausalLM, AutoTokenizer +from transformers.utils import check_min_version + +from optimum.habana.checkpoint_utils import ( + get_ds_injection_policy, + get_repo_root, + model_is_optimized, + model_on_meta, + write_checkpoints_json, +) +from optimum.habana.utils import ( + check_habana_frameworks_version, + check_optimum_habana_min_version, + get_habana_frameworks_version, + set_seed, +) + + +def adjust_batch(batch, size): + curr_size = batch["input_ids"].shape[1] + if curr_size >= size: + adjusted_batch = { + "input_ids": batch["input_ids"][:, :size], + "attention_mask": batch["attention_mask"][:, :size], + } + else: + adjusted_batch = {} + for k in batch: + last_colm = batch[k][:, -1] + expanded = last_colm.tile((size - curr_size, 1)).T + adjusted_batch[k] = torch.concat([batch[k], expanded], 1) + assert adjusted_batch["input_ids"].shape[1] == size + assert adjusted_batch["attention_mask"].shape[1] == size + return adjusted_batch + + +def override_print(enable): + import builtins as __builtin__ + + builtin_print = __builtin__.print + + def print(*args, **kwargs): + force = kwargs.pop("force", False) + if force or enable: + builtin_print(*args, **kwargs) + + __builtin__.print = print + + +def override_logger(logger, enable): + logger_info = logger.info + + def info(*args, **kwargs): + force = kwargs.pop("force", False) + if force or enable: + logger_info(*args, **kwargs) + + logger.info = info + + +def count_hpu_graphs(): + return len(glob.glob(".graph_dumps/*PreGraph*")) + + +def override_prints(enable, logger): + override_print(enable) + override_logger(logger, enable) + + +def setup_distributed(args): + args.local_rank = int(os.getenv("LOCAL_RANK", "0")) + args.world_size = int(os.getenv("WORLD_SIZE", "0")) + args.global_rank = int(os.getenv("RANK", "0")) + + +def setup_inference(args, model): + import habana_frameworks.torch.core as htcore + + habana_version = get_habana_frameworks_version() + + print("Initializing inference mode") + # Keeping the if-else here for back compat. TODO remove later + if habana_version.major >= 1 and habana_version.minor >= 16: + htcore.hpu_initialize(model, mark_only_scales_as_const=True) + else: + const_marking = os.getenv("ENABLE_CONST_MARKING", "True") + if const_marking == "True": + htcore.hpu_initialize(model) + return model + + +def setup_const_serialization(const_serialization_path): + import uuid + + const_serialization_path = os.path.join(const_serialization_path + uuid.uuid4().hex) + os.makedirs(const_serialization_path) + from habana_frameworks.torch.hpu import enable_const_section_serialization + + print(f"Serializing const params to {const_serialization_path}") + enable_const_section_serialization(const_serialization_path, True) + + +def setup_env(args): + # Will error if the minimal version of Transformers is not installed. Remove at your own risks. + check_min_version("4.34.0") + check_optimum_habana_min_version("1.9.0.dev0") + # TODO: SW-167588 - WA for memory issue in hqt prep_model + os.environ.setdefault("EXPERIMENTAL_WEIGHT_SHARING", "FALSE") + + if args.global_rank == 0 and not args.torch_compile: + os.environ.setdefault("GRAPH_VISUALIZATION", "true") + shutil.rmtree(".graph_dumps", ignore_errors=True) + + if args.world_size > 0: + os.environ.setdefault("PT_HPU_LAZY_ACC_PAR_MODE", "0") + os.environ.setdefault("PT_HPU_ENABLE_LAZY_COLLECTIVES", "true") + + if ( + args.use_hpu_graphs + and args.limit_hpu_graphs + and not args.reuse_cache + and args.bucket_internal + ): + # Based upon above conditions and below env variable, + # we can call HPU graphs clear_inputs(). + os.environ.setdefault("PT_HPUGRAPH_DISABLE_TENSOR_CACHE", "1") + + # Tweak generation so that it runs faster on Gaudi + from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi + + adapt_transformers_to_gaudi() + + +def setup_device(args): + if args.device == "hpu": + import habana_frameworks.torch.core as htcore + + if args.quant_config: + htcore.hpu_set_env() + return torch.device(args.device) + + +# patching LinearAllreduce to use ScopedLinearAllReduce +def patch_scoped_linear_all_reduce(model): + from deepspeed.module_inject.layers import LinearAllreduce + + from optimum.habana.transformers.models.modeling_all_models import ( + ScopedLinearAllReduce, + ) + + for name, module in model.named_children(): + if type(module) is LinearAllreduce: + SL = ScopedLinearAllReduce(mod=module) + setattr(model, name, SL) + patch_scoped_linear_all_reduce(module) + + +def get_torch_compiled_model(model): + model.model = torch.compile( + model.model, backend="hpu_backend", options={"keep_input_mutations": True} + ) + return model + + +def setup_model(args, model_dtype, model_kwargs, logger): + logger.info("Single-device run.") + if args.assistant_model is None: + assistant_model = None + else: + logger.info(f"Using asssitant model {args.assistant_model}.") + if args.disk_offload: + from accelerate import infer_auto_device_map, init_empty_weights + + config = AutoConfig.from_pretrained(args.model_name_or_path) + with init_empty_weights(): + model = AutoModelForCausalLM.from_config(config) + max_memory = {"cpu": "10GiB"} + device_map = infer_auto_device_map( + model, max_memory=max_memory, dtype=model_dtype + ) + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, + device_map=device_map, + offload_folder="/tmp/offload_folder/", + offload_state_dict=True, + torch_dtype=model_dtype, + **model_kwargs, + ) + else: + if args.assistant_model is not None: + assistant_model = AutoModelForCausalLM.from_pretrained( + args.assistant_model, torch_dtype=model_dtype, **model_kwargs + ) + if args.peft_model is not None: + model = peft_model(args, model_dtype, logger, **model_kwargs) + else: + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs + ) + if args.quant_config: + import habana_quantization_toolkit + + habana_quantization_toolkit.prep_model(model) + if args.assistant_model is not None: + habana_quantization_toolkit.quantize_model(assistant_model) + + model = model.eval().to(args.device) + if args.assistant_model is not None: + assistant_model = assistant_model.eval().to(args.device) + + if args.use_hpu_graphs: + from habana_frameworks.torch.hpu import wrap_in_hpu_graph + + from optimum.habana.transformers.trainer import _is_peft_model + + if ( + check_habana_frameworks_version("1.13.0") + and model.config.model_type == "falcon" + ): + model = wrap_in_hpu_graph(model, hash_with_views=False) + else: + model = wrap_in_hpu_graph(model) + if args.assistant_model is not None: + assistant_model = wrap_in_hpu_graph(assistant_model) + if _is_peft_model(model): + model.base_model = wrap_in_hpu_graph(model.base_model) + + if args.torch_compile and model.config.model_type == "llama": + model = get_torch_compiled_model(model) + # if args.assistant_model is not None: + # assistant_model = get_torch_compiled_model(assistant_model) + return model, assistant_model + + +def setup_distributed_model(args, model_dtype, model_kwargs, logger): + import deepspeed + + logger.info("DeepSpeed is enabled.") + deepspeed.init_distributed(dist_backend="hccl") + config = AutoConfig.from_pretrained( + args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs + ) + load_to_meta = model_on_meta(config) + + if args.assistant_model is None: + assistant_model = None + else: + logger.info(f"Using asssitant model {args.assistant_model}.") + + if load_to_meta: + # Construct model with fake meta tensors, later will be replaced on devices during ds-inference ckpt load + with deepspeed.OnDevice(dtype=model_dtype, device="meta"): + model = AutoModelForCausalLM.from_config(config, torch_dtype=model_dtype) + + # Model loaded to meta is managed differently + checkpoints_json = tempfile.NamedTemporaryFile(suffix=".json", mode="+w") + + # For PEFT models, write the merged model on disk to be able to load it on the meta device + if args.peft_model is not None: + merged_model_dir = "/tmp/text_generation_merged_peft_model" + if args.local_rank == 0: + if Path(merged_model_dir).is_dir(): + shutil.rmtree(merged_model_dir) + peft_model(args, model_dtype, logger, **model_kwargs).save_pretrained( + merged_model_dir + ) + torch.distributed.barrier() + + write_checkpoints_json( + merged_model_dir + if args.peft_model is not None + else args.model_name_or_path, + args.local_rank, + checkpoints_json, + token=args.token, + ) + else: + # TODO: revisit placement on CPU when auto-injection is possible + with deepspeed.OnDevice(dtype=model_dtype, device="cpu"): + if args.peft_model is not None: + model = peft_model(args, model_dtype, logger, **model_kwargs) + else: + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs + ) + model.eval() + + if args.assistant_model is not None: + assistant_model = AutoModelForCausalLM.from_pretrained( + args.assistant_model, torch_dtype=model_dtype, **model_kwargs + ).eval() + + # Initialize the model + ds_inference_kwargs = {"dtype": model_dtype} + ds_inference_kwargs["tensor_parallel"] = {"tp_size": args.world_size} + ds_inference_kwargs["enable_cuda_graph"] = args.use_hpu_graphs + ds_inference_kwargs["injection_policy"] = get_ds_injection_policy(config) + if load_to_meta: + ds_inference_kwargs["checkpoint"] = checkpoints_json.name + + model = deepspeed.init_inference(model, **ds_inference_kwargs) + model = model.module + if model.config.model_type in ["llama", "falcon", "qwen2"]: + patch_scoped_linear_all_reduce(model) + + if args.quant_config: + import habana_quantization_toolkit + + habana_quantization_toolkit.prep_model(model) + if args.assistant_model is not None: + habana_quantization_toolkit.prep_model(assistant_model) + + if args.torch_compile and model.config.model_type == "llama": + model = get_torch_compiled_model(model) + # if args.assistant_model is not None: + # assistant_model = get_torch_compiled_model(assistant_model) + return model, assistant_model + + +def peft_model(args, model_dtype, logger, **model_kwargs): + import importlib.util + + if importlib.util.find_spec("peft") is None: + raise ImportError( + "The `peft` package is not installed, please run: `pip install peft`." + ) + from peft import AutoPeftModelForCausalLM + from peft.config import PeftConfigMixin + + base_model_name = PeftConfigMixin.from_pretrained( + args.peft_model, + token=model_kwargs["token"] if "token" in model_kwargs else None, + ).base_model_name_or_path + + base_model_is_local = Path(base_model_name).is_dir() + if not base_model_is_local: + # Check if the base model path to a remote repository on the HF Hub exists + from huggingface_hub import list_repo_files + + try: + list_repo_files(base_model_name) + base_model_is_remote = True + except Exception: + base_model_is_remote = False + + if base_model_is_local or base_model_is_remote: + model = AutoPeftModelForCausalLM.from_pretrained( + args.peft_model, torch_dtype=model_dtype, **model_kwargs + ) + else: + # Since the base model doesn't exist locally nor remotely, use `args.model_name_or_path` as the base model + logger.warning( + f"The base model `{base_model_name}` of the LoRA configuration associated" + f" to `{args.peft_model}` does not exist locally or remotely. Using " + f"`--model_name_or_path {args.model_name_or_path}` as a fall back for the base model." + ) + from peft import PeftModel + + model = AutoModelForCausalLM.from_pretrained( + args.model_name_or_path, torch_dtype=model_dtype, **model_kwargs + ) + model = PeftModel.from_pretrained( + model, args.peft_model, torch_dtype=model_dtype, **model_kwargs + ) + if hasattr(model, "merge_and_unload"): + model = model.merge_and_unload() + if model_dtype == torch.bfloat16: + model = model.to(torch.bfloat16) + return model + else: + from optimum.habana.peft.peft_model import ( + gaudi_generate, + gaudi_prepare_inputs_for_generation, + ) + + model.__class__.generate = gaudi_generate + model.__class__.prepare_inputs_for_generation = ( + gaudi_prepare_inputs_for_generation + ) + return model + + +def setup_tokenizer(args, model, assistant_model): + tokenizer_kwargs = { + "revision": args.model_revision, + "token": args.token, + "trust_remote_code": args.trust_remote_code, + } + if args.bad_words is not None or args.force_words is not None: + tokenizer_kwargs["add_prefix_space"] = True + tokenizer = AutoTokenizer.from_pretrained( + args.model_name_or_path, **tokenizer_kwargs + ) + if not model.config.is_encoder_decoder: + tokenizer.padding_side = "left" + + if model.config.model_type == "llama": + # unwind broken decapoda-research config + model.generation_config.pad_token_id = 0 + model.generation_config.bos_token_id = 1 + model.generation_config.eos_token_id = 2 + if assistant_model is not None: + assistant_model.generation_config.pad_token_id = 0 + assistant_model.generation_config.bos_token_id = 1 + assistant_model.generation_config.eos_token_id = 2 + tokenizer.bos_token_id = model.generation_config.bos_token_id + tokenizer.eos_token_id = model.generation_config.eos_token_id + tokenizer.pad_token_id = model.generation_config.pad_token_id + tokenizer.pad_token = tokenizer.decode(tokenizer.pad_token_id) + tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id) + tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id) + if model.config.model_type == "persimmon": + model.generation_config.pad_token_id = model.generation_config.eos_token_id + if assistant_model is not None: + assistant_model.generation_config.pad_token_id = ( + assistant_model.generation_config.eos_token_id + ) + tokenizer.bos_token_id = model.generation_config.bos_token_id + tokenizer.eos_token_id = model.generation_config.eos_token_id + tokenizer.pad_token_id = model.generation_config.pad_token_id + tokenizer.pad_token = tokenizer.decode(tokenizer.pad_token_id) + tokenizer.eos_token = tokenizer.decode(tokenizer.eos_token_id) + tokenizer.bos_token = tokenizer.decode(tokenizer.bos_token_id) + + # Some models like GPT2 do not have a PAD token so we have to set it if necessary + if tokenizer.pad_token is None: + tokenizer.pad_token = tokenizer.eos_token + model.generation_config.pad_token_id = model.generation_config.eos_token_id + if assistant_model is not None: + assistant_model.generation_config.pad_token_id = ( + assistant_model.generation_config.eos_token_id + ) + + return tokenizer, model, assistant_model + + +def setup_generation_config(args, model, assistant_model, tokenizer): + bad_words_ids = None + force_words_ids = None + if args.bad_words is not None: + bad_words_ids = [ + tokenizer.encode(bad_word, add_special_tokens=False) + for bad_word in args.bad_words + ] + if args.force_words is not None: + force_words_ids = [ + tokenizer.encode(force_word, add_special_tokens=False) + for force_word in args.force_words + ] + + is_optimized = model_is_optimized(model.config) + + # Generation configuration + generation_config = copy.deepcopy(model.generation_config) + generation_config.max_new_tokens = args.max_new_tokens + generation_config.use_cache = args.use_kv_cache + generation_config.static_shapes = is_optimized and assistant_model is None + generation_config.bucket_size = args.bucket_size if is_optimized else -1 + generation_config.bucket_internal = args.bucket_internal + generation_config.do_sample = args.do_sample + generation_config.num_beams = args.num_beams + generation_config.bad_words_ids = bad_words_ids + generation_config.force_words_ids = force_words_ids + generation_config.num_return_sequences = args.num_return_sequences + generation_config.trim_logits = args.trim_logits + generation_config.attn_softmax_bf16 = args.attn_softmax_bf16 + generation_config.limit_hpu_graphs = args.limit_hpu_graphs + generation_config.reuse_cache = args.reuse_cache + generation_config.reduce_recompile = args.reduce_recompile + if generation_config.reduce_recompile: + assert generation_config.bucket_size > 0 + generation_config.use_flash_attention = args.use_flash_attention + generation_config.flash_attention_recompute = args.flash_attention_recompute + generation_config.flash_attention_causal_mask = args.flash_attention_causal_mask + generation_config.flash_attention_fast_softmax = args.flash_attention_fast_softmax + generation_config.trust_remote_code = args.trust_remote_code + + return generation_config + + +def exclude_hpu_graph_configs(args): + # Excluded configs for batch size 1 for hpu graph + if args.batch_size == 1 and args.limit_hpu_graphs: + if ( + "falcon-180B" in args.model_name_or_path + or "falcon-180b" in args.model_name_or_path + ): + return False + if args.world_size == 2 or args.world_size == 4 or args.world_size == 8: + if args.quant_config: + if args.max_input_tokens >= 8192 and args.max_new_tokens >= 128: + return False + else: + if args.max_input_tokens >= 4096 and args.max_new_tokens >= 128: + return False + return True + else: + return False + + +def initialize_model(args, logger): + init_start = time.perf_counter() + setup_distributed(args) + if exclude_hpu_graph_configs(args): + args.limit_hpu_graphs = False + override_prints(args.global_rank == 0 or args.verbose_workers, logger) + setup_env(args) + setup_device(args) + set_seed(args.seed) + get_repo_root(args.model_name_or_path, local_rank=args.local_rank, token=args.token) + if args.assistant_model is not None: + get_repo_root( + args.assistant_model, local_rank=args.local_rank, token=args.token + ) + use_deepspeed = False + if use_deepspeed or args.bf16: + model_dtype = torch.bfloat16 + else: + model_dtype = torch.float + args.attn_softmax_bf16 = False + + model_kwargs = { + "revision": args.model_revision, + "token": args.token, + "trust_remote_code": args.trust_remote_code, + } + if args.trust_remote_code: + logger.warning( + "`trust_remote_code` is set, there is no guarantee this model works properly and it may fail" + ) + + model, assistant_model = ( + setup_model(args, model_dtype, model_kwargs, logger) + if not use_deepspeed + else setup_distributed_model(args, model_dtype, model_kwargs, logger) + ) + tokenizer, model, assistant_model = setup_tokenizer(args, model, assistant_model) + generation_config = setup_generation_config(args, model, assistant_model, tokenizer) + + if args.const_serialization_path: + setup_const_serialization(args.const_serialization_path) + if args.quant_config: + model = setup_inference(args, model) + init_end = time.perf_counter() + logger.info(f"Args: {args}") + logger.info( + f"device: {args.device}, n_hpu: {args.world_size}, bf16: {model_dtype == torch.bfloat16}" + ) + logger.info(f"Model initialization took {(init_end - init_start):.3f}s") + return model, assistant_model, tokenizer, generation_config diff --git a/llama-index-integrations/llms/llama-index-llms-gaudi/pyproject.toml b/llama-index-integrations/llms/llama-index-llms-gaudi/pyproject.toml new file mode 100644 index 0000000000000..c540f328ad284 --- /dev/null +++ b/llama-index-integrations/llms/llama-index-llms-gaudi/pyproject.toml @@ -0,0 +1,71 @@ +[build-system] +build-backend = "poetry.core.masonry.api" +requires = ["poetry-core"] + +[tool.codespell] +check-filenames = true +check-hidden = true +# Feel free to un-skip examples, and experimental, you will just need to +# work through many typos (--write-changes and --interactive will help) +skip = "*.csv,*.html,*.json,*.jsonl,*.pdf,*.txt,*.ipynb" + +[tool.llamahub] +contains_example = false +import_path = "llama_index.llms.gaudi" + +[tool.llamahub.class_authors] +GaudiLLM = "llama-index" + +[tool.mypy] +disallow_untyped_defs = true +# Remove venv skip when integrated with pre-commit +exclude = ["_static", "build", "examples", "notebooks", "venv"] +ignore_missing_imports = true +python_version = "3.10" + +[tool.poetry] +authors = ["Your Name "] +description = "llama-index llms gaudi integration" +license = "MIT" +name = "llama-index-llms-gaudi" +packages = [{include = "llama_index/"}] +readme = "README.md" +version = "0.1.0" + +[tool.poetry.dependencies] +python = ">=3.9,<4.0" +huggingface-hub = "^0.23.0" +torch = "^2.1.2" +text-generation = "^0.7.0" +llama-index-core = "^0.11.0" +llama-index-llms-huggingface = "^0.3.0" +optimum = {extras = ["habana"], version = ">=1.21.2"} + +[tool.poetry.dependencies.transformers] +extras = ["torch"] +version = "^4.37.0" + +[tool.poetry.group.dev.dependencies] +ipython = "8.10.0" +jupyter = "^1.0.0" +mypy = "0.991" +pre-commit = "3.2.0" +pylint = "2.15.10" +pytest = "7.2.1" +pytest-mock = "3.11.1" +ruff = "0.0.292" +tree-sitter-languages = "^1.8.0" +types-Deprecated = ">=0.1.0" +types-PyYAML = "^6.0.12.12" +types-protobuf = "^4.24.0.4" +types-redis = "4.5.5.0" +types-requests = "2.28.11.8" +types-setuptools = "67.1.0.0" + +[tool.poetry.group.dev.dependencies.black] +extras = ["jupyter"] +version = "<=23.9.1,>=23.7.0" + +[tool.poetry.group.dev.dependencies.codespell] +extras = ["toml"] +version = ">=v2.2.6"