diff --git a/README.md b/README.md index 517226dc5..92778486f 100644 --- a/README.md +++ b/README.md @@ -103,7 +103,8 @@ At present, we have introduced several key features to showcase our current capa We offer extensive model support, including dozens of large language models (LLMs) from both open-source and API agents, such as LLaMA/LLaMA2, Baichuan, ChatGLM, Wenxin, Tongyi, Zhipu, and many more. - News - - 🔥🔥🔥 [qwen-72b-chat](https://huggingface.co/Qwen/Qwen-72B-Chat) + - 🔥🔥🔥 [Mixtral-8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) + - 🔥🔥🔥 [Qwen-72B-Chat](https://huggingface.co/Qwen/Qwen-72B-Chat) - 🔥🔥🔥 [Yi-34B-Chat](https://huggingface.co/01-ai/Yi-34B-Chat) - [More Supported LLMs](http://docs.dbgpt.site/docs/modules/smmf) diff --git a/README.zh.md b/README.zh.md index 0ba4c40af..61cbc7def 100644 --- a/README.zh.md +++ b/README.zh.md @@ -111,7 +111,8 @@ DB-GPT是一个开源的数据库领域大模型框架。目的是构建大模 海量模型支持,包括开源、API代理等几十种大语言模型。如LLaMA/LLaMA2、Baichuan、ChatGLM、文心、通义、智谱等。当前已支持如下模型: - 新增支持模型 - - 🔥🔥🔥 [qwen-72b-chat](https://huggingface.co/Qwen/Qwen-72B-Chat) + - 🔥🔥🔥 [Mixtral-8x7B](https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1) + - 🔥🔥🔥 [Qwen-72B-Chat](https://huggingface.co/Qwen/Qwen-72B-Chat) - 🔥🔥🔥 [Yi-34B-Chat](https://huggingface.co/01-ai/Yi-34B-Chat) - [更多开源模型](https://www.yuque.com/eosphoros/dbgpt-docs/iqaaqwriwhp6zslc#qQktR) diff --git a/dbgpt/app/chat_adapter.py b/dbgpt/app/chat_adapter.py index 474695beb..c1cb192b1 100644 --- a/dbgpt/app/chat_adapter.py +++ b/dbgpt/app/chat_adapter.py @@ -245,7 +245,7 @@ def get_conv_template(self, model_path: str) -> Conversation: class LlamaCppChatAdapter(BaseChatAdpter): def match(self, model_path: str): - from dbgpt.model.adapter import LlamaCppAdapater + from dbgpt.model.adapter.old_adapter import LlamaCppAdapater if "llama-cpp" == model_path: return True diff --git a/dbgpt/configs/model_config.py b/dbgpt/configs/model_config.py index ccbe4e3d4..c9f123677 100644 --- a/dbgpt/configs/model_config.py +++ b/dbgpt/configs/model_config.py @@ -113,7 +113,9 @@ def get_device() -> str: # https://huggingface.co/microsoft/Orca-2-13b "orca-2-13b": os.path.join(MODEL_PATH, "Orca-2-13b"), # https://huggingface.co/openchat/openchat_3.5 - "openchat_3.5": os.path.join(MODEL_PATH, "openchat_3.5"), + "openchat-3.5": os.path.join(MODEL_PATH, "openchat_3.5"), + # https://huggingface.co/openchat/openchat-3.5-1210 + "openchat-3.5-1210": os.path.join(MODEL_PATH, "openchat-3.5-1210"), # https://huggingface.co/hfl/chinese-alpaca-2-7b "chinese-alpaca-2-7b": os.path.join(MODEL_PATH, "chinese-alpaca-2-7b"), # https://huggingface.co/hfl/chinese-alpaca-2-13b @@ -124,6 +126,10 @@ def get_device() -> str: "zephyr-7b-alpha": os.path.join(MODEL_PATH, "zephyr-7b-alpha"), # https://huggingface.co/mistralai/Mistral-7B-Instruct-v0.1 "mistral-7b-instruct-v0.1": os.path.join(MODEL_PATH, "Mistral-7B-Instruct-v0.1"), + # https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 + "mixtral-8x7b-instruct-v0.1": os.path.join( + MODEL_PATH, "Mixtral-8x7B-Instruct-v0.1" + ), # https://huggingface.co/Open-Orca/Mistral-7B-OpenOrca "mistral-7b-openorca": os.path.join(MODEL_PATH, "Mistral-7B-OpenOrca"), # https://huggingface.co/Xwin-LM/Xwin-LM-7B-V0.1 diff --git a/dbgpt/model/adapter/__init__.py b/dbgpt/model/adapter/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/dbgpt/model/adapter/base.py b/dbgpt/model/adapter/base.py new file mode 100644 index 000000000..df1d9441a --- /dev/null +++ b/dbgpt/model/adapter/base.py @@ -0,0 +1,437 @@ +from abc import ABC, abstractmethod +from typing import Dict, List, Optional, Any, Tuple, Type, Callable +import logging +from dbgpt.core.interface.message import ModelMessage, ModelMessageRoleType +from dbgpt.model.base import ModelType +from dbgpt.model.parameter import ( + BaseModelParameters, + ModelParameters, + LlamaCppModelParameters, + ProxyModelParameters, +) +from dbgpt.model.adapter.template import ( + get_conv_template, + ConversationAdapter, + ConversationAdapterFactory, +) + +logger = logging.getLogger(__name__) + + +class LLMModelAdapter(ABC): + """New Adapter for DB-GPT LLM models""" + + model_name: Optional[str] = None + model_path: Optional[str] = None + conv_factory: Optional[ConversationAdapterFactory] = None + # TODO: more flexible quantization config + support_4bit: bool = False + support_8bit: bool = False + support_system_message: bool = True + + def __repr__(self) -> str: + return f"<{self.__class__.__name__} model_name={self.model_name} model_path={self.model_path}>" + + def __str__(self): + return self.__repr__() + + @abstractmethod + def new_adapter(self, **kwargs) -> "LLMModelAdapter": + """Create a new adapter instance + + Args: + **kwargs: The parameters of the new adapter instance + + Returns: + LLMModelAdapter: The new adapter instance + """ + + def use_fast_tokenizer(self) -> bool: + """Whether use a [fast Rust-based tokenizer](https://huggingface.co/docs/tokenizers/index) if it is supported + for a given model. + """ + return False + + def model_type(self) -> str: + return ModelType.HF + + def model_param_class(self, model_type: str = None) -> Type[BaseModelParameters]: + """Get the startup parameters instance of the model + + Args: + model_type (str, optional): The type of model. Defaults to None. + + Returns: + Type[BaseModelParameters]: The startup parameters instance of the model + """ + # """Get the startup parameters instance of the model""" + model_type = model_type if model_type else self.model_type() + if model_type == ModelType.LLAMA_CPP: + return LlamaCppModelParameters + elif model_type == ModelType.PROXY: + return ProxyModelParameters + return ModelParameters + + def match( + self, + model_type: str, + model_name: Optional[str] = None, + model_path: Optional[str] = None, + ) -> bool: + """Whether the model adapter can load the given model + + Args: + model_type (str): The type of model + model_name (Optional[str], optional): The name of model. Defaults to None. + model_path (Optional[str], optional): The path of model. Defaults to None. + """ + return False + + def support_quantization_4bit(self) -> bool: + """Whether the model adapter can load 4bit model + + If it is True, we will load the 4bit model with :meth:`~LLMModelAdapter.load` + + Returns: + bool: Whether the model adapter can load 4bit model, default is False + """ + return self.support_4bit + + def support_quantization_8bit(self) -> bool: + """Whether the model adapter can load 8bit model + + If it is True, we will load the 8bit model with :meth:`~LLMModelAdapter.load` + + Returns: + bool: Whether the model adapter can load 8bit model, default is False + """ + return self.support_8bit + + def load(self, model_path: str, from_pretrained_kwargs: dict): + """Load model and tokenizer""" + raise NotImplementedError + + def load_from_params(self, params): + """Load the model and tokenizer according to the given parameters""" + raise NotImplementedError + + def support_async(self) -> bool: + """Whether the loaded model supports asynchronous calls""" + return False + + def get_generate_stream_function(self, model, model_path: str): + """Get the generate stream function of the model""" + raise NotImplementedError + + def get_async_generate_stream_function(self, model, model_path: str): + """Get the asynchronous generate stream function of the model""" + raise NotImplementedError + + def get_default_conv_template( + self, model_name: str, model_path: str + ) -> Optional[ConversationAdapter]: + """Get the default conversation template + + Args: + model_name (str): The name of the model. + model_path (str): The path of the model. + + Returns: + Optional[ConversationAdapter]: The conversation template. + """ + raise NotImplementedError + + def get_default_message_separator(self) -> str: + """Get the default message separator""" + try: + conv_template = self.get_default_conv_template( + self.model_name, self.model_path + ) + return conv_template.sep + except Exception: + return "\n" + + def transform_model_messages( + self, messages: List[ModelMessage] + ) -> List[Dict[str, str]]: + """Transform the model messages + + Default is the OpenAI format, example: + .. code-block:: python + return_messages = [ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi"} + ] + + But some model may need to transform the messages to other format(e.g. There is no system message), such as: + .. code-block:: python + return_messages = [ + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi"} + ] + Args: + messages (List[ModelMessage]): The model messages + + Returns: + List[Dict[str, str]]: The transformed model messages + """ + logger.info(f"support_system_message: {self.support_system_message}") + if not self.support_system_message: + return self._transform_to_no_system_messages(messages) + else: + return ModelMessage.to_openai_messages(messages) + + def _transform_to_no_system_messages( + self, messages: List[ModelMessage] + ) -> List[Dict[str, str]]: + """Transform the model messages to no system messages + + Some opensource chat model no system messages, so wo should transform the messages to no system messages. + + Merge the system messages to the last user message, example: + .. code-block:: python + return_messages = [ + {"role": "system", "content": "You are a helpful assistant"}, + {"role": "user", "content": "Hello"}, + {"role": "assistant", "content": "Hi"} + ] + => + return_messages = [ + {"role": "user", "content": "You are a helpful assistant\nHello"}, + {"role": "assistant", "content": "Hi"} + ] + + Args: + messages (List[ModelMessage]): The model messages + + Returns: + List[Dict[str, str]]: The transformed model messages + """ + openai_messages = ModelMessage.to_openai_messages(messages) + system_messages = [] + return_messages = [] + for message in openai_messages: + if message["role"] == "system": + system_messages.append(message["content"]) + else: + return_messages.append(message) + if len(system_messages) > 1: + # Too much system messages should be a warning + logger.warning("Your system messages have more than one message") + if system_messages: + sep = self.get_default_message_separator() + str_system_messages = ",".join(system_messages) + # Update last user message + return_messages[-1]["content"] = ( + str_system_messages + sep + return_messages[-1]["content"] + ) + return return_messages + + def get_str_prompt( + self, + params: Dict, + messages: List[ModelMessage], + tokenizer: Any, + prompt_template: str = None, + ) -> Optional[str]: + """Get the string prompt from the given parameters and messages + + If the value of return is not None, we will skip :meth:`~LLMModelAdapter.get_prompt_with_template` and use the value of return. + + Args: + params (Dict): The parameters + messages (List[ModelMessage]): The model messages + tokenizer (Any): The tokenizer of model, in huggingface chat model, we can create the prompt by tokenizer + prompt_template (str, optional): The prompt template. Defaults to None. + + Returns: + Optional[str]: The string prompt + """ + return None + + def get_prompt_with_template( + self, + params: Dict, + messages: List[ModelMessage], + model_name: str, + model_path: str, + model_context: Dict, + prompt_template: str = None, + ): + conv: ConversationAdapter = self.get_default_conv_template( + model_name, model_path + ) + + if prompt_template: + logger.info(f"Use prompt template {prompt_template} from config") + conv = get_conv_template(prompt_template) + if not conv or not messages: + # Nothing to do + logger.info( + f"No conv from model_path {model_path} or no messages in params, {self}" + ) + return None, None, None + + conv = conv.copy() + system_messages = [] + user_messages = [] + ai_messages = [] + + for message in messages: + if isinstance(message, ModelMessage): + role = message.role + content = message.content + elif isinstance(message, dict): + role = message["role"] + content = message["content"] + else: + raise ValueError(f"Invalid message type: {message}") + + if role == ModelMessageRoleType.SYSTEM: + # Support for multiple system messages + system_messages.append(content) + elif role == ModelMessageRoleType.HUMAN: + # conv.append_message(conv.roles[0], content) + user_messages.append(content) + elif role == ModelMessageRoleType.AI: + # conv.append_message(conv.roles[1], content) + ai_messages.append(content) + else: + raise ValueError(f"Unknown role: {role}") + + can_use_systems: [] = [] + if system_messages: + if len(system_messages) > 1: + # Compatible with dbgpt complex scenarios, the last system will protect more complete information + # entered by the current user + user_messages[-1] = system_messages[-1] + can_use_systems = system_messages[:-1] + else: + can_use_systems = system_messages + + for i in range(len(user_messages)): + conv.append_message(conv.roles[0], user_messages[i]) + if i < len(ai_messages): + conv.append_message(conv.roles[1], ai_messages[i]) + + # TODO join all system messages may not be a good idea + conv.set_system_message("".join(can_use_systems)) + # Add a blank message for the assistant. + conv.append_message(conv.roles[1], None) + new_prompt = conv.get_prompt() + return new_prompt, conv.stop_str, conv.stop_token_ids + + def model_adaptation( + self, + params: Dict, + model_name: str, + model_path: str, + tokenizer: Any, + prompt_template: str = None, + ) -> Tuple[Dict, Dict]: + """Params adaptation""" + messages = params.get("messages") + # Some model context to dbgpt server + model_context = {"prompt_echo_len_char": -1, "has_format_prompt": False} + if messages: + # Dict message to ModelMessage + messages = [ + m if isinstance(m, ModelMessage) else ModelMessage(**m) + for m in messages + ] + params["messages"] = messages + + new_prompt = self.get_str_prompt(params, messages, tokenizer, prompt_template) + conv_stop_str, conv_stop_token_ids = None, None + if not new_prompt: + ( + new_prompt, + conv_stop_str, + conv_stop_token_ids, + ) = self.get_prompt_with_template( + params, messages, model_name, model_path, model_context, prompt_template + ) + if not new_prompt: + return params, model_context + + # Overwrite the original prompt + # TODO remote bos token and eos token from tokenizer_config.json of model + prompt_echo_len_char = len(new_prompt.replace("", "").replace("", "")) + model_context["prompt_echo_len_char"] = prompt_echo_len_char + model_context["echo"] = params.get("echo", True) + model_context["has_format_prompt"] = True + params["prompt"] = new_prompt + + custom_stop = params.get("stop") + custom_stop_token_ids = params.get("stop_token_ids") + + # Prefer the value passed in from the input parameter + params["stop"] = custom_stop or conv_stop_str + params["stop_token_ids"] = custom_stop_token_ids or conv_stop_token_ids + + return params, model_context + + +class AdapterEntry: + """The entry of model adapter""" + + def __init__( + self, + model_adapter: LLMModelAdapter, + match_funcs: List[Callable[[str, str, str], bool]] = None, + ): + self.model_adapter = model_adapter + self.match_funcs = match_funcs or [] + + +model_adapters: List[AdapterEntry] = [] + + +def register_model_adapter( + model_adapter_cls: Type[LLMModelAdapter], + match_funcs: List[Callable[[str, str, str], bool]] = None, +) -> None: + """Register a model adapter. + + Args: + model_adapter_cls (Type[LLMModelAdapter]): The model adapter class. + match_funcs (List[Callable[[str, str, str], bool]], optional): The match functions. Defaults to None. + """ + model_adapters.append(AdapterEntry(model_adapter_cls(), match_funcs)) + + +def get_model_adapter( + model_type: str, + model_name: str, + model_path: str, + conv_factory: Optional[ConversationAdapterFactory] = None, +) -> Optional[LLMModelAdapter]: + """Get a model adapter. + + Args: + model_type (str): The type of the model. + model_name (str): The name of the model. + model_path (str): The path of the model. + conv_factory (Optional[ConversationAdapterFactory], optional): The conversation factory. Defaults to None. + Returns: + Optional[LLMModelAdapter]: The model adapter. + """ + adapter = None + # First find adapter by model_name + for adapter_entry in model_adapters: + if adapter_entry.model_adapter.match(model_type, model_name, None): + adapter = adapter_entry.model_adapter + break + for adapter_entry in model_adapters: + if adapter_entry.model_adapter.match(model_type, None, model_path): + adapter = adapter_entry.model_adapter + break + if adapter: + new_adapter = adapter.new_adapter() + new_adapter.model_name = model_name + new_adapter.model_path = model_path + if conv_factory: + new_adapter.conv_factory = conv_factory + return new_adapter + return None diff --git a/dbgpt/model/adapter/fschat_adapter.py b/dbgpt/model/adapter/fschat_adapter.py new file mode 100644 index 000000000..ca5413645 --- /dev/null +++ b/dbgpt/model/adapter/fschat_adapter.py @@ -0,0 +1,262 @@ +"""Adapter for fastchat + +You can import fastchat only in this file, so that the user does not need to install fastchat if he does not use it. +""" +import os +import threading +import logging +from functools import cache +from typing import TYPE_CHECKING, Callable, Tuple, List, Optional + +try: + from fastchat.conversation import ( + Conversation, + register_conv_template, + SeparatorStyle, + ) +except ImportError as exc: + raise ValueError( + "Could not import python package: fschat " + "Please install fastchat by command `pip install fschat` " + ) from exc + +from dbgpt.model.adapter.template import ConversationAdapter, PromptType +from dbgpt.model.adapter.base import LLMModelAdapter + +if TYPE_CHECKING: + from fastchat.model.model_adapter import BaseModelAdapter + from torch.nn import Module as TorchNNModule + +logger = logging.getLogger(__name__) + +thread_local = threading.local() +_IS_BENCHMARK = os.getenv("DB_GPT_MODEL_BENCHMARK", "False").lower() == "true" + +# If some model is not in the blacklist, but it still affects the loading of DB-GPT, you can add it to the blacklist. +__BLACK_LIST_MODEL_PROMPT = [] + + +class FschatConversationAdapter(ConversationAdapter): + """The conversation adapter for fschat.""" + + def __init__(self, conv: Conversation): + self._conv = conv + + @property + def prompt_type(self) -> PromptType: + return PromptType.FSCHAT + + @property + def roles(self) -> Tuple[str]: + return self._conv.roles + + @property + def sep(self) -> Optional[str]: + return self._conv.sep + + @property + def stop_str(self) -> str: + return self._conv.stop_str + + @property + def stop_token_ids(self) -> Optional[List[int]]: + return self._conv.stop_token_ids + + def get_prompt(self) -> str: + """Get the prompt string.""" + return self._conv.get_prompt() + + def set_system_message(self, system_message: str) -> None: + """Set the system message.""" + self._conv.set_system_message(system_message) + + def append_message(self, role: str, message: str) -> None: + """Append a new message. + + Args: + role (str): The role of the message. + message (str): The message content. + """ + self._conv.append_message(role, message) + + def update_last_message(self, message: str) -> None: + """Update the last output. + + The last message is typically set to be None when constructing the prompt, + so we need to update it in-place after getting the response from a model. + + Args: + message (str): The message content. + """ + self._conv.update_last_message(message) + + def copy(self) -> "ConversationAdapter": + """Copy the conversation.""" + return FschatConversationAdapter(self._conv.copy()) + + +class FastChatLLMModelAdapterWrapper(LLMModelAdapter): + """Wrapping fastchat adapter""" + + def __init__(self, adapter: "BaseModelAdapter") -> None: + self._adapter = adapter + + def new_adapter(self, **kwargs) -> "LLMModelAdapter": + return FastChatLLMModelAdapterWrapper(self._adapter) + + def use_fast_tokenizer(self) -> bool: + return self._adapter.use_fast_tokenizer + + def load(self, model_path: str, from_pretrained_kwargs: dict): + return self._adapter.load_model(model_path, from_pretrained_kwargs) + + def get_generate_stream_function(self, model: "TorchNNModule", model_path: str): + if _IS_BENCHMARK: + from dbgpt.util.benchmarks.llm.fastchat_benchmarks_inference import ( + generate_stream, + ) + + return generate_stream + else: + from fastchat.model.model_adapter import get_generate_stream_function + + return get_generate_stream_function(model, model_path) + + def get_default_conv_template( + self, model_name: str, model_path: str + ) -> Optional[ConversationAdapter]: + conv_template = self._adapter.get_default_conv_template(model_path) + return FschatConversationAdapter(conv_template) if conv_template else None + + def __str__(self) -> str: + return "{}({}.{})".format( + self.__class__.__name__, + self._adapter.__class__.__module__, + self._adapter.__class__.__name__, + ) + + +def _get_fastchat_model_adapter( + model_name: str, + model_path: str, + caller: Callable[[str], None] = None, + use_fastchat_monkey_patch: bool = False, +): + from fastchat.model import model_adapter + + _bak_get_model_adapter = model_adapter.get_model_adapter + try: + if use_fastchat_monkey_patch: + model_adapter.get_model_adapter = _fastchat_get_adapter_monkey_patch + thread_local.model_name = model_name + _remove_black_list_model_of_fastchat() + if caller: + return caller(model_path) + finally: + del thread_local.model_name + model_adapter.get_model_adapter = _bak_get_model_adapter + + +def _fastchat_get_adapter_monkey_patch(model_path: str, model_name: str = None): + if not model_name: + if not hasattr(thread_local, "model_name"): + raise RuntimeError("fastchat get adapter monkey path need model_name") + model_name = thread_local.model_name + from fastchat.model.model_adapter import model_adapters + + for adapter in model_adapters: + if adapter.match(model_name): + logger.info( + f"Found llm model adapter with model name: {model_name}, {adapter}" + ) + return adapter + + model_path_basename = ( + None if not model_path else os.path.basename(os.path.normpath(model_path)) + ) + for adapter in model_adapters: + if model_path_basename and adapter.match(model_path_basename): + logger.info( + f"Found llm model adapter with model path: {model_path} and base name: {model_path_basename}, {adapter}" + ) + return adapter + + for adapter in model_adapters: + if model_path and adapter.match(model_path): + logger.info( + f"Found llm model adapter with model path: {model_path}, {adapter}" + ) + return adapter + + raise ValueError( + f"Invalid model adapter for model name {model_name} and model path {model_path}" + ) + + +@cache +def _remove_black_list_model_of_fastchat(): + from fastchat.model.model_adapter import model_adapters + + black_list_models = [] + for adapter in model_adapters: + try: + if ( + adapter.get_default_conv_template("/data/not_exist_model_path").name + in __BLACK_LIST_MODEL_PROMPT + ): + black_list_models.append(adapter) + except Exception: + pass + for adapter in black_list_models: + model_adapters.remove(adapter) + + +# Covering the configuration of fastcaht, we will regularly feedback the code here to fastchat. +# We also recommend that you modify it directly in the fastchat repository. + +# source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L212 +register_conv_template( + Conversation( + name="aquila-legacy", + system_message="A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n", + roles=("### Human: ", "### Assistant: ", "System"), + messages=(), + offset=0, + sep_style=SeparatorStyle.NO_COLON_TWO, + sep="\n", + sep2="", + stop_str=["", "[UNK]"], + ), + override=True, +) +# source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L227 +register_conv_template( + Conversation( + name="aquila", + system_message="A chat between a curious human and an artificial intelligence assistant. " + "The assistant gives helpful, detailed, and polite answers to the human's questions.", + roles=("Human", "Assistant", "System"), + messages=(), + offset=0, + sep_style=SeparatorStyle.ADD_COLON_TWO, + sep="###", + sep2="", + stop_str=["", "[UNK]"], + ), + override=True, +) +# source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L242 +register_conv_template( + Conversation( + name="aquila-v1", + roles=("<|startofpiece|>", "<|endofpiece|>", ""), + messages=(), + offset=0, + sep_style=SeparatorStyle.NO_COLON_TWO, + sep="", + sep2="", + stop_str=["", "<|endoftext|>"], + ), + override=True, +) diff --git a/dbgpt/model/adapter/hf_adapter.py b/dbgpt/model/adapter/hf_adapter.py new file mode 100644 index 000000000..673223b49 --- /dev/null +++ b/dbgpt/model/adapter/hf_adapter.py @@ -0,0 +1,136 @@ +from abc import ABC, abstractmethod +from typing import Dict, Optional, List, Any +import logging + +from dbgpt.core import ModelMessage +from dbgpt.model.base import ModelType +from dbgpt.model.adapter.base import LLMModelAdapter, register_model_adapter + +logger = logging.getLogger(__name__) + + +class NewHFChatModelAdapter(LLMModelAdapter, ABC): + """Model adapter for new huggingface chat models + + See https://huggingface.co/docs/transformers/main/en/chat_templating + + We can transform the inference chat messages to chat model instead of create a + prompt template for this model + """ + + def new_adapter(self, **kwargs) -> "NewHFChatModelAdapter": + return self.__class__() + + def match( + self, + model_type: str, + model_name: Optional[str] = None, + model_path: Optional[str] = None, + ) -> bool: + if model_type != ModelType.HF: + return False + if model_name is None and model_path is None: + return False + model_name = model_name.lower() if model_name else None + model_path = model_path.lower() if model_path else None + return self.do_match(model_name) or self.do_match(model_path) + + @abstractmethod + def do_match(self, lower_model_name_or_path: Optional[str] = None): + raise NotImplementedError() + + def load(self, model_path: str, from_pretrained_kwargs: dict): + try: + import transformers + from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel + except ImportError as exc: + raise ValueError( + "Could not import depend python package " + "Please install it with `pip install transformers`." + ) from exc + if not transformers.__version__ >= "4.34.0": + raise ValueError( + "Current model (Load by NewHFChatModelAdapter) require transformers.__version__>=4.34.0" + ) + revision = from_pretrained_kwargs.get("revision", "main") + try: + tokenizer = AutoTokenizer.from_pretrained( + model_path, + use_fast=self.use_fast_tokenizer(), + revision=revision, + trust_remote_code=True, + ) + except TypeError: + tokenizer = AutoTokenizer.from_pretrained( + model_path, use_fast=False, revision=revision, trust_remote_code=True + ) + try: + model = AutoModelForCausalLM.from_pretrained( + model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs + ) + except NameError: + model = AutoModel.from_pretrained( + model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs + ) + # tokenizer.use_default_system_prompt = False + return model, tokenizer + + def get_generate_stream_function(self, model, model_path: str): + """Get the generate stream function of the model""" + from dbgpt.model.llm_out.hf_chat_llm import huggingface_chat_generate_stream + + return huggingface_chat_generate_stream + + def get_str_prompt( + self, + params: Dict, + messages: List[ModelMessage], + tokenizer: Any, + prompt_template: str = None, + ) -> Optional[str]: + from transformers import AutoTokenizer + + if not tokenizer: + raise ValueError("tokenizer is is None") + tokenizer: AutoTokenizer = tokenizer + + messages = self.transform_model_messages(messages) + logger.debug(f"The messages after transform: \n{messages}") + str_prompt = tokenizer.apply_chat_template( + messages, tokenize=False, add_generation_prompt=True + ) + return str_prompt + + +class YiAdapter(NewHFChatModelAdapter): + support_4bit: bool = True + support_8bit: bool = True + support_system_message: bool = True + + def do_match(self, lower_model_name_or_path: Optional[str] = None): + return ( + lower_model_name_or_path + and "yi-" in lower_model_name_or_path + and "chat" in lower_model_name_or_path + ) + + +class Mixtral8x7BAdapter(NewHFChatModelAdapter): + """ + https://huggingface.co/mistralai/Mixtral-8x7B-Instruct-v0.1 + """ + + support_4bit: bool = True + support_8bit: bool = True + support_system_message: bool = False + + def do_match(self, lower_model_name_or_path: Optional[str] = None): + return ( + lower_model_name_or_path + and "mixtral" in lower_model_name_or_path + and "8x7b" in lower_model_name_or_path + ) + + +register_model_adapter(YiAdapter) +register_model_adapter(Mixtral8x7BAdapter) diff --git a/dbgpt/model/adapter/model_adapter.py b/dbgpt/model/adapter/model_adapter.py new file mode 100644 index 000000000..efaf9f1b9 --- /dev/null +++ b/dbgpt/model/adapter/model_adapter.py @@ -0,0 +1,166 @@ +from __future__ import annotations + +from typing import ( + List, + Type, + Optional, +) +import logging +import threading +import os +from functools import cache +from dbgpt.model.base import ModelType +from dbgpt.model.parameter import BaseModelParameters +from dbgpt.model.adapter.base import LLMModelAdapter, get_model_adapter +from dbgpt.model.adapter.template import ( + ConversationAdapter, + ConversationAdapterFactory, +) + +logger = logging.getLogger(__name__) + +thread_local = threading.local() +_IS_BENCHMARK = os.getenv("DB_GPT_MODEL_BENCHMARK", "False").lower() == "true" + + +_OLD_MODELS = [ + "llama-cpp", + "proxyllm", + "gptj-6b", + "codellama-13b-sql-sft", + "codellama-7b", + "codellama-7b-sql-sft", + "codellama-13b", +] + + +@cache +def get_llm_model_adapter( + model_name: str, + model_path: str, + use_fastchat: bool = True, + use_fastchat_monkey_patch: bool = False, + model_type: str = None, +) -> LLMModelAdapter: + conv_factory = DefaultConversationAdapterFactory() + if model_type == ModelType.VLLM: + logger.info("Current model type is vllm, return VLLMModelAdapterWrapper") + from dbgpt.model.adapter.vllm_adapter import VLLMModelAdapterWrapper + + return VLLMModelAdapterWrapper(conv_factory) + + # Import NewHFChatModelAdapter for it can be registered + from dbgpt.model.adapter.hf_adapter import NewHFChatModelAdapter + + new_model_adapter = get_model_adapter( + model_type, model_name, model_path, conv_factory + ) + if new_model_adapter: + logger.info(f"Current model {model_name} use new adapter {new_model_adapter}") + return new_model_adapter + + must_use_old = any(m in model_name for m in _OLD_MODELS) + result_adapter: Optional[LLMModelAdapter] = None + if use_fastchat and not must_use_old: + logger.info("Use fastcat adapter") + from dbgpt.model.adapter.fschat_adapter import ( + _get_fastchat_model_adapter, + _fastchat_get_adapter_monkey_patch, + FastChatLLMModelAdapterWrapper, + ) + + adapter = _get_fastchat_model_adapter( + model_name, + model_path, + _fastchat_get_adapter_monkey_patch, + use_fastchat_monkey_patch=use_fastchat_monkey_patch, + ) + if adapter: + result_adapter = FastChatLLMModelAdapterWrapper(adapter) + + else: + from dbgpt.model.adapter.old_adapter import ( + get_llm_model_adapter as _old_get_llm_model_adapter, + OldLLMModelAdapterWrapper, + ) + from dbgpt.app.chat_adapter import get_llm_chat_adapter + + logger.info("Use DB-GPT old adapter") + result_adapter = OldLLMModelAdapterWrapper( + _old_get_llm_model_adapter(model_name, model_path), + get_llm_chat_adapter(model_name, model_path), + ) + if result_adapter: + result_adapter.model_name = model_name + result_adapter.model_path = model_path + result_adapter.conv_factory = conv_factory + return result_adapter + else: + raise ValueError(f"Can not find adapter for model {model_name}") + + +@cache +def _auto_get_conv_template( + model_name: str, model_path: str +) -> Optional[ConversationAdapter]: + """Auto get the conversation template. + + Args: + model_name (str): The name of the model. + model_path (str): The path of the model. + + Returns: + Optional[ConversationAdapter]: The conversation template. + """ + try: + adapter = get_llm_model_adapter(model_name, model_path, use_fastchat=True) + return adapter.get_default_conv_template(model_name, model_path) + except Exception as e: + logger.debug(f"Failed to get conv template for {model_name} {model_path}: {e}") + return None + + +class DefaultConversationAdapterFactory(ConversationAdapterFactory): + def get_by_model(self, model_name: str, model_path: str) -> ConversationAdapter: + """Get a conversation adapter by model. + + Args: + model_name (str): The name of the model. + model_path (str): The path of the model. + Returns: + ConversationAdapter: The conversation adapter. + """ + return _auto_get_conv_template(model_name, model_path) + + +def _dynamic_model_parser() -> Optional[List[Type[BaseModelParameters]]]: + """Dynamic model parser, parse the model parameters from the command line arguments. + + Returns: + Optional[List[Type[BaseModelParameters]]]: The model parameters class list. + """ + from dbgpt.util.parameter_utils import _SimpleArgParser + from dbgpt.model.parameter import ( + EmbeddingModelParameters, + WorkerType, + EMBEDDING_NAME_TO_PARAMETER_CLASS_CONFIG, + ) + + pre_args = _SimpleArgParser("model_name", "model_path", "worker_type", "model_type") + pre_args.parse() + model_name = pre_args.get("model_name") + model_path = pre_args.get("model_path") + worker_type = pre_args.get("worker_type") + model_type = pre_args.get("model_type") + if model_name is None and model_type != ModelType.VLLM: + return None + if worker_type == WorkerType.TEXT2VEC: + return [ + EMBEDDING_NAME_TO_PARAMETER_CLASS_CONFIG.get( + model_name, EmbeddingModelParameters + ) + ] + + llm_adapter = get_llm_model_adapter(model_name, model_path, model_type=model_type) + param_class = llm_adapter.model_param_class() + return [param_class] diff --git a/dbgpt/model/adapter.py b/dbgpt/model/adapter/old_adapter.py similarity index 82% rename from dbgpt/model/adapter.py rename to dbgpt/model/adapter/old_adapter.py index 05be0e464..a63054695 100644 --- a/dbgpt/model/adapter.py +++ b/dbgpt/model/adapter/old_adapter.py @@ -9,7 +9,7 @@ import re import logging from pathlib import Path -from typing import List, Tuple +from typing import List, Tuple, TYPE_CHECKING, Optional from functools import cache from transformers import ( AutoModel, @@ -17,6 +17,9 @@ AutoTokenizer, LlamaTokenizer, ) + +from dbgpt.model.adapter.base import LLMModelAdapter +from dbgpt.model.adapter.template import ConversationAdapter, PromptType from dbgpt.model.base import ModelType from dbgpt.model.parameter import ( @@ -24,9 +27,13 @@ LlamaCppModelParameters, ProxyModelParameters, ) +from dbgpt.model.conversation import Conversation from dbgpt.configs.model_config import get_device from dbgpt._private.config import Config +if TYPE_CHECKING: + from dbgpt.app.chat_adapter import BaseChatAdpter + logger = logging.getLogger(__name__) CFG = Config() @@ -92,17 +99,6 @@ def get_llm_model_adapter(model_name: str, model_path: str) -> BaseLLMAdaper: ) -def _parse_model_param_class(model_name: str, model_path: str) -> ModelParameters: - try: - llm_adapter = get_llm_model_adapter(model_name, model_path) - return llm_adapter.model_param_class() - except Exception as e: - logger.warn( - f"Parse model parameters with model name {model_name} and model {model_path} failed {str(e)}, return `ModelParameters`" - ) - return ModelParameters - - # TODO support cpu? for practise we support gpt4all or chatglm-6b-int4? @@ -426,6 +422,87 @@ def loader(self, model_path: str, from_pretrained_kwargs: dict): return model, tokenizer +class OldLLMModelAdapterWrapper(LLMModelAdapter): + """Wrapping old adapter, which may be removed later""" + + def __init__(self, adapter: BaseLLMAdaper, chat_adapter: "BaseChatAdpter") -> None: + self._adapter = adapter + self._chat_adapter = chat_adapter + + def new_adapter(self, **kwargs) -> "LLMModelAdapter": + return OldLLMModelAdapterWrapper(self._adapter, self._chat_adapter) + + def use_fast_tokenizer(self) -> bool: + return self._adapter.use_fast_tokenizer() + + def model_type(self) -> str: + return self._adapter.model_type() + + def model_param_class(self, model_type: str = None) -> ModelParameters: + return self._adapter.model_param_class(model_type) + + def get_default_conv_template( + self, model_name: str, model_path: str + ) -> Optional[ConversationAdapter]: + conv_template = self._chat_adapter.get_conv_template(model_path) + return OldConversationAdapter(conv_template) if conv_template else None + + def load(self, model_path: str, from_pretrained_kwargs: dict): + return self._adapter.loader(model_path, from_pretrained_kwargs) + + def get_generate_stream_function(self, model, model_path: str): + return self._chat_adapter.get_generate_stream_func(model_path) + + def __str__(self) -> str: + return "{}({}.{})".format( + self.__class__.__name__, + self._adapter.__class__.__module__, + self._adapter.__class__.__name__, + ) + + +class OldConversationAdapter(ConversationAdapter): + """Wrapping old Conversation, which may be removed later""" + + def __init__(self, conv: Conversation) -> None: + self._conv = conv + + @property + def prompt_type(self) -> PromptType: + return PromptType.DBGPT + + @property + def roles(self) -> Tuple[str]: + return self._conv.roles + + @property + def sep(self) -> Optional[str]: + return self._conv.sep + + @property + def stop_str(self) -> str: + return self._conv.stop_str + + @property + def stop_token_ids(self) -> Optional[List[int]]: + return self._conv.stop_token_ids + + def get_prompt(self) -> str: + return self._conv.get_prompt() + + def set_system_message(self, system_message: str) -> None: + self._conv.update_system_message(system_message) + + def append_message(self, role: str, message: str) -> None: + self._conv.append_message(role, message) + + def update_last_message(self, message: str) -> None: + self._conv.update_last_message(message) + + def copy(self) -> "ConversationAdapter": + return OldConversationAdapter(self._conv.copy()) + + register_llm_model_adapters(VicunaLLMAdapater) register_llm_model_adapters(ChatGLMAdapater) register_llm_model_adapters(GuanacoAdapter) diff --git a/dbgpt/model/adapter/template.py b/dbgpt/model/adapter/template.py new file mode 100644 index 000000000..3fb9a6ec1 --- /dev/null +++ b/dbgpt/model/adapter/template.py @@ -0,0 +1,130 @@ +from abc import ABC, abstractmethod +from enum import Enum +from typing import TYPE_CHECKING, Optional, Tuple, Union, List + +if TYPE_CHECKING: + from fastchat.conversation import Conversation + + +class PromptType(str, Enum): + """Prompt type.""" + + FSCHAT: str = "fschat" + DBGPT: str = "dbgpt" + + +class ConversationAdapter(ABC): + """The conversation adapter.""" + + @property + def prompt_type(self) -> PromptType: + return PromptType.FSCHAT + + @property + @abstractmethod + def roles(self) -> Tuple[str]: + """Get the roles of the conversation. + + Returns: + Tuple[str]: The roles of the conversation. + """ + + @property + def sep(self) -> Optional[str]: + """Get the separator between messages.""" + return "\n" + + @property + def stop_str(self) -> Optional[Union[str, List[str]]]: + """Get the stop criteria.""" + return None + + @property + def stop_token_ids(self) -> Optional[List[int]]: + """Stops generation if meeting any token in this list""" + return None + + @abstractmethod + def get_prompt(self) -> str: + """Get the prompt string. + + Returns: + str: The prompt string. + """ + + @abstractmethod + def set_system_message(self, system_message: str) -> None: + """Set the system message.""" + + @abstractmethod + def append_message(self, role: str, message: str) -> None: + """Append a new message. + Args: + role (str): The role of the message. + message (str): The message content. + """ + + @abstractmethod + def update_last_message(self, message: str) -> None: + """Update the last output. + + The last message is typically set to be None when constructing the prompt, + so we need to update it in-place after getting the response from a model. + + Args: + message (str): The message content. + """ + + @abstractmethod + def copy(self) -> "ConversationAdapter": + """Copy the conversation.""" + + +class ConversationAdapterFactory(ABC): + """The conversation adapter factory.""" + + def get_by_name( + self, + template_name: str, + prompt_template_type: Optional[PromptType] = PromptType.FSCHAT, + ) -> ConversationAdapter: + """Get a conversation adapter by name. + + Args: + template_name (str): The name of the template. + prompt_template_type (Optional[PromptType]): The type of the prompt template, default to be FSCHAT. + + Returns: + ConversationAdapter: The conversation adapter. + """ + raise NotImplementedError() + + def get_by_model(self, model_name: str, model_path: str) -> ConversationAdapter: + """Get a conversation adapter by model. + + Args: + model_name (str): The name of the model. + model_path (str): The path of the model. + + Returns: + ConversationAdapter: The conversation adapter. + """ + raise NotImplementedError() + + +def get_conv_template(name: str) -> ConversationAdapter: + """Get a conversation template. + + Args: + name (str): The name of the template. + + Just return the fastchat conversation template for now. + # TODO: More templates should be supported. + Returns: + Conversation: The conversation template. + """ + from fastchat.conversation import get_conv_template + from dbgpt.model.adapter.fschat_adapter import FschatConversationAdapter + + conv_template = get_conv_template(name) + return FschatConversationAdapter(conv_template) diff --git a/dbgpt/model/adapter/vllm_adapter.py b/dbgpt/model/adapter/vllm_adapter.py new file mode 100644 index 000000000..2ffe0c764 --- /dev/null +++ b/dbgpt/model/adapter/vllm_adapter.py @@ -0,0 +1,93 @@ +import dataclasses +import logging +from dbgpt.model.base import ModelType +from dbgpt.model.adapter.base import LLMModelAdapter +from dbgpt.model.adapter.template import ConversationAdapter, ConversationAdapterFactory +from dbgpt.model.parameter import BaseModelParameters +from dbgpt.util.parameter_utils import ( + _extract_parameter_details, + _build_parameter_class, + _get_dataclass_print_str, +) + +logger = logging.getLogger(__name__) + + +class VLLMModelAdapterWrapper(LLMModelAdapter): + """Wrapping vllm engine""" + + def __init__(self, conv_factory: ConversationAdapterFactory): + self.conv_factory = conv_factory + + def new_adapter(self, **kwargs) -> "VLLMModelAdapterWrapper": + return VLLMModelAdapterWrapper(self.conv_factory) + + def model_type(self) -> str: + return ModelType.VLLM + + def model_param_class(self, model_type: str = None) -> BaseModelParameters: + import argparse + from vllm.engine.arg_utils import AsyncEngineArgs + + parser = argparse.ArgumentParser() + parser = AsyncEngineArgs.add_cli_args(parser) + parser.add_argument("--model_name", type=str, help="model name") + parser.add_argument( + "--model_path", + type=str, + help="local model path of the huggingface model to use", + ) + parser.add_argument("--model_type", type=str, help="model type") + parser.add_argument("--device", type=str, default=None, help="device") + # TODO parse prompt templete from `model_name` and `model_path` + parser.add_argument( + "--prompt_template", + type=str, + default=None, + help="Prompt template. If None, the prompt template is automatically determined from model path", + ) + + descs = _extract_parameter_details( + parser, + "dbgpt.model.parameter.VLLMModelParameters", + skip_names=["model"], + overwrite_default_values={"trust_remote_code": True}, + ) + return _build_parameter_class(descs) + + def load_from_params(self, params): + from vllm import AsyncLLMEngine + from vllm.engine.arg_utils import AsyncEngineArgs + import torch + + num_gpus = torch.cuda.device_count() + if num_gpus > 1 and hasattr(params, "tensor_parallel_size"): + setattr(params, "tensor_parallel_size", num_gpus) + logger.info( + f"Start vllm AsyncLLMEngine with args: {_get_dataclass_print_str(params)}" + ) + + params = dataclasses.asdict(params) + params["model"] = params["model_path"] + attrs = [attr.name for attr in dataclasses.fields(AsyncEngineArgs)] + vllm_engine_args_dict = {attr: params.get(attr) for attr in attrs} + # Set the attributes from the parsed arguments. + engine_args = AsyncEngineArgs(**vllm_engine_args_dict) + engine = AsyncLLMEngine.from_engine_args(engine_args) + return engine, engine.engine.tokenizer + + def support_async(self) -> bool: + return True + + def get_async_generate_stream_function(self, model, model_path: str): + from dbgpt.model.llm_out.vllm_llm import generate_stream + + return generate_stream + + def get_default_conv_template( + self, model_name: str, model_path: str + ) -> ConversationAdapter: + return self.conv_factory.get_by_model(model_name, model_path) + + def __str__(self) -> str: + return "{}.{}".format(self.__class__.__module__, self.__class__.__name__) diff --git a/dbgpt/model/cli.py b/dbgpt/model/cli.py index 67d5eec3c..67e12d7eb 100644 --- a/dbgpt/model/cli.py +++ b/dbgpt/model/cli.py @@ -405,7 +405,7 @@ def stop_model_controller(port: int): def _model_dynamic_factory() -> Callable[[None], List[Type]]: - from dbgpt.model.model_adapter import _dynamic_model_parser + from dbgpt.model.adapter.model_adapter import _dynamic_model_parser param_class = _dynamic_model_parser() fix_class = [ModelWorkerParameters] diff --git a/dbgpt/model/cluster/worker/default_worker.py b/dbgpt/model/cluster/worker/default_worker.py index 7345bb5d4..c4967a076 100644 --- a/dbgpt/model/cluster/worker/default_worker.py +++ b/dbgpt/model/cluster/worker/default_worker.py @@ -6,7 +6,8 @@ import traceback from dbgpt.configs.model_config import get_device -from dbgpt.model.model_adapter import get_llm_model_adapter, LLMModelAdaper +from dbgpt.model.adapter.base import LLMModelAdapter +from dbgpt.model.adapter.model_adapter import get_llm_model_adapter from dbgpt.core import ModelOutput, ModelInferenceMetrics from dbgpt.model.loader import ModelLoader, _get_model_real_path from dbgpt.model.parameter import ModelParameters @@ -27,7 +28,7 @@ def __init__(self) -> None: self.model = None self.tokenizer = None self._model_params = None - self.llm_adapter: LLMModelAdaper = None + self.llm_adapter: LLMModelAdapter = None self._support_async = False def load_worker(self, model_name: str, model_path: str, **kwargs) -> None: diff --git a/dbgpt/model/llm_utils.py b/dbgpt/model/llm_utils.py index e877778ed..031896e86 100644 --- a/dbgpt/model/llm_utils.py +++ b/dbgpt/model/llm_utils.py @@ -37,7 +37,7 @@ def list_supported_models(): def _list_supported_models( worker_type: str, model_config: Dict[str, str] ) -> List[SupportedModel]: - from dbgpt.model.model_adapter import get_llm_model_adapter + from dbgpt.model.adapter.model_adapter import get_llm_model_adapter from dbgpt.model.loader import _get_model_real_path ret = [] diff --git a/dbgpt/model/loader.py b/dbgpt/model/loader.py index 2030eb402..4ed42d630 100644 --- a/dbgpt/model/loader.py +++ b/dbgpt/model/loader.py @@ -1,13 +1,13 @@ #!/usr/bin/env python3 # -*- coding: utf-8 -*- -from typing import Optional, Dict +from typing import Optional, Dict, Any -from dataclasses import asdict import logging from dbgpt.configs.model_config import get_device from dbgpt.model.base import ModelType -from dbgpt.model.model_adapter import get_llm_model_adapter, LLMModelAdaper +from dbgpt.model.adapter.base import LLMModelAdapter +from dbgpt.model.adapter.model_adapter import get_llm_model_adapter from dbgpt.model.parameter import ( ModelParameters, LlamaCppModelParameters, @@ -117,7 +117,7 @@ def loader( raise Exception(f"Unkown model type {model_type}") def loader_with_params( - self, model_params: ModelParameters, llm_adapter: LLMModelAdaper + self, model_params: ModelParameters, llm_adapter: LLMModelAdapter ): model_type = llm_adapter.model_type() self.prompt_template = model_params.prompt_template @@ -133,7 +133,7 @@ def loader_with_params( raise Exception(f"Unkown model type {model_type}") -def huggingface_loader(llm_adapter: LLMModelAdaper, model_params: ModelParameters): +def huggingface_loader(llm_adapter: LLMModelAdapter, model_params: ModelParameters): import torch from dbgpt.model.compression import compress_module @@ -174,6 +174,12 @@ def huggingface_loader(llm_adapter: LLMModelAdaper, model_params: ModelParameter else: raise ValueError(f"Invalid device: {device}") + model, tokenizer = _try_load_default_quantization_model( + llm_adapter, device, num_gpus, model_params, kwargs + ) + if model: + return model, tokenizer + can_quantization = _check_quantization(model_params) if can_quantization and (num_gpus > 1 or model_params.load_4bit): @@ -192,6 +198,46 @@ def huggingface_loader(llm_adapter: LLMModelAdaper, model_params: ModelParameter # TODO merge current code into `load_huggingface_quantization_model` compress_module(model, model_params.device) + return _handle_model_and_tokenizer(model, tokenizer, device, num_gpus, model_params) + + +def _try_load_default_quantization_model( + llm_adapter: LLMModelAdapter, + device: str, + num_gpus: int, + model_params: ModelParameters, + kwargs: Dict[str, Any], +): + """Try load default quantization model(Support by huggingface default)""" + cloned_kwargs = {k: v for k, v in kwargs.items()} + try: + model, tokenizer = None, None + if device != "cuda": + return None, None + elif model_params.load_8bit and llm_adapter.support_8bit: + cloned_kwargs["load_in_8bit"] = True + model, tokenizer = llm_adapter.load(model_params.model_path, cloned_kwargs) + elif model_params.load_4bit and llm_adapter.support_4bit: + cloned_kwargs["load_in_4bit"] = True + model, tokenizer = llm_adapter.load(model_params.model_path, cloned_kwargs) + if model: + logger.info( + f"Load default quantization model {model_params.model_name} success" + ) + return _handle_model_and_tokenizer( + model, tokenizer, device, num_gpus, model_params + ) + return None, None + except Exception as e: + logger.warning( + f"Load default quantization model {model_params.model_name} failed, error: {str(e)}" + ) + return None, None + + +def _handle_model_and_tokenizer( + model, tokenizer, device: str, num_gpus: int, model_params: ModelParameters +): if ( (device == "cuda" and num_gpus == 1 and not model_params.cpu_offloading) or device == "mps" @@ -209,7 +255,7 @@ def huggingface_loader(llm_adapter: LLMModelAdaper, model_params: ModelParameter def load_huggingface_quantization_model( - llm_adapter: LLMModelAdaper, + llm_adapter: LLMModelAdapter, model_params: ModelParameters, kwargs: Dict, max_memory: Dict[int, str], @@ -344,7 +390,9 @@ def load_huggingface_quantization_model( return model, tokenizer -def llamacpp_loader(llm_adapter: LLMModelAdaper, model_params: LlamaCppModelParameters): +def llamacpp_loader( + llm_adapter: LLMModelAdapter, model_params: LlamaCppModelParameters +): try: from dbgpt.model.llm.llama_cpp.llama_cpp import LlamaCppModel except ImportError as exc: @@ -358,7 +406,7 @@ def llamacpp_loader(llm_adapter: LLMModelAdaper, model_params: LlamaCppModelPara return model, tokenizer -def proxyllm_loader(llm_adapter: LLMModelAdaper, model_params: ProxyModelParameters): +def proxyllm_loader(llm_adapter: LLMModelAdapter, model_params: ProxyModelParameters): from dbgpt.model.proxy.llms.proxy_model import ProxyModel logger.info("Load proxyllm") diff --git a/dbgpt/model/model_adapter.py b/dbgpt/model/model_adapter.py deleted file mode 100644 index 85243a12b..000000000 --- a/dbgpt/model/model_adapter.py +++ /dev/null @@ -1,660 +0,0 @@ -from __future__ import annotations - -from typing import Callable, List, Dict, Type, Tuple, TYPE_CHECKING, Any, Optional -import dataclasses -import logging -import threading -import os -from functools import cache -from dbgpt.model.base import ModelType -from dbgpt.model.parameter import ( - ModelParameters, - LlamaCppModelParameters, - ProxyModelParameters, -) -from dbgpt.core.interface.message import ModelMessage, ModelMessageRoleType -from dbgpt.util.parameter_utils import ( - _extract_parameter_details, - _build_parameter_class, - _get_dataclass_print_str, -) - -try: - from fastchat.conversation import ( - Conversation, - register_conv_template, - SeparatorStyle, - ) -except ImportError as exc: - raise ValueError( - "Could not import python package: fschat " - "Please install fastchat by command `pip install fschat` " - ) from exc - -if TYPE_CHECKING: - from fastchat.model.model_adapter import BaseModelAdapter - from dbgpt.model.adapter import BaseLLMAdaper as OldBaseLLMAdaper - from torch.nn import Module as TorchNNModule - -logger = logging.getLogger(__name__) - -thread_local = threading.local() -_IS_BENCHMARK = os.getenv("DB_GPT_MODEL_BENCHMARK", "False").lower() == "true" - - -_OLD_MODELS = [ - "llama-cpp", - "proxyllm", - "gptj-6b", - "codellama-13b-sql-sft", - "codellama-7b", - "codellama-7b-sql-sft", - "codellama-13b", -] - -_NEW_HF_CHAT_MODELS = [ - "yi-34b", - "yi-6b", -] - -# The implementation of some models in fastchat will affect the DB-GPT loading model and will be temporarily added to the blacklist. -_BLACK_LIST_MODLE_PROMPT = ["OpenHermes-2.5-Mistral-7B"] - - -class LLMModelAdaper: - """New Adapter for DB-GPT LLM models""" - - def use_fast_tokenizer(self) -> bool: - """Whether use a [fast Rust-based tokenizer](https://huggingface.co/docs/tokenizers/index) if it is supported - for a given model. - """ - return False - - def model_type(self) -> str: - return ModelType.HF - - def model_param_class(self, model_type: str = None) -> ModelParameters: - """Get the startup parameters instance of the model""" - model_type = model_type if model_type else self.model_type() - if model_type == ModelType.LLAMA_CPP: - return LlamaCppModelParameters - elif model_type == ModelType.PROXY: - return ProxyModelParameters - return ModelParameters - - def load(self, model_path: str, from_pretrained_kwargs: dict): - """Load model and tokenizer""" - raise NotImplementedError - - def load_from_params(self, params): - """Load the model and tokenizer according to the given parameters""" - raise NotImplementedError - - def support_async(self) -> bool: - """Whether the loaded model supports asynchronous calls""" - return False - - def get_generate_stream_function(self, model, model_path: str): - """Get the generate stream function of the model""" - raise NotImplementedError - - def get_async_generate_stream_function(self, model, model_path: str): - """Get the asynchronous generate stream function of the model""" - raise NotImplementedError - - def get_default_conv_template( - self, model_name: str, model_path: str - ) -> "Conversation": - """Get the default conv template""" - raise NotImplementedError - - def get_str_prompt( - self, - params: Dict, - messages: List[ModelMessage], - tokenizer: Any, - prompt_template: str = None, - ) -> Optional[str]: - return None - - def get_prompt_with_template( - self, - params: Dict, - messages: List[ModelMessage], - model_name: str, - model_path: str, - model_context: Dict, - prompt_template: str = None, - ): - conv = self.get_default_conv_template(model_name, model_path) - - if prompt_template: - logger.info(f"Use prompt template {prompt_template} from config") - conv = get_conv_template(prompt_template) - if not conv or not messages: - # Nothing to do - logger.info( - f"No conv from model_path {model_path} or no messages in params, {self}" - ) - return None, None, None - - conv = conv.copy() - system_messages = [] - user_messages = [] - ai_messages = [] - - for message in messages: - role, content = None, None - if isinstance(message, ModelMessage): - role = message.role - content = message.content - elif isinstance(message, dict): - role = message["role"] - content = message["content"] - else: - raise ValueError(f"Invalid message type: {message}") - - if role == ModelMessageRoleType.SYSTEM: - # Support for multiple system messages - system_messages.append(content) - elif role == ModelMessageRoleType.HUMAN: - # conv.append_message(conv.roles[0], content) - user_messages.append(content) - elif role == ModelMessageRoleType.AI: - # conv.append_message(conv.roles[1], content) - ai_messages.append(content) - else: - raise ValueError(f"Unknown role: {role}") - - can_use_systems: [] = [] - if system_messages: - if len(system_messages) > 1: - ## Compatible with dbgpt complex scenarios, the last system will protect more complete information entered by the current user - user_messages[-1] = system_messages[-1] - can_use_systems = system_messages[:-1] - else: - can_use_systems = system_messages - - for i in range(len(user_messages)): - conv.append_message(conv.roles[0], user_messages[i]) - if i < len(ai_messages): - conv.append_message(conv.roles[1], ai_messages[i]) - - if isinstance(conv, Conversation): - conv.set_system_message("".join(can_use_systems)) - else: - conv.update_system_message("".join(can_use_systems)) - - # Add a blank message for the assistant. - conv.append_message(conv.roles[1], None) - new_prompt = conv.get_prompt() - return new_prompt, conv.stop_str, conv.stop_token_ids - - def model_adaptation( - self, - params: Dict, - model_name: str, - model_path: str, - tokenizer: Any, - prompt_template: str = None, - ) -> Tuple[Dict, Dict]: - """Params adaptation""" - messages = params.get("messages") - # Some model scontext to dbgpt server - model_context = {"prompt_echo_len_char": -1, "has_format_prompt": False} - if messages: - # Dict message to ModelMessage - messages = [ - m if isinstance(m, ModelMessage) else ModelMessage(**m) - for m in messages - ] - params["messages"] = messages - - new_prompt = self.get_str_prompt(params, messages, tokenizer, prompt_template) - conv_stop_str, conv_stop_token_ids = None, None - if not new_prompt: - ( - new_prompt, - conv_stop_str, - conv_stop_token_ids, - ) = self.get_prompt_with_template( - params, messages, model_name, model_path, model_context, prompt_template - ) - if not new_prompt: - return params, model_context - - # Overwrite the original prompt - # TODO remote bos token and eos token from tokenizer_config.json of model - prompt_echo_len_char = len(new_prompt.replace("", "").replace("", "")) - model_context["prompt_echo_len_char"] = prompt_echo_len_char - model_context["echo"] = params.get("echo", True) - model_context["has_format_prompt"] = True - params["prompt"] = new_prompt - - custom_stop = params.get("stop") - custom_stop_token_ids = params.get("stop_token_ids") - - # Prefer the value passed in from the input parameter - params["stop"] = custom_stop or conv_stop_str - params["stop_token_ids"] = custom_stop_token_ids or conv_stop_token_ids - - return params, model_context - - -class OldLLMModelAdaperWrapper(LLMModelAdaper): - """Wrapping old adapter, which may be removed later""" - - def __init__(self, adapter: "OldBaseLLMAdaper", chat_adapter) -> None: - self._adapter = adapter - self._chat_adapter = chat_adapter - - def use_fast_tokenizer(self) -> bool: - return self._adapter.use_fast_tokenizer() - - def model_type(self) -> str: - return self._adapter.model_type() - - def model_param_class(self, model_type: str = None) -> ModelParameters: - return self._adapter.model_param_class(model_type) - - def get_default_conv_template( - self, model_name: str, model_path: str - ) -> "Conversation": - return self._chat_adapter.get_conv_template(model_path) - - def load(self, model_path: str, from_pretrained_kwargs: dict): - return self._adapter.loader(model_path, from_pretrained_kwargs) - - def get_generate_stream_function(self, model, model_path: str): - return self._chat_adapter.get_generate_stream_func(model_path) - - def __str__(self) -> str: - return "{}({}.{})".format( - self.__class__.__name__, - self._adapter.__class__.__module__, - self._adapter.__class__.__name__, - ) - - -class FastChatLLMModelAdaperWrapper(LLMModelAdaper): - """Wrapping fastchat adapter""" - - def __init__(self, adapter: "BaseModelAdapter") -> None: - self._adapter = adapter - - def use_fast_tokenizer(self) -> bool: - return self._adapter.use_fast_tokenizer - - def load(self, model_path: str, from_pretrained_kwargs: dict): - return self._adapter.load_model(model_path, from_pretrained_kwargs) - - def get_generate_stream_function(self, model: "TorchNNModule", model_path: str): - if _IS_BENCHMARK: - from dbgpt.util.benchmarks.llm.fastchat_benchmarks_inference import ( - generate_stream, - ) - - return generate_stream - else: - from fastchat.model.model_adapter import get_generate_stream_function - - return get_generate_stream_function(model, model_path) - - def get_default_conv_template( - self, model_name: str, model_path: str - ) -> "Conversation": - return self._adapter.get_default_conv_template(model_path) - - def __str__(self) -> str: - return "{}({}.{})".format( - self.__class__.__name__, - self._adapter.__class__.__module__, - self._adapter.__class__.__name__, - ) - - -class NewHFChatModelAdapter(LLMModelAdaper): - def load(self, model_path: str, from_pretrained_kwargs: dict): - try: - import transformers - from transformers import AutoModelForCausalLM, AutoTokenizer, AutoModel - except ImportError as exc: - raise ValueError( - "Could not import depend python package " - "Please install it with `pip install transformers`." - ) from exc - if not transformers.__version__ >= "4.34.0": - raise ValueError( - "Current model (Load by HFNewChatAdapter) require transformers.__version__>=4.34.0" - ) - revision = from_pretrained_kwargs.get("revision", "main") - try: - tokenizer = AutoTokenizer.from_pretrained( - model_path, - use_fast=self.use_fast_tokenizer, - revision=revision, - trust_remote_code=True, - ) - except TypeError: - tokenizer = AutoTokenizer.from_pretrained( - model_path, use_fast=False, revision=revision, trust_remote_code=True - ) - try: - model = AutoModelForCausalLM.from_pretrained( - model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs - ) - except NameError: - model = AutoModel.from_pretrained( - model_path, low_cpu_mem_usage=True, **from_pretrained_kwargs - ) - # tokenizer.use_default_system_prompt = False - return model, tokenizer - - def get_generate_stream_function(self, model, model_path: str): - """Get the generate stream function of the model""" - from dbgpt.model.llm_out.hf_chat_llm import huggingface_chat_generate_stream - - return huggingface_chat_generate_stream - - def get_str_prompt( - self, - params: Dict, - messages: List[ModelMessage], - tokenizer: Any, - prompt_template: str = None, - ) -> Optional[str]: - from transformers import AutoTokenizer - - if not tokenizer: - raise ValueError("tokenizer is is None") - tokenizer: AutoTokenizer = tokenizer - - messages = ModelMessage.to_openai_messages(messages) - str_prompt = tokenizer.apply_chat_template( - messages, tokenize=False, add_generation_prompt=True - ) - return str_prompt - - -def get_conv_template(name: str) -> "Conversation": - """Get a conversation template.""" - from fastchat.conversation import get_conv_template - - return get_conv_template(name) - - -@cache -def _auto_get_conv_template(model_name: str, model_path: str) -> "Conversation": - try: - adapter = get_llm_model_adapter(model_name, model_path, use_fastchat=True) - return adapter.get_default_conv_template(model_name, model_path) - except Exception: - return None - - -@cache -def get_llm_model_adapter( - model_name: str, - model_path: str, - use_fastchat: bool = True, - use_fastchat_monkey_patch: bool = False, - model_type: str = None, -) -> LLMModelAdaper: - if model_type == ModelType.VLLM: - logger.info("Current model type is vllm, return VLLMModelAdaperWrapper") - return VLLMModelAdaperWrapper() - - use_new_hf_chat_models = any(m in model_name.lower() for m in _NEW_HF_CHAT_MODELS) - if use_new_hf_chat_models: - logger.info(f"Current model {model_name} use NewHFChatModelAdapter") - return NewHFChatModelAdapter() - - must_use_old = any(m in model_name for m in _OLD_MODELS) - if use_fastchat and not must_use_old: - logger.info("Use fastcat adapter") - adapter = _get_fastchat_model_adapter( - model_name, - model_path, - _fastchat_get_adapter_monkey_patch, - use_fastchat_monkey_patch=use_fastchat_monkey_patch, - ) - return FastChatLLMModelAdaperWrapper(adapter) - else: - from dbgpt.model.adapter import ( - get_llm_model_adapter as _old_get_llm_model_adapter, - ) - from dbgpt.app.chat_adapter import get_llm_chat_adapter - - logger.info("Use DB-GPT old adapter") - return OldLLMModelAdaperWrapper( - _old_get_llm_model_adapter(model_name, model_path), - get_llm_chat_adapter(model_name, model_path), - ) - - -def _get_fastchat_model_adapter( - model_name: str, - model_path: str, - caller: Callable[[str], None] = None, - use_fastchat_monkey_patch: bool = False, -): - from fastchat.model import model_adapter - - _bak_get_model_adapter = model_adapter.get_model_adapter - try: - if use_fastchat_monkey_patch: - model_adapter.get_model_adapter = _fastchat_get_adapter_monkey_patch - thread_local.model_name = model_name - _remove_black_list_model_of_fastchat() - if caller: - return caller(model_path) - finally: - del thread_local.model_name - model_adapter.get_model_adapter = _bak_get_model_adapter - - -def _fastchat_get_adapter_monkey_patch(model_path: str, model_name: str = None): - if not model_name: - if not hasattr(thread_local, "model_name"): - raise RuntimeError("fastchat get adapter monkey path need model_name") - model_name = thread_local.model_name - from fastchat.model.model_adapter import model_adapters - - for adapter in model_adapters: - if adapter.match(model_name): - logger.info( - f"Found llm model adapter with model name: {model_name}, {adapter}" - ) - return adapter - - model_path_basename = ( - None if not model_path else os.path.basename(os.path.normpath(model_path)) - ) - for adapter in model_adapters: - if model_path_basename and adapter.match(model_path_basename): - logger.info( - f"Found llm model adapter with model path: {model_path} and base name: {model_path_basename}, {adapter}" - ) - return adapter - - for adapter in model_adapters: - if model_path and adapter.match(model_path): - logger.info( - f"Found llm model adapter with model path: {model_path}, {adapter}" - ) - return adapter - - raise ValueError( - f"Invalid model adapter for model name {model_name} and model path {model_path}" - ) - - -@cache -def _remove_black_list_model_of_fastchat(): - from fastchat.model.model_adapter import model_adapters - - black_list_models = [] - for adapter in model_adapters: - try: - if ( - adapter.get_default_conv_template("/data/not_exist_model_path").name - in _BLACK_LIST_MODLE_PROMPT - ): - black_list_models.append(adapter) - except Exception: - pass - for adapter in black_list_models: - model_adapters.remove(adapter) - - -def _dynamic_model_parser() -> Callable[[None], List[Type]]: - from dbgpt.util.parameter_utils import _SimpleArgParser - from dbgpt.model.parameter import ( - EmbeddingModelParameters, - WorkerType, - EMBEDDING_NAME_TO_PARAMETER_CLASS_CONFIG, - ) - - pre_args = _SimpleArgParser("model_name", "model_path", "worker_type", "model_type") - pre_args.parse() - model_name = pre_args.get("model_name") - model_path = pre_args.get("model_path") - worker_type = pre_args.get("worker_type") - model_type = pre_args.get("model_type") - if model_name is None and model_type != ModelType.VLLM: - return None - if worker_type == WorkerType.TEXT2VEC: - return [ - EMBEDDING_NAME_TO_PARAMETER_CLASS_CONFIG.get( - model_name, EmbeddingModelParameters - ) - ] - - llm_adapter = get_llm_model_adapter(model_name, model_path, model_type=model_type) - param_class = llm_adapter.model_param_class() - return [param_class] - - -class VLLMModelAdaperWrapper(LLMModelAdaper): - """Wrapping vllm engine""" - - def model_type(self) -> str: - return ModelType.VLLM - - def model_param_class(self, model_type: str = None) -> ModelParameters: - import argparse - from vllm.engine.arg_utils import AsyncEngineArgs - - parser = argparse.ArgumentParser() - parser = AsyncEngineArgs.add_cli_args(parser) - parser.add_argument("--model_name", type=str, help="model name") - parser.add_argument( - "--model_path", - type=str, - help="local model path of the huggingface model to use", - ) - parser.add_argument("--model_type", type=str, help="model type") - parser.add_argument("--device", type=str, default=None, help="device") - # TODO parse prompt templete from `model_name` and `model_path` - parser.add_argument( - "--prompt_template", - type=str, - default=None, - help="Prompt template. If None, the prompt template is automatically determined from model path", - ) - - descs = _extract_parameter_details( - parser, - "dbgpt.model.parameter.VLLMModelParameters", - skip_names=["model"], - overwrite_default_values={"trust_remote_code": True}, - ) - return _build_parameter_class(descs) - - def load_from_params(self, params): - from vllm import AsyncLLMEngine - from vllm.engine.arg_utils import AsyncEngineArgs - import torch - - num_gpus = torch.cuda.device_count() - if num_gpus > 1 and hasattr(params, "tensor_parallel_size"): - setattr(params, "tensor_parallel_size", num_gpus) - logger.info( - f"Start vllm AsyncLLMEngine with args: {_get_dataclass_print_str(params)}" - ) - - params = dataclasses.asdict(params) - params["model"] = params["model_path"] - attrs = [attr.name for attr in dataclasses.fields(AsyncEngineArgs)] - vllm_engine_args_dict = {attr: params.get(attr) for attr in attrs} - # Set the attributes from the parsed arguments. - engine_args = AsyncEngineArgs(**vllm_engine_args_dict) - engine = AsyncLLMEngine.from_engine_args(engine_args) - return engine, engine.engine.tokenizer - - def support_async(self) -> bool: - return True - - def get_async_generate_stream_function(self, model, model_path: str): - from dbgpt.model.llm_out.vllm_llm import generate_stream - - return generate_stream - - def get_default_conv_template( - self, model_name: str, model_path: str - ) -> "Conversation": - return _auto_get_conv_template(model_name, model_path) - - def __str__(self) -> str: - return "{}.{}".format(self.__class__.__module__, self.__class__.__name__) - - -# Covering the configuration of fastcaht, we will regularly feedback the code here to fastchat. -# We also recommend that you modify it directly in the fastchat repository. - -# source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L212 -register_conv_template( - Conversation( - name="aquila-legacy", - system_message="A chat between a curious human and an artificial intelligence assistant. " - "The assistant gives helpful, detailed, and polite answers to the human's questions.\n\n", - roles=("### Human: ", "### Assistant: ", "System"), - messages=(), - offset=0, - sep_style=SeparatorStyle.NO_COLON_TWO, - sep="\n", - sep2="", - stop_str=["", "[UNK]"], - ), - override=True, -) -# source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L227 -register_conv_template( - Conversation( - name="aquila", - system_message="A chat between a curious human and an artificial intelligence assistant. " - "The assistant gives helpful, detailed, and polite answers to the human's questions.", - roles=("Human", "Assistant", "System"), - messages=(), - offset=0, - sep_style=SeparatorStyle.ADD_COLON_TWO, - sep="###", - sep2="", - stop_str=["", "[UNK]"], - ), - override=True, -) -# source: https://huggingface.co/BAAI/AquilaChat2-34B/blob/4608b75855334b93329a771aee03869dbf7d88cc/predict.py#L242 -register_conv_template( - Conversation( - name="aquila-v1", - roles=("<|startofpiece|>", "<|endofpiece|>", ""), - messages=(), - offset=0, - sep_style=SeparatorStyle.NO_COLON_TWO, - sep="", - sep2="", - stop_str=["", "<|endoftext|>"], - ), - override=True, -)