From 438e17d2c716b0cf3ba708e337699fb0499fc039 Mon Sep 17 00:00:00 2001 From: Davidqian123 Date: Tue, 22 Oct 2024 17:45:04 +0000 Subject: [PATCH 01/20] release 0.0.8.9 --- nexa/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexa/__init__.py b/nexa/__init__.py index 09b3af88..4d36a89e 100644 --- a/nexa/__init__.py +++ b/nexa/__init__.py @@ -1 +1 @@ -__version__ = "0.0.8.8" +__version__ = "0.0.8.9" From f21c2b99eda99834194e8f907817fe024740198d Mon Sep 17 00:00:00 2001 From: Davidqian123 Date: Wed, 6 Nov 2024 00:54:03 +0000 Subject: [PATCH 02/20] update dependency --- dependency/bark.cpp | 2 +- dependency/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dependency/bark.cpp b/dependency/bark.cpp index 451a7290..1c228860 160000 --- a/dependency/bark.cpp +++ b/dependency/bark.cpp @@ -1 +1 @@ -Subproject commit 451a7290c50fb41ab7113667f3c7854b4a271da2 +Subproject commit 1c22886058af2ff72b92624bc86f88cc11a3dfc6 diff --git a/dependency/llama.cpp b/dependency/llama.cpp index 4a29bca8..38c6fa3b 160000 --- a/dependency/llama.cpp +++ b/dependency/llama.cpp @@ -1 +1 @@ -Subproject commit 4a29bca867e2601a2e69e007640ac1abb9f3a381 +Subproject commit 38c6fa3b8fb6c88075102fd859d04eaea27aa87c From 85cdcffe07fbe571733eadc29d5c5b240dfd5fd8 Mon Sep 17 00:00:00 2001 From: qiqiWav Date: Fri, 6 Dec 2024 00:02:11 +0000 Subject: [PATCH 03/20] merge main --- dependency/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dependency/llama.cpp b/dependency/llama.cpp index ed459776..bb33473f 160000 --- a/dependency/llama.cpp +++ b/dependency/llama.cpp @@ -1 +1 @@ -Subproject commit ed459776811d0928ce55a001e9e5a6bc3bf22ca4 +Subproject commit bb33473f08db604e1f30334366032f0904e2a722 From 608c928d60fbafed1ddf27910d328a56027252e2 Mon Sep 17 00:00:00 2001 From: qiqiWav Date: Fri, 6 Dec 2024 00:26:01 +0000 Subject: [PATCH 04/20] omnivision -> omniVLM --- README.md | 4 ++-- docs/README.md | 10 ++++++++-- nexa/constants.py | 10 +++++----- nexa/gguf/nexa_inference_vlm_omni.py | 2 +- 4 files changed, 16 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index a61f4633..17887873 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ ## Latest News 🔥 -- Support Nexa AI's own vision language model (0.9B parameters): `nexa run omnivision` and audio language model (2.9B parameters): `nexa run omniaudio` +- Support Nexa AI's own vision language model (0.9B parameters): `nexa run omniVLM` and audio language model (2.9B parameters): `nexa run omniaudio` - Support audio language model: `nexa run qwen2audio`, **we are the first open-source toolkit to support audio language model with GGML tensor library.** - Support iOS Swift binding for local inference on **iOS mobile** devices. - Support embedding model: `nexa embed ` @@ -228,7 +228,7 @@ Supported model examples (full list at [Model Hub](https://nexa.ai/models)): | [qwen2audio](https://nexa.ai/Qwen/Qwen2-Audio-7.8B-Instruct/gguf-q4_K_M/readme) | AudioLM | GGUF | `nexa run qwen2audio` | | [octopus-v2](https://www.nexaai.com/NexaAI/Octopus-v2/gguf-q4_0/readme) | Function Call | GGUF | `nexa run octopus-v2` | | [octo-net](https://www.nexaai.com/NexaAI/Octo-net/gguf-q4_0/readme) | Text | GGUF | `nexa run octo-net` | -| [omnivision](https://nexa.ai/NexaAI/omnivision/gguf-fp16/readme) | Multimodal | GGUF | `nexa run omnivision` | +| [omniVLM](https://nexa.ai/NexaAI/omniVLM/gguf-fp16/readme) | Multimodal | GGUF | `nexa run omniVLM` | | [nanollava](https://www.nexaai.com/qnguyen3/nanoLLaVA/gguf-fp16/readme) | Multimodal | GGUF | `nexa run nanollava` | | [llava-phi3](https://www.nexaai.com/xtuner/llava-phi-3-mini/gguf-q4_0/readme) | Multimodal | GGUF | `nexa run llava-phi3` | | [llava-llama3](https://www.nexaai.com/xtuner/llava-llama-3-8b-v1.1/gguf-q4_0/readme) | Multimodal | GGUF | `nexa run llava-llama3` | diff --git a/docs/README.md b/docs/README.md index 252116f7..d4081d2e 100644 --- a/docs/README.md +++ b/docs/README.md @@ -28,12 +28,16 @@ pip install nexaai[onnx] # if you need ONNX support ``` ### build from source + To build C++ only + ``` cmake -B build -S . cmake --build build --config Release -j32 ``` + To build C++ and install python package from source, run the following commands: + ```bash git clone --recursive https://github.com/NexaAI/nexa-sdk.git cd nexa-sdk @@ -75,7 +79,7 @@ python -m nexa.gguf.nexa_inference_text gemma python -m nexa.gguf.nexa_inference_text octopusv2 --stop_words "" wget https://assets-c4akfrf5b4d3f4b7.z01.azurefd.net/assets/2024/04/BMDataViz_661fb89f3845e.png -O test.png python -m nexa.gguf.nexa_inference_vlm nanollava -python -m nexa.gguf.nexa_inference_vlm_omni omnivision +python -m nexa.gguf.nexa_inference_vlm_omni omniVLM python -m nexa.gguf.nexa_inference_image sd1-4 python -m nexa.gguf.nexa_inference_image sd1-4 --img2img wget -O control_normal-fp16.safetensors https://huggingface.co/webui/ControlNet-modules-safetensors/resolve/main/control_normal-fp16.safetensors @@ -235,7 +239,9 @@ dumpbin /dependents your_executable_or_dll.dll # in Developer PowerShell for Vi ``` ### Debug dynamic lib + According to [isse](https://github.com/abetlen/llama-cpp-python/issues/1346), below can check the exported symbols on linux. + ``` readelf -Ws --dyn-syms libllama.so -``` \ No newline at end of file +``` diff --git a/nexa/constants.py b/nexa/constants.py index 24acd195..51d6e051 100644 --- a/nexa/constants.py +++ b/nexa/constants.py @@ -188,8 +188,8 @@ class ModelType(Enum): "omnivision-preview": "omnivision-preview:projector-fp16", "omnivision-preview:fp16": "omnivision-preview:projector-fp16", "omnivision-preview:q4_0": "omnivision-preview:projector-q4_0", - "omnivision": "omnivision:projector-fp16", - "omnivision:fp16": "omnivision:projector-fp16", + "omniVLM": "omniVLM:projector-fp16", + "omniVLM:fp16": "omniVLM:projector-fp16", "omnivision-ocr": "omnivision-ocr:projector-fp16", "omnivision-ocr:fp16": "omnivision-ocr:projector-fp16", } @@ -198,8 +198,8 @@ class ModelType(Enum): "omnivision-preview": "omnivision-preview:model-fp16", "omnivision-preview:fp16": "omnivision-preview:model-fp16", "omnivision-preview:q4_0": "omnivision-preview:model-q4_0", - "omnivision": "omnivision:model-fp16", - "omnivision:fp16": "omnivision:model-fp16", + "omniVLM": "omniVLM:model-fp16", + "omniVLM:fp16": "omniVLM:model-fp16", "omnivision-ocr": "omnivision-ocr:model-fp16", "omnivision-ocr:fp16": "omnivision-ocr:model-fp16", } @@ -461,7 +461,7 @@ class ModelType(Enum): "FLUX.1-schnell": ModelType.COMPUTER_VISION, "Phi-3-vision-128k-instruct": ModelType.MULTIMODAL, "omnivision-preview": ModelType.MULTIMODAL, - "omnivision": ModelType.MULTIMODAL, + "omniVLM": ModelType.MULTIMODAL, "omnivision-ocr": ModelType.MULTIMODAL, "nanoLLaVA": ModelType.MULTIMODAL, "llava-v1.6-mistral-7b": ModelType.MULTIMODAL, diff --git a/nexa/gguf/nexa_inference_vlm_omni.py b/nexa/gguf/nexa_inference_vlm_omni.py index bd5b6b29..4a76a4eb 100644 --- a/nexa/gguf/nexa_inference_vlm_omni.py +++ b/nexa/gguf/nexa_inference_vlm_omni.py @@ -40,7 +40,7 @@ def __init__( else: self.n_gpu_layers = 0 - # Handle direct model file paths (e.g., omnivision:model-fp16) + # Handle direct model file paths (e.g., omniVLM:model-fp16) if model_path and ':model-' in model_path: base_name = model_path.split(':')[0] model_type = model_path.split('model-')[1] From adf21f62025e16bd1205755bfcda06d3bb4411a3 Mon Sep 17 00:00:00 2001 From: Davidqian123 Date: Fri, 6 Dec 2024 21:21:42 +0000 Subject: [PATCH 05/20] update dependencies --- dependency/bark.cpp | 2 +- dependency/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dependency/bark.cpp b/dependency/bark.cpp index 1c228860..f4ab4420 160000 --- a/dependency/bark.cpp +++ b/dependency/bark.cpp @@ -1 +1 @@ -Subproject commit 1c22886058af2ff72b92624bc86f88cc11a3dfc6 +Subproject commit f4ab4420973d04055225c85be2ca7c0273e65074 diff --git a/dependency/llama.cpp b/dependency/llama.cpp index 38c6fa3b..bb33473f 160000 --- a/dependency/llama.cpp +++ b/dependency/llama.cpp @@ -1 +1 @@ -Subproject commit 38c6fa3b8fb6c88075102fd859d04eaea27aa87c +Subproject commit bb33473f08db604e1f30334366032f0904e2a722 From 024782bccde3f4dd7dce9adc425568c4f590c9e3 Mon Sep 17 00:00:00 2001 From: Davidqian123 Date: Fri, 6 Dec 2024 21:24:57 +0000 Subject: [PATCH 06/20] release v0.0.9.6 --- nexa/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nexa/__init__.py b/nexa/__init__.py index 26e7b666..af51d3c5 100644 --- a/nexa/__init__.py +++ b/nexa/__init__.py @@ -1 +1 @@ -__version__ = "0.0.9.5" +__version__ = "0.0.9.6" From 929af92554f9dfe0d8db707784ab9513a0cb35e5 Mon Sep 17 00:00:00 2001 From: zhycheng614 Date: Fri, 6 Dec 2024 23:39:37 +0000 Subject: [PATCH 07/20] for metal, update macos wheel version to 13 14 15 --- .github/workflows/build-wheels-metal.yaml | 2 +- dependency/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index e56b16c7..f0011a98 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -11,7 +11,7 @@ jobs: runs-on: macos-${{ matrix.os }} strategy: matrix: - os: [12, 13, 14] + os: [13, 14, 15] steps: - uses: actions/checkout@v4 diff --git a/dependency/llama.cpp b/dependency/llama.cpp index bb33473f..ed459776 160000 --- a/dependency/llama.cpp +++ b/dependency/llama.cpp @@ -1 +1 @@ -Subproject commit bb33473f08db604e1f30334366032f0904e2a722 +Subproject commit ed459776811d0928ce55a001e9e5a6bc3bf22ca4 From 255be44b5760773841e45792c192a772a4697cef Mon Sep 17 00:00:00 2001 From: Davidqian123 Date: Fri, 6 Dec 2024 23:41:18 +0000 Subject: [PATCH 08/20] update --- .github/workflows/build-wheels-metal.yaml | 2 +- dependency/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml index e56b16c7..f0011a98 100644 --- a/.github/workflows/build-wheels-metal.yaml +++ b/.github/workflows/build-wheels-metal.yaml @@ -11,7 +11,7 @@ jobs: runs-on: macos-${{ matrix.os }} strategy: matrix: - os: [12, 13, 14] + os: [13, 14, 15] steps: - uses: actions/checkout@v4 diff --git a/dependency/llama.cpp b/dependency/llama.cpp index bb33473f..ed459776 160000 --- a/dependency/llama.cpp +++ b/dependency/llama.cpp @@ -1 +1 @@ -Subproject commit bb33473f08db604e1f30334366032f0904e2a722 +Subproject commit ed459776811d0928ce55a001e9e5a6bc3bf22ca4 From f6d438aac5c59a34095a0b934f0b2985a503db59 Mon Sep 17 00:00:00 2001 From: Te993 <3923106166@qq.com> Date: Mon, 9 Dec 2024 16:50:12 +0800 Subject: [PATCH 09/20] upgrade llama cpp python --- nexa/gguf/llama/_ctypes_extensions.py | 75 ++ nexa/gguf/llama/_internals_transformers.py | 415 +++--- nexa/gguf/llama/_utils_transformers.py | 4 +- nexa/gguf/llama/llama.py | 498 ++++---- nexa/gguf/llama/llama_cache.py | 10 +- nexa/gguf/llama/llama_chat_format.py | 125 +- nexa/gguf/llama/llama_cpp.py | 1343 ++++++++++---------- nexa/gguf/llama/llama_grammar.py | 884 +------------ nexa/gguf/llama/llama_speculative.py | 2 +- nexa/gguf/llama/llama_tokenizer.py | 37 +- nexa/gguf/llama/llama_types.py | 2 +- nexa/gguf/llama/llava_cpp.py | 81 +- 12 files changed, 1356 insertions(+), 2120 deletions(-) create mode 100644 nexa/gguf/llama/_ctypes_extensions.py diff --git a/nexa/gguf/llama/_ctypes_extensions.py b/nexa/gguf/llama/_ctypes_extensions.py new file mode 100644 index 00000000..2ff7e38e --- /dev/null +++ b/nexa/gguf/llama/_ctypes_extensions.py @@ -0,0 +1,75 @@ +from __future__ import annotations + +import sys +import os +import ctypes +import functools +import pathlib + +from typing import ( + Any, + Callable, + List, + Union, + Optional, + TYPE_CHECKING, + TypeVar, + Generic, +) +from typing_extensions import TypeAlias + +# ctypes sane type hint helpers +# +# - Generic Pointer and Array types +# - PointerOrRef type with a type hinted byref function +# +# NOTE: Only use these for static type checking not for runtime checks +# no good will come of that + +if TYPE_CHECKING: + CtypesCData = TypeVar("CtypesCData", bound=ctypes._CData) # type: ignore + + CtypesArray: TypeAlias = ctypes.Array[CtypesCData] # type: ignore + + CtypesPointer: TypeAlias = ctypes._Pointer[CtypesCData] # type: ignore + + CtypesVoidPointer: TypeAlias = ctypes.c_void_p + + class CtypesRef(Generic[CtypesCData]): + pass + + CtypesPointerOrRef: TypeAlias = Union[ + CtypesPointer[CtypesCData], CtypesRef[CtypesCData] + ] + + CtypesFuncPointer: TypeAlias = ctypes._FuncPointer # type: ignore + +F = TypeVar("F", bound=Callable[..., Any]) + +def ctypes_function_for_shared_library(lib: ctypes.CDLL): + """Decorator for defining ctypes functions with type hints""" + + def ctypes_function( + name: str, argtypes: List[Any], restype: Any, enabled: bool = True + ): + def decorator(f: F) -> F: + if enabled: + func = getattr(lib, name) + func.argtypes = argtypes + func.restype = restype + functools.wraps(f)(func) + return func + else: + return f + + return decorator + + return ctypes_function + + +def _byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCData]: + """Type-annotated version of ctypes.byref""" + ... + + +byref = _byref if TYPE_CHECKING else ctypes.byref diff --git a/nexa/gguf/llama/_internals_transformers.py b/nexa/gguf/llama/_internals_transformers.py index 7646563f..bbd215d1 100644 --- a/nexa/gguf/llama/_internals_transformers.py +++ b/nexa/gguf/llama/_internals_transformers.py @@ -6,6 +6,7 @@ from typing import ( Dict, List, + Tuple, Optional, Sequence, ) @@ -25,7 +26,7 @@ # Python wrappers over llama.h structs -class _LlamaModel: +class LlamaModel: """Intermediate Python wrapper for a llama.cpp llama_model. NOTE: For stability it's recommended you use the Llama class instead.""" @@ -41,19 +42,21 @@ def __init__( self.verbose = verbose self._exit_stack = ExitStack() - self.model = None + model = None if not os.path.exists(path_model): raise ValueError(f"Model path does not exist: {path_model}") with suppress_stdout_stderr(disable=verbose): - self.model = llama_cpp.llama_load_model_from_file( + model = llama_cpp.llama_load_model_from_file( self.path_model.encode("utf-8"), self.params ) - if self.model is None: + if model is None: raise ValueError(f"Failed to load model from file: {path_model}") + self.model = model + def free_model(): if self.model is None: return @@ -69,128 +72,83 @@ def __del__(self): self.close() def vocab_type(self) -> int: - assert self.model is not None return llama_cpp.llama_vocab_type(self.model) def n_vocab(self) -> int: - assert self.model is not None return llama_cpp.llama_n_vocab(self.model) def n_ctx_train(self) -> int: - assert self.model is not None return llama_cpp.llama_n_ctx_train(self.model) def n_embd(self) -> int: - assert self.model is not None return llama_cpp.llama_n_embd(self.model) def rope_freq_scale_train(self) -> float: - assert self.model is not None return llama_cpp.llama_rope_freq_scale_train(self.model) def desc(self) -> str: - assert self.model is not None buf = ctypes.create_string_buffer(1024) llama_cpp.llama_model_desc(self.model, buf, 1024) return buf.value.decode("utf-8") def size(self) -> int: - assert self.model is not None return llama_cpp.llama_model_size(self.model) def n_params(self) -> int: - assert self.model is not None return llama_cpp.llama_model_n_params(self.model) def get_tensor(self, name: str) -> ctypes.c_void_p: - assert self.model is not None return llama_cpp.llama_get_model_tensor(self.model, name.encode("utf-8")) - def apply_lora_from_file( - self, - lora_path: str, - scale: float, - path_base_model: Optional[str], - n_threads: int, - ): - assert self.model is not None - return llama_cpp.llama_model_apply_lora_from_file( - self.model, - lora_path.encode("utf-8"), - scale, - ( - path_base_model.encode("utf-8") - if path_base_model is not None - else ctypes.c_char_p(0) - ), - n_threads, - ) - # Vocab def token_get_text(self, token: int) -> str: - # TODO: Fix - assert self.model is not None return llama_cpp.llama_token_get_text(self.model, token).decode("utf-8") def token_get_score(self, token: int) -> float: - assert self.model is not None return llama_cpp.llama_token_get_score(self.model, token) def token_get_attr(self, token: int) -> int: - assert self.model is not None return llama_cpp.llama_token_get_attr(self.model, token) # Special tokens def token_bos(self) -> int: - assert self.model is not None return llama_cpp.llama_token_bos(self.model) def token_eos(self) -> int: - assert self.model is not None return llama_cpp.llama_token_eos(self.model) def token_cls(self) -> int: - assert self.model is not None return llama_cpp.llama_token_cls(self.model) def token_sep(self) -> int: - assert self.model is not None return llama_cpp.llama_token_sep(self.model) def token_nl(self) -> int: - assert self.model is not None return llama_cpp.llama_token_nl(self.model) def token_prefix(self) -> int: - assert self.model is not None return llama_cpp.llama_token_prefix(self.model) def token_middle(self) -> int: - assert self.model is not None return llama_cpp.llama_token_middle(self.model) def token_suffix(self) -> int: - assert self.model is not None return llama_cpp.llama_token_suffix(self.model) def token_eot(self) -> int: - assert self.model is not None return llama_cpp.llama_token_eot(self.model) def add_bos_token(self) -> bool: - assert self.model is not None return llama_cpp.llama_add_bos_token(self.model) def add_eos_token(self) -> bool: - assert self.model is not None return llama_cpp.llama_add_eos_token(self.model) # Tokenization def tokenize(self, text: bytes, add_bos: bool, special: bool): - assert self.model is not None n_ctx = self.n_ctx_train() tokens = (llama_cpp.llama_token * n_ctx)() n_tokens = llama_cpp.llama_tokenize( @@ -209,13 +167,11 @@ def tokenize(self, text: bytes, add_bos: bool, special: bool): return list(tokens[:n_tokens]) def token_to_piece(self, token: int, special: bool = False) -> bytes: - assert self.model is not None buf = ctypes.create_string_buffer(32) llama_cpp.llama_token_to_piece(self.model, token, buf, 32, 0, special) return bytes(buf) def detokenize(self, tokens: List[int], special: bool = False) -> bytes: - assert self.model is not None output = b"" size = 32 buffer = (ctypes.c_char * size)() @@ -235,7 +191,6 @@ def detokenize(self, tokens: List[int], special: bool = False) -> bytes: # Extra def metadata(self) -> Dict[str, str]: - assert self.model is not None metadata: Dict[str, str] = {} buffer_size = 1024 buffer = ctypes.create_string_buffer(buffer_size) @@ -272,14 +227,14 @@ def default_params(): return llama_cpp.llama_model_default_params() -class _LlamaContext: +class LlamaContext: """Intermediate Python wrapper for a llama.cpp llama_context. NOTE: For stability it's recommended you use the Llama class instead.""" def __init__( self, *, - model: _LlamaModel, + model: LlamaModel, params: llama_cpp.llama_context_params, verbose: bool = True, ): @@ -288,15 +243,13 @@ def __init__( self.verbose = verbose self._exit_stack = ExitStack() - self.ctx = None - - assert self.model.model is not None + ctx = llama_cpp.llama_new_context_with_model(self.model.model, self.params) - self.ctx = llama_cpp.llama_new_context_with_model(self.model.model, self.params) - - if self.ctx is None: + if ctx is None: raise ValueError("Failed to create llama_context") + self.ctx = ctx + def free_ctx(): if self.ctx is None: return @@ -312,40 +265,38 @@ def __del__(self): self.close() def n_ctx(self) -> int: - assert self.ctx is not None return llama_cpp.llama_n_ctx(self.ctx) def pooling_type(self) -> int: - assert self.ctx is not None return llama_cpp.llama_pooling_type(self.ctx) def kv_cache_clear(self): - assert self.ctx is not None llama_cpp.llama_kv_cache_clear(self.ctx) def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int): - assert self.ctx is not None llama_cpp.llama_kv_cache_seq_rm(self.ctx, seq_id, p0, p1) def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int): - assert self.ctx is not None llama_cpp.llama_kv_cache_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1) def kv_cache_seq_keep(self, seq_id: int): - assert self.ctx is not None llama_cpp.llama_kv_cache_seq_keep(self.ctx, seq_id) def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int): - assert self.ctx is not None llama_cpp.llama_kv_cache_seq_add(self.ctx, seq_id, p0, p1, shift) def get_state_size(self) -> int: - assert self.ctx is not None return llama_cpp.llama_get_state_size(self.ctx) - def decode(self, batch: "_LlamaBatch"): - assert self.ctx is not None - assert batch.batch is not None + # TODO: copy_state_data + + # TODO: set_state_data + + # TODO: llama_load_session_file + + # TODO: llama_save_session_file + + def decode(self, batch: LlamaBatch): return_code = llama_cpp.llama_decode( self.ctx, batch.batch, @@ -354,25 +305,21 @@ def decode(self, batch: "_LlamaBatch"): raise RuntimeError(f"llama_decode returned {return_code}") def set_n_threads(self, n_threads: int, n_threads_batch: int): - assert self.ctx is not None llama_cpp.llama_set_n_threads(self.ctx, n_threads, n_threads_batch) def get_logits(self): - assert self.ctx is not None return llama_cpp.llama_get_logits(self.ctx) def get_logits_ith(self, i: int): - assert self.ctx is not None return llama_cpp.llama_get_logits_ith(self.ctx, i) def get_embeddings(self): - assert self.ctx is not None return llama_cpp.llama_get_embeddings(self.ctx) # Sampling functions def set_rng_seed(self, seed: int): - assert self.ctx is not None + # TODO: Fix llama_cpp.llama_set_rng_seed(self.ctx, seed) def sample_repetition_penalties( @@ -384,7 +331,6 @@ def sample_repetition_penalties( penalty_freq: float, penalty_present: float, ): - assert self.ctx is not None llama_cpp.llama_sample_repetition_penalties( self.ctx, llama_cpp.byref(candidates.candidates), @@ -396,55 +342,39 @@ def sample_repetition_penalties( ) def sample_softmax(self, candidates: "_LlamaTokenDataArray"): - assert self.ctx is not None llama_cpp.llama_sample_softmax( self.ctx, llama_cpp.byref(candidates.candidates), ) def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int): - assert self.ctx is not None llama_cpp.llama_sample_top_k( self.ctx, llama_cpp.byref(candidates.candidates), k, min_keep ) def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int): - assert self.ctx is not None llama_cpp.llama_sample_top_p( self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep ) def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int): - assert self.ctx is not None llama_cpp.llama_sample_min_p( self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep ) - def sample_tail_free( - self, candidates: "_LlamaTokenDataArray", z: float, min_keep: int - ): - assert self.ctx is not None - llama_cpp.llama_sample_tail_free( - self.ctx, llama_cpp.byref(candidates.candidates), z, min_keep - ) - def sample_typical( self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int ): - assert self.ctx is not None llama_cpp.llama_sample_typical( self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep ) def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float): - assert self.ctx is not None llama_cpp.llama_sample_temp( self.ctx, llama_cpp.byref(candidates.candidates), temp ) def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar): - assert self.ctx is not None - assert grammar.grammar is not None llama_cpp.llama_sample_grammar( self.ctx, llama_cpp.byref(candidates.candidates), @@ -459,7 +389,6 @@ def sample_token_mirostat( m: int, mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float], ) -> int: - assert self.ctx is not None return llama_cpp.llama_sample_token_mirostat( self.ctx, llama_cpp.byref(candidates.candidates), @@ -476,7 +405,6 @@ def sample_token_mirostat_v2( eta: float, mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float], ) -> int: - assert self.ctx is not None return llama_cpp.llama_sample_token_mirostat_v2( self.ctx, llama_cpp.byref(candidates.candidates), @@ -486,14 +414,12 @@ def sample_token_mirostat_v2( ) def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int: - assert self.ctx is not None return llama_cpp.llama_sample_token_greedy( self.ctx, llama_cpp.byref(candidates.candidates), ) def sample_token(self, candidates: "_LlamaTokenDataArray") -> int: - assert self.ctx is not None return llama_cpp.llama_sample_token( self.ctx, llama_cpp.byref(candidates.candidates), @@ -501,17 +427,13 @@ def sample_token(self, candidates: "_LlamaTokenDataArray") -> int: # Grammar def grammar_accept_token(self, grammar: LlamaGrammar, token: int): - assert self.ctx is not None - assert grammar.grammar is not None llama_cpp.llama_grammar_accept_token(grammar.grammar, self.ctx, token) def reset_timings(self): - assert self.ctx is not None - llama_cpp.llama_reset_timings(self.ctx) + llama_cpp.llama_perf_context_reset(self.ctx) def print_timings(self): - assert self.ctx is not None - llama_cpp.llama_print_timings(self.ctx) + llama_cpp.llama_perf_context_print(self.ctx) # Utility functions @staticmethod @@ -520,7 +442,7 @@ def default_params(): return llama_cpp.llama_context_default_params() -class _LlamaBatch: +class LlamaBatch: def __init__( self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True ): @@ -530,10 +452,12 @@ def __init__( self.verbose = verbose self._exit_stack = ExitStack() - self.batch = None - self.batch = llama_cpp.llama_batch_init( - self._n_tokens, self.embd, self.n_seq_max - ) + batch = llama_cpp.llama_batch_init(self._n_tokens, self.embd, self.n_seq_max) + + if batch is None: + raise ValueError("Failed to create llama_batch") + + self.batch = batch def free_batch(): if self.batch is None: @@ -550,15 +474,12 @@ def __del__(self): self.close() def n_tokens(self) -> int: - assert self.batch is not None return self.batch.n_tokens def reset(self): - assert self.batch is not None self.batch.n_tokens = 0 def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool): - assert self.batch is not None n_tokens = len(batch) self.batch.n_tokens = n_tokens for i in range(n_tokens): @@ -570,7 +491,6 @@ def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool): self.batch.logits[n_tokens - 1] = True def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool): - assert self.batch is not None n_tokens = len(batch) n_tokens0 = self.batch.n_tokens self.batch.n_tokens += n_tokens @@ -584,7 +504,7 @@ def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool): self.batch.logits[n_tokens - 1] = True -class _LlamaTokenDataArray: +class LlamaTokenDataArray: def __init__(self, *, n_vocab: int): self.n_vocab = n_vocab self.candidates_data = np.recarray( @@ -609,90 +529,10 @@ def copy_logits(self, logits: npt.NDArray[np.single]): self.candidates.size = self.n_vocab -# Python wrappers over common/common -def _tokenize(model: _LlamaModel, text: str, add_bos: bool, special: bool) -> list[int]: - assert model.model is not None - n_tokens = len(text) + 1 if add_bos else len(text) - result = (llama_cpp.llama_token * n_tokens)() - n_tokens = llama_cpp.llama_tokenize( - model.model, - text.encode("utf-8"), - len(text), - result, - n_tokens, - add_bos, - special, - ) - if n_tokens < 0: - result = (llama_cpp.llama_token * -n_tokens)() - check = llama_cpp.llama_tokenize( - model.model, - text.encode("utf-8"), - len(text), - result, - len(result), - add_bos, - special, - ) - if check != -n_tokens: - raise RuntimeError(f'Failed to tokenize: text="{text}" n_tokens={n_tokens}') - else: - result = result[:n_tokens] - return list(result) - - -def _token_to_piece(model: _LlamaModel, token: int, special: bool = False) -> str: - assert model.model is not None - result = (ctypes.c_char * 8)(0) - n_tokens = llama_cpp.llama_token_to_piece( - model.model, token, result, 0, len(result), special - ) - if n_tokens < 0: - result = (ctypes.c_char * -n_tokens)(0) - check = llama_cpp.llama_token_to_piece( - model.model, token, result, 0, len(result), special - ) - if check != -n_tokens: - raise RuntimeError(f"Failed to get piece: token={token}") - else: - result = result[:n_tokens] - return bytes(result).decode("utf-8") - - -def _detokenize_spm(model: _LlamaModel, tokens: List[int]) -> str: - bos_id = model.token_bos() - result = "" - for i, token in enumerate(tokens): - piece = _token_to_piece(model, token) - if ( - (tokens[0] == bos_id and i == 1) or (tokens[0] != bos_id and i == 0) - ) and piece[0] == " ": - piece = piece[1:] - result += piece - return result - - -def _detokenize_bpe(model: _LlamaModel, tokens: List[int]) -> str: - result = "" - for token in tokens: - piece = _token_to_piece(model, token) - result += piece - return result - - -def _should_add_bos(model: _LlamaModel) -> bool: - assert model.model is not None - add_bos = llama_cpp.llama_add_bos_token(model.model) - if add_bos: - return add_bos - else: - return llama_cpp.llama_vocab_type(model.model) == llama_cpp.LLAMA_VOCAB_TYPE_SPM - - # Embedding functions -def _normalize_embedding(embedding): +def normalize_embedding(embedding): norm = float(np.linalg.norm(embedding)) if norm == 0.0: return embedding @@ -703,7 +543,7 @@ def _normalize_embedding(embedding): @dataclass -class _LlamaSamplingParams: +class LlamaSamplingParams: n_prev: int = 64 n_probs: int = 0 top_k: int = 40 @@ -730,8 +570,8 @@ class _LlamaSamplingParams: @dataclass -class _LlamaSamplingContext: - params: _LlamaSamplingParams = field(default_factory=_LlamaSamplingParams) +class LlamaSamplingContext: + params: LlamaSamplingParams = field(default_factory=LlamaSamplingParams) mirostat_mu: ctypes.c_float = field(default_factory=ctypes.c_float) grammar: Optional[LlamaGrammar] = None # NOTE: Missing parsed_grammar @@ -745,7 +585,7 @@ def reset(self): self.grammar.reset() def cp(self): - return _LlamaSamplingContext( + return LlamaSamplingContext( params=self.params, mirostat_mu=self.mirostat_mu, grammar=self.grammar, @@ -759,12 +599,12 @@ def last(self) -> Optional[int]: else: return None - def prev_str(self, ctx_main: _LlamaContext, n: int) -> str: + def prev_str(self, ctx_main: LlamaContext, n: int) -> str: return ctx_main.model.detokenize(self.prev[-n:]).decode("utf-8") def sample( self, - ctx_main: _LlamaContext, + ctx_main: LlamaContext, idx: int = 0, logits_array: Optional[npt.NDArray[np.single]] = None, ): @@ -782,7 +622,7 @@ def sample( for token, logit_bias in self.params.logit_bias.items(): logits_array[token] += logit_bias - token_data_array = _LlamaTokenDataArray( + token_data_array = LlamaTokenDataArray( n_vocab=n_vocab ) # TODO: Only create this once token_data_array.copy_logits(logits_array) @@ -838,9 +678,6 @@ def sample( ctx_main.sample_top_k( token_data_array, self.params.top_k, min_keep=min_keep ) - ctx_main.sample_tail_free( - token_data_array, self.params.tfs_z, min_keep=min_keep - ) ctx_main.sample_typical( token_data_array, self.params.typical_p, min_keep=min_keep ) @@ -854,7 +691,173 @@ def sample( id = ctx_main.sample_token(token_data_array) return id - def accept(self, ctx_main: _LlamaContext, id: int, apply_grammar: bool): + def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool): if apply_grammar and self.grammar is not None: ctx_main.grammar_accept_token(self.grammar, id) - self.prev.append(id) \ No newline at end of file + self.prev.append(id) + + +from typing import List, Callable, Optional, Union +import ctypes +import llama_cpp + + +class CustomSampler: + def __init__( + self, apply_func: typing.Callable[[llama_cpp.llama_token_data_array], None] + ): + self.apply_func = apply_func + + def apply_wrapper( + sampler: llama_cpp.llama_sampler_p, + cur_p: llama_cpp.llama_token_data_array_p, + ): + self.apply_func(cur_p) + + def free_wrapper(sampler: llama_cpp.llama_sampler_p): + pass + + sampler_i = llama_cpp.llama_sampler_i() + sampler_i.apply = llama_cpp.llama_sampler_i_apply(apply_wrapper) + self._apply_wrapper_ref = apply_wrapper + + sampler_i.name = llama_cpp.llama_sampler_i_name(0) + sampler_i.accept = llama_cpp.llama_sampler_i_accept(0) + sampler_i.reset = llama_cpp.llama_sampler_i_reset(0) + sampler_i.clone = llama_cpp.llama_sampler_i_clone(0) + sampler_i.free = llama_cpp.llama_sampler_i_free(0) + + self.sampler = llama_cpp.llama_sampler() + self.sampler.iface = ctypes.pointer(sampler_i) + self.sampler.ctx = None + + def get_sampler(self) -> llama_cpp.llama_sampler_p: + return ctypes.pointer(self.sampler) + + +class LlamaSampler: + def __init__(self): + params = llama_cpp.llama_sampler_chain_params() + self.sampler = llama_cpp.llama_sampler_chain_init(params) + self.samplers: List[llama_cpp.llama_sampler_p] = [] + self.custom_samplers: List[Tuple[int, CustomSampler]] = [] + + def add_greedy(self): + sampler = llama_cpp.llama_sampler_init_greedy() + self._add_sampler(sampler) + + def add_dist(self, seed: int): + sampler = llama_cpp.llama_sampler_init_dist(seed) + self._add_sampler(sampler) + + def add_softmax(self): + sampler = llama_cpp.llama_sampler_init_softmax() + self._add_sampler(sampler) + + def add_top_k(self, k: int): + sampler = llama_cpp.llama_sampler_init_top_k(k) + self._add_sampler(sampler) + + def add_top_p(self, p: float, min_keep: int): + sampler = llama_cpp.llama_sampler_init_top_p(p, min_keep) + self._add_sampler(sampler) + + def add_min_p(self, p: float, min_keep: int): + sampler = llama_cpp.llama_sampler_init_min_p(p, min_keep) + self._add_sampler(sampler) + + def add_typical(self, p: float, min_keep: int): + sampler = llama_cpp.llama_sampler_init_typical(p, min_keep) + self._add_sampler(sampler) + + def add_temp(self, temp: float): + sampler = llama_cpp.llama_sampler_init_temp(temp) + self._add_sampler(sampler) + + def add_temp_ext(self, t: float, delta: float, exponent: float): + sampler = llama_cpp.llama_sampler_init_temp_ext(t, delta, exponent) + self._add_sampler(sampler) + + def add_mirostat(self, n_vocab: int, seed: int, tau: float, eta: float, m: int): + sampler = llama_cpp.llama_sampler_init_mirostat(n_vocab, seed, tau, eta, m) + self._add_sampler(sampler) + + def add_mirostat_v2(self, seed: int, tau: float, eta: float): + sampler = llama_cpp.llama_sampler_init_mirostat_v2(seed, tau, eta) + self._add_sampler(sampler) + + def add_grammar(self, model: LlamaModel, grammar: LlamaGrammar): + sampler = llama_cpp.llama_sampler_init_grammar( + model.model, grammar._grammar.encode("utf-8"), grammar._root.encode("utf-8") + ) + self._add_sampler(sampler) + + def add_penalties( + self, + n_vocab: int, + special_eos_id: int, + linefeed_id: int, + penalty_last_n: int, + penalty_repeat: float, + penalty_freq: float, + penalty_present: float, + penalize_nl: bool, + ignore_eos: bool, + ): + sampler = llama_cpp.llama_sampler_init_penalties( + n_vocab, + special_eos_id, + linefeed_id, + penalty_last_n, + penalty_repeat, + penalty_freq, + penalty_present, + penalize_nl, + ignore_eos, + ) + self._add_sampler(sampler) + + def init_logit_bias( + self, n_vocab: int, n_logit_bias, logit_bias: llama_cpp.llama_logit_bias_p + ): + sampler = llama_cpp.llama_sampler_init_logit_bias( + n_vocab, n_logit_bias, logit_bias + ) + self._add_sampler(sampler) + + def add_custom( + self, apply_func: Callable[[llama_cpp.llama_token_data_array], None] + ): + custom_sampler = CustomSampler(apply_func) + sampler = custom_sampler.get_sampler() + self._add_sampler(sampler) + # NOTE: Must remove custom samplers before free or llama.cpp will try to free them + self.custom_samplers.append( + (llama_cpp.llama_sampler_chain_n(self.sampler) - 1, custom_sampler) + ) + + def _add_sampler(self, sampler: llama_cpp.llama_sampler_p): + assert self.sampler is not None + llama_cpp.llama_sampler_chain_add(self.sampler, sampler) + self.samplers.append(sampler) + + def get_seed(self) -> int: + assert self.sampler is not None + return llama_cpp.llama_sampler_get_seed(self.sampler) + + def sample(self, ctx: LlamaContext, idx: int) -> int: + assert self.sampler is not None + return llama_cpp.llama_sampler_sample(self.sampler, ctx.ctx, idx) + + def close(self): + if self.sampler: + # NOTE: Must remove custom samplers before free or llama.cpp will try to free them + for i, _ in reversed(self.custom_samplers): + llama_cpp.llama_sampler_chain_remove(self.sampler, i) + llama_cpp.llama_sampler_free(self.sampler) + self.sampler = None + self.samplers.clear() + self.custom_samplers.clear() + + def __del__(self): + self.close() diff --git a/nexa/gguf/llama/_utils_transformers.py b/nexa/gguf/llama/_utils_transformers.py index 0049e9cc..29628193 100644 --- a/nexa/gguf/llama/_utils_transformers.py +++ b/nexa/gguf/llama/_utils_transformers.py @@ -17,7 +17,7 @@ class suppress_stdout_stderr(object): sys = sys os = os - def __init__(self, disable: bool = False): + def __init__(self, disable: bool = True): self.disable = disable # Oddly enough this works better than the contextlib version @@ -75,4 +75,4 @@ class Singleton(object, metaclass=MetaSingleton): """ def __init__(self): - super(Singleton, self).__init__() \ No newline at end of file + super(Singleton, self).__init__() diff --git a/nexa/gguf/llama/llama.py b/nexa/gguf/llama/llama.py index 0007b515..4ceb378f 100644 --- a/nexa/gguf/llama/llama.py +++ b/nexa/gguf/llama/llama.py @@ -7,6 +7,7 @@ import json import ctypes import typing +import random import fnmatch import warnings import contextlib @@ -31,7 +32,12 @@ from nexa.gguf.llama.llama_types import * from nexa.gguf.llama.llama_grammar import LlamaGrammar -from nexa.gguf.llama.llama_cache import BaseLlamaCache +from nexa.gguf.llama.llama_cache import ( + BaseLlamaCache, + LlamaCache, # type: ignore + LlamaDiskCache, # type: ignore + LlamaRAMCache, # type: ignore +) from nexa.gguf.llama.llama_tokenizer import BaseLlamaTokenizer, LlamaTokenizer import nexa.gguf.llama.llama_cpp as llama_cpp import nexa.gguf.llama.llama_chat_format as llama_chat_format @@ -41,15 +47,7 @@ import numpy as np import numpy.typing as npt -from nexa.gguf.llama._internals_transformers import ( - _LlamaModel, # type: ignore - _LlamaContext, # type: ignore - _LlamaBatch, # type: ignore - _LlamaTokenDataArray, # type: ignore - _LlamaSamplingParams, # type: ignore - _LlamaSamplingContext, # type: ignore - _normalize_embedding, # type: ignore -) +import nexa.gguf.llama._internals_transformers as internals from nexa.gguf.llama._logger_transformers import set_verbose from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr @@ -77,6 +75,7 @@ def __init__( seed: int = llama_cpp.LLAMA_DEFAULT_SEED, n_ctx: int = 512, n_batch: int = 512, + n_ubatch: int = 512, n_threads: Optional[int] = None, n_threads_batch: Optional[int] = None, rope_scaling_type: Optional[ @@ -90,7 +89,7 @@ def __init__( yarn_beta_fast: float = 32.0, yarn_beta_slow: float = 1.0, yarn_orig_ctx: int = 0, - logits_all: bool = True, # switch + logits_all: bool = False, embedding: bool = False, offload_kqv: bool = True, flash_attn: bool = False, @@ -158,6 +157,7 @@ def __init__( seed: RNG seed, -1 for random n_ctx: Text context, 0 = from model n_batch: Prompt processing maximum batch size + n_ubatch: Physical batch size n_threads: Number of threads to use for generation n_threads_batch: Number of threads to use for batch processing rope_scaling_type: RoPE scaling type, from `enum llama_rope_scaling_type`. ref: https://github.com/ggerganov/llama.cpp/pull/2054 @@ -258,28 +258,28 @@ def __init__( for i, (k, v) in enumerate(kv_overrides.items()): self._kv_overrides_array[i].key = k.encode("utf-8") if isinstance(v, bool): - self._kv_overrides_array[i].tag = ( - llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL - ) + self._kv_overrides_array[ + i + ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL self._kv_overrides_array[i].value.val_bool = v elif isinstance(v, int): - self._kv_overrides_array[i].tag = ( - llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT - ) + self._kv_overrides_array[ + i + ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT self._kv_overrides_array[i].value.val_i64 = v elif isinstance(v, float): - self._kv_overrides_array[i].tag = ( - llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT - ) + self._kv_overrides_array[ + i + ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT self._kv_overrides_array[i].value.val_f64 = v elif isinstance(v, str): # type: ignore v_bytes = v.encode("utf-8") if len(v_bytes) > 128: # TODO: Make this a constant raise ValueError(f"Value for {k} is too long: {v}") v_bytes = v_bytes.ljust(128, b"\0") - self._kv_overrides_array[i].tag = ( - llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR - ) + self._kv_overrides_array[ + i + ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR # copy min(v_bytes, 128) to str_value address = typing.cast( int, @@ -295,20 +295,23 @@ def __init__( else: raise ValueError(f"Unknown value type for {k}: {v}") - self._kv_overrides_array[-1].key = ( - b"\0" # ensure sentinel element is zeroed - ) + self._kv_overrides_array[ + -1 + ].key = b"\0" # ensure sentinel element is zeroed self.model_params.kv_overrides = self._kv_overrides_array self.n_batch = min(n_ctx, n_batch) # ??? self.n_threads = n_threads or max(multiprocessing.cpu_count() // 2, 1) self.n_threads_batch = n_threads_batch or multiprocessing.cpu_count() + # Used by the sampler + self._seed = seed or llama_cpp.LLAMA_DEFAULT_SEED + # Context Params self.context_params = llama_cpp.llama_context_default_params() - self.context_params.seed = seed self.context_params.n_ctx = n_ctx self.context_params.n_batch = self.n_batch + self.context_params.n_ubatch = min(self.n_batch, n_ubatch) self.context_params.n_threads = self.n_threads self.context_params.n_threads_batch = self.n_threads_batch self.context_params.rope_scaling_type = ( @@ -336,10 +339,9 @@ def __init__( yarn_beta_slow if yarn_beta_slow != 0.0 else 0 ) self.context_params.yarn_orig_ctx = yarn_orig_ctx if yarn_orig_ctx != 0 else 0 - # self.context_params.logits_all = ( - # logits_all if draft_model is None else True - # ) # Must be set to True for speculative decoding - self.context_params.logits_all = True + self.context_params.logits_all = ( + logits_all if draft_model is None else True + ) # Must be set to True for speculative decoding self.context_params.embeddings = embedding # TODO: Rename to embeddings self.context_params.offload_kqv = offload_kqv self.context_params.flash_attn = flash_attn @@ -364,7 +366,7 @@ def __init__( self._model = self._stack.enter_context( contextlib.closing( - _LlamaModel( + internals.LlamaModel( path_model=self.model_path, params=self.model_params, verbose=self.verbose, @@ -381,10 +383,11 @@ def __init__( self.n_batch = min(n_ctx, n_batch) self.context_params.n_ctx = self._model.n_ctx_train() self.context_params.n_batch = self.n_batch + self.context_params.n_ubatch = min(self.n_batch, n_ubatch) self._ctx = self._stack.enter_context( contextlib.closing( - _LlamaContext( + internals.LlamaContext( model=self._model, params=self.context_params, verbose=self.verbose, @@ -394,7 +397,7 @@ def __init__( self._batch = self._stack.enter_context( contextlib.closing( - _LlamaBatch( + internals.LlamaBatch( n_tokens=self.n_batch, embd=0, n_seq_max=self.context_params.n_ctx, @@ -406,7 +409,6 @@ def __init__( self._lora_adapter: Optional[llama_cpp.llama_lora_adapter_p] = None if self.lora_path: - assert self._model.model is not None self._lora_adapter = llama_cpp.llama_lora_adapter_init( self._model.model, self.lora_path.encode("utf-8"), @@ -424,7 +426,6 @@ def free_lora_adapter(): self._stack.callback(free_lora_adapter) - assert self._ctx.ctx is not None if llama_cpp.llama_lora_adapter_set( self._ctx.ctx, self._lora_adapter, self.lora_scale ): @@ -437,9 +438,9 @@ def free_lora_adapter(): self.chat_format = chat_format self.chat_handler = chat_handler - self._chat_handlers: Dict[str, llama_chat_format.LlamaChatCompletionHandler] = ( - {} - ) + self._chat_handlers: Dict[ + str, llama_chat_format.LlamaChatCompletionHandler + ] = {} self.draft_model = draft_model @@ -449,12 +450,12 @@ def free_lora_adapter(): self._token_nl = self.token_nl() self._token_eos = self.token_eos() - self._candidates = _LlamaTokenDataArray(n_vocab=self._n_vocab) + self._candidates = internals.LlamaTokenDataArray(n_vocab=self._n_vocab) self.n_tokens = 0 self.input_ids: npt.NDArray[np.intc] = np.ndarray((n_ctx,), dtype=np.intc) self.scores: npt.NDArray[np.single] = np.ndarray( - (n_ctx, self._n_vocab), dtype=np.single + (n_ctx if logits_all == True else n_batch, self._n_vocab), dtype=np.single ) self._mirostat_mu = ctypes.c_float( @@ -538,14 +539,14 @@ def free_lora_adapter(): f"Using fallback chat format: {self.chat_format}", file=sys.stderr ) + self._sampler = None + @property def ctx(self) -> llama_cpp.llama_context_p: - assert self._ctx.ctx is not None return self._ctx.ctx @property def model(self) -> llama_cpp.llama_model_p: - assert self._model.model is not None return self._model.model @property @@ -586,7 +587,10 @@ def tokenize( return self.tokenizer_.tokenize(text, add_bos, special) def detokenize( - self, tokens: List[int], prev_tokens: Optional[List[int]] = None, special: bool = False + self, + tokens: List[int], + prev_tokens: Optional[List[int]] = None, + special: bool = False, ) -> bytes: """Detokenize a list of tokens. @@ -598,8 +602,10 @@ def detokenize( Returns: The detokenized string. """ - return self.tokenizer_.detokenize(tokens, prev_tokens=prev_tokens, special=special) - + return self.tokenizer_.detokenize( + tokens, prev_tokens=prev_tokens, special=special + ) + def set_cache(self, cache: Optional[BaseLlamaCache]): """Set the cache. @@ -614,8 +620,7 @@ def set_seed(self, seed: int): Args: seed: The random seed. """ - assert self._ctx.ctx is not None - llama_cpp.llama_set_rng_seed(self._ctx.ctx, seed) + self._seed = seed def reset(self): """Reset the model state.""" @@ -627,8 +632,6 @@ def eval(self, tokens: Sequence[int]): Args: tokens: The list of tokens to evaluate. """ - assert self._ctx.ctx is not None - assert self._batch.batch is not None self._ctx.kv_cache_seq_rm(-1, self.n_tokens, -1) for i in range(0, len(tokens), self.n_batch): batch = tokens[i : min(len(tokens), i + self.n_batch)] @@ -649,15 +652,106 @@ def eval(self, tokens: Sequence[int]): ) self.scores[n_past : n_past + n_tokens, :].reshape(-1)[::] = logits else: - rows = 1 - cols = self._n_vocab - logits = np.ctypeslib.as_array( - self._ctx.get_logits(), shape=(rows * cols,) - ) - self.scores[n_past + n_tokens - 1, :].reshape(-1)[::] = logits + # rows = 1 + # cols = self._n_vocab + # logits = np.ctypeslib.as_array( + # self._ctx.get_logits(), shape=(rows * cols,) + # ) + # self.scores[n_past + n_tokens - 1, :].reshape(-1)[::] = logits + # NOTE: Now that sampling is done inside the sampler, logits are only needed for logprobs which requires logits_all + pass # Update n_tokens self.n_tokens += n_tokens + def _init_sampler( + self, + top_k: int = 40, + top_p: float = 0.95, + min_p: float = 0.05, + typical_p: float = 1.0, + temp: float = 0.80, + repeat_penalty: float = 1.0, + frequency_penalty: float = 0.0, + presence_penalty: float = 0.0, + tfs_z: float = 1.0, + mirostat_mode: int = 0, + mirostat_eta: float = 0.1, + mirostat_tau: float = 5.0, + penalize_nl: bool = True, + logits_processor: Optional[LogitsProcessorList] = None, + grammar: Optional[LlamaGrammar] = None, + ): + sampler = internals.LlamaSampler() + + if logits_processor is not None: + # Create and add a custom sampler + def apply_func(token_data_array: llama_cpp.llama_token_data_array_p): + size = token_data_array.contents.size + data_soa = token_data_array.contents.data + data_soa_address = ctypes.addressof(data_soa.contents) + # NOTE: This is probably broken + recarray = np.recarray( + shape=(size,), + dtype=np.dtype( + [("id", np.intc), ("logit", np.single), ("p", np.single)], + align=True, + ), + buf=(llama_cpp.llama_token_data * size).from_address( + data_soa_address + ), + ) + for logit_processor in logits_processor: + recarray.logit[:] = logit_processor(self._input_ids, recarray.logit) + + sampler.add_custom(apply_func) + + sampler.add_penalties( + n_vocab=self._n_vocab, + special_eos_id=self._token_eos, + linefeed_id=self._token_nl, + penalty_last_n=self.last_n_tokens_size, + penalty_repeat=repeat_penalty, + penalty_freq=frequency_penalty, + penalty_present=presence_penalty, + penalize_nl=penalize_nl, + ignore_eos=False, + ) + + if grammar is not None: + sampler.add_grammar(self._model, grammar) + + if temp < 0.0: + sampler.add_softmax() + sampler.add_dist(self._seed) + elif temp == 0.0: + sampler.add_greedy() + else: + if mirostat_mode == 1: + mirostat_m = 100 + sampler.add_mirostat( + self._n_vocab, + self._seed, + mirostat_tau, + mirostat_eta, + mirostat_m, + ) + elif mirostat_mode == 2: + sampler.add_mirostat_v2( + self._seed, + mirostat_tau, + mirostat_eta, + ) + else: + n_probs = 0 + min_keep = max(1, n_probs) + sampler.add_top_k(top_k) + sampler.add_typical(typical_p, min_keep) + sampler.add_top_p(top_p, min_keep) + sampler.add_min_p(min_p, min_keep) + sampler.add_temp(temp) + sampler.add_dist(self._seed) + return sampler + def sample( self, top_k: int = 40, @@ -674,8 +768,6 @@ def sample( mirostat_tau: float = 5.0, penalize_nl: bool = True, logits_processor: Optional[LogitsProcessorList] = None, - logprobs: Optional[bool] = None, - top_logprobs: Optional[int] = None, grammar: Optional[LlamaGrammar] = None, idx: Optional[int] = None, ): @@ -690,69 +782,37 @@ def sample( Returns: The sampled token. """ - assert self._ctx is not None assert self.n_tokens > 0 - if idx is None: - logits: npt.NDArray[np.single] = self._scores[-1, :] - else: - logits = self._scores[idx, :] - - if logits_processor is not None: - logits[:] = ( - logits_processor(self._input_ids, logits) - if idx is None - else logits_processor(self._input_ids[: idx + 1], logits) + tmp_sampler = False + + if self._sampler is None: + tmp_sampler = True + self._sampler = self._init_sampler( + top_k=top_k, + top_p=top_p, + min_p=min_p, + typical_p=typical_p, + temp=temp, + repeat_penalty=repeat_penalty, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + penalize_nl=penalize_nl, + logits_processor=logits_processor, + grammar=grammar, ) - sampling_params = _LlamaSamplingParams( - top_k=top_k, - top_p=top_p, - min_p=min_p, - tfs_z=tfs_z, - typical_p=typical_p, - temp=temp, - penalty_last_n=self.last_n_tokens_size, - penalty_repeat=repeat_penalty, - penalty_freq=frequency_penalty, - penalty_present=presence_penalty, - mirostat=mirostat_mode, - mirostat_tau=mirostat_tau, - mirostat_eta=mirostat_eta, - penalize_nl=penalize_nl, - ) - sampling_context = _LlamaSamplingContext( - params=sampling_params, - grammar=grammar, - ) - sampling_context.prev = list(self.eval_tokens) - id = sampling_context.sample(ctx_main=self._ctx, logits_array=logits) - sampling_context.accept( - ctx_main=self._ctx, - id=id, - apply_grammar=grammar is not None, - ) + ridx = idx - self.n_tokens if idx is not None else -1 - if logprobs is not None and (top_logprobs is not None and top_logprobs > 0): - sampled_logprobs = self.logits_to_logprobs(logits) - token_logprob = float(sampled_logprobs[id]) - - top_logprobs_dict = None - if top_logprobs is not None: - sorted_indices = sampled_logprobs.argsort()[::-1] - top_indices = sorted_indices[:top_logprobs] - top_logprobs_dict = { - self.detokenize([i]).decode("utf-8", errors="ignore"): float(sampled_logprobs[i]) - for i in top_indices - } - - return { - "token": id, - "token_logprob": token_logprob, - "top_logprobs": top_logprobs_dict - } - else: - return id + assert self.ctx is not None + token = self._sampler.sample(self._ctx, ridx) + if tmp_sampler: + self._sampler = None + return token def generate( self, @@ -772,8 +832,6 @@ def generate( mirostat_eta: float = 0.1, penalize_nl: bool = True, logits_processor: Optional[LogitsProcessorList] = None, - logprobs: Optional[bool] = None, - top_logprobs: Optional[int] = None, stopping_criteria: Optional[StoppingCriteriaList] = None, grammar: Optional[LlamaGrammar] = None, ) -> Generator[int, Optional[Sequence[int]], None]: @@ -798,6 +856,23 @@ def generate( """ # Reset mirostat sampling self._mirostat_mu = ctypes.c_float(2.0 * mirostat_tau) + self._sampler = self._init_sampler( + top_k=top_k, + top_p=top_p, + min_p=min_p, + typical_p=typical_p, + temp=temp, + repeat_penalty=repeat_penalty, + frequency_penalty=frequency_penalty, + presence_penalty=presence_penalty, + tfs_z=tfs_z, + mirostat_mode=mirostat_mode, + mirostat_tau=mirostat_tau, + mirostat_eta=mirostat_eta, + penalize_nl=penalize_nl, + logits_processor=logits_processor, + grammar=grammar, + ) # Check for kv cache prefix match if reset and self.n_tokens > 0: @@ -812,16 +887,19 @@ def generate( tokens = tokens[longest_prefix:] self.n_tokens = longest_prefix if self.verbose: - print(f"Llama.generate: {longest_prefix} prefix-match hit, " - f"remaining {len(tokens)} prompt tokens to eval", file=sys.stderr) + print( + f"Llama.generate: {longest_prefix} prefix-match hit, " + f"remaining {len(tokens)} prompt tokens to eval", + file=sys.stderr, + ) # Reset the model state if reset: self.reset() - # Reset the grammar - if grammar is not None: - grammar.reset() + # # Reset the grammar + # if grammar is not None: + # grammar.reset() sample_idx = self.n_tokens + len(tokens) - 1 tokens = list(tokens) @@ -830,7 +908,7 @@ def generate( while True: self.eval(tokens) while sample_idx < self.n_tokens: - result = self.sample( + token = self.sample( top_k=top_k, top_p=top_p, min_p=min_p, @@ -844,26 +922,17 @@ def generate( mirostat_tau=mirostat_tau, mirostat_eta=mirostat_eta, logits_processor=logits_processor, - logprobs=logprobs, - top_logprobs=top_logprobs, grammar=grammar, penalize_nl=penalize_nl, idx=sample_idx, ) - if isinstance(result, dict): - token = result["token"] - logprobs_info = result - else: - token = result - logprobs_info = None - sample_idx += 1 if stopping_criteria is not None and stopping_criteria( - self._input_ids, self._scores[-1, :] + self._input_ids[: sample_idx], self._scores[sample_idx - self.n_tokens, :] ): return - tokens_or_none = yield token, logprobs_info + tokens_or_none = yield token tokens.clear() tokens.append(token) if tokens_or_none is not None: @@ -896,7 +965,6 @@ def create_embedding( Returns: An embedding object. """ - assert self._model.model is not None model_name: str = model if model is not None else self.model_path input = input if isinstance(input, list) else [input] @@ -941,7 +1009,6 @@ def embed( Returns: A list of embeddings """ - assert self._ctx.ctx is not None n_embd = self.n_embd() n_batch = self.n_batch @@ -955,7 +1022,7 @@ def embed( ) if self.verbose: - llama_cpp.llama_reset_timings(self._ctx.ctx) + llama_cpp.llama_perf_context_reset(self._ctx.ctx) if isinstance(input, str): inputs = [input] @@ -969,7 +1036,6 @@ def embed( data: Union[List[List[float]], List[List[List[float]]]] = [] def decode_batch(seq_sizes: List[int]): - assert self._ctx.ctx is not None llama_cpp.llama_kv_cache_clear(self._ctx.ctx) self._ctx.decode(self._batch) self._batch.reset() @@ -984,7 +1050,9 @@ def decode_batch(seq_sizes: List[int]): for j in range(size) ] if normalize: - embedding = [_normalize_embedding(e) for e in embedding] + embedding = [ + internals.normalize_embedding(e) for e in embedding + ] data.append(embedding) pos += size else: @@ -992,7 +1060,7 @@ def decode_batch(seq_sizes: List[int]): ptr = llama_cpp.llama_get_embeddings_seq(self._ctx.ctx, i) embedding: List[float] = ptr[:n_embd] if normalize: - embedding = _normalize_embedding(embedding) + embedding = internals.normalize_embedding(embedding) data.append(embedding) # init state @@ -1035,7 +1103,7 @@ def decode_batch(seq_sizes: List[int]): decode_batch(s_batch) if self.verbose: - llama_cpp.llama_print_timings(self._ctx.ctx) + llama_cpp.llama_perf_context_print(self._ctx.ctx) output = data[0] if isinstance(input, str) else data @@ -1077,7 +1145,6 @@ def _create_completion( ) -> Union[ Iterator[CreateCompletionResponse], Iterator[CreateCompletionStreamResponse] ]: - assert self._ctx is not None assert suffix is None or suffix.__class__ is str completion_id: str = f"cmpl-{str(uuid.uuid4())}" @@ -1222,7 +1289,7 @@ def logit_bias_processor( raise ValueError( "logprobs is not supported for models created with logits_all=False" ) - + if self.cache: try: cache_item = self.cache[prompt_tokens] @@ -1241,13 +1308,13 @@ def logit_bias_processor( print("Llama._create_completion: cache miss", file=sys.stderr) if seed is not None: - self._ctx.set_rng_seed(seed) + self.set_seed(seed) + else: + self.set_seed(random.Random(self._seed).randint(0, 2 ** 32)) finish_reason = "length" multibyte_fix = 0 - logprobs_or_none = None - - for token, logprobs_info in self.generate( + for token in self.generate( prompt_tokens, top_k=top_k, top_p=top_p, @@ -1263,11 +1330,8 @@ def logit_bias_processor( repeat_penalty=repeat_penalty, stopping_criteria=stopping_criteria, logits_processor=logits_processor, - logprobs=logprobs, - top_logprobs=logprobs, grammar=grammar, ): - assert self._model.model is not None if llama_cpp.llama_token_is_eog(self._model.model, token): text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) finish_reason = "stop" @@ -1275,20 +1339,6 @@ def logit_bias_processor( completion_tokens.append(token) - if logprobs_info and logprobs_or_none is None: - logprobs_or_none = { - "tokens": [], - "text_offset": [], - "token_logprobs": [], - "top_logprobs": [] - } - - if logprobs_info: - logprobs_or_none["tokens"].append(self.detokenize([token]).decode("utf-8", errors="ignore")) - logprobs_or_none["text_offset"].append(len(self.detokenize(completion_tokens[:-1]))) - logprobs_or_none["token_logprobs"].append(logprobs_info["token_logprob"]) - logprobs_or_none["top_logprobs"].append(logprobs_info["top_logprobs"]) - all_text = self.detokenize(completion_tokens, prev_tokens=prompt_tokens) # Contains multi-byte UTF8 @@ -1468,15 +1518,15 @@ def logit_bias_processor( if stream: remaining_tokens = completion_tokens[returned_tokens:] - all_text = self.detokenize( + remaining_text = self.detokenize( remaining_tokens, prev_tokens=prompt_tokens + completion_tokens[:returned_tokens], ) - any_stop = [s for s in stop_sequences if s in all_text] + any_stop = [s for s in stop_sequences if s in remaining_text] if len(any_stop) > 0: - end = min(all_text.index(stop) for stop in any_stop) + end = min(remaining_text.index(stop) for stop in any_stop) else: - end = len(all_text) + end = len(remaining_text) token_end_position = 0 for token in remaining_tokens: @@ -1487,7 +1537,7 @@ def logit_bias_processor( ) ) - # logprobs_or_none: Optional[CompletionLogprobs] = None + logprobs_or_none: Optional[CompletionLogprobs] = None if logprobs is not None: if token == bos_token_id: continue @@ -1572,10 +1622,7 @@ def logit_bias_processor( { "text": "", "index": 0, - "delta": { - "content": "", - }, - "logprobs": logprobs_or_none, + "logprobs": None, "finish_reason": finish_reason, } ], @@ -1601,7 +1648,7 @@ def logit_bias_processor( if suffix_token_id < 0 and suffix is not None: text_str = text_str + suffix - # logprobs_or_none: Optional[CompletionLogprobs] = None + logprobs_or_none: Optional[CompletionLogprobs] = None if logprobs is not None: text_offset = 0 if echo else len(prompt) token_offset = 0 if echo else len(prompt_tokens[1:]) @@ -1985,7 +2032,7 @@ def create_chat_completion_openai_v1( *args: Any, **kwargs: Any, ): - """Generate a chat completion with return type based on the OpenAI v1 API. + """Generate a chat completion with return type based on the the OpenAI v1 API. OpenAI python package is required to use this method. @@ -2029,6 +2076,7 @@ def __getstate__(self): seed=self.context_params.seed, n_ctx=self.context_params.n_ctx, n_batch=self.n_batch, + n_ubatch=self.context_params.n_ubatch, n_threads=self.context_params.n_threads, n_threads_batch=self.context_params.n_threads_batch, rope_scaling_type=self.context_params.rope_scaling_type, @@ -2069,7 +2117,6 @@ def __setstate__(self, state): self.__init__(**state) def save_state(self) -> LlamaState: - assert self._ctx.ctx is not None if self.verbose: print("Llama.save_state: saving llama state", file=sys.stderr) state_size = llama_cpp.llama_get_state_size(self._ctx.ctx) @@ -2096,15 +2143,17 @@ def save_state(self) -> LlamaState: n_tokens=self.n_tokens, llama_state=bytes(llama_state_compact), llama_state_size=n_bytes, + seed=self._seed, ) def load_state(self, state: LlamaState) -> None: - assert self._ctx.ctx is not None # Only filling in up to `n_tokens` and then zero-ing out the rest self.scores[: state.n_tokens, :] = state.scores.copy() - self.scores[state.n_tokens :, :] = 0.0 + rest = self.scores[state.n_tokens :, :] + rest[rest > 0] = 0.0 self.input_ids = state.input_ids.copy() self.n_tokens = state.n_tokens + self._seed = state.seed state_size = state.llama_state_size LLamaStateArrayType = ctypes.c_uint8 * state_size llama_state = LLamaStateArrayType.from_buffer_copy(state.llama_state) @@ -2147,62 +2196,6 @@ def pooling_type(self) -> str: def close(self) -> None: """Explicitly free the model from memory.""" self._stack.close() - - def unload_lora(self): - """Unload the LoRA adapter while keeping the base model in memory.""" - if self._lora_adapter is not None: - llama_cpp.llama_lora_adapter_clear(self._ctx.ctx) - llama_cpp.llama_lora_adapter_free(self._lora_adapter) - self._lora_adapter = None - self.lora_path = None - self.lora_scale = 1.0 - - def reload_lora(self, lora_path: str, lora_scale: float = 1.0): - """Reload a LoRA adapter from the given path. - - Args: - lora_path: Path to the LoRA adapter file - lora_scale: Scale to apply to the LoRA adapter (default: 1.0) - - Raises: - RuntimeError: If initialization or setting of the LoRA adapter fails - """ - # First unload any existing LoRA adapter - if self._lora_adapter is not None: - self.unload_lora() - - # Initialize new LoRA adapter - assert self._model.model is not None - self._lora_adapter = llama_cpp.llama_lora_adapter_init( - self._model.model, - lora_path.encode("utf-8"), - ) - if self._lora_adapter is None: - raise RuntimeError( - f"Failed to initialize LoRA adapter from lora path: {lora_path}" - ) - - def free_lora_adapter(): - if self._lora_adapter is None: - return - llama_cpp.llama_lora_adapter_free(self._lora_adapter) - self._lora_adapter = None - - self._stack.callback(free_lora_adapter) - - # Apply the LoRA adapter - assert self._ctx.ctx is not None - if llama_cpp.llama_lora_adapter_set( - self._ctx.ctx, self._lora_adapter, lora_scale - ): - # Clean up on failure - self.unload_lora() - raise RuntimeError( - f"Failed to set LoRA adapter from lora path: {lora_path}" - ) - - self.lora_path = lora_path - self.lora_scale = lora_scale def __del__(self) -> None: self.close() @@ -2240,6 +2233,7 @@ def from_pretrained( cls, repo_id: str, filename: Optional[str], + additional_files: Optional[List] = None, local_dir: Optional[Union[str, os.PathLike[str]]] = None, local_dir_use_symlinks: Union[bool, Literal["auto"]] = "auto", cache_dir: Optional[Union[str, os.PathLike[str]]] = None, @@ -2252,6 +2246,7 @@ def from_pretrained( Args: repo_id: The model repo id. filename: A filename or glob pattern to match the model file in the repo. + additional_files: A list of filenames or glob patterns to match additional model files in the repo. local_dir: The local directory to save the model to. local_dir_use_symlinks: Whether to use symlinks when downloading the model. **kwargs: Additional keyword arguments to pass to the Llama constructor. @@ -2282,6 +2277,7 @@ def from_pretrained( rel_path = Path(file).relative_to(repo_id) file_list.append(str(rel_path)) + # find the only/first shard file: matching_files = [file for file in file_list if fnmatch.fnmatch(file, filename)] # type: ignore if len(matching_files) == 0: @@ -2311,6 +2307,35 @@ def from_pretrained( cache_dir=cache_dir, ) + if additional_files: + for additonal_file_name in additional_files: + # find the additional shard file: + matching_additional_files = [file for file in file_list if fnmatch.fnmatch(file, additonal_file_name)] + + if len(matching_additional_files) == 0: + raise ValueError( + f"No file found in {repo_id} that match {additonal_file_name}\n\n" + f"Available Files:\n{json.dumps(file_list)}" + ) + + if len(matching_additional_files) > 1: + raise ValueError( + f"Multiple files found in {repo_id} matching {additonal_file_name}\n\n" + f"Available Files:\n{json.dumps(files)}" + ) + + (matching_additional_file,) = matching_additional_files + + # download the additional file + hf_hub_download( + repo_id=repo_id, + filename=matching_additional_file, + subfolder=subfolder, + local_dir=local_dir, + local_dir_use_symlinks=local_dir_use_symlinks, + cache_dir=cache_dir, + ) + if local_dir is None: model_path = hf_hub_download( repo_id=repo_id, @@ -2324,6 +2349,7 @@ def from_pretrained( else: model_path = os.path.join(local_dir, filename) + # loading the first file of a sharded GGUF loads all remaining shard files in the subfolder return cls( model_path=model_path, **kwargs, @@ -2338,12 +2364,14 @@ def __init__( n_tokens: int, llama_state: bytes, llama_state_size: int, + seed: int, ): self.input_ids = input_ids self.scores = scores self.n_tokens = n_tokens self.llama_state = llama_state self.llama_state_size = llama_state_size + self.seed = seed LogitsProcessor = Callable[ diff --git a/nexa/gguf/llama/llama_cache.py b/nexa/gguf/llama/llama_cache.py index 54f22eb7..05c5a0fa 100644 --- a/nexa/gguf/llama/llama_cache.py +++ b/nexa/gguf/llama/llama_cache.py @@ -9,7 +9,7 @@ import diskcache -import nexa.gguf.llama as llama_cpp +import nexa.gguf.llama.llama from nexa.gguf.llama.llama_types import * @@ -52,9 +52,9 @@ class LlamaRAMCache(BaseLlamaCache): def __init__(self, capacity_bytes: int = (2 << 30)): super().__init__(capacity_bytes) self.capacity_bytes = capacity_bytes - self.cache_state: OrderedDict[Tuple[int, ...], "llama_cpp.llama.LlamaState"] = ( - OrderedDict() - ) + self.cache_state: OrderedDict[ + Tuple[int, ...], "llama_cpp.llama.LlamaState" + ] = OrderedDict() @property def cache_size(self): @@ -152,4 +152,4 @@ def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"): while self.cache_size > self.capacity_bytes and len(self.cache) > 0: key_to_remove = next(iter(self.cache)) del self.cache[key_to_remove] - print("LlamaDiskCache.__setitem__: trim", file=sys.stderr) \ No newline at end of file + print("LlamaDiskCache.__setitem__: trim", file=sys.stderr) diff --git a/nexa/gguf/llama/llama_chat_format.py b/nexa/gguf/llama/llama_chat_format.py index ff5cd06d..aeee3399 100644 --- a/nexa/gguf/llama/llama_chat_format.py +++ b/nexa/gguf/llama/llama_chat_format.py @@ -304,7 +304,6 @@ def _convert_text_completion_chunks_to_chat( } ], } - yield { "id": "chat" + chunk["id"], "model": chunk["model"], @@ -1010,7 +1009,7 @@ def format_qwen( **kwargs: Any, ) -> ChatFormatterResponse: _roles = dict(user="<|im_start|>user", assistant="<|im_start|>assistant") - system_message = "You are a helpful assistant." + system_message = _get_system_message(messages) or "You are a helpful assistant." system_template = "<|im_start|>system\n{system_message}" system_message = system_template.format(system_message=system_message) _messages = _map_roles(messages, _roles) @@ -1364,34 +1363,6 @@ def format_gemma( return ChatFormatterResponse(prompt=_prompt, stop=_sep) -@register_chat_format("octopusv2") -def format_octopus_v2( - messages: List[llama_types.ChatCompletionRequestMessage], - **kwargs: Any, -) -> ChatFormatterResponse: - system_message = "Below is the query from the users, please call the correct function and generate the parameters to call the function.\n\n" - _roles = dict(user="Query:", assistant="Response:") - _sep = "\n\n" - _messages = _map_roles(messages, _roles) - - # Assuming the last message should be the assistant's response - _messages.append((_roles["assistant"], None)) - - # Concatenating the prompt - _prompt = system_message - for role, content in _messages: - if content: - _prompt += f"{role} {content.strip()}{_sep}" - else: - _prompt += f"{role} " - - # The final prompt - _prompt = _prompt.strip() - - # Returning the formatted response - return ChatFormatterResponse(prompt=_prompt, stop=_sep) - - # Tricky chat formats that require custom chat handlers @@ -2736,6 +2707,31 @@ def last_image_embed_free(): def load_image(self, image_url: str) -> bytes: return self._load_image(image_url) + def _embed_image_bytes(self, image_bytes: bytes, n_threads_batch: int = 1): + if ( + self._last_image_embed is not None + and self._last_image_hash is not None + and hash(image_bytes) == self._last_image_hash + ): + return self._last_image_embed + with suppress_stdout_stderr(disable=self.verbose): + # Free the previous image embed + if self._last_image_embed is not None: + self._llava_cpp.llava_image_embed_free(self._last_image_embed) + self._last_image_embed = None + self._last_image_hash = None + embed = self._llava_cpp.llava_image_embed_make_with_bytes( + self.clip_ctx, + n_threads_batch, + (ctypes.c_uint8 * len(image_bytes)).from_buffer( + bytearray(image_bytes) + ), + len(image_bytes), + ) + self._last_image_embed = embed + self._last_image_hash = hash(image_bytes) + return embed + def __call__( self, *, @@ -2798,30 +2794,9 @@ def __call__( ) split_text = self.split_text_on_image_urls(text, image_urls) - def embed_image_bytes(image_bytes: bytes): - if ( - self._last_image_embed is not None - and self._last_image_hash is not None - and hash(image_bytes) == self._last_image_hash - ): - return self._last_image_embed - with suppress_stdout_stderr(disable=self.verbose): - # Free the previous image embed - if self._last_image_embed is not None: - self._llava_cpp.llava_image_embed_free(self._last_image_embed) - self._last_image_embed = None - self._last_image_hash = None - embed = self._llava_cpp.llava_image_embed_make_with_bytes( - self.clip_ctx, - llama.context_params.n_threads_batch, - (ctypes.c_uint8 * len(image_bytes)).from_buffer( - bytearray(image_bytes) - ), - len(image_bytes), - ) - self._last_image_embed = embed - self._last_image_hash = hash(image_bytes) - return embed + if self.verbose: + print(text, file=sys.stderr) + # Evaluate prompt llama.reset() @@ -2838,7 +2813,7 @@ def embed_image_bytes(image_bytes: bytes): llama.eval(tokens) else: image_bytes = self.load_image(value) - embed = embed_image_bytes(image_bytes) + embed = self._embed_image_bytes(image_bytes, llama.context_params.n_threads_batch) if llama.n_tokens + embed.contents.n_image_pos > llama.n_ctx(): raise ValueError( f"Prompt exceeds n_ctx: {llama.n_tokens + embed.contents.n_image_pos} > {llama.n_ctx()}" @@ -3337,6 +3312,44 @@ class Llama3VisionAlphaChatHandler(Llava15ChatHandler): Llama3VisionAlpha = Llama3VisionAlphaChatHandler +class MiniCPMv26ChatHandler(Llava15ChatHandler): + DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant." + + CHAT_FORMAT = ( + "{% for message in messages %}" + "{% if loop.first and messages[0]['role'] != 'system' %}" + "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n" + "{% endif %}" + "<|im_start|>{{ message['role'] }}\n" + "{% if message['content'] is iterable %}" + "{% for content in message['content'] %}" + "{% if content.type == 'image_url' %}" + "{% if content.image_url is string %}" + "{{ content.image_url }}" + "{% endif %}" + "{% if content.image_url is mapping %}" + "{{ content.image_url.url }}" + "{% endif %}" + "{% endif %}" + "{% endfor %}" + + "{% for content in message['content'] %}" + "{% if content.type == 'text' %}" + "{{ content.text }}" + "{% endif %}" + "{% endfor %}" + "{% endif %}" + "{% if message['content'] is string %}" + "{{ message['content'] }}" + "{% endif %}" + "<|im_end|>\n" + "{% endfor %}" + "{% if add_generation_prompt %}" + "<|im_start|>assistant\n" + "{% endif %}" + ) + + @register_chat_completion_handler("chatml-function-calling") def chatml_function_calling( llama: llama.Llama, @@ -3777,4 +3790,4 @@ def chatml_function_calling( }, } - raise ValueError("Automatic streaming tool choice is not supported") \ No newline at end of file + raise ValueError("Automatic streaming tool choice is not supported") diff --git a/nexa/gguf/llama/llama_cpp.py b/nexa/gguf/llama/llama_cpp.py index 442d2e86..3f4b9baa 100644 --- a/nexa/gguf/llama/llama_cpp.py +++ b/nexa/gguf/llama/llama_cpp.py @@ -1,90 +1,44 @@ from __future__ import annotations -import sys import os import ctypes -import functools import pathlib from typing import ( - Any, Callable, - List, Union, NewType, Optional, TYPE_CHECKING, - TypeVar, - Generic, ) -from typing_extensions import TypeAlias +from typing_extensions import ( + byref, + ctypes_function_for_shared_library, +) + +if TYPE_CHECKING: + from typing_extensions import ( + CtypesCData, + CtypesArray, + CtypesPointer, + CtypesVoidPointer, + CtypesRef, + CtypesPointerOrRef, + CtypesFuncPointer, + ) + from nexa.gguf.lib_utils import load_library +from nexa.gguf.llama._ctypes_extensions import ctypes_function_for_shared_library # Specify the base name of the shared library to load _lib_base_name = "llama" - # Load the library _lib = load_library(_lib_base_name) -# ctypes sane type hint helpers -# -# - Generic Pointer and Array types -# - PointerOrRef type with a type hinted byref function -# -# NOTE: Only use these for static type checking not for runtime checks -# no good will come of that - -if TYPE_CHECKING: - CtypesCData = TypeVar("CtypesCData", bound=ctypes._CData) # type: ignore - - CtypesArray: TypeAlias = ctypes.Array[CtypesCData] # type: ignore - - CtypesPointer: TypeAlias = ctypes._Pointer[CtypesCData] # type: ignore - - CtypesVoidPointer: TypeAlias = ctypes.c_void_p - - class CtypesRef(Generic[CtypesCData]): - pass - - CtypesPointerOrRef: TypeAlias = Union[ - CtypesPointer[CtypesCData], CtypesRef[CtypesCData] - ] - - CtypesFuncPointer: TypeAlias = ctypes._FuncPointer # type: ignore - -F = TypeVar("F", bound=Callable[..., Any]) - - -def ctypes_function_for_shared_library(lib: ctypes.CDLL): - def ctypes_function( - name: str, argtypes: List[Any], restype: Any, enabled: bool = True - ): - def decorator(f: F) -> F: - if enabled: - func = getattr(lib, name) - func.argtypes = argtypes - func.restype = restype - functools.wraps(f)(func) - return func - else: - return f - - return decorator - - return ctypes_function - - ctypes_function = ctypes_function_for_shared_library(_lib) -def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCData]: - """Type-annotated version of ctypes.byref""" - ... - - -byref = ctypes.byref # type: ignore - # from ggml.h # // NOTE: always add types at the end of the enum to keep backward compatibility # enum ggml_type { @@ -148,11 +102,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa GGML_TYPE_I64 = 27 GGML_TYPE_F64 = 28 GGML_TYPE_IQ1_M = 29 -GGML_TYPE_BF16 = 30, -GGML_TYPE_Q4_0_4_4 = 31 -GGML_TYPE_Q4_0_4_8 = 32 -GGML_TYPE_Q4_0_8_8 = 33 -GGML_TYPE_COUNT = 34 +GGML_TYPE_COUNT = 30 # from ggml-backend.h # typedef bool (*ggml_backend_sched_eval_callback)(struct ggml_tensor * t, bool ask, void * user_data); @@ -176,6 +126,9 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa # define LLAMA_DEFAULT_SEED 0xFFFFFFFF LLAMA_DEFAULT_SEED = 0xFFFFFFFF +# define LLAMA_TOKEN_NULL -1 +LLAMA_TOKEN_NULL = -1 + # define LLAMA_FILE_MAGIC_GGLA 0x67676c61u // 'ggla' LLAMA_FILE_MAGIC_GGLA = 0x67676C61 @@ -187,8 +140,8 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa # define LLAMA_SESSION_MAGIC LLAMA_FILE_MAGIC_GGSN LLAMA_SESSION_MAGIC = LLAMA_FILE_MAGIC_GGSN -# define LLAMA_SESSION_VERSION 8 -LLAMA_SESSION_VERSION = 8 +# define LLAMA_SESSION_VERSION 9 +LLAMA_SESSION_VERSION = 9 # define LLAMA_STATE_SEQ_MAGIC LLAMA_FILE_MAGIC_GGSQ LLAMA_STATE_SEQ_MAGIC = LLAMA_FILE_MAGIC_GGSQ @@ -203,6 +156,9 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa llama_context_p = NewType("llama_context_p", int) llama_context_p_ctypes = ctypes.c_void_p +# # struct llama_sampler; +# llama_sampler_p = NewType("llama_sampler_p", int) +# llama_sampler_p_ctypes = ctypes.c_void_p # typedef int32_t llama_pos; llama_pos = ctypes.c_int32 @@ -263,6 +219,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa # LLAMA_VOCAB_PRE_TYPE_BLOOM = 23, # LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24, # LLAMA_VOCAB_PRE_TYPE_EXAONE = 25, +# LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26, # }; LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0 LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1 @@ -290,6 +247,7 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa LLAMA_VOCAB_PRE_TYPE_BLOOM = 23 LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24 LLAMA_VOCAB_PRE_TYPE_EXAONE = 25 +LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26 # // note: these values should be synchronized with ggml_rope @@ -447,12 +405,14 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa # LLAMA_POOLING_TYPE_MEAN = 1, # LLAMA_POOLING_TYPE_CLS = 2, # LLAMA_POOLING_TYPE_LAST = 3, +# LLAMA_POOLING_TYPE_RANK = 4, // used by reranking models to attach the classification head to the graph # }; LLAMA_POOLING_TYPE_UNSPECIFIED = -1 LLAMA_POOLING_TYPE_NONE = 0 LLAMA_POOLING_TYPE_MEAN = 1 LLAMA_POOLING_TYPE_CLS = 2 LLAMA_POOLING_TYPE_LAST = 3 +LLAMA_POOLING_TYPE_RANK = 4 # enum llama_attention_type { # LLAMA_ATTENTION_TYPE_UNSPECIFIED = -1, @@ -463,10 +423,11 @@ def byref(obj: CtypesCData, offset: Optional[int] = None) -> CtypesRef[CtypesCDa LLAMA_ATTENTION_TYPE_CAUSAL = 0 LLAMA_ATTENTION_TYPE_NON_CAUSAL = 1 + # enum llama_split_mode { -# LLAMA_SPLIT_MODE_NONE = 0, // single GPU -# LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs -# LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs +# LLAMA_SPLIT_MODE_NONE = 0, // single GPU +# LLAMA_SPLIT_MODE_LAYER = 1, // split layers and KV across GPUs +# LLAMA_SPLIT_MODE_ROW = 2, // split rows across GPUs # }; LLAMA_SPLIT_MODE_NONE = 0 LLAMA_SPLIT_MODE_LAYER = 1 @@ -502,8 +463,11 @@ class llama_token_data(ctypes.Structure): # typedef struct llama_token_data_array { +# // TODO: consider SoA +# // NOTE: this pointer can be modified by the samplers # llama_token_data * data; # size_t size; +# int64_t selected; // this is the index in the data array (i.e. not the token id) # bool sorted; # } llama_token_data_array; class llama_token_data_array(ctypes.Structure): @@ -512,16 +476,19 @@ class llama_token_data_array(ctypes.Structure): Attributes: data (ctypes.Array[llama_token_data]): token data size (int): size of the array + selected (int): index in the data array (i.e. not the token id) sorted (bool): whether the array is sorted""" if TYPE_CHECKING: data: CtypesArray[llama_token_data] size: int + selected: int sorted: bool _fields_ = [ ("data", llama_token_data_p), ("size", ctypes.c_size_t), + ("selected", ctypes.c_int64), ("sorted", ctypes.c_bool), ] @@ -541,8 +508,11 @@ class llama_token_data_array(ctypes.Structure): # // - token : the token ids of the input (used when embd is NULL) # // - embd : token embeddings (i.e. float vector of size n_embd) (used when token is NULL) # // - pos : the positions of the respective token in the sequence +# // (if set to NULL, the token position will be tracked automatically by llama_decode) # // - seq_id : the sequence to which the respective token belongs +# // (if set to NULL, the sequence ID will be assumed to be 0) # // - logits : if zero, the logits (and/or the embeddings) for the respective token will not be output +# // (if set to NULL, only the logits for last token will be returned) # // # typedef struct llama_batch { # int32_t n_tokens; @@ -553,16 +523,6 @@ class llama_token_data_array(ctypes.Structure): # int32_t * n_seq_id; # llama_seq_id ** seq_id; # int8_t * logits; // TODO: rename this to "output" - - -# // NOTE: helpers for smooth API transition - can be deprecated in the future -# // for future-proof code, use the above fields instead and ignore everything below -# // -# // pos[i] = all_pos_0 + i*all_pos_1 -# // -# llama_pos all_pos_0; // used if pos == NULL -# llama_pos all_pos_1; // used if pos == NULL -# llama_seq_id all_seq_id; // used if seq_id == NULL # } llama_batch; class llama_batch(ctypes.Structure): """Input data for llama_decode @@ -597,9 +557,6 @@ class llama_batch(ctypes.Structure): ("n_seq_id", ctypes.POINTER(ctypes.c_int32)), ("seq_id", ctypes.POINTER(ctypes.POINTER(llama_seq_id))), ("logits", ctypes.POINTER(ctypes.c_int8)), - ("all_pos_0", llama_pos), - ("all_pos_1", llama_pos), - ("all_seq_id", llama_seq_id), ] @@ -740,7 +697,6 @@ class llama_model_params(ctypes.Structure): # // NOTE: changing the default values of parameters marked as [EXPERIMENTAL] may cause crashes or incorrect results in certain configurations # // https://github.com/ggerganov/llama.cpp/pull/7544 # struct llama_context_params { -# uint32_t seed; // RNG seed, -1 for random # uint32_t n_ctx; // text context, 0 = from model # uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode # uint32_t n_ubatch; // physical maximum batch size @@ -773,6 +729,7 @@ class llama_model_params(ctypes.Structure): # bool embeddings; // if true, extract embeddings (together with logits) # bool offload_kqv; // whether to offload the KQV ops (including the KV cache) to GPU # bool flash_attn; // whether to use flash attention [EXPERIMENTAL] +# bool no_perf; // whether to measure performance timings # // Abort callback @@ -785,7 +742,6 @@ class llama_context_params(ctypes.Structure): """Parameters for llama_context Attributes: - seed (int): RNG seed, -1 for random n_ctx (int): text context, 0 = from model n_batch (int): logical maximum batch size that can be submitted to llama_decode n_ubatch (int): physical maximum batch size @@ -816,7 +772,6 @@ class llama_context_params(ctypes.Structure): """ if TYPE_CHECKING: - seed: int n_ctx: int n_batch: int n_ubatch: int @@ -846,7 +801,6 @@ class llama_context_params(ctypes.Structure): abort_callback_data: ctypes.c_void_p _fields_ = [ - ("seed", ctypes.c_uint32), ("n_ctx", ctypes.c_uint32), ("n_batch", ctypes.c_uint32), ("n_ubatch", ctypes.c_uint32), @@ -952,101 +906,44 @@ class llama_model_quantize_params(ctypes.Structure): ] -# // grammar types -# struct llama_grammar; -llama_grammar_p = ctypes.c_void_p - -# // grammar element type -# enum llama_gretype { -# // end of rule definition -# LLAMA_GRETYPE_END = 0, - -# // start of alternate definition for rule -# LLAMA_GRETYPE_ALT = 1, - -# // non-terminal element: reference to rule -# LLAMA_GRETYPE_RULE_REF = 2, - -# // terminal element: character (code point) -# LLAMA_GRETYPE_CHAR = 3, - -# // inverse char(s) ([^a], [^a-b] [^abc]) -# LLAMA_GRETYPE_CHAR_NOT = 4, - -# // modifies a preceding LLAMA_GRETYPE_CHAR or LLAMA_GRETYPE_CHAR_ALT to -# // be an inclusive range ([a-z]) -# LLAMA_GRETYPE_CHAR_RNG_UPPER = 5, +# typedef struct llama_logit_bias { +# llama_token token; +# float bias; +# } llama_logit_bias; +class llama_logit_bias(ctypes.Structure): + """Used to store logit bias -# // modifies a preceding LLAMA_GRETYPE_CHAR or -# // LLAMA_GRETYPE_CHAR_RNG_UPPER to add an alternate char to match ([ab], [a-zA]) -# LLAMA_GRETYPE_CHAR_ALT = 6, + Attributes: + token (llama_token): token id + bias (float): bias""" -# // any character (.) -# LLAMA_GRETYPE_CHAR_ANY = 7, -# }; -LLAMA_GRETYPE_END = 0 -LLAMA_GRETYPE_ALT = 1 -LLAMA_GRETYPE_RULE_REF = 2 -LLAMA_GRETYPE_CHAR = 3 -LLAMA_GRETYPE_CHAR_NOT = 4 -LLAMA_GRETYPE_CHAR_RNG_UPPER = 5 -LLAMA_GRETYPE_CHAR_ALT = 6 -LLAMA_GRETYPE_CHAR_ANY = 7 - - -# typedef struct llama_grammar_element { -# enum llama_gretype type; -# uint32_t value; // Unicode code point or rule ID -# } llama_grammar_element; -class llama_grammar_element(ctypes.Structure): if TYPE_CHECKING: - type: int - value: int + token: llama_token + bias: float _fields_ = [ - ("type", ctypes.c_int), - ("value", ctypes.c_uint32), + ("token", llama_token), + ("bias", ctypes.c_float), ] -llama_grammar_element_p = ctypes.POINTER(llama_grammar_element) +llama_logit_bias_p = ctypes.POINTER(llama_logit_bias) -# // performance timing information -# struct llama_timings { -# double t_start_ms; -# double t_end_ms; -# double t_load_ms; -# double t_sample_ms; -# double t_p_eval_ms; -# double t_eval_ms; +# typedef struct llama_sampler_chain_params { +# bool no_perf; // whether to measure performance timings +# } llama_sampler_chain_params; +class llama_sampler_chain_params(ctypes.Structure): + """Parameters for llama_sampler_chain + + Attributes: + no_perf (bool): whether to measure performance timings""" -# int32_t n_sample; -# int32_t n_p_eval; -# int32_t n_eval; -# }; -class llama_timings(ctypes.Structure): if TYPE_CHECKING: - t_start_ms: float - t_end_ms: float - t_load_ms: float - t_sample_ms: float - t_p_eval_ms: float - t_eval_ms: float - n_sample: int - n_p_eval: int - n_eval: int + no_perf: bool _fields_ = [ - ("t_start_ms", ctypes.c_double), - ("t_end_ms", ctypes.c_double), - ("t_load_ms", ctypes.c_double), - ("t_sample_ms", ctypes.c_double), - ("t_p_eval_ms", ctypes.c_double), - ("t_eval_ms", ctypes.c_double), - ("n_sample", ctypes.c_int32), - ("n_p_eval", ctypes.c_int32), - ("n_eval", ctypes.c_int32), + ("no_perf", ctypes.c_bool), ] @@ -1069,7 +966,7 @@ class llama_chat_message(ctypes.Structure): # // Helpers for getting default parameters -# LLAMA_API struct llama_model_params llama_model_default_params(void); +# LLAMA_API struct llama_model_params llama_model_default_params(void); @ctypes_function( "llama_model_default_params", [], @@ -1080,7 +977,7 @@ def llama_model_default_params() -> llama_model_params: ... -# LLAMA_API struct llama_context_params llama_context_default_params(void); +# LLAMA_API struct llama_context_params llama_context_default_params(void); @ctypes_function( "llama_context_default_params", [], @@ -1091,6 +988,17 @@ def llama_context_default_params() -> llama_context_params: ... +# LLAMA_API struct llama_sampler_chain_params llama_sampler_chain_default_params(void); +@ctypes_function( + "llama_sampler_chain_default_params", + [], + llama_sampler_chain_params, +) +def llama_sampler_chain_default_params() -> llama_sampler_chain_params: + """Get default parameters for llama_sampler_chain""" + ... + + # LLAMA_API struct llama_model_quantize_params llama_model_quantize_default_params(void); @ctypes_function( "llama_model_quantize_default_params", @@ -1171,7 +1079,7 @@ def llama_backend_free(): # LLAMA_API struct llama_model * llama_load_model_from_file( # const char * path_model, -# struct llama_model_params params); +# struct llama_model_params params); @ctypes_function( "llama_load_model_from_file", [ctypes.c_char_p, llama_model_params], @@ -1253,9 +1161,9 @@ def llama_supports_gpu_offload() -> bool: ... -# LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx); -@ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes) -def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: +# LLAMA_API bool llama_supports_rpc (void); +@ctypes_function("llama_supports_rpc", [], ctypes.c_bool) +def llama_supports_rpc() -> bool: ... @@ -1283,24 +1191,6 @@ def llama_n_seq_max(ctx: llama_context_p, /) -> int: ... -# LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); -@ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int) -def llama_pooling_type(ctx: llama_context_p, /) -> int: - ... - - -# LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model); -@ctypes_function("llama_vocab_type", [llama_model_p_ctypes], ctypes.c_int) -def llama_vocab_type(model: llama_model_p, /) -> int: - ... - - -# LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); -@ctypes_function("llama_rope_type", [llama_model_p_ctypes], ctypes.c_int) -def llama_rope_type(model: llama_model_p, /) -> int: - ... - - # LLAMA_API int32_t llama_n_vocab (const struct llama_model * model); @ctypes_function("llama_n_vocab", [llama_model_p_ctypes], ctypes.c_int32) def llama_n_vocab(model: llama_model_p, /) -> int: @@ -1325,6 +1215,36 @@ def llama_n_layer(model: llama_model_p, /) -> int: ... +# LLAMA_API int32_t llama_n_head (const struct llama_model * model); +@ctypes_function("llama_n_head", [llama_model_p_ctypes], ctypes.c_int32) +def llama_n_head(model: llama_model_p, /) -> int: + ... + + +# LLAMA_API const struct llama_model * llama_get_model(const struct llama_context * ctx); +@ctypes_function("llama_get_model", [llama_context_p_ctypes], llama_model_p_ctypes) +def llama_get_model(ctx: llama_context_p, /) -> Optional[llama_model_p]: + ... + + +# LLAMA_API enum llama_pooling_type llama_pooling_type(const struct llama_context * ctx); +@ctypes_function("llama_pooling_type", [llama_context_p_ctypes], ctypes.c_int) +def llama_pooling_type(ctx: llama_context_p, /) -> int: + ... + + +# LLAMA_API enum llama_vocab_type llama_vocab_type (const struct llama_model * model); +@ctypes_function("llama_vocab_type", [llama_model_p_ctypes], ctypes.c_int) +def llama_vocab_type(model: llama_model_p, /) -> int: + ... + + +# LLAMA_API enum llama_rope_type llama_rope_type (const struct llama_model * model); +@ctypes_function("llama_rope_type", [llama_model_p_ctypes], ctypes.c_int) +def llama_rope_type(model: llama_model_p, /) -> int: + ... + + # // Get the model's RoPE frequency scaling factor # LLAMA_API float llama_rope_freq_scale_train(const struct llama_model * model); @ctypes_function("llama_rope_freq_scale_train", [llama_model_p_ctypes], ctypes.c_float) @@ -1492,10 +1412,10 @@ def llama_model_decoder_start_token(model: llama_model_p, /) -> int: # // Returns true if the model is recurrent (like Mamba, RWKV, etc.) # LLAMA_API bool llama_model_is_recurrent(const struct llama_model * model); -# @ctypes_function("llama_model_is_recurrent", [llama_model_p_ctypes], ctypes.c_bool) -# def llama_model_is_recurrent(model: llama_model_p, /) -> bool: -# """Returns true if the model is recurrent (like Mamba, RWKV, etc.)""" -# ... +@ctypes_function("llama_model_is_recurrent", [llama_model_p_ctypes], ctypes.c_bool) +def llama_model_is_recurrent(model: llama_model_p, /) -> bool: + """Returns true if the model is recurrent (like Mamba, RWKV, etc.)""" + ... # // Returns 0 on success @@ -1983,7 +1903,7 @@ def llama_kv_cache_update(ctx: llama_context_p, /): # // Returns the *actual* size in bytes of the state -# // (rng, logits, embedding and kv_cache) +# // (logits, embedding and kv_cache) # // Only use when saving the state, not when restoring it, otherwise the size may be too small. # LLAMA_API size_t llama_state_get_size(struct llama_context * ctx); @ctypes_function("llama_state_get_size", [llama_context_p_ctypes], ctypes.c_size_t) @@ -2332,30 +2252,26 @@ def llama_state_seq_load_file( # // -# // Return batch for single sequence of tokens starting at pos_0 +# // Return batch for single sequence of tokens +# // The sequence ID will be fixed to 0 +# // The position of the tokens will be tracked automatically by llama_decode # // # // NOTE: this is a helper function to facilitate transition to the new batch API - avoid using it # // # LLAMA_API struct llama_batch llama_batch_get_one( # llama_token * tokens, -# int32_t n_tokens, -# llama_pos pos_0, -# llama_seq_id seq_id); +# int32_t n_tokens); @ctypes_function( "llama_batch_get_one", [ llama_token_p, - ctypes.c_int, - llama_pos, - llama_seq_id, + ctypes.c_int32, ], llama_batch, ) def llama_batch_get_one( tokens: CtypesArray[llama_token], n_tokens: Union[ctypes.c_int, int], - pos_0: Union[llama_pos, int], - seq_id: llama_seq_id, /, ) -> llama_batch: """Return batch for single sequence of tokens starting at pos_0 @@ -2602,7 +2518,8 @@ def llama_get_embeddings_ith( # // Get the embeddings for a sequence id # // Returns NULL if pooling_type is LLAMA_POOLING_TYPE_NONE -# // shape: [n_embd] (1-dimensional) +# // when pooling_type == LLAMA_POOLING_TYPE_RANK, returns float[1] with the rank of the sequence +# // otherwise: float[n_embd] (1-dimensional) # LLAMA_API float * llama_get_embeddings_seq(struct llama_context * ctx, llama_seq_id seq_id); @ctypes_function( "llama_get_embeddings_seq", @@ -2692,6 +2609,13 @@ def llama_token_eos(model: llama_model_p, /) -> int: ... +# LLAMA_API llama_token llama_token_eot(const struct llama_model * model); // end-of-turn +@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token) +def llama_token_eot(model: llama_model_p, /) -> int: + """end-of-turn""" + ... + + # LLAMA_API llama_token llama_token_cls(const struct llama_model * model); // classification @ctypes_function("llama_token_cls", [llama_model_p_ctypes], llama_token) def llama_token_cls(model: llama_model_p, /) -> int: @@ -2726,34 +2650,60 @@ def llama_add_eos_token(model: llama_model_p, /) -> bool: # // Codellama infill tokens -# LLAMA_API llama_token llama_token_prefix(const struct llama_model * model); // Beginning of infill prefix +# DEPRECATED(LLAMA_API llama_token llama_token_prefix(const struct llama_model * model), "use llama_token_fim_pre instead"); @ctypes_function("llama_token_prefix", [llama_model_p_ctypes], llama_token) def llama_token_prefix(model: llama_model_p) -> int: """codellama infill tokens""" ... -# LLAMA_API llama_token llama_token_middle(const struct llama_model * model); // Beginning of infill middle +# DEPRECATED(LLAMA_API llama_token llama_token_middle(const struct llama_model * model), "use llama_token_fim_mid instead"); @ctypes_function("llama_token_middle", [llama_model_p_ctypes], llama_token) def llama_token_middle(model: llama_model_p, /) -> int: ... -# LLAMA_API llama_token llama_token_suffix(const struct llama_model * model); // Beginning of infill suffix +# DEPRECATED(LLAMA_API llama_token llama_token_suffix(const struct llama_model * model), "use llama_token_fim_suf instead"); @ctypes_function("llama_token_suffix", [llama_model_p_ctypes], llama_token) def llama_token_suffix(model: llama_model_p, /) -> int: ... -# LLAMA_API llama_token llama_token_eot (const struct llama_model * model); // End of infill middle -@ctypes_function("llama_token_eot", [llama_model_p_ctypes], llama_token) -def llama_token_eot(model: llama_model_p, /) -> int: +# LLAMA_API llama_token llama_token_fim_pre(const struct llama_model * model); +@ctypes_function("llama_token_fim_pre", [llama_model_p_ctypes], llama_token) +def llama_token_fim_pre(model: llama_model_p, /) -> int: ... +# LLAMA_API llama_token llama_token_fim_suf(const struct llama_model * model); +@ctypes_function("llama_token_fim_suf", [llama_model_p_ctypes], llama_token) +def llama_token_fim_suf(model: llama_model_p, /) -> int: + ... + +# LLAMA_API llama_token llama_token_fim_mid(const struct llama_model * model); +@ctypes_function("llama_token_fim_mid", [llama_model_p_ctypes], llama_token) +def llama_token_fim_mid(model: llama_model_p, /) -> int: + ... + +# LLAMA_API llama_token llama_token_fim_pad(const struct llama_model * model); +@ctypes_function("llama_token_fim_pad", [llama_model_p_ctypes], llama_token) +def llama_token_fim_pad(model: llama_model_p, /) -> int: + ... + +# LLAMA_API llama_token llama_token_fim_rep(const struct llama_model * model); +@ctypes_function("llama_token_fim_rep", [llama_model_p_ctypes], llama_token) +def llama_token_fim_rep(model: llama_model_p, /) -> int: + ... + +# LLAMA_API llama_token llama_token_fim_sep(const struct llama_model * model); +@ctypes_function("llama_token_fim_sep", [llama_model_p_ctypes], llama_token) +def llama_token_fim_sep(model: llama_model_p, /) -> int: + ... # // # // Tokenization # // +# // The API is thread-safe. +# // # /// @details Convert the provided text into tokens. @@ -2860,6 +2810,23 @@ def llama_token_to_piece( ... +# # // check if token0 is contained as a prefix in token1 +# # LLAMA_API bool llama_token_is_prefix( +# # const struct llama_model * model, +# # llama_token token0, +# # llama_token token1); +# @ctypes_function( +# "llama_token_is_prefix", +# [llama_model_p_ctypes, llama_token, llama_token], +# ctypes.c_bool, +# ) +# def llama_token_is_prefix( +# model: llama_model_p, token0: Union[llama_token, int], token1: Union[llama_token, int], / +# ) -> bool: +# """Check if token0 is contained as a prefix in token1""" +# ... + + # /// @details Convert the provided tokens into text (inverse of llama_tokenize()). # /// @param text The char pointer must be large enough to hold the resulting text. # /// @return Returns the number of chars/bytes on success, no more than text_len_max. @@ -2954,413 +2921,315 @@ def llama_chat_apply_template( # // -# // Grammar +# // Sampling API +# // +# // Sample usage: +# // +# // // prepare the sampling chain at the start +# // auto sparams = llama_sampler_chain_default_params(); +# // +# // llama_sampler * smpl = llama_sampler_chain_init(sparams); +# // +# // llama_sampler_chain_add(smpl, llama_sampler_init_top_k(50)); +# // llama_sampler_chain_add(smpl, llama_sampler_init_top_p(0.9, 1)); +# // llama_sampler_chain_add(smpl, llama_sampler_init_temp (0.8)); +# // +# // // typically, the chain should end with a sampler such as "greedy", "dist" or "mirostat" +# // // this sampler will be responsible to select the actual token +# // llama_sampler_chain_add(smpl, llama_sampler_init_dist(seed)); +# // +# // ... +# // +# // // decoding loop: +# // while (...) { +# // ... +# // +# // llama_decode(ctx, batch); +# // +# // // sample from the logits of the last token in the batch +# // const llama_token id = llama_sampler_sample(smpl, ctx, -1); +# // +# // // accepting the token updates the internal state of certain samplers (e.g. grammar, repetition, etc.) +# // llama_sampler_accept(smpl, id); +# // ... +# // } +# // +# // llama_sampler_free(smpl); +# // +# // TODO: In the future, llama_sampler will be utilized to offload the sampling to the backends (e.g. GPU). +# // TODO: in the future, the entire sampling API that uses llama_model should start using llama_vocab # // +# typedef void * llama_sampler_context_t; +llama_sampler_context_t = ctypes.c_void_p -# LLAMA_API struct llama_grammar * llama_grammar_init( -# const llama_grammar_element ** rules, -# size_t n_rules, -# size_t start_rule_index); -@ctypes_function( - "llama_grammar_init", - [ - ctypes.POINTER(llama_grammar_element_p), - ctypes.c_size_t, - ctypes.c_size_t, - ], - llama_grammar_p, -) -def llama_grammar_init( - rules: CtypesArray[ - CtypesPointer[llama_grammar_element] - ], # NOTE: This might be wrong type sig - n_rules: Union[ctypes.c_size_t, int], - start_rule_index: Union[ctypes.c_size_t, int], - /, -) -> Optional[llama_grammar_p]: - """Initialize a grammar from a set of rules.""" + +# // user code can implement the interface below in order to create custom llama_sampler +# struct llama_sampler_i { +# const char * (*name) (const struct llama_sampler * smpl); // can be NULL +# void (*accept)( struct llama_sampler * smpl, llama_token token); // can be NULL +# void (*apply) ( struct llama_sampler * smpl, llama_token_data_array * cur_p); // required +# void (*reset) ( struct llama_sampler * smpl); // can be NULL +# struct llama_sampler * (*clone) (const struct llama_sampler * smpl); // can be NULL if ctx is NULL +# void (*free) ( struct llama_sampler * smpl); // can be NULL if ctx is NULL +# +# // TODO: API for internal libllama usage for appending the sampling to an existing ggml_cgraph +# //void (*apply_ggml) (struct llama_sampler * smpl, ...); +# }; +class llama_sampler_i(ctypes.Structure): ... -# LLAMA_API void llama_grammar_free(struct llama_grammar * grammar); -@ctypes_function( - "llama_grammar_free", - [llama_grammar_p], - None, +# struct llama_sampler { +# struct llama_sampler_i * iface; +# llama_sampler_context_t ctx; +# }; +class llama_sampler(ctypes.Structure): + _fields_ = [ + ("iface", ctypes.POINTER(llama_sampler_i)), + ("ctx", llama_sampler_context_t), + ] + + +if TYPE_CHECKING: + llama_sampler_p = CtypesPointer[llama_sampler] + +llama_sampler_p_ctypes = ctypes.POINTER(llama_sampler) + +llama_sampler_i_name = ctypes.CFUNCTYPE(ctypes.c_char_p, llama_sampler_p_ctypes) +llama_sampler_i_accept = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes, llama_token) +llama_sampler_i_apply = ctypes.CFUNCTYPE( + None, llama_sampler_p_ctypes, llama_token_data_array_p ) -def llama_grammar_free(grammar: llama_grammar_p, /): - """Free a grammar.""" - ... +llama_sampler_i_reset = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes) +llama_sampler_i_clone = ctypes.CFUNCTYPE(llama_sampler_p_ctypes, llama_sampler_p_ctypes) +llama_sampler_i_free = ctypes.CFUNCTYPE(None, llama_sampler_p_ctypes) + +llama_sampler_i._fields_ = [ + ("name", llama_sampler_i_name), + ("accept", llama_sampler_i_accept), + ("apply", llama_sampler_i_apply), + ("reset", llama_sampler_i_reset), + ("clone", llama_sampler_i_clone), + ("free", llama_sampler_i_free), +] -# LLAMA_API struct llama_grammar * llama_grammar_copy(const struct llama_grammar * grammar); +# // mirror of llama_sampler_i: +# LLAMA_API const char * llama_sampler_name (const struct llama_sampler * smpl); @ctypes_function( - "llama_grammar_copy", - [llama_grammar_p], - llama_grammar_p, + "llama_sampler_name", + [llama_sampler_p_ctypes], + ctypes.c_char_p, ) -def llama_grammar_copy(grammar: llama_grammar_p, /) -> llama_grammar_p: - """Copy a grammar.""" +def llama_sampler_name(smpl: llama_sampler_p, /) -> bytes: ... -# /// @details Apply constraints from grammar -# LLAMA_API void llama_grammar_sample( -# const struct llama_grammar * grammar, -# const struct llama_context * ctx, -# llama_token_data_array * candidates); +# LLAMA_API void llama_sampler_accept( struct llama_sampler * smpl, llama_token token); @ctypes_function( - "llama_grammar_sample", - [ - llama_grammar_p, - llama_context_p_ctypes, - llama_token_data_array_p, - ], + "llama_sampler_accept", + [llama_sampler_p_ctypes, llama_token], None, ) -def llama_grammar_sample( - grammar: llama_grammar_p, - ctx: llama_context_p, - candidates: Union[ - CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array] - ], - /, -): - """Apply constraints from grammar""" +def llama_sampler_accept(smpl: llama_sampler_p, token: Union[llama_token, int], /): ... -# LLAMA_API DEPRECATED(void llama_sample_grammar( -# struct llama_context * ctx, -# llama_token_data_array * candidates, -# const struct llama_grammar * grammar), -# "use llama_grammar_sample instead"); +# LLAMA_API void llama_sampler_apply ( struct llama_sampler * smpl, llama_token_data_array * cur_p); @ctypes_function( - "llama_sample_grammar", - [llama_context_p_ctypes, llama_token_data_array_p, llama_grammar_p], + "llama_sampler_apply", + [llama_sampler_p_ctypes, llama_token_data_array_p], None, ) -def llama_sample_grammar( - ctx: llama_context_p, - candidates: Union[ - CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array] - ], - grammar, # type: llama_grammar_p - /, +def llama_sampler_apply( + smpl: llama_sampler_p, cur_p: CtypesArray[llama_token_data_array], / ): - """Apply constraints from grammar - - Parameters: - candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. - grammar: A grammar object containing the rules and constraints to apply to the generated text. - """ ... -# /// @details Accepts the sampled token into the grammar -# LLAMA_API void llama_grammar_accept_token( -# struct llama_grammar * grammar, -# struct llama_context * ctx, -# llama_token token); +# LLAMA_API void llama_sampler_reset ( struct llama_sampler * smpl); @ctypes_function( - "llama_grammar_accept_token", - [llama_grammar_p, llama_context_p_ctypes, llama_token], + "llama_sampler_reset", + [llama_sampler_p_ctypes], None, ) -def llama_grammar_accept_token( - grammar: llama_grammar_p, - ctx: llama_context_p, - token: Union[llama_token, int], - /, -): - """Accepts the sampled token into the grammar""" +def llama_sampler_reset(smpl: llama_sampler_p, /): ... -# // -# // Sampling functions -# // +# LLAMA_API struct llama_sampler * llama_sampler_clone (const struct llama_sampler * smpl); +@ctypes_function( + "llama_sampler_clone", + [llama_sampler_p_ctypes], + llama_sampler_p_ctypes, +) +def llama_sampler_clone(smpl: llama_sampler_p, /) -> llama_sampler_p: + ... -# // Sets the current rng seed. -# LLAMA_API void llama_set_rng_seed(struct llama_context * ctx, uint32_t seed); +# // important: do not free if the sampler has been added to a llama_sampler_chain (via llama_sampler_chain_add) +# LLAMA_API void llama_sampler_free ( struct llama_sampler * smpl); @ctypes_function( - "llama_set_rng_seed", - [llama_context_p_ctypes, ctypes.c_uint32], + "llama_sampler_free", + [llama_sampler_p_ctypes], None, ) -def llama_set_rng_seed(ctx: llama_context_p, seed: Union[ctypes.c_uint32, int], /): - """Sets the current rng seed.""" +def llama_sampler_free(smpl: llama_sampler_p, /): ... -# /// @details Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. -# /// @details Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. -# LLAMA_API void llama_sample_repetition_penalties( -# struct llama_context * ctx, -# llama_token_data_array * candidates, -# const llama_token * last_tokens, -# size_t penalty_last_n, -# float penalty_repeat, -# float penalty_freq, -# float penalty_present); -@ctypes_function( - "llama_sample_repetition_penalties", - [ - llama_context_p_ctypes, - llama_token_data_array_p, - llama_token_p, - ctypes.c_size_t, - ctypes.c_float, - ctypes.c_float, - ctypes.c_float, - ], - None, +# // llama_sampler_chain +# // a type of llama_sampler that can chain multiple samplers one after another +# +# LLAMA_API struct llama_sampler * llama_sampler_chain_init(struct llama_sampler_chain_params params); +@ctypes_function( + "llama_sampler_chain_init", + [llama_sampler_chain_params], + llama_sampler_p_ctypes, ) -def llama_sample_repetition_penalties( - ctx: llama_context_p, - candidates: Union[ - CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array] - ], - last_tokens_data: CtypesArray[llama_token], - penalty_last_n: Union[ctypes.c_size_t, int], - penalty_repeat: Union[ctypes.c_float, float], - penalty_freq: Union[ctypes.c_float, float], - penalty_present: Union[ctypes.c_float, float], - /, -): - """Repetition penalty described in CTRL academic paper https://arxiv.org/abs/1909.05858, with negative logit fix. - Frequency and presence penalties described in OpenAI API https://platform.openai.com/docs/api-reference/parameter-details. - """ +def llama_sampler_chain_init(params: llama_sampler_chain_params, /) -> llama_sampler_p: ... -# /// @details Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806 -# /// @param logits Logits extracted from the original generation context. -# /// @param logits_guidance Logits extracted from a separate context from the same model. Other than a negative prompt at the beginning, it should have all generated and user input tokens copied from the main context. -# /// @param scale Guidance strength. 1.0f means no guidance. Higher values mean stronger guidance. -# LLAMA_API void llama_sample_apply_guidance( -# struct llama_context * ctx, -# float * logits, -# float * logits_guidance, -# float scale); +# // important: takes ownership of the sampler object and will free it when llama_sampler_free is called +# LLAMA_API void llama_sampler_chain_add( struct llama_sampler * chain, struct llama_sampler * smpl); @ctypes_function( - "llama_sample_apply_guidance", - [ - llama_context_p_ctypes, - ctypes.POINTER(ctypes.c_float), - ctypes.POINTER(ctypes.c_float), - ctypes.c_float, - ], + "llama_sampler_chain_add", + [llama_sampler_p_ctypes, llama_sampler_p_ctypes], None, ) -def llama_sample_apply_guidance( - ctx: llama_context_p, - logits: CtypesArray[ctypes.c_float], - logits_guidance: CtypesArray[ctypes.c_float], - scale: Union[ctypes.c_float, float], - /, -): - """Apply classifier-free guidance to the logits as described in academic paper "Stay on topic with Classifier-Free Guidance" https://arxiv.org/abs/2306.17806""" +def llama_sampler_chain_add(chain: llama_sampler_p, smpl: llama_sampler_p, /): ... -# /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. -# LLAMA_API void llama_sample_softmax( -# struct llama_context * ctx, -# llama_token_data_array * candidates); +# LLAMA_API struct llama_sampler * llama_sampler_chain_get(const struct llama_sampler * chain, int32_t i); @ctypes_function( - "llama_sample_softmax", - [llama_context_p_ctypes, llama_token_data_array_p], - None, + "llama_sampler_chain_get", + [llama_sampler_p_ctypes, ctypes.c_int32], + llama_sampler_p_ctypes, ) -def llama_sample_softmax( - ctx: llama_context_p, - candidates: Union[ - CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array] - ], - /, -): - """Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits.""" +def llama_sampler_chain_get( + chain: llama_sampler_p, i: Union[ctypes.c_int32, int], / +) -> llama_sampler_p: ... -# /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 -# LLAMA_API void llama_sample_top_k( -# struct llama_context * ctx, -# llama_token_data_array * candidates, -# int32_t k, -# size_t min_keep); +# LLAMA_API int llama_sampler_chain_n (const struct llama_sampler * chain); @ctypes_function( - "llama_sample_top_k", - [llama_context_p_ctypes, llama_token_data_array_p, ctypes.c_int32, ctypes.c_size_t], - None, + "llama_sampler_chain_n", + [llama_sampler_p_ctypes], + ctypes.c_int, ) -def llama_sample_top_k( - ctx: llama_context_p, - candidates: Union[ - CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array] - ], - k: Union[ctypes.c_int, int], - min_keep: Union[ctypes.c_size_t, int], - /, -): - """Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751""" +def llama_sampler_chain_n(chain: llama_sampler_p, /) -> int: ... -# /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 -# LLAMA_API void llama_sample_top_p( -# struct llama_context * ctx, -# llama_token_data_array * candidates, -# float p, -# size_t min_keep); +# // after removing a sampler, the chain will no longer own it, and it will not be freed when the chain is freed +# LLAMA_API struct llama_sampler * llama_sampler_chain_remove( struct llama_sampler * chain, int32_t i); @ctypes_function( - "llama_sample_top_p", - [llama_context_p_ctypes, llama_token_data_array_p, ctypes.c_float, ctypes.c_size_t], - None, + "llama_sampler_chain_remove", + [llama_sampler_p_ctypes, ctypes.c_int32], + llama_sampler_p_ctypes, ) -def llama_sample_top_p( - ctx: llama_context_p, - candidates: Union[ - CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array] - ], - p: Union[ctypes.c_float, float], - min_keep: Union[ctypes.c_size_t, int], - /, -): - """Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751""" +def llama_sampler_chain_remove( + chain: llama_sampler_p, i: Union[ctypes.c_int32, int], / +) -> llama_sampler_p: ... -# /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 -# LLAMA_API void llama_sample_min_p( -# struct llama_context * ctx, -# llama_token_data_array * candidates, -# float p, -# size_t min_keep); +# // available samplers: +# +# LLAMA_API struct llama_sampler * llama_sampler_init_greedy(void); +@ctypes_function("llama_sampler_init_greedy", [], llama_sampler_p_ctypes) +def llama_sampler_init_greedy() -> llama_sampler_p: + ... + + +# LLAMA_API struct llama_sampler * llama_sampler_init_dist (uint32_t seed); +@ctypes_function("llama_sampler_init_dist", [ctypes.c_uint32], llama_sampler_p_ctypes) +def llama_sampler_init_dist(seed: int) -> llama_sampler_p: + ... + + +# /// @details Sorts candidate tokens by their logits in descending order and calculate probabilities based on logits. +# /// NOTE: Avoid using on the full vocabulary as the sorting can become slow. For example, apply top-k or top-p sampling first. +# DEPRECATED(LLAMA_API struct llama_sampler * llama_sampler_init_softmax (void), +# "will be removed in the future (see https://github.com/ggerganov/llama.cpp/pull/9896#discussion_r1800920915)"); +@ctypes_function("llama_sampler_init_softmax", [], llama_sampler_p_ctypes) +def llama_sampler_init_softmax() -> llama_sampler_p: + ... + + +# /// @details Top-K sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# LLAMA_API struct llama_sampler * llama_sampler_init_top_k (int32_t k); +@ctypes_function("llama_sampler_init_top_k", [ctypes.c_int32], llama_sampler_p_ctypes) +def llama_sampler_init_top_k(k: int) -> llama_sampler_p: + ... + + +# /// @details Nucleus sampling described in academic paper "The Curious Case of Neural Text Degeneration" https://arxiv.org/abs/1904.09751 +# LLAMA_API struct llama_sampler * llama_sampler_init_top_p (float p, size_t min_keep); @ctypes_function( - "llama_sample_min_p", - [llama_context_p_ctypes, llama_token_data_array_p, ctypes.c_float, ctypes.c_size_t], - None, + "llama_sampler_init_top_p", + [ctypes.c_float, ctypes.c_size_t], + llama_sampler_p_ctypes, ) -def llama_sample_min_p( - ctx: llama_context_p, - candidates: Union[ - CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array] - ], - p: Union[ctypes.c_float, float], - min_keep: Union[ctypes.c_size_t, int], - /, -): - """Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841""" +def llama_sampler_init_top_p(p: float, min_keep: int) -> llama_sampler_p: ... -# /// @details Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/. -# LLAMA_API void llama_sample_tail_free( -# struct llama_context * ctx, -# llama_token_data_array * candidates, -# float z, -# size_t min_keep); +# /// @details Minimum P sampling as described in https://github.com/ggerganov/llama.cpp/pull/3841 +# LLAMA_API struct llama_sampler * llama_sampler_init_min_p (float p, size_t min_keep); @ctypes_function( - "llama_sample_tail_free", - [llama_context_p_ctypes, llama_token_data_array_p, ctypes.c_float, ctypes.c_size_t], - None, + "llama_sampler_init_min_p", + [ctypes.c_float, ctypes.c_size_t], + llama_sampler_p_ctypes, ) -def llama_sample_tail_free( - ctx: llama_context_p, - candidates: Union[ - CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array] - ], - z: Union[ctypes.c_float, float], - min_keep: Union[ctypes.c_size_t, int], - /, -): - """Tail Free Sampling described in https://www.trentonbricken.com/Tail-Free-Sampling/.""" +def llama_sampler_init_min_p(p: float, min_keep: int) -> llama_sampler_p: ... # /// @details Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666. -# LLAMA_API void llama_sample_typical( -# struct llama_context * ctx, -# llama_token_data_array * candidates, -# float p, -# size_t min_keep); +# LLAMA_API struct llama_sampler * llama_sampler_init_typical (float p, size_t min_keep); @ctypes_function( - "llama_sample_typical", - [llama_context_p_ctypes, llama_token_data_array_p, ctypes.c_float, ctypes.c_size_t], - None, + "llama_sampler_init_typical", + [ctypes.c_float, ctypes.c_size_t], + llama_sampler_p_ctypes, ) -def llama_sample_typical( - ctx: llama_context_p, - candidates: Union[ - CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array] - ], - p: Union[ctypes.c_float, float], - min_keep: Union[ctypes.c_size_t, int], - /, -): - """Locally Typical Sampling implementation described in the paper https://arxiv.org/abs/2202.00666.""" +def llama_sampler_init_typical(p: float, min_keep: int) -> llama_sampler_p: ... -# /// @details Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772. -# LLAMA_API void llama_sample_entropy( -# struct llama_context * ctx, -# llama_token_data_array * candidates_p, -# float min_temp, -# float max_temp, -# float exponent_val); +# LLAMA_API struct llama_sampler * llama_sampler_init_temp (float t); +@ctypes_function("llama_sampler_init_temp", [ctypes.c_float], llama_sampler_p_ctypes) +def llama_sampler_init_temp(t: float) -> llama_sampler_p: + ... + + +# /// @details Dynamic temperature implementation (a.k.a. entropy) described in the paper https://arxiv.org/abs/2309.02772. +# LLAMA_API struct llama_sampler * llama_sampler_init_temp_ext (float t, float delta, float exponent); @ctypes_function( - "llama_sample_entropy", - [ - llama_context_p_ctypes, - llama_token_data_array_p, - ctypes.c_float, - ctypes.c_float, - ctypes.c_float, - ], - None, + "llama_sampler_init_temp_ext", + [ctypes.c_float, ctypes.c_float, ctypes.c_float], + llama_sampler_p_ctypes, ) -def llama_sample_entropy( - ctx: llama_context_p, - candidates: Union[ - CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array] - ], - min_temp: Union[ctypes.c_float, float], - max_temp: Union[ctypes.c_float, float], - exponent_val: Union[ctypes.c_float, float], - /, -): - """Dynamic temperature implementation described in the paper https://arxiv.org/abs/2309.02772.""" +def llama_sampler_init_temp_ext( + t: float, delta: float, exponent: float +) -> llama_sampler_p: ... -# LLAMA_API void llama_sample_temp( -# struct llama_context * ctx, -# llama_token_data_array * candidates, -# float temp); +# /// @details XTC sampler as described in https://github.com/oobabooga/text-generation-webui/pull/6335 +# LLAMA_API struct llama_sampler * llama_sampler_init_xtc (float p, float t, size_t min_keep, uint32_t seed); @ctypes_function( - "llama_sample_temp", - [llama_context_p_ctypes, llama_token_data_array_p, ctypes.c_float], - None, + "llama_sampler_init_xtc", + [ctypes.c_float, ctypes.c_float, ctypes.c_size_t, ctypes.c_uint32], + llama_sampler_p_ctypes, ) -def llama_sample_temp( - ctx: llama_context_p, - candidates: Union[ - CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array] - ], - temp: Union[ctypes.c_float, float], - /, -): - """Temperature sampling described in academic paper "Generating Long Sequences with Sparse Transformers" https://arxiv.org/abs/1904.10509 - - Parameters: - candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. - temp: The temperature value to use for the sampling. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. - """ +def llama_sampler_init_xtc( + p: float, t: float, min_keep: int, seed: int, / +) -> llama_sampler_p: ... @@ -3370,45 +3239,20 @@ def llama_sample_temp( # /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. # /// @param m The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. # /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. -# LLAMA_API llama_token llama_sample_token_mirostat( -# struct llama_context * ctx, -# llama_token_data_array * candidates, -# float tau, -# float eta, -# int32_t m, -# float * mu); +# LLAMA_API struct llama_sampler * llama_sampler_init_mirostat( +# int32_t n_vocab, +# uint32_t seed, +# float tau, +# float eta, +# int32_t m); @ctypes_function( - "llama_sample_token_mirostat", - [ - llama_context_p_ctypes, - llama_token_data_array_p, - ctypes.c_float, - ctypes.c_float, - ctypes.c_int32, - ctypes.POINTER(ctypes.c_float), - ], - llama_token, + "llama_sampler_init_mirostat", + [ctypes.c_int32, ctypes.c_uint32, ctypes.c_float, ctypes.c_float, ctypes.c_int32], + llama_sampler_p_ctypes, ) -def llama_sample_token_mirostat( - ctx: llama_context_p, - candidates: Union[ - CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array] - ], - tau: Union[ctypes.c_float, float], - eta: Union[ctypes.c_float, float], - m: Union[ctypes.c_int, int], - mu: CtypesPointerOrRef[ctypes.c_float], - /, -) -> int: - """Mirostat 1.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. - - Parameters: - candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. - tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. - eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. - m: The number of tokens considered in the estimation of `s_hat`. This is an arbitrary value that is used to calculate `s_hat`, which in turn helps to calculate the value of `k`. In the paper, they use `m = 100`, but you can experiment with different values to see how it affects the performance of the algorithm. - mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. - """ +def llama_sampler_init_mirostat( + n_vocab: int, seed: int, tau: float, eta: float, m: int, / +) -> llama_sampler_p: ... @@ -3417,82 +3261,189 @@ def llama_sample_token_mirostat( # /// @param tau The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. # /// @param eta The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. # /// @param mu Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. -# LLAMA_API llama_token llama_sample_token_mirostat_v2( -# struct llama_context * ctx, -# llama_token_data_array * candidates, -# float tau, -# float eta, -# float * mu); +# LLAMA_API struct llama_sampler * llama_sampler_init_mirostat_v2( +# uint32_t seed, +# float tau, +# float eta); @ctypes_function( - "llama_sample_token_mirostat_v2", + "llama_sampler_init_mirostat_v2", + [ctypes.c_uint32, ctypes.c_float, ctypes.c_float], + llama_sampler_p_ctypes, +) +def llama_sampler_init_mirostat_v2( + seed: int, tau: float, eta: float, / +) -> llama_sampler_p: + ... + + +# LLAMA_API struct llama_sampler * llama_sampler_init_grammar( +# const struct llama_model * model, +# const char * grammar_str, +# const char * grammar_root); +@ctypes_function( + "llama_sampler_init_grammar", + [llama_model_p_ctypes, ctypes.c_char_p, ctypes.c_char_p], + llama_sampler_p_ctypes, +) +def llama_sampler_init_grammar( + model: llama_model_p, grammar_str: bytes, grammar_root: bytes, / +) -> llama_sampler_p: + ... + + +# LLAMA_API struct llama_sampler * llama_sampler_init_penalties( +# int32_t n_vocab, // llama_n_vocab() +# llama_token special_eos_id, // llama_token_eos() +# llama_token linefeed_id, // llama_token_nl() +# int32_t penalty_last_n, // last n tokens to penalize (0 = disable penalty, -1 = context size) +# float penalty_repeat, // 1.0 = disabled +# float penalty_freq, // 0.0 = disabled +# float penalty_present, // 0.0 = disabled +# bool penalize_nl, // consider newlines as a repeatable token +# bool ignore_eos); // ignore the end-of-sequence token +@ctypes_function( + "llama_sampler_init_penalties", [ - llama_context_p_ctypes, - llama_token_data_array_p, + ctypes.c_int32, + llama_token, + llama_token, + ctypes.c_int32, ctypes.c_float, ctypes.c_float, - ctypes.POINTER(ctypes.c_float), + ctypes.c_float, + ctypes.c_bool, + ctypes.c_bool, ], - llama_token, -) -def llama_sample_token_mirostat_v2( - ctx: llama_context_p, - candidates: Union[ - CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array] + llama_sampler_p_ctypes, +) +def llama_sampler_init_penalties( + n_vocab: int, + special_eos_id: int, + linefeed_id: int, + penalty_last_n: int, + penalty_repeat: float, + penalty_freq: float, + penalty_present: float, + penalize_nl: bool, + ignore_eos: bool, + /, +) -> llama_sampler_p: + ... + + +# /// @details DRY sampler, designed by p-e-w, as described in: https://github.com/oobabooga/text-generation-webui/pull/5677, porting Koboldcpp implementation authored by pi6am: https://github.com/LostRuins/koboldcpp/pull/982 +# LLAMA_API struct llama_sampler * llama_sampler_init_dry( +# const struct llama_model * model, +# float dry_multiplier, +# float dry_base, +# int32_t dry_allowed_length, +# int32_t dry_penalty_last_n, +# const char ** seq_breakers, +# size_t num_breakers); +@ctypes_function( + "llama_sampler_init_dry", + [ + llama_model_p_ctypes, + ctypes.c_float, + ctypes.c_float, + ctypes.c_int32, + ctypes.c_int32, + ctypes.POINTER(ctypes.c_char_p), + ctypes.c_size_t, ], - tau: Union[ctypes.c_float, float], - eta: Union[ctypes.c_float, float], - mu: CtypesPointerOrRef[ctypes.c_float], + llama_sampler_p_ctypes, +) +def llama_sampler_init_dry( + model: llama_model_p, + dry_multiplier: float, + dry_base: float, + dry_allowed_length: int, + dry_penalty_last_n: int, + seq_breakers: CtypesArray[bytes], + num_breakers: int, /, -) -> int: - """Mirostat 2.0 algorithm described in the paper https://arxiv.org/abs/2007.14966. Uses tokens instead of words. +) -> llama_sampler_p: + ... - Parameters: - candidates: A vector of `llama_token_data` containing the candidate tokens, their probabilities (p), and log-odds (logit) for the current position in the generated text. - tau: The target cross-entropy (or surprise) value you want to achieve for the generated text. A higher value corresponds to more surprising or less predictable text, while a lower value corresponds to less surprising or more predictable text. - eta: The learning rate used to update `mu` based on the error between the target and observed surprisal of the sampled word. A larger learning rate will cause `mu` to be updated more quickly, while a smaller learning rate will result in slower updates. - mu: Maximum cross-entropy. This value is initialized to be twice the target cross-entropy (`2 * tau`) and is updated in the algorithm based on the error between the target and observed surprisal. - """ + +# LLAMA_API struct llama_sampler * llama_sampler_init_logit_bias( +# int32_t n_vocab, +# int32_t n_logit_bias, +# const llama_logit_bias * logit_bias); +@ctypes_function( + "llama_sampler_init_logit_bias", + [ctypes.c_int32, ctypes.c_int32, llama_logit_bias_p], + llama_sampler_p_ctypes, +) +def llama_sampler_init_logit_bias( + n_vocab: int, n_logit_bias: int, logit_bias: CtypesArray[llama_logit_bias], / +) -> llama_sampler_p: ... -# /// @details Selects the token with the highest probability. -# /// Does not compute the token probabilities. Use llama_sample_softmax() instead. -# LLAMA_API llama_token llama_sample_token_greedy( -# struct llama_context * ctx, -# llama_token_data_array * candidates); +# // this sampler is meant to be used for fill-in-the-middle infilling +# // it's supposed to be used after top_k + top_p sampling +# // +# // 1. if the sum of the EOG probs times the number of candidates is higher than the sum of the other probs -> pick EOG +# // 2. combine probs of tokens that have the same prefix +# // +# // example: +# // +# // - before: +# // "hel": 0.5 +# // "hell": 0.2 +# // "hello": 0.1 +# // "dummy": 0.1 +# // +# // - after: +# // "hel": 0.8 +# // "dummy": 0.1 +# // +# // 3. discard non-EOG tokens with low prob +# // 4. if no tokens are left -> pick EOT +# // +# LLAMA_API struct llama_sampler * llama_sampler_init_infill(const struct llama_model * model); @ctypes_function( - "llama_sample_token_greedy", - [llama_context_p_ctypes, llama_token_data_array_p], - llama_token, + "llama_sampler_init_infill", + [llama_model_p_ctypes], + llama_sampler_p_ctypes, ) -def llama_sample_token_greedy( - ctx: llama_context_p, - candidates: Union[ - CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array] - ], - /, -) -> int: - """Selects the token with the highest probability.""" +def llama_sampler_init_infill(model: llama_model_p, /) -> llama_sampler_p: + """This sampler is meant to be used for fill-in-the-middle infilling. + """ ... -# /// @details Randomly selects a token from the candidates based on their probabilities using the RNG of ctx. -# LLAMA_API llama_token llama_sample_token( -# struct llama_context * ctx, -# llama_token_data_array * candidates); +# // Returns the seed used by the sampler if applicable, LLAMA_DEFAULT_SEED otherwise +# LLAMA_API uint32_t llama_sampler_get_seed(const struct llama_sampler * smpl); @ctypes_function( - "llama_sample_token", - [llama_context_p_ctypes, llama_token_data_array_p], + "llama_sampler_get_seed", + [llama_sampler_p_ctypes], + ctypes.c_uint32, +) +def llama_sampler_get_seed(smpl: llama_sampler_p, /) -> int: + ... + + +# /// @details Sample and accept a token from the idx-th output of the last evaluation +# // +# // Shorthand for: +# // const auto * logits = llama_get_logits_ith(ctx, idx); +# // llama_token_data_array cur_p = { ... init from logits ... }; +# // llama_sampler_apply(smpl, &cur_p); +# // auto token = cur_p.data[cur_p.selected].id; +# // llama_sampler_accept(smpl, token); +# // return token; +# // Returns the sampled token +# LLAMA_API llama_token llama_sampler_sample(struct llama_sampler * smpl, struct llama_context * ctx, int32_t idx); +@ctypes_function( + "llama_sampler_sample", + [llama_sampler_p_ctypes, llama_context_p_ctypes, ctypes.c_int32], llama_token, ) -def llama_sample_token( - ctx: llama_context_p, - candidates: Union[ - CtypesArray[llama_token_data_array], CtypesPointerOrRef[llama_token_data_array] - ], - /, +def llama_sampler_sample( + smpl: llama_sampler_p, ctx: llama_context_p, idx: int, / ) -> int: - """Randomly selects a token from the candidates based on their probabilities.""" ... @@ -3543,79 +3494,139 @@ def llama_split_prefix( ... -# Performance information +# // Print system information +# LLAMA_API const char * llama_print_system_info(void); +@ctypes_function("llama_print_system_info", [], ctypes.c_char_p) +def llama_print_system_info() -> bytes: + ... -# LLAMA_API struct llama_timings llama_get_timings(struct llama_context * ctx); +# // Set callback for all future logging events. +# // If this is not called, or NULL is supplied, everything is output on stderr. +# LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data); @ctypes_function( - "llama_get_timings", + "llama_log_set", + [ctypes.c_void_p, ctypes.c_void_p], + None, +) +def llama_log_set( + log_callback: Optional[CtypesFuncPointer], + user_data: ctypes.c_void_p, + /, +): + """Set callback for all future logging events. + + If this is not called, or NULL is supplied, everything is output on stderr.""" + ... + + +# // +# // Performance utils +# // +# // NOTE: Used by llama.cpp examples, avoid using in third-party apps. Instead, do your own performance measurements. +# // + + +# struct llama_perf_context_data { +# double t_start_ms; +# double t_load_ms; +# double t_p_eval_ms; +# double t_eval_ms; +# +# int32_t n_p_eval; +# int32_t n_eval; +# }; +class llama_perf_context_data(ctypes.Structure): + _fields_ = [ + ("t_start_ms", ctypes.c_double), + ("t_load_ms", ctypes.c_double), + ("t_p_eval_ms", ctypes.c_double), + ("t_eval_ms", ctypes.c_double), + ("n_p_eval", ctypes.c_int32), + ("n_eval", ctypes.c_int32), + ] + + +# struct llama_perf_sampler_data { +# double t_sample_ms; +# +# int32_t n_sample; +# }; +class llama_perf_sampler_data(ctypes.Structure): + _fields_ = [ + ("t_sample_ms", ctypes.c_double), + ("n_sample", ctypes.c_int32), + ] + + +# LLAMA_API struct llama_perf_context_data llama_perf_context (const struct llama_context * ctx); +@ctypes_function( + "llama_perf_context", [llama_context_p_ctypes], - llama_timings, + llama_perf_context_data, ) -def llama_get_timings(ctx: llama_context_p, /) -> llama_timings: - """Get performance information""" +def llama_perf_context(ctx: llama_context_p, /) -> llama_perf_context_data: ... -# LLAMA_API void llama_print_timings(struct llama_context * ctx); +# LLAMA_API void llama_perf_context_print(const struct llama_context * ctx); @ctypes_function( - "llama_print_timings", + "llama_perf_context_print", [llama_context_p_ctypes], None, ) -def llama_print_timings(ctx: llama_context_p, /): - """Print performance information""" +def llama_perf_context_print(ctx: llama_context_p, /): ... -# LLAMA_API void llama_reset_timings(struct llama_context * ctx); +# LLAMA_API void llama_perf_context_reset( struct llama_context * ctx); @ctypes_function( - "llama_reset_timings", + "llama_perf_context_reset", [llama_context_p_ctypes], None, ) -def llama_reset_timings(ctx: llama_context_p, /): - """Reset performance information""" +def llama_perf_context_reset(ctx: llama_context_p, /): ... -# Print system information -# LLAMA_API const char * llama_print_system_info(void); +# // NOTE: the following work only with samplers constructed via llama_sampler_chain_init +# LLAMA_API struct llama_perf_sampler_data llama_perf_sampler (const struct llama_sampler * chain); @ctypes_function( - "llama_print_system_info", - [], - ctypes.c_char_p, + "llama_perf_sampler", + [llama_sampler_p_ctypes], + llama_perf_sampler_data, ) -def llama_print_system_info() -> bytes: - """Print system information""" +def llama_perf_sampler(chain: llama_sampler_p, /) -> llama_perf_sampler_data: ... -# NOTE: THIS IS CURRENTLY BROKEN AS ggml_log_callback IS NOT EXPOSED IN LLAMA.H -# // Set callback for all future logging events. -# // If this is not called, or NULL is supplied, everything is output on stderr. -# LLAMA_API void llama_log_set(ggml_log_callback log_callback, void * user_data); +# LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain); @ctypes_function( - "llama_log_set", - [ctypes.c_void_p, ctypes.c_void_p], + "llama_perf_sampler_print", + [llama_sampler_p_ctypes], None, ) -def llama_log_set( - log_callback: Optional[CtypesFuncPointer], - user_data: ctypes.c_void_p, - /, -): - """Set callback for all future logging events. +def llama_perf_sampler_print(chain: llama_sampler_p, /): + ... - If this is not called, or NULL is supplied, everything is output on stderr.""" + +# LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain); +@ctypes_function( + "llama_perf_sampler_reset", + [llama_sampler_p_ctypes], + None, +) +def llama_perf_sampler_reset(chain: llama_sampler_p, /): ... -# LLAMA_API void llama_dump_timing_info_yaml(FILE * stream, const struct llama_context * ctx); +# LLAMA_API void llama_perf_dump_yaml(FILE * stream, const struct llama_context * ctx); @ctypes_function( - "llama_dump_timing_info_yaml", - [ctypes.c_void_p, llama_context_p_ctypes], + "llama_perf_dump_yaml", + [ctypes.POINTER(ctypes.c_void_p), llama_context_p_ctypes], None, ) -def llama_dump_timing_info_yaml(stream: ctypes.c_void_p, ctx: llama_context_p, /): - ... \ No newline at end of file +def llama_perf_dump_yaml( + stream: ctypes.POINTER(ctypes.c_void_p), ctx: llama_context_p, / +): + ... diff --git a/nexa/gguf/llama/llama_grammar.py b/nexa/gguf/llama/llama_grammar.py index 2fc20d05..b95c77ab 100644 --- a/nexa/gguf/llama/llama_grammar.py +++ b/nexa/gguf/llama/llama_grammar.py @@ -2,11 +2,6 @@ # flake8: noqa from pathlib import Path -import sys -import ctypes -import enum -import typing -import dataclasses from itertools import groupby from typing import ( @@ -18,883 +13,18 @@ Union, ) -import nexa.gguf.llama.llama_cpp as llama_cpp - -class GrammarElementType(enum.IntEnum): - END = llama_cpp.LLAMA_GRETYPE_END - ALT = llama_cpp.LLAMA_GRETYPE_ALT - RULE_REF = llama_cpp.LLAMA_GRETYPE_RULE_REF - CHAR = llama_cpp.LLAMA_GRETYPE_CHAR - CHAR_NOT = llama_cpp.LLAMA_GRETYPE_CHAR_NOT - CHAR_RNG_UPPER = llama_cpp.LLAMA_GRETYPE_CHAR_RNG_UPPER - CHAR_ALT = llama_cpp.LLAMA_GRETYPE_CHAR_ALT - CHAR_ANY = llama_cpp.LLAMA_GRETYPE_CHAR_ANY - - -@dataclasses.dataclass -class GrammarElement: - type: GrammarElementType - value: int - - -@dataclasses.dataclass -class ParseState: - symbol_ids: typing.Dict[str, int] = dataclasses.field(default_factory=dict) - rules: typing.List[typing.List[GrammarElement]] = dataclasses.field(default_factory=list) - - -# static std::pair decode_utf8(const char * src) { -# static const int lookup[] = { 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4 }; -# uint8_t first_byte = static_cast(*src); -# uint8_t highbits = first_byte >> 4; -# int len = lookup[highbits]; -# uint8_t mask = (1 << (8 - len)) - 1; -# uint32_t value = first_byte & mask; -# const char * end = src + len; // may overrun! -# const char * pos = src + 1; -# for ( ; pos < end && *pos; pos++) { -# value = (value << 6) + (static_cast(*pos) & 0x3F); -# } -# return std::make_pair(value, pos); -# } -def decode_utf8(src: str) -> typing.Tuple[int, str]: - lookup: list[int] = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 3, 4] - first_byte: int = ord(src[0]) - highbits: int = first_byte >> 4 - length: int = lookup[highbits] - mask: int = (1 << (8 - length)) - 1 - value: int = first_byte & mask - end: int = min(len(src), length) # Prevent overrun - - pos: int = 1 - for pos in range(1, end): - if not src[pos]: - break - value = (value << 6) + (ord(src[pos]) & 0x3F) - - return value, src[pos:] if pos < len(src) else "" - - -# static uint32_t get_symbol_id(parse_state & state, const char * src, size_t len) { -# uint32_t next_id = static_cast(state.symbol_ids.size()); -# auto result = state.symbol_ids.emplace(std::string(src, len), next_id); -# return result.first->second; -# } -def get_symbol_id(state: ParseState, name: str) -> int: - next_id = len(state.symbol_ids) - return state.symbol_ids.setdefault(name, next_id) - - -# static uint32_t generate_symbol_id(parse_state & state, const std::string & base_name) { -# uint32_t next_id = static_cast(state.symbol_ids.size()); -# state.symbol_ids[base_name + '_' + std::to_string(next_id)] = next_id; -# return next_id; -# } -def generate_symbol_id(state: ParseState, base_name: str) -> int: - next_id = len(state.symbol_ids) - state.symbol_ids[f"{base_name}_{next_id}"] = next_id - return next_id - - -# static void add_rule( -# parse_state & state, -# uint32_t rule_id, -# const std::vector & rule) { -# if (state.rules.size() <= rule_id) { -# state.rules.resize(rule_id + 1); -# } -# state.rules[rule_id] = rule; -# } -def add_rule(state: ParseState, rule_id: int, rule: typing.List[GrammarElement]) -> None: - if len(state.rules) <= rule_id: - state.rules.extend([[]] * (rule_id + 1 - len(state.rules))) - state.rules[rule_id] = rule - - -# static bool is_digit_char(char c) { -# return '0' <= c && c <= '9'; -# } -def is_digit_char(c: str) -> bool: - return "0" <= c <= "9" - - -# static bool is_word_char(char c) { -# return ('a' <= c && c <= 'z') || ('A' <= c && c <= 'Z') || c == '-' || is_digit_char(c); -# } -def is_word_char(c: str) -> bool: - return ("a" <= c <= "z") or ("A" <= c <= "Z") or c == "-" or is_digit_char(c) - - -# static std::pair parse_hex(const char * src, int size) { -# const char * pos = src; -# const char * end = src + size; -# uint32_t value = 0; -# for ( ; pos < end && *pos; pos++) { -# value <<= 4; -# char c = *pos; -# if ('a' <= c && c <= 'f') { -# value += c - 'a' + 10; -# } else if ('A' <= c && c <= 'F') { -# value += c - 'A' + 10; -# } else if ('0' <= c && c <= '9') { -# value += c - '0'; -# } else { -# break; -# } -# } -# if (pos != end) { -# throw std::runtime_error("expecting " + std::to_string(size) + " hex chars at " + src); -# } -# return std::make_pair(value, pos); -# } -def parse_hex(src: str, size: int) -> typing.Tuple[int, str]: - pos = 0 - value = 0 - for _ in range(size): - value <<= 4 - c = src[pos] - if "a" <= c <= "f": - value += ord(c) - ord("a") + 10 - elif "A" <= c <= "F": - value += ord(c) - ord("A") + 10 - elif "0" <= c <= "9": - value += ord(c) - ord("0") - else: - break - pos += 1 - if pos != size: - raise ValueError(f"expecting {size} hex chars at {src}") - return value, src[pos:] - - -# static const char * parse_space(const char * src, bool newline_ok) { -# const char * pos = src; -# while (*pos == ' ' || *pos == '\t' || *pos == '#' || -# (newline_ok && (*pos == '\r' || *pos == '\n'))) { -# if (*pos == '#') { -# while (*pos && *pos != '\r' && *pos != '\n') { -# pos++; -# } -# } else { -# pos++; -# } -# } -# return pos; -# } -def parse_space(src: str, newline_ok: bool) -> str: - pos = src - while pos and (pos[0] in (' ', '\t', '#') or (newline_ok and pos[0] in ('\r', '\n'))): - if pos[0] == "#": - while pos and pos[0] not in ("\r", "\n"): - pos = pos[1:] - else: - pos = pos[1:] - return pos - - -# static const char * parse_name(const char * src) { -# const char * pos = src; -# while (is_word_char(*pos)) { -# pos++; -# } -# if (pos == src) { -# throw std::runtime_error(std::string("expecting name at ") + src); -# } -# return pos; -# } -def parse_name(src: str) -> typing.Tuple[str, str]: - pos = src - while pos and is_word_char(pos[0]): - pos = pos[1:] - if pos == src: - raise ValueError(f"expecting name at {src}") - return src[:len(src) - len(pos)], pos - -# static const char * parse_int(const char * src) { -# const char * pos = src; -# while (is_digit_char(*pos)) { -# pos++; -# } -# if (pos == src) { -# throw std::runtime_error(std::string("expecting integer at ") + src); -# } -# return pos; -# } -def parse_int(src: str) -> typing.Tuple[int, str]: - pos = src - while pos and is_digit_char(pos[0]): - pos = pos[1:] - if pos == src: - raise ValueError(f"expecting integer at {src}") - return int(src[:len(src) - len(pos)]), pos - - -# static std::pair parse_char(const char * src) { -# if (*src == '\\') { -# switch (src[1]) { -# case 'x': return parse_hex(src + 2, 2); -# case 'u': return parse_hex(src + 2, 4); -# case 'U': return parse_hex(src + 2, 8); -# case 't': return std::make_pair('\t', src + 2); -# case 'r': return std::make_pair('\r', src + 2); -# case 'n': return std::make_pair('\n', src + 2); -# case '\\': -# case '"': -# case '[': -# case ']': -# return std::make_pair(src[1], src + 2); -# default: -# throw std::runtime_error(std::string("unknown escape at ") + src); -# } -# } else if (*src) { -# return decode_utf8(src); -# } -# throw std::runtime_error("unexpected end of input"); -# } -def parse_char(src: str) -> typing.Tuple[int, str]: - if not src: - raise ValueError("unexpected end of input") - if src[0] == "\\": - if src[1] == "x": - return parse_hex(src[2:], 2) - elif src[1] == "u": - return parse_hex(src[2:], 4) - elif src[1] == "U": - return parse_hex(src[2:], 8) - elif src[1] == "t": - return ord("\t"), src[2:] - elif src[1] == "r": - return ord("\r"), src[2:] - elif src[1] == "n": - return ord("\n"), src[2:] - elif src[1] in ('\\', '"', '[', ']'): - return ord(src[1]), src[2:] - else: - raise ValueError(f"unknown escape at {src}") - return decode_utf8(src) - -# static const char * parse_sequence( -# parse_state & state, -# const char * src, -# const std::string & rule_name, -# std::vector & out_elements, -# bool is_nested) { -# size_t last_sym_start = out_elements.size(); -# const char * pos = src; -# -# auto handle_repetitions = [&](int min_times, int max_times) { -# -# if (last_sym_start == out_elements.size()) { -# throw std::runtime_error(std::string("expecting preceding item to */+/?/{ at ") + pos); -# } -# -# // apply transformation to previous symbol (last_sym_start to end) according to -# // the following rewrite rules: -# // S{m,n} --> S S S (m times) S'(n-m) -# // S'(x) ::= S S'(x-1) | -# // (... n-m definitions of these S' rules ...) -# // S'(1) ::= S | -# // S{m,} --> S S S (m times) S' -# // S' ::= S S' | -# // S* --> S{0,} -# // --> S' ::= S S' | -# // S+ --> S{1,} -# // --> S S' -# // S' ::= S S' | -# // S? --> S{0,1} -# // --> S' -# // S' ::= S | -# -# std::vector previous_elements(out_elements.begin() + last_sym_start, out_elements.end()); -# if (min_times == 0) { -# out_elements.resize(last_sym_start); -# } else { -# // Repeat the previous elements (min_times - 1) times -# for (int i = 1; i < min_times; i++) { -# out_elements.insert(out_elements.end(), previous_elements.begin(), previous_elements.end()); -# } -# } -# -# uint32_t last_rec_rule_id = 0; -# auto n_opt = max_times < 0 ? 1 : max_times - min_times; -# -# std::vector rec_rule(previous_elements); -# for (int i = 0; i < n_opt; i++) { -# rec_rule.resize(previous_elements.size()); -# uint32_t rec_rule_id = generate_symbol_id(state, rule_name); -# if (i > 0 || max_times < 0) { -# rec_rule.push_back({LLAMA_GRETYPE_RULE_REF, max_times < 0 ? rec_rule_id : last_rec_rule_id}); -# } -# rec_rule.push_back({LLAMA_GRETYPE_ALT, 0}); -# rec_rule.push_back({LLAMA_GRETYPE_END, 0}); -# add_rule(state, rec_rule_id, rec_rule); -# last_rec_rule_id = rec_rule_id; -# } -# if (n_opt > 0) { -# out_elements.push_back({LLAMA_GRETYPE_RULE_REF, last_rec_rule_id}); -# } -# }; -# -# while (*pos) { -# if (*pos == '"') { // literal string -# pos++; -# last_sym_start = out_elements.size(); -# while (*pos != '"') { -# if (!*pos) { -# throw std::runtime_error("unexpected end of input"); -# } -# auto char_pair = parse_char(pos); -# pos = char_pair.second; -# out_elements.push_back({LLAMA_GRETYPE_CHAR, char_pair.first}); -# } -# pos = parse_space(pos + 1, is_nested); -# } else if (*pos == '[') { // char range(s) -# pos++; -# enum llama_gretype start_type = LLAMA_GRETYPE_CHAR; -# if (*pos == '^') { -# pos++; -# start_type = LLAMA_GRETYPE_CHAR_NOT; -# } -# last_sym_start = out_elements.size(); -# while (*pos != ']') { -# if (!*pos) { -# throw std::runtime_error("unexpected end of input"); -# } -# auto char_pair = parse_char(pos); -# pos = char_pair.second; -# enum llama_gretype type = last_sym_start < out_elements.size() -# ? LLAMA_GRETYPE_CHAR_ALT -# : start_type; -# -# out_elements.push_back({type, char_pair.first}); -# if (pos[0] == '-' && pos[1] != ']') { -# if (!pos[1]) { -# throw std::runtime_error("unexpected end of input"); -# } -# auto endchar_pair = parse_char(pos + 1); -# pos = endchar_pair.second; -# out_elements.push_back({LLAMA_GRETYPE_CHAR_RNG_UPPER, endchar_pair.first}); -# } -# } -# pos = parse_space(pos + 1, is_nested); -# } else if (is_word_char(*pos)) { // rule reference -# const char * name_end = parse_name(pos); -# uint32_t ref_rule_id = get_symbol_id(state, pos, name_end - pos); -# pos = parse_space(name_end, is_nested); -# last_sym_start = out_elements.size(); -# out_elements.push_back({LLAMA_GRETYPE_RULE_REF, ref_rule_id}); -# } else if (*pos == '(') { // grouping -# // parse nested alternates into synthesized rule -# pos = parse_space(pos + 1, true); -# uint32_t sub_rule_id = generate_symbol_id(state, rule_name); -# pos = parse_alternates(state, pos, rule_name, sub_rule_id, true); -# last_sym_start = out_elements.size(); -# // output reference to synthesized rule -# out_elements.push_back({LLAMA_GRETYPE_RULE_REF, sub_rule_id}); -# if (*pos != ')') { -# throw std::runtime_error(std::string("expecting ')' at ") + pos); -# } -# pos = parse_space(pos + 1, is_nested); -# } else if (*pos == '.') { // any char -# last_sym_start = out_elements.size(); -# out_elements.push_back({LLAMA_GRETYPE_CHAR_ANY, 0}); -# pos = parse_space(pos + 1, is_nested); -# } else if (*pos == '*') { -# pos = parse_space(pos + 1, is_nested); -# handle_repetitions(0, -1); -# } else if (*pos == '+') { -# pos = parse_space(pos + 1, is_nested); -# handle_repetitions(1, -1); -# } else if (*pos == '?') { -# pos = parse_space(pos + 1, is_nested); -# handle_repetitions(0, 1); -# } else if (*pos == '{') { -# pos = parse_space(pos + 1, is_nested); -# -# if (!is_digit_char(*pos)) { -# throw std::runtime_error(std::string("expecting an int at ") + pos); -# } -# const char * int_end = parse_int(pos); -# int min_times = std::stoul(std::string(pos, int_end - pos)); -# pos = parse_space(int_end, is_nested); -# -# int max_times = -1; -# -# if (*pos == '}') { -# max_times = min_times; -# pos = parse_space(pos + 1, is_nested); -# } else if (*pos == ',') { -# pos = parse_space(pos + 1, is_nested); -# -# if (is_digit_char(*pos)) { -# const char * int_end = parse_int(pos); -# max_times = std::stoul(std::string(pos, int_end - pos)); -# pos = parse_space(int_end, is_nested); -# } -# -# if (*pos != '}') { -# throw std::runtime_error(std::string("expecting '}' at ") + pos); -# } -# pos = parse_space(pos + 1, is_nested); -# } else { -# throw std::runtime_error(std::string("expecting ',' at ") + pos); -# } -# handle_repetitions(min_times, max_times); -# } else { -# break; -# } -# } -# return pos; -# } -def parse_sequence(state: ParseState, src: str, rule_name: str, out_elements: typing.List[GrammarElement], is_nested: bool) -> str: - last_sym_start = len(out_elements) - pos = src - - def handle_repetitions(min_times: int, max_times: int) -> None: - nonlocal state, src, rule_name, out_elements, is_nested, last_sym_start, pos - - if last_sym_start == len(out_elements): - raise ValueError(f"expecting preceding item to */+/?/{{ at {pos}") - - previous_elements = out_elements[last_sym_start:] - if min_times == 0: - del out_elements[last_sym_start:] - else: - for i in range(1, min_times): - out_elements.extend(previous_elements) - - last_rec_rule_id = 0 - n_opt = 1 if max_times < 0 else max_times - min_times - - rec_rule = previous_elements[:] - for i in range(n_opt): - rec_rule = rec_rule[:len(previous_elements)] - rec_rule_id = generate_symbol_id(state, rule_name) - if i > 0 or max_times < 0: - rec_rule.append(GrammarElement(GrammarElementType.RULE_REF, rec_rule_id if max_times < 0 else last_rec_rule_id)) - rec_rule.append(GrammarElement(GrammarElementType.ALT, 0)) - rec_rule.append(GrammarElement(GrammarElementType.END, 0)) - add_rule(state, rec_rule_id, rec_rule) - last_rec_rule_id = rec_rule_id - if n_opt > 0: - out_elements.append(GrammarElement(GrammarElementType.RULE_REF, last_rec_rule_id)) - - while pos: - if pos[0] == '"': - pos = pos[1:] - last_sym_start = len(out_elements) - while not pos.startswith('"'): - if not pos: - raise ValueError("unexpected end of input") - char, pos = parse_char(pos) - out_elements.append(GrammarElement(GrammarElementType.CHAR, char)) - pos = parse_space(pos[1:], is_nested) - elif pos[0] == "[": - pos = pos[1:] - start_type = GrammarElementType.CHAR - if pos[0] == "^": - pos = pos[1:] - start_type = GrammarElementType.CHAR_NOT - last_sym_start = len(out_elements) - while pos[0] != "]": - if not pos: - raise ValueError("unexpected end of input") - char, pos = parse_char(pos) - type = GrammarElementType.CHAR_ALT if last_sym_start < len(out_elements) else start_type - out_elements.append(GrammarElement(type, char)) - if pos[0] == "-" and pos[1] != "]": - if not pos[1]: - raise ValueError("unexpected end of input") - endchar, pos = parse_char(pos[1:]) - out_elements.append(GrammarElement(GrammarElementType.CHAR_RNG_UPPER, endchar)) - pos = parse_space(pos[1:], is_nested) - elif pos and is_word_char(pos[0]): - name, rest = parse_name(pos) - ref_rule_id = get_symbol_id(state, name) - pos = parse_space(rest, is_nested) - last_sym_start = len(out_elements) - out_elements.append(GrammarElement(GrammarElementType.RULE_REF, ref_rule_id)) - elif pos.startswith("("): - pos = parse_space(pos[1:], newline_ok=True) - sub_rule_id = generate_symbol_id(state, rule_name) - pos = parse_alternates(state, pos, rule_name, sub_rule_id, is_nested=True) - last_sym_start = len(out_elements) - out_elements.append(GrammarElement(GrammarElementType.RULE_REF, sub_rule_id)) - if pos[0] != ")": - raise ValueError(f"expecting ')' at {pos}") - pos = parse_space(pos[1:], is_nested) - elif pos.startswith("."): - last_sym_start = len(out_elements) - out_elements.append(GrammarElement(GrammarElementType.CHAR_ANY, 0)) - pos = parse_space(pos[1:], is_nested) - elif pos.startswith("*"): - pos = parse_space(pos[1:], is_nested) - handle_repetitions(0, -1) - elif pos.startswith("+"): - pos = parse_space(pos[1:], is_nested) - handle_repetitions(1, -1) - elif pos.startswith("?"): - pos = parse_space(pos[1:], is_nested) - handle_repetitions(0, 1) - elif pos.startswith("{"): - pos = parse_space(pos[1:], is_nested) - - if not is_digit_char(pos): - raise ValueError(f"expecting an int at {pos}") - min_times, pos = parse_int(pos) - pos = parse_space(pos, is_nested) - - max_times = -1 - - if pos[0] == "}": - max_times = min_times - pos = parse_space(pos[1:], is_nested) - elif pos[0] == ",": - pos = parse_space(pos[1:], is_nested) - - if is_digit_char(pos): - max_times, pos = parse_int(pos) - pos = parse_space(pos, is_nested) - - if pos[0] != "}": - raise ValueError("expecting '}' at {}".format(pos)) - - pos = parse_space(pos[1:], is_nested) - else: - raise ValueError(f"expecting ',' at {pos}") - handle_repetitions(min_times, max_times) - else: - break - return pos - - -# const char * parse_alternates( -# parse_state & state, -# const char * src, -# const std::string & rule_name, -# uint32_t rule_id, -# bool is_nested) { -# std::vector rule; -# const char * pos = parse_sequence(state, src, rule_name, rule, is_nested); -# while (*pos == '|') { -# rule.push_back({LLAMA_GRETYPE_ALT, 0}); -# pos = parse_space(pos + 1, true); -# pos = parse_sequence(state, pos, rule_name, rule, is_nested); -# } -# rule.push_back({LLAMA_GRETYPE_END, 0}); -# add_rule(state, rule_id, rule); -# return pos; -# } -def parse_alternates(state: ParseState, src: str, rule_name: str, rule_id: int, is_nested: bool) -> str: - rule = [] - pos = parse_sequence(state, src, rule_name, rule, is_nested) - while pos.startswith("|"): - rule.append(GrammarElement(GrammarElementType.ALT, 0)) - pos = parse_space(pos[1:], newline_ok=True) - pos = parse_sequence(state, pos, rule_name, rule, is_nested) - rule.append(GrammarElement(GrammarElementType.END, 0)) - add_rule(state, rule_id, rule) - return pos - - -# static const char * parse_rule(parse_state & state, const char * src) { -# const char * name_end = parse_name(src); -# const char * pos = parse_space(name_end, false); -# size_t name_len = name_end - src; -# uint32_t rule_id = get_symbol_id(state, src, name_len); -# const std::string name(src, name_len); -# -# if (!(pos[0] == ':' && pos[1] == ':' && pos[2] == '=')) { -# throw std::runtime_error(std::string("expecting ::= at ") + pos); -# } -# pos = parse_space(pos + 3, true); -# -# pos = parse_alternates(state, pos, name, rule_id, false); -# -# if (*pos == '\r') { -# pos += pos[1] == '\n' ? 2 : 1; -# } else if (*pos == '\n') { -# pos++; -# } else if (*pos) { -# throw std::runtime_error(std::string("expecting newline or end at ") + pos); -# } -# return parse_space(pos, true); -# } -def parse_rule(state: ParseState, src: str) -> str: - pos = src - name, pos = parse_name(pos) - pos = parse_space(pos, newline_ok=False) - rule_id = get_symbol_id(state, name) - - if not pos.startswith("::="): - raise ValueError(f"expecting ::= at {pos}") - - pos = parse_space(pos[3:], newline_ok=True) - - pos = parse_alternates(state, pos, name, rule_id, is_nested=False) - - if pos.startswith("\r"): - pos = pos[2:] if pos[1] == "\n" else pos[1:] - elif pos.startswith("\n"): - pos = pos[1:] - elif pos: - raise ValueError(f"expecting newline or end at {pos}") - return parse_space(pos, newline_ok=True) - - -# parse_state parse(const char * src) { -# try { -# parse_state state; -# const char * pos = parse_space(src, true); -# while (*pos) { -# pos = parse_rule(state, pos); -# } -# // Validate the state to ensure that all rules are defined -# for (const auto & rule : state.rules) { -# for (const auto & elem : rule) { -# if (elem.type == LLAMA_GRETYPE_RULE_REF) { -# // Ensure that the rule at that location exists -# if (elem.value >= state.rules.size() || state.rules[elem.value].empty()) { -# // Get the name of the rule that is missing -# for (const auto & kv : state.symbol_ids) { -# if (kv.second == elem.value) { -# throw std::runtime_error("Undefined rule identifier '" + kv.first + "'"); -# } -# } -# } -# } -# } -# } -# return state; -# } catch (const std::exception & err) { -# fprintf(stderr, "%s: error parsing grammar: %s\n", __func__, err.what()); -# return parse_state(); -# } -# } -def parse(src: str) -> ParseState: - state = ParseState() - pos = src - pos = parse_space(pos, newline_ok=True) - while pos: - pos = parse_rule(state, pos) - # validate - for rule in state.rules: - for elem in rule: - if elem.type == GrammarElementType.RULE_REF: - if elem.value >= len(state.rules) or not state.rules[elem.value]: - for k, v in state.symbol_ids.items(): - if v == elem.value: - raise ValueError(f"Undefined rule identifier '{k}'") - return state - - -# static bool is_char_element(llama_grammar_element elem) { -# switch (elem.type) { -# case LLAMA_GRETYPE_CHAR: return true; -# case LLAMA_GRETYPE_CHAR_NOT: return true; -# case LLAMA_GRETYPE_CHAR_ALT: return true; -# case LLAMA_GRETYPE_CHAR_RNG_UPPER: return true; -# case LLAMA_GRETYPE_CHAR_ANY: return true; -# default: return false; -# } -# } -def is_char_element(elem: GrammarElement) -> bool: - return elem.type in ( - GrammarElementType.CHAR, - GrammarElementType.CHAR_NOT, - GrammarElementType.CHAR_ALT, - GrammarElementType.CHAR_RNG_UPPER, - GrammarElementType.CHAR_ANY - ) - - -def print_grammar_char(file: typing.TextIO, c: int) -> None: - if 0x20 <= c <= 0x7f: - print(chr(c), end="", file=file) - else: - print(f"", end="", file=file) - - -# static void print_rule( -# FILE * file, -# uint32_t rule_id, -# const std::vector & rule, -# const std::map & symbol_id_names) { -# if (rule.empty() || rule.back().type != LLAMA_GRETYPE_END) { -# throw std::runtime_error( -# "malformed rule, does not end with LLAMA_GRETYPE_END: " + std::to_string(rule_id)); -# } -# fprintf(file, "%s ::= ", symbol_id_names.at(rule_id).c_str()); -# for (size_t i = 0, end = rule.size() - 1; i < end; i++) { -# llama_grammar_element elem = rule[i]; -# switch (elem.type) { -# case LLAMA_GRETYPE_END: -# throw std::runtime_error( -# "unexpected end of rule: " + std::to_string(rule_id) + "," + -# std::to_string(i)); -# case LLAMA_GRETYPE_ALT: -# fprintf(file, "| "); -# break; -# case LLAMA_GRETYPE_RULE_REF: -# fprintf(file, "%s ", symbol_id_names.at(elem.value).c_str()); -# break; -# case LLAMA_GRETYPE_CHAR: -# fprintf(file, "["); -# print_grammar_char(file, elem.value); -# break; -# case LLAMA_GRETYPE_CHAR_NOT: -# fprintf(file, "[^"); -# print_grammar_char(file, elem.value); -# break; -# case LLAMA_GRETYPE_CHAR_RNG_UPPER: -# if (i == 0 || !is_char_element(rule[i - 1])) { -# throw std::runtime_error( -# "LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: " + -# std::to_string(rule_id) + "," + std::to_string(i)); -# } -# fprintf(file, "-"); -# print_grammar_char(file, elem.value); -# break; -# case LLAMA_GRETYPE_CHAR_ALT: -# if (i == 0 || !is_char_element(rule[i - 1])) { -# throw std::runtime_error( -# "LLAMA_GRETYPE_CHAR_ALT without preceding char: " + -# std::to_string(rule_id) + "," + std::to_string(i)); -# } -# print_grammar_char(file, elem.value); -# break; -# case LLAMA_GRETYPE_CHAR_ANY: -# fprintf(file, "."); -# break; -# } -# if (is_char_element(elem)) { -# switch (rule[i + 1].type) { -# case LLAMA_GRETYPE_CHAR_ALT: -# case LLAMA_GRETYPE_CHAR_RNG_UPPER: -# case LLAMA_GRETYPE_CHAR_ANY: -# break; -# default: -# fprintf(file, "] "); -# } -# } -# } -# fprintf(file, "\n"); -# } -def print_rule( - file: typing.TextIO, - rule_id: int, - rule: typing.List[GrammarElement], - symbol_id_names: typing.Dict[int, str], -) -> None: - if not rule or rule[-1].type != GrammarElementType.END: - raise ValueError(f"malformed rule, does not end with LLAMA_GRETYPE_END: {rule_id}") - - print(f"{symbol_id_names[rule_id]} ::=", end=" ", file=file) - - for i, elem in enumerate(rule[:-1]): - if elem.type == GrammarElementType.END: - raise ValueError(f"unexpected end of rule: {rule_id}, {i}") - if elem.type == GrammarElementType.ALT: - print("| ", end="", file=file) - elif elem.type == GrammarElementType.RULE_REF: - print(f"{symbol_id_names[elem.value]} ", end="", file=file) - elif elem.type == GrammarElementType.CHAR: - print("[", end="", file=file) - print_grammar_char(file, elem.value) - elif elem.type == GrammarElementType.CHAR_NOT: - print("[^", end="", file=file) - print_grammar_char(file, elem.value) - elif elem.type == GrammarElementType.CHAR_RNG_UPPER: - if i == 0 or not is_char_element(rule[i - 1]): - raise ValueError(f"LLAMA_GRETYPE_CHAR_RNG_UPPER without preceding char: {rule_id}, {i}") - print(f"-", end="", file=file) - print_grammar_char(file, elem.value) - elif elem.type == GrammarElementType.CHAR_ALT: - if i == 0 or not is_char_element(rule[i - 1]): - raise ValueError(f"LLAMA_GRETYPE_CHAR_ALT without preceding char: {rule_id}, {i}") - print_grammar_char(file, elem.value) - elif elem.type == GrammarElementType.CHAR_ANY: - print(".", end="", file=file) - if is_char_element(elem): - if rule[i + 1].type in (GrammarElementType.CHAR_ALT, GrammarElementType.CHAR_RNG_UPPER, GrammarElementType.CHAR_ANY): - continue - print("] ", end="", file=file) - print(file=file) - - -def print_grammar(file: typing.TextIO, state: ParseState) -> None: - try: - symbol_id_names = {v: k for k, v in state.symbol_ids.items()} - for i, rule in enumerate(state.rules): - print_rule(file, i, rule, symbol_id_names) - except Exception as err: - print(f"\nerror printing grammar: {err}", file=file) - raise err +LLAMA_GRAMMAR_DEFAULT_ROOT = "root" class LlamaGrammar: - def __init__(self, parse_state: ParseState): - self.parse_state = parse_state - - self._grammar_rules = parse_state.rules - self._n_rules = len(self._grammar_rules) - self._start_rule_index = parse_state.symbol_ids["root"] - - self._element_lists = [ - [ - llama_cpp.llama_grammar_element(ctypes.c_int(elem.type), ctypes.c_uint32(elem.value)) - for elem in subvector - ] - for subvector in self._grammar_rules - ] - - # Step 2: Convert each list to llama_grammar_element array and get pointer - self._element_arrays = [ - (llama_cpp.llama_grammar_element * len(sublist))(*sublist) - for sublist in self._element_lists - ] - - # Step 3: Get pointer of each array - self._element_array_pointers = [ - ctypes.cast(subarray, llama_cpp.llama_grammar_element_p) for subarray in self._element_arrays - ] - - # Step 4: Make array of these pointers and get its pointer - self._rules = (llama_cpp.llama_grammar_element_p * len(self._element_array_pointers))( - *self._element_array_pointers - ) - - self.grammar = None - self._init_grammar() - - - def _init_grammar(self): - grammar = llama_cpp.llama_grammar_init( - self._rules, ctypes.c_size_t(self._n_rules), ctypes.c_size_t(self._start_rule_index) - ) - - if grammar is None: - raise ValueError("Failed to create grammar") - - self.grammar = grammar - - def __del__(self): - if self.grammar is not None: - llama_cpp.llama_grammar_free(self.grammar) - self.grammar = None - - def reset(self): - if self.grammar is not None: - llama_cpp.llama_grammar_free(self.grammar) - self._init_grammar() + def __init__(self, *args, _grammar: str, **kwargs): + self._grammar = _grammar + self._root = LLAMA_GRAMMAR_DEFAULT_ROOT @classmethod def from_string(cls, grammar: str, verbose: bool = True) -> "LlamaGrammar": - parsed_grammar = parse(grammar) - if verbose: - print_grammar(file=sys.stdout, state=parsed_grammar) - return cls(parsed_grammar) - + return cls(_grammar=grammar) + @classmethod def from_file(cls, file: Union[str, Path], verbose: bool = True) -> "LlamaGrammar": try: @@ -1820,4 +950,4 @@ def json_schema_to_gbnf(schema: str, prop_order: Optional[List[str]] = None): ) schema = converter.resolve_refs(schema, "stdin") converter.visit(schema, "") - return converter.format_grammar() \ No newline at end of file + return converter.format_grammar() diff --git a/nexa/gguf/llama/llama_speculative.py b/nexa/gguf/llama/llama_speculative.py index 6188cb26..39dfb903 100644 --- a/nexa/gguf/llama/llama_speculative.py +++ b/nexa/gguf/llama/llama_speculative.py @@ -61,4 +61,4 @@ def __call__( input_ids=input_ids, max_ngram_size=self.max_ngram_size, num_pred_tokens=self.num_pred_tokens, - ) \ No newline at end of file + ) diff --git a/nexa/gguf/llama/llama_tokenizer.py b/nexa/gguf/llama/llama_tokenizer.py index f89fadd8..a0105cc8 100644 --- a/nexa/gguf/llama/llama_tokenizer.py +++ b/nexa/gguf/llama/llama_tokenizer.py @@ -7,7 +7,7 @@ Any, ) -import nexa.gguf.llama.llama_cpp as llama_cpp +import llama_cpp from nexa.gguf.llama.llama_types import List @@ -27,7 +27,10 @@ def tokenize( @abc.abstractmethod def detokenize( - self, tokens: List[int], prev_tokens: Optional[List[int]] = None, special: bool = False + self, + tokens: List[int], + prev_tokens: Optional[List[int]] = None, + special: bool = False, ) -> bytes: """Detokenize the tokens into text. @@ -49,7 +52,10 @@ def tokenize( return self._model.tokenize(text, add_bos=add_bos, special=special) def detokenize( - self, tokens: List[int], prev_tokens: Optional[List[int]] = None, special: bool = False + self, + tokens: List[int], + prev_tokens: Optional[List[int]] = None, + special: bool = False, ) -> bytes: return self._model.detokenize(tokens, special=special) @@ -80,19 +86,24 @@ def tokenize( ) def detokenize( - self, tokens: List[int], prev_tokens: Optional[List[int]] = None, special: bool = False + self, + tokens: List[int], + prev_tokens: Optional[List[int]] = None, + special: bool = False, ) -> bytes: - skip_special_tokens = not special + skip_special_tokens = not special if prev_tokens is not None: - text = self.hf_tokenizer.decode(prev_tokens + tokens, skip_special_tokens=skip_special_tokens).encode( - "utf-8", errors="ignore" - ) - prev_text = self.hf_tokenizer.decode(prev_tokens, skip_special_tokens=skip_special_tokens).encode( - "utf-8", errors="ignore" - ) + text = self.hf_tokenizer.decode( + prev_tokens + tokens, skip_special_tokens=skip_special_tokens + ).encode("utf-8", errors="ignore") + prev_text = self.hf_tokenizer.decode( + prev_tokens, skip_special_tokens=skip_special_tokens + ).encode("utf-8", errors="ignore") return text[len(prev_text) :] else: - return self.hf_tokenizer.decode(tokens, skip_special_tokens=skip_special_tokens).encode("utf-8", errors="ignore") + return self.hf_tokenizer.decode( + tokens, skip_special_tokens=skip_special_tokens + ).encode("utf-8", errors="ignore") @classmethod def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenizer": @@ -106,4 +117,4 @@ def from_pretrained(cls, pretrained_model_name_or_path: str) -> "LlamaHFTokenize hf_tokenizer = AutoTokenizer.from_pretrained( pretrained_model_name_or_path=pretrained_model_name_or_path ) - return cls(hf_tokenizer) \ No newline at end of file + return cls(hf_tokenizer) diff --git a/nexa/gguf/llama/llama_types.py b/nexa/gguf/llama/llama_types.py index 3cc2122e..bbb58afc 100644 --- a/nexa/gguf/llama/llama_types.py +++ b/nexa/gguf/llama/llama_types.py @@ -295,4 +295,4 @@ class ChatCompletionNamedToolChoice(TypedDict): ChatCompletionChunk = CreateChatCompletionStreamResponse ChatCompletionStreamResponse = CreateChatCompletionStreamResponse ChatCompletionResponseFunction = ChatCompletionFunction -ChatCompletionFunctionCall = ChatCompletionResponseFunctionCall \ No newline at end of file +ChatCompletionFunctionCall = ChatCompletionResponseFunctionCall diff --git a/nexa/gguf/llama/llava_cpp.py b/nexa/gguf/llama/llava_cpp.py index 9671eafb..9989407a 100644 --- a/nexa/gguf/llama/llava_cpp.py +++ b/nexa/gguf/llama/llava_cpp.py @@ -1,9 +1,6 @@ from __future__ import annotations -import sys import os -import ctypes -import functools from ctypes import ( c_bool, c_char_p, @@ -17,69 +14,29 @@ ) import pathlib from typing import ( - List, Union, NewType, Optional, - TypeVar, - Callable, - Any, TYPE_CHECKING, - Generic, ) -from typing_extensions import TypeAlias import nexa.gguf.llama.llama_cpp as llama_cpp + from nexa.gguf.lib_utils import load_library +from nexa.gguf.llama._ctypes_extensions import ctypes_function_for_shared_library + +if TYPE_CHECKING: + from nexa.gguf.llama._ctypes_extensions import ( + CtypesArray, + ) + # Specify the base name of the shared library to load _libllava_base_name = "llava_shared" # Load the library _libllava = load_library(_libllava_base_name) -# ctypes helper - -if TYPE_CHECKING: - CtypesCData = TypeVar("CtypesCData", bound=ctypes._CData) # type: ignore - - CtypesArray: TypeAlias = ctypes.Array[CtypesCData] # type: ignore - - CtypesPointer: TypeAlias = ctypes._Pointer[CtypesCData] # type: ignore - - CtypesVoidPointer: TypeAlias = ctypes.c_void_p - - class CtypesRef(Generic[CtypesCData]): - pass - - CtypesPointerOrRef: TypeAlias = Union[ - CtypesPointer[CtypesCData], CtypesRef[CtypesCData] - ] - - CtypesFuncPointer: TypeAlias = ctypes._FuncPointer # type: ignore - -F = TypeVar("F", bound=Callable[..., Any]) - - -def ctypes_function_for_shared_library(lib: ctypes.CDLL): - def ctypes_function( - name: str, argtypes: List[Any], restype: Any, enabled: bool = True - ): - def decorator(f: F) -> F: - if enabled: - func = getattr(lib, name) - func.argtypes = argtypes - func.restype = restype - functools.wraps(f)(func) - return func - else: - return f - - return decorator - - return ctypes_function - - ctypes_function = ctypes_function_for_shared_library(_libllava) @@ -112,7 +69,8 @@ class llava_image_embed(Structure): ) def llava_validate_embed_size( ctx_llama: llama_cpp.llama_context_p, ctx_clip: clip_ctx_p, / -) -> bool: ... +) -> bool: + ... # /** build an image embed from image file bytes */ @@ -128,7 +86,8 @@ def llava_image_embed_make_with_bytes( image_bytes: CtypesArray[c_uint8], image_bytes_length: Union[c_int, int], /, -) -> "_Pointer[llava_image_embed]": ... +) -> "_Pointer[llava_image_embed]": + ... # /** build an image embed from a path to an image filename */ @@ -140,13 +99,15 @@ def llava_image_embed_make_with_bytes( ) def llava_image_embed_make_with_filename( ctx_clip: clip_ctx_p, n_threads: Union[c_int, int], image_path: bytes, / -) -> "_Pointer[llava_image_embed]": ... +) -> "_Pointer[llava_image_embed]": + ... # LLAVA_API void llava_image_embed_free(struct llava_image_embed * embed); # /** free an embedding made with llava_image_embed_make_* */ @ctypes_function("llava_image_embed_free", [POINTER(llava_image_embed)], None) -def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /): ... +def llava_image_embed_free(embed: "_Pointer[llava_image_embed]", /): + ... # /** write the image represented by embed into the llama context with batch size n_batch, starting at context pos n_past. on completion, n_past points to the next position in the context after the image embed. */ @@ -167,7 +128,8 @@ def llava_eval_image_embed( n_batch: Union[c_int, int], n_past: "_Pointer[c_int]", /, -) -> bool: ... +) -> bool: + ... ################################################ @@ -180,10 +142,13 @@ def llava_eval_image_embed( @ctypes_function("clip_model_load", [c_char_p, c_int], clip_ctx_p_ctypes) def clip_model_load( fname: bytes, verbosity: Union[c_int, int], / -) -> Optional[clip_ctx_p]: ... +) -> Optional[clip_ctx_p]: + ... # /** free mmproj model */ # CLIP_API void clip_free(struct clip_ctx * ctx); @ctypes_function("clip_free", [clip_ctx_p_ctypes], None) -def clip_free(ctx: clip_ctx_p, /): ... \ No newline at end of file +def clip_free(ctx: clip_ctx_p, /): + ... + From c1a43b93ddf576cbbfe7f0046719420ab1b0f019 Mon Sep 17 00:00:00 2001 From: Te993 <3923106166@qq.com> Date: Mon, 9 Dec 2024 20:17:31 +0800 Subject: [PATCH 10/20] upgrade --- nexa/gguf/lib_utils.py | 3 +- nexa/gguf/llama/__init__.py | 4 + nexa/gguf/llama/_ctypes_extensions.py | 56 ++ nexa/gguf/llama/_internals_transformers.py | 863 --------------------- nexa/gguf/llama/_logger_transformers.py | 41 - nexa/gguf/llama/_utils_spinner.py | 79 -- nexa/gguf/llama/_utils_transformers.py | 78 -- nexa/gguf/llama/audio_lm_cpp.py | 116 --- nexa/gguf/llama/kv_cache.py | 86 -- nexa/gguf/llama/llama.py | 6 +- nexa/gguf/llama/llama_chat_format.py | 6 +- nexa/gguf/llama/llama_cpp.py | 12 +- nexa/gguf/llama/llama_tokenizer.py | 2 +- nexa/gguf/llama/llava_cpp.py | 14 +- nexa/gguf/llama/omni_vlm_cpp.py | 79 -- nexa/gguf/nexa_inference_audio_lm.py | 2 +- nexa/gguf/nexa_inference_image.py | 2 +- nexa/gguf/nexa_inference_text.py | 2 +- nexa/gguf/nexa_inference_vlm.py | 2 +- nexa/gguf/nexa_inference_vlm_omni.py | 2 +- nexa/gguf/nexa_inference_voice.py | 2 +- nexa/gguf/server/nexa_service.py | 2 +- nexa/gguf/streamlit/streamlit_audio_lm.py | 2 +- tests/test_text_generation.py | 3 + 24 files changed, 96 insertions(+), 1368 deletions(-) delete mode 100644 nexa/gguf/llama/_internals_transformers.py delete mode 100644 nexa/gguf/llama/_logger_transformers.py delete mode 100644 nexa/gguf/llama/_utils_spinner.py delete mode 100644 nexa/gguf/llama/_utils_transformers.py delete mode 100644 nexa/gguf/llama/audio_lm_cpp.py delete mode 100644 nexa/gguf/llama/kv_cache.py delete mode 100644 nexa/gguf/llama/omni_vlm_cpp.py diff --git a/nexa/gguf/lib_utils.py b/nexa/gguf/lib_utils.py index ec030b9d..8397e026 100644 --- a/nexa/gguf/lib_utils.py +++ b/nexa/gguf/lib_utils.py @@ -17,7 +17,8 @@ def is_gpu_available(): # Load the library def load_library(lib_base_name: str): # Construct the paths to the possible shared library names - _base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" + # _base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" + _base_path = pathlib.Path('D:/repo/nexa-ai/llama-cpp-python/llama_cpp/lib') # Searching for the library in the current directory under the name "libllama" (default name # for llamacpp) and "llama" (default name for this repo) _lib_paths: List[pathlib.Path] = [] diff --git a/nexa/gguf/llama/__init__.py b/nexa/gguf/llama/__init__.py index e69de29b..6f1a2122 100644 --- a/nexa/gguf/llama/__init__.py +++ b/nexa/gguf/llama/__init__.py @@ -0,0 +1,4 @@ +from nexa.gguf.llama.llama_cpp import * +from nexa.gguf.llama.llama import * + +# __version__ = "0.3.2" diff --git a/nexa/gguf/llama/_ctypes_extensions.py b/nexa/gguf/llama/_ctypes_extensions.py index 2ff7e38e..e88ed387 100644 --- a/nexa/gguf/llama/_ctypes_extensions.py +++ b/nexa/gguf/llama/_ctypes_extensions.py @@ -18,6 +18,61 @@ ) from typing_extensions import TypeAlias + +# Load the library +def load_shared_library(lib_base_name: str, base_path: pathlib.Path): + """Platform independent shared library loader""" + # Searching for the library in the current directory under the name "libllama" (default name + # for llamacpp) and "llama" (default name for this repo) + lib_paths: List[pathlib.Path] = [] + # Determine the file extension based on the platform + if sys.platform.startswith("linux") or sys.platform.startswith("freebsd"): + lib_paths += [ + base_path / f"lib{lib_base_name}.so", + ] + elif sys.platform == "darwin": + lib_paths += [ + base_path / f"lib{lib_base_name}.so", + base_path / f"lib{lib_base_name}.dylib", + ] + elif sys.platform == "win32": + lib_paths += [ + base_path / f"{lib_base_name}.dll", + base_path / f"lib{lib_base_name}.dll", + ] + else: + raise RuntimeError("Unsupported platform") + + cdll_args = dict() # type: ignore + + # Add the library directory to the DLL search path on Windows (if needed) + if sys.platform == "win32": + os.add_dll_directory(str(base_path)) + os.environ["PATH"] = str(base_path) + os.pathsep + os.environ["PATH"] + + if sys.platform == "win32" and sys.version_info >= (3, 8): + os.add_dll_directory(str(base_path)) + if "CUDA_PATH" in os.environ: + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin")) + os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib")) + if "HIP_PATH" in os.environ: + os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "bin")) + os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "lib")) + cdll_args["winmode"] = ctypes.RTLD_GLOBAL + + # Try to load the shared library, handling potential errors + for lib_path in lib_paths: + if lib_path.exists(): + try: + return ctypes.CDLL(str(lib_path), **cdll_args) # type: ignore + except Exception as e: + raise RuntimeError(f"Failed to load shared library '{lib_path}': {e}") + + raise FileNotFoundError( + f"Shared library with base name '{lib_base_name}' not found" + ) + + # ctypes sane type hint helpers # # - Generic Pointer and Array types @@ -46,6 +101,7 @@ class CtypesRef(Generic[CtypesCData]): F = TypeVar("F", bound=Callable[..., Any]) + def ctypes_function_for_shared_library(lib: ctypes.CDLL): """Decorator for defining ctypes functions with type hints""" diff --git a/nexa/gguf/llama/_internals_transformers.py b/nexa/gguf/llama/_internals_transformers.py deleted file mode 100644 index bbd215d1..00000000 --- a/nexa/gguf/llama/_internals_transformers.py +++ /dev/null @@ -1,863 +0,0 @@ -from __future__ import annotations - -import os -import ctypes - -from typing import ( - Dict, - List, - Tuple, - Optional, - Sequence, -) -from dataclasses import dataclass, field -from contextlib import ExitStack - -import numpy as np -import numpy.typing as npt - -from nexa.gguf.llama.llama_types import * -from nexa.gguf.llama.llama_grammar import LlamaGrammar -from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr - -import nexa.gguf.llama.llama_cpp as llama_cpp - - -# Python wrappers over llama.h structs - - -class LlamaModel: - """Intermediate Python wrapper for a llama.cpp llama_model. - NOTE: For stability it's recommended you use the Llama class instead.""" - - def __init__( - self, - *, - path_model: str, - params: llama_cpp.llama_model_params, - verbose: bool = True, - ): - self.path_model = path_model - self.params = params - self.verbose = verbose - self._exit_stack = ExitStack() - - model = None - - if not os.path.exists(path_model): - raise ValueError(f"Model path does not exist: {path_model}") - - with suppress_stdout_stderr(disable=verbose): - model = llama_cpp.llama_load_model_from_file( - self.path_model.encode("utf-8"), self.params - ) - - if model is None: - raise ValueError(f"Failed to load model from file: {path_model}") - - self.model = model - - def free_model(): - if self.model is None: - return - llama_cpp.llama_free_model(self.model) - self.model = None - - self._exit_stack.callback(free_model) - - def close(self): - self._exit_stack.close() - - def __del__(self): - self.close() - - def vocab_type(self) -> int: - return llama_cpp.llama_vocab_type(self.model) - - def n_vocab(self) -> int: - return llama_cpp.llama_n_vocab(self.model) - - def n_ctx_train(self) -> int: - return llama_cpp.llama_n_ctx_train(self.model) - - def n_embd(self) -> int: - return llama_cpp.llama_n_embd(self.model) - - def rope_freq_scale_train(self) -> float: - return llama_cpp.llama_rope_freq_scale_train(self.model) - - def desc(self) -> str: - buf = ctypes.create_string_buffer(1024) - llama_cpp.llama_model_desc(self.model, buf, 1024) - return buf.value.decode("utf-8") - - def size(self) -> int: - return llama_cpp.llama_model_size(self.model) - - def n_params(self) -> int: - return llama_cpp.llama_model_n_params(self.model) - - def get_tensor(self, name: str) -> ctypes.c_void_p: - return llama_cpp.llama_get_model_tensor(self.model, name.encode("utf-8")) - - # Vocab - - def token_get_text(self, token: int) -> str: - return llama_cpp.llama_token_get_text(self.model, token).decode("utf-8") - - def token_get_score(self, token: int) -> float: - return llama_cpp.llama_token_get_score(self.model, token) - - def token_get_attr(self, token: int) -> int: - return llama_cpp.llama_token_get_attr(self.model, token) - - # Special tokens - - def token_bos(self) -> int: - return llama_cpp.llama_token_bos(self.model) - - def token_eos(self) -> int: - return llama_cpp.llama_token_eos(self.model) - - def token_cls(self) -> int: - return llama_cpp.llama_token_cls(self.model) - - def token_sep(self) -> int: - return llama_cpp.llama_token_sep(self.model) - - def token_nl(self) -> int: - return llama_cpp.llama_token_nl(self.model) - - def token_prefix(self) -> int: - return llama_cpp.llama_token_prefix(self.model) - - def token_middle(self) -> int: - return llama_cpp.llama_token_middle(self.model) - - def token_suffix(self) -> int: - return llama_cpp.llama_token_suffix(self.model) - - def token_eot(self) -> int: - return llama_cpp.llama_token_eot(self.model) - - def add_bos_token(self) -> bool: - return llama_cpp.llama_add_bos_token(self.model) - - def add_eos_token(self) -> bool: - return llama_cpp.llama_add_eos_token(self.model) - - # Tokenization - - def tokenize(self, text: bytes, add_bos: bool, special: bool): - n_ctx = self.n_ctx_train() - tokens = (llama_cpp.llama_token * n_ctx)() - n_tokens = llama_cpp.llama_tokenize( - self.model, text, len(text), tokens, n_ctx, add_bos, special - ) - if n_tokens < 0: - n_tokens = abs(n_tokens) - tokens = (llama_cpp.llama_token * n_tokens)() - n_tokens = llama_cpp.llama_tokenize( - self.model, text, len(text), tokens, n_tokens, add_bos, special - ) - if n_tokens < 0: - raise RuntimeError( - f'Failed to tokenize: text="{text}" n_tokens={n_tokens}' - ) - return list(tokens[:n_tokens]) - - def token_to_piece(self, token: int, special: bool = False) -> bytes: - buf = ctypes.create_string_buffer(32) - llama_cpp.llama_token_to_piece(self.model, token, buf, 32, 0, special) - return bytes(buf) - - def detokenize(self, tokens: List[int], special: bool = False) -> bytes: - output = b"" - size = 32 - buffer = (ctypes.c_char * size)() - for token in tokens: - n = llama_cpp.llama_token_to_piece( - self.model, llama_cpp.llama_token(token), buffer, size, 0, special - ) - assert n <= size - output += bytes(buffer[:n]) - # NOTE: Llama1 models automatically added a space at the start of the prompt - # this line removes a leading space if the first token is a beginning of sentence token - return ( - output[1:] - if len(tokens) > 0 and tokens[0] == self.token_bos() and output[0:1] == b" " - else output - ) - - # Extra - def metadata(self) -> Dict[str, str]: - metadata: Dict[str, str] = {} - buffer_size = 1024 - buffer = ctypes.create_string_buffer(buffer_size) - # zero the buffer - buffer.value = b"\0" * buffer_size - # iterate over model keys - for i in range(llama_cpp.llama_model_meta_count(self.model)): - nbytes = llama_cpp.llama_model_meta_key_by_index( - self.model, i, buffer, buffer_size - ) - if nbytes > buffer_size: - buffer_size = nbytes + 1 - buffer = ctypes.create_string_buffer(buffer_size) - nbytes = llama_cpp.llama_model_meta_key_by_index( - self.model, i, buffer, buffer_size - ) - key = buffer.value.decode("utf-8") - nbytes = llama_cpp.llama_model_meta_val_str_by_index( - self.model, i, buffer, buffer_size - ) - if nbytes > buffer_size: - buffer_size = nbytes + 1 - buffer = ctypes.create_string_buffer(buffer_size) - nbytes = llama_cpp.llama_model_meta_val_str_by_index( - self.model, i, buffer, buffer_size - ) - value = buffer.value.decode("utf-8") - metadata[key] = value - return metadata - - @staticmethod - def default_params(): - """Get the default llama_model_params.""" - return llama_cpp.llama_model_default_params() - - -class LlamaContext: - """Intermediate Python wrapper for a llama.cpp llama_context. - NOTE: For stability it's recommended you use the Llama class instead.""" - - def __init__( - self, - *, - model: LlamaModel, - params: llama_cpp.llama_context_params, - verbose: bool = True, - ): - self.model = model - self.params = params - self.verbose = verbose - self._exit_stack = ExitStack() - - ctx = llama_cpp.llama_new_context_with_model(self.model.model, self.params) - - if ctx is None: - raise ValueError("Failed to create llama_context") - - self.ctx = ctx - - def free_ctx(): - if self.ctx is None: - return - llama_cpp.llama_free(self.ctx) - self.ctx = None - - self._exit_stack.callback(free_ctx) - - def close(self): - self._exit_stack.close() - - def __del__(self): - self.close() - - def n_ctx(self) -> int: - return llama_cpp.llama_n_ctx(self.ctx) - - def pooling_type(self) -> int: - return llama_cpp.llama_pooling_type(self.ctx) - - def kv_cache_clear(self): - llama_cpp.llama_kv_cache_clear(self.ctx) - - def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int): - llama_cpp.llama_kv_cache_seq_rm(self.ctx, seq_id, p0, p1) - - def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int): - llama_cpp.llama_kv_cache_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1) - - def kv_cache_seq_keep(self, seq_id: int): - llama_cpp.llama_kv_cache_seq_keep(self.ctx, seq_id) - - def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int): - llama_cpp.llama_kv_cache_seq_add(self.ctx, seq_id, p0, p1, shift) - - def get_state_size(self) -> int: - return llama_cpp.llama_get_state_size(self.ctx) - - # TODO: copy_state_data - - # TODO: set_state_data - - # TODO: llama_load_session_file - - # TODO: llama_save_session_file - - def decode(self, batch: LlamaBatch): - return_code = llama_cpp.llama_decode( - self.ctx, - batch.batch, - ) - if return_code != 0: - raise RuntimeError(f"llama_decode returned {return_code}") - - def set_n_threads(self, n_threads: int, n_threads_batch: int): - llama_cpp.llama_set_n_threads(self.ctx, n_threads, n_threads_batch) - - def get_logits(self): - return llama_cpp.llama_get_logits(self.ctx) - - def get_logits_ith(self, i: int): - return llama_cpp.llama_get_logits_ith(self.ctx, i) - - def get_embeddings(self): - return llama_cpp.llama_get_embeddings(self.ctx) - - # Sampling functions - - def set_rng_seed(self, seed: int): - # TODO: Fix - llama_cpp.llama_set_rng_seed(self.ctx, seed) - - def sample_repetition_penalties( - self, - candidates: "_LlamaTokenDataArray", - last_tokens_data: "llama_cpp.Array[llama_cpp.llama_token]", - penalty_last_n: int, - penalty_repeat: float, - penalty_freq: float, - penalty_present: float, - ): - llama_cpp.llama_sample_repetition_penalties( - self.ctx, - llama_cpp.byref(candidates.candidates), - last_tokens_data, - penalty_last_n, - penalty_repeat, - penalty_freq, - penalty_present, - ) - - def sample_softmax(self, candidates: "_LlamaTokenDataArray"): - llama_cpp.llama_sample_softmax( - self.ctx, - llama_cpp.byref(candidates.candidates), - ) - - def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int): - llama_cpp.llama_sample_top_k( - self.ctx, llama_cpp.byref(candidates.candidates), k, min_keep - ) - - def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int): - llama_cpp.llama_sample_top_p( - self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep - ) - - def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int): - llama_cpp.llama_sample_min_p( - self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep - ) - - def sample_typical( - self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int - ): - llama_cpp.llama_sample_typical( - self.ctx, llama_cpp.byref(candidates.candidates), p, min_keep - ) - - def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float): - llama_cpp.llama_sample_temp( - self.ctx, llama_cpp.byref(candidates.candidates), temp - ) - - def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar): - llama_cpp.llama_sample_grammar( - self.ctx, - llama_cpp.byref(candidates.candidates), - grammar.grammar, - ) - - def sample_token_mirostat( - self, - candidates: "_LlamaTokenDataArray", - tau: float, - eta: float, - m: int, - mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float], - ) -> int: - return llama_cpp.llama_sample_token_mirostat( - self.ctx, - llama_cpp.byref(candidates.candidates), - tau, - eta, - m, - mu, - ) - - def sample_token_mirostat_v2( - self, - candidates: "_LlamaTokenDataArray", - tau: float, - eta: float, - mu: llama_cpp.CtypesPointerOrRef[ctypes.c_float], - ) -> int: - return llama_cpp.llama_sample_token_mirostat_v2( - self.ctx, - llama_cpp.byref(candidates.candidates), - tau, - eta, - mu, - ) - - def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int: - return llama_cpp.llama_sample_token_greedy( - self.ctx, - llama_cpp.byref(candidates.candidates), - ) - - def sample_token(self, candidates: "_LlamaTokenDataArray") -> int: - return llama_cpp.llama_sample_token( - self.ctx, - llama_cpp.byref(candidates.candidates), - ) - - # Grammar - def grammar_accept_token(self, grammar: LlamaGrammar, token: int): - llama_cpp.llama_grammar_accept_token(grammar.grammar, self.ctx, token) - - def reset_timings(self): - llama_cpp.llama_perf_context_reset(self.ctx) - - def print_timings(self): - llama_cpp.llama_perf_context_print(self.ctx) - - # Utility functions - @staticmethod - def default_params(): - """Get the default llama_context_params.""" - return llama_cpp.llama_context_default_params() - - -class LlamaBatch: - def __init__( - self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True - ): - self._n_tokens = n_tokens - self.embd = embd - self.n_seq_max = n_seq_max - self.verbose = verbose - self._exit_stack = ExitStack() - - batch = llama_cpp.llama_batch_init(self._n_tokens, self.embd, self.n_seq_max) - - if batch is None: - raise ValueError("Failed to create llama_batch") - - self.batch = batch - - def free_batch(): - if self.batch is None: - return - llama_cpp.llama_batch_free(self.batch) - self.batch = None - - self._exit_stack.callback(free_batch) - - def close(self): - self._exit_stack.close() - - def __del__(self): - self.close() - - def n_tokens(self) -> int: - return self.batch.n_tokens - - def reset(self): - self.batch.n_tokens = 0 - - def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool): - n_tokens = len(batch) - self.batch.n_tokens = n_tokens - for i in range(n_tokens): - self.batch.token[i] = batch[i] - self.batch.pos[i] = n_past + i - self.batch.seq_id[i][0] = 0 - self.batch.n_seq_id[i] = 1 - self.batch.logits[i] = logits_all - self.batch.logits[n_tokens - 1] = True - - def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool): - n_tokens = len(batch) - n_tokens0 = self.batch.n_tokens - self.batch.n_tokens += n_tokens - for i in range(n_tokens): - j = n_tokens0 + i - self.batch.token[j] = batch[i] - self.batch.pos[j] = i - self.batch.seq_id[j][0] = seq_id - self.batch.n_seq_id[j] = 1 - self.batch.logits[j] = logits_all - self.batch.logits[n_tokens - 1] = True - - -class LlamaTokenDataArray: - def __init__(self, *, n_vocab: int): - self.n_vocab = n_vocab - self.candidates_data = np.recarray( - (self.n_vocab,), - dtype=np.dtype( - [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True - ), - ) - self.candidates = llama_cpp.llama_token_data_array( - data=self.candidates_data.ctypes.data_as(llama_cpp.llama_token_data_p), - size=self.n_vocab, - sorted=False, - ) - self.default_candidates_data_id = np.arange(self.n_vocab, dtype=np.intc) # type: ignore - self.default_candidates_data_p = np.zeros(self.n_vocab, dtype=np.single) - - def copy_logits(self, logits: npt.NDArray[np.single]): - self.candidates_data.id[:] = self.default_candidates_data_id - self.candidates_data.logit[:] = logits - self.candidates_data.p[:] = self.default_candidates_data_p - self.candidates.sorted = False - self.candidates.size = self.n_vocab - - -# Embedding functions - - -def normalize_embedding(embedding): - norm = float(np.linalg.norm(embedding)) - if norm == 0.0: - return embedding - return [v / norm for v in embedding] - - -# Python wrappers over common/sampling structs - - -@dataclass -class LlamaSamplingParams: - n_prev: int = 64 - n_probs: int = 0 - top_k: int = 40 - top_p: float = 0.95 - min_p: float = 0.05 - tfs_z: float = 1.00 - typical_p: float = 1.00 - temp: float = 0.80 - penalty_last_n: int = 64 - penalty_repeat: float = 1.0 - penalty_freq: float = 0.00 - penalty_present: float = 0.00 - mirostat: int = 0 - mirostat_tau: float = 5.00 - mirostat_eta: float = 0.10 - penalize_nl: bool = True - - grammar: str = "" - - cfg_negative_prompt: str = "" - cfg_scale: float = 1.00 - - logit_bias: dict[int, float] = field(default_factory=dict) - - -@dataclass -class LlamaSamplingContext: - params: LlamaSamplingParams = field(default_factory=LlamaSamplingParams) - mirostat_mu: ctypes.c_float = field(default_factory=ctypes.c_float) - grammar: Optional[LlamaGrammar] = None - # NOTE: Missing parsed_grammar - prev: list[int] = field(default_factory=list) - cur: list[llama_cpp.llama_token_data] = field(default_factory=list) - - def reset(self): - self.prev = [] - self.cur = [] - if self.grammar is not None: - self.grammar.reset() - - def cp(self): - return LlamaSamplingContext( - params=self.params, - mirostat_mu=self.mirostat_mu, - grammar=self.grammar, - prev=self.prev.copy(), - cur=self.cur.copy(), - ) - - def last(self) -> Optional[int]: - if len(self.prev) > 0: - return self.prev[-1] - else: - return None - - def prev_str(self, ctx_main: LlamaContext, n: int) -> str: - return ctx_main.model.detokenize(self.prev[-n:]).decode("utf-8") - - def sample( - self, - ctx_main: LlamaContext, - idx: int = 0, - logits_array: Optional[npt.NDArray[np.single]] = None, - ): - n_vocab = ctx_main.model.n_vocab() - id: int = 0 - - if logits_array is None: - logits = ctx_main.get_logits_ith(idx) - logits_array = np.array( - ctypes.cast(logits, ctypes.POINTER(ctypes.c_float * n_vocab)).contents, - dtype=np.single, - ) - - # apply logit_bias - for token, logit_bias in self.params.logit_bias.items(): - logits_array[token] += logit_bias - - token_data_array = LlamaTokenDataArray( - n_vocab=n_vocab - ) # TODO: Only create this once - token_data_array.copy_logits(logits_array) - - # apply penalties - if len(self.prev) > 0: - nl_token = ctx_main.model.token_nl() - nl_logit = logits_array[nl_token] - last_tokens = self.prev[-self.params.penalty_last_n :] - last_tokens_size = min(len(last_tokens), self.params.penalty_last_n) - if last_tokens_size > 0: - last_tokens_p = (llama_cpp.llama_token * len(last_tokens))(*last_tokens) - ctx_main.sample_repetition_penalties( - token_data_array, - last_tokens_p, - last_tokens_size, - self.params.penalty_repeat, - self.params.penalty_freq, - self.params.penalty_present, - ) - if not self.params.penalize_nl: - token_data_array.candidates_data.logit[nl_token] = nl_logit - - if self.grammar is not None: - ctx_main.sample_grammar(token_data_array, self.grammar) - - if self.params.temp < 0: - ctx_main.sample_softmax(token_data_array) - id = token_data_array.candidates_data.id[0] - elif self.params.temp == 0: - id = ctx_main.sample_token_greedy(token_data_array) - else: - if self.params.mirostat == 1: - mirostat_m = 100 - ctx_main.sample_temp(token_data_array, self.params.temp) - id = ctx_main.sample_token_mirostat( - token_data_array, - self.params.mirostat_tau, - self.params.mirostat_eta, - mirostat_m, - ctypes.pointer(self.mirostat_mu), - ) - elif self.params.mirostat == 2: - ctx_main.sample_temp(token_data_array, self.params.temp) - id = ctx_main.sample_token_mirostat_v2( - token_data_array, - self.params.mirostat_tau, - self.params.mirostat_eta, - ctypes.pointer(self.mirostat_mu), - ) - else: - min_keep = max(1, self.params.n_probs) - ctx_main.sample_top_k( - token_data_array, self.params.top_k, min_keep=min_keep - ) - ctx_main.sample_typical( - token_data_array, self.params.typical_p, min_keep=min_keep - ) - ctx_main.sample_top_p( - token_data_array, self.params.top_p, min_keep=min_keep - ) - ctx_main.sample_min_p( - token_data_array, self.params.min_p, min_keep=min_keep - ) - ctx_main.sample_temp(token_data_array, self.params.temp) - id = ctx_main.sample_token(token_data_array) - return id - - def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool): - if apply_grammar and self.grammar is not None: - ctx_main.grammar_accept_token(self.grammar, id) - self.prev.append(id) - - -from typing import List, Callable, Optional, Union -import ctypes -import llama_cpp - - -class CustomSampler: - def __init__( - self, apply_func: typing.Callable[[llama_cpp.llama_token_data_array], None] - ): - self.apply_func = apply_func - - def apply_wrapper( - sampler: llama_cpp.llama_sampler_p, - cur_p: llama_cpp.llama_token_data_array_p, - ): - self.apply_func(cur_p) - - def free_wrapper(sampler: llama_cpp.llama_sampler_p): - pass - - sampler_i = llama_cpp.llama_sampler_i() - sampler_i.apply = llama_cpp.llama_sampler_i_apply(apply_wrapper) - self._apply_wrapper_ref = apply_wrapper - - sampler_i.name = llama_cpp.llama_sampler_i_name(0) - sampler_i.accept = llama_cpp.llama_sampler_i_accept(0) - sampler_i.reset = llama_cpp.llama_sampler_i_reset(0) - sampler_i.clone = llama_cpp.llama_sampler_i_clone(0) - sampler_i.free = llama_cpp.llama_sampler_i_free(0) - - self.sampler = llama_cpp.llama_sampler() - self.sampler.iface = ctypes.pointer(sampler_i) - self.sampler.ctx = None - - def get_sampler(self) -> llama_cpp.llama_sampler_p: - return ctypes.pointer(self.sampler) - - -class LlamaSampler: - def __init__(self): - params = llama_cpp.llama_sampler_chain_params() - self.sampler = llama_cpp.llama_sampler_chain_init(params) - self.samplers: List[llama_cpp.llama_sampler_p] = [] - self.custom_samplers: List[Tuple[int, CustomSampler]] = [] - - def add_greedy(self): - sampler = llama_cpp.llama_sampler_init_greedy() - self._add_sampler(sampler) - - def add_dist(self, seed: int): - sampler = llama_cpp.llama_sampler_init_dist(seed) - self._add_sampler(sampler) - - def add_softmax(self): - sampler = llama_cpp.llama_sampler_init_softmax() - self._add_sampler(sampler) - - def add_top_k(self, k: int): - sampler = llama_cpp.llama_sampler_init_top_k(k) - self._add_sampler(sampler) - - def add_top_p(self, p: float, min_keep: int): - sampler = llama_cpp.llama_sampler_init_top_p(p, min_keep) - self._add_sampler(sampler) - - def add_min_p(self, p: float, min_keep: int): - sampler = llama_cpp.llama_sampler_init_min_p(p, min_keep) - self._add_sampler(sampler) - - def add_typical(self, p: float, min_keep: int): - sampler = llama_cpp.llama_sampler_init_typical(p, min_keep) - self._add_sampler(sampler) - - def add_temp(self, temp: float): - sampler = llama_cpp.llama_sampler_init_temp(temp) - self._add_sampler(sampler) - - def add_temp_ext(self, t: float, delta: float, exponent: float): - sampler = llama_cpp.llama_sampler_init_temp_ext(t, delta, exponent) - self._add_sampler(sampler) - - def add_mirostat(self, n_vocab: int, seed: int, tau: float, eta: float, m: int): - sampler = llama_cpp.llama_sampler_init_mirostat(n_vocab, seed, tau, eta, m) - self._add_sampler(sampler) - - def add_mirostat_v2(self, seed: int, tau: float, eta: float): - sampler = llama_cpp.llama_sampler_init_mirostat_v2(seed, tau, eta) - self._add_sampler(sampler) - - def add_grammar(self, model: LlamaModel, grammar: LlamaGrammar): - sampler = llama_cpp.llama_sampler_init_grammar( - model.model, grammar._grammar.encode("utf-8"), grammar._root.encode("utf-8") - ) - self._add_sampler(sampler) - - def add_penalties( - self, - n_vocab: int, - special_eos_id: int, - linefeed_id: int, - penalty_last_n: int, - penalty_repeat: float, - penalty_freq: float, - penalty_present: float, - penalize_nl: bool, - ignore_eos: bool, - ): - sampler = llama_cpp.llama_sampler_init_penalties( - n_vocab, - special_eos_id, - linefeed_id, - penalty_last_n, - penalty_repeat, - penalty_freq, - penalty_present, - penalize_nl, - ignore_eos, - ) - self._add_sampler(sampler) - - def init_logit_bias( - self, n_vocab: int, n_logit_bias, logit_bias: llama_cpp.llama_logit_bias_p - ): - sampler = llama_cpp.llama_sampler_init_logit_bias( - n_vocab, n_logit_bias, logit_bias - ) - self._add_sampler(sampler) - - def add_custom( - self, apply_func: Callable[[llama_cpp.llama_token_data_array], None] - ): - custom_sampler = CustomSampler(apply_func) - sampler = custom_sampler.get_sampler() - self._add_sampler(sampler) - # NOTE: Must remove custom samplers before free or llama.cpp will try to free them - self.custom_samplers.append( - (llama_cpp.llama_sampler_chain_n(self.sampler) - 1, custom_sampler) - ) - - def _add_sampler(self, sampler: llama_cpp.llama_sampler_p): - assert self.sampler is not None - llama_cpp.llama_sampler_chain_add(self.sampler, sampler) - self.samplers.append(sampler) - - def get_seed(self) -> int: - assert self.sampler is not None - return llama_cpp.llama_sampler_get_seed(self.sampler) - - def sample(self, ctx: LlamaContext, idx: int) -> int: - assert self.sampler is not None - return llama_cpp.llama_sampler_sample(self.sampler, ctx.ctx, idx) - - def close(self): - if self.sampler: - # NOTE: Must remove custom samplers before free or llama.cpp will try to free them - for i, _ in reversed(self.custom_samplers): - llama_cpp.llama_sampler_chain_remove(self.sampler, i) - llama_cpp.llama_sampler_free(self.sampler) - self.sampler = None - self.samplers.clear() - self.custom_samplers.clear() - - def __del__(self): - self.close() diff --git a/nexa/gguf/llama/_logger_transformers.py b/nexa/gguf/llama/_logger_transformers.py deleted file mode 100644 index 83721274..00000000 --- a/nexa/gguf/llama/_logger_transformers.py +++ /dev/null @@ -1,41 +0,0 @@ -import sys -import ctypes -import logging - -from nexa.gguf.llama import llama_cpp - -# Mapping ggml log levels to Python logging levels -GGML_LOG_LEVEL_TO_LOGGING_LEVEL = { - 2: logging.ERROR, - 3: logging.WARNING, - 4: logging.INFO, - 5: logging.DEBUG, -} - -# Initialize the logger for llama-cpp-python -logger = logging.getLogger("nexa-transformers") - -# # Define the log callback function -# @llama_cpp.llama_log_callback -# def llama_log_callback( -# level: int, -# text: bytes, -# user_data: ctypes.c_void_p, -# ): -# # Check if the logger is set to log the provided level -# if logger.level <= GGML_LOG_LEVEL_TO_LOGGING_LEVEL[level]: -# # Print the log message to stderr -# print(text.decode("utf-8"), end="", flush=True, file=sys.stderr) - -# # Set the log callback function for llama_cpp -# llama_cpp.llama_log_set(llama_log_callback, ctypes.c_void_p(0)) - -# Utility function to set verbosity -def set_verbose(verbose: bool): - logger.setLevel(logging.DEBUG if verbose else logging.ERROR) - -# Example usage -if __name__ == "__main__": - # Set the verbosity based on a condition or user input - set_verbose(False) - # Rest of your application code here diff --git a/nexa/gguf/llama/_utils_spinner.py b/nexa/gguf/llama/_utils_spinner.py deleted file mode 100644 index f89ba2e1..00000000 --- a/nexa/gguf/llama/_utils_spinner.py +++ /dev/null @@ -1,79 +0,0 @@ -# For similar spinner animation implementation, refer to: nexa/utils.py - -import sys -import threading -import time -import os -import itertools -from contextlib import contextmanager - -def get_spinner_style(style="default"): - spinners = { - "default": ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"] - } - return spinners.get(style, spinners["default"]) - -def _get_output_stream(): - """Get the appropriate output stream based on platform.""" - if sys.platform == "win32": - return open('CONOUT$', 'wb') - else: - try: - return os.open('/dev/tty', os.O_WRONLY) - except (FileNotFoundError, OSError): - return os.open('/dev/stdout', os.O_WRONLY) - -def show_spinner(stop_event, style="default", message=""): - spinner = itertools.cycle(get_spinner_style(style)) - fd = _get_output_stream() - is_windows = sys.platform == "win32" - - try: - while not stop_event.is_set(): - display = f"\r{message} {next(spinner)}" if message else f"\r{next(spinner)} " - - if is_windows: - fd.write(display.encode()) - fd.flush() - else: - os.write(fd, display.encode()) - time.sleep(0.1) - - # Clear the spinner - clear_msg = b"\r" + b" " * (len(message) + 2) + b"\r" - if is_windows: - fd.write(clear_msg) - fd.flush() - else: - os.write(fd, clear_msg) - - finally: - if is_windows: - fd.close() - else: - os.close(fd) - -def start_spinner(style="default", message=""): - stop_event = threading.Event() - spinner_thread = threading.Thread( - target=show_spinner, - args=(stop_event, style, message), - daemon=True - ) - spinner_thread.start() - return stop_event, spinner_thread - -def stop_spinner(stop_event, spinner_thread): - if stop_event and not stop_event.is_set(): - stop_event.set() - if spinner_thread and spinner_thread.is_alive(): - spinner_thread.join() - -@contextmanager -def spinning_cursor(message="", style="default"): - """Context manager for spinner animation.""" - stop_event, thread = start_spinner(style, message) - try: - yield - finally: - stop_spinner(stop_event, thread) \ No newline at end of file diff --git a/nexa/gguf/llama/_utils_transformers.py b/nexa/gguf/llama/_utils_transformers.py deleted file mode 100644 index 29628193..00000000 --- a/nexa/gguf/llama/_utils_transformers.py +++ /dev/null @@ -1,78 +0,0 @@ -import os -import sys - -from typing import Any, Dict - -# Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor -outnull_file = open(os.devnull, "w") -errnull_file = open(os.devnull, "w") - -STDOUT_FILENO = 1 -STDERR_FILENO = 2 - - -class suppress_stdout_stderr(object): - # NOTE: these must be "saved" here to avoid exceptions when using - # this context manager inside of a __del__ method - sys = sys - os = os - - def __init__(self, disable: bool = True): - self.disable = disable - - # Oddly enough this works better than the contextlib version - def __enter__(self): - if self.disable: - return self - - self.old_stdout_fileno_undup = STDOUT_FILENO - self.old_stderr_fileno_undup = STDERR_FILENO - - self.old_stdout_fileno = self.os.dup(self.old_stdout_fileno_undup) - self.old_stderr_fileno = self.os.dup(self.old_stderr_fileno_undup) - - self.old_stdout = self.sys.stdout - self.old_stderr = self.sys.stderr - - self.os.dup2(outnull_file.fileno(), self.old_stdout_fileno_undup) - self.os.dup2(errnull_file.fileno(), self.old_stderr_fileno_undup) - - self.sys.stdout = outnull_file - self.sys.stderr = errnull_file - return self - - def __exit__(self, *_): - if self.disable: - return - - # Check if sys.stdout and sys.stderr have fileno method - self.sys.stdout = self.old_stdout - self.sys.stderr = self.old_stderr - - self.os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup) - self.os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup) - - self.os.close(self.old_stdout_fileno) - self.os.close(self.old_stderr_fileno) - - -class MetaSingleton(type): - """ - Metaclass for implementing the Singleton pattern. - """ - - _instances: Dict[type, Any] = {} - - def __call__(cls, *args: Any, **kwargs: Any) -> Any: - if cls not in cls._instances: - cls._instances[cls] = super(MetaSingleton, cls).__call__(*args, **kwargs) - return cls._instances[cls] - - -class Singleton(object, metaclass=MetaSingleton): - """ - Base class for implementing the Singleton pattern. - """ - - def __init__(self): - super(Singleton, self).__init__() diff --git a/nexa/gguf/llama/audio_lm_cpp.py b/nexa/gguf/llama/audio_lm_cpp.py deleted file mode 100644 index 88db2a33..00000000 --- a/nexa/gguf/llama/audio_lm_cpp.py +++ /dev/null @@ -1,116 +0,0 @@ -import ctypes -import os -import sys -from pathlib import Path - - -# Load the library -def _load_shared_library(lib_base_name: str, base_path: Path = None): - # Determine the file extension based on the platform - if sys.platform.startswith("linux"): - lib_ext = ".so" - elif sys.platform == "darwin": - lib_ext = ".dylib" - elif sys.platform == "win32": - lib_ext = ".dll" - else: - raise RuntimeError("Unsupported platform") - # Construct the paths to the possible shared library names - if base_path is None: - _base_path = Path(__file__).parent.parent.resolve() - else: - _base_path = base_path - _lib_paths = [ - _base_path / f"lib{lib_base_name}{lib_ext}", - _base_path / f"{lib_base_name}{lib_ext}", - ] - # Add the library directory to the DLL search path on Windows (if needed) - if sys.platform == "win32" and sys.version_info >= (3, 8): - os.add_dll_directory(str(_base_path)) - # Try to load the shared library, handling potential errors - for _lib_path in _lib_paths: - if _lib_path.exists(): - try: - return ctypes.CDLL(str(_lib_path)) - except Exception as e: - print(f"Failed to load shared library '{_lib_path}': {e}") - raise FileNotFoundError( - f"Shared library with base name '{lib_base_name}' not found" - ) - -def _get_lib(is_qwen: bool = True): - # Specify the base name of the shared library to load - _lib_base_name = "nexa-qwen2-audio-lib_shared" if is_qwen else "nexa-omni-audio-lib_shared" - base_path = ( - Path(__file__).parent.parent.parent.parent.resolve() - / "nexa" - / "gguf" - / "lib" - ) - return _load_shared_library(_lib_base_name, base_path) - -# Initialize both libraries -_lib_omni = _get_lib(is_qwen=False) -_lib_qwen = _get_lib(is_qwen=True) - -# conda config --add channels conda-forge -# conda update libstdcxx-ng -# struct omni_context_params -# { -# char *model; -# char *mmproj; -# char *file; -# char *prompt; -# int32_t n_gpu_layers; -# }; -class omni_context_params(ctypes.Structure): - _fields_ = [ - ("model", ctypes.c_char_p), - ("mmproj", ctypes.c_char_p), - ("file", ctypes.c_char_p), - ("prompt", ctypes.c_char_p), - ("n_gpu_layers", ctypes.c_int32), - ] - -omni_context_params_p = ctypes.POINTER(omni_context_params) -omni_context_p = ctypes.c_void_p - -# OMNI_AUDIO_API omni_context_params omni_context_default_params(); -def context_default_params(is_qwen: bool = True) -> omni_context_params: - _lib = _lib_qwen if is_qwen else _lib_omni - return _lib.omni_context_default_params() - -# OMNI_AUDIO_API struct omni_context *omni_init_context(omni_context_params ¶ms); -def init_context(params: omni_context_params_p, is_qwen: bool = True) -> omni_context_p: # type: ignore - _lib = _lib_qwen if is_qwen else _lib_omni - return _lib.omni_init_context(params) - -# OMNI_AUDIO_API void omni_process_full( -# struct omni_context *ctx_omni, -# omni_context_params ¶ms -# ); -def process_full(ctx: omni_context_p, params: omni_context_params_p, is_qwen: bool = True): # type: ignore - _lib = _lib_qwen if is_qwen else _lib_omni - return _lib.omni_process_full(ctx, params) - -# OMNI_AUDIO_API void omni_free(struct omni_context *ctx_omni); -def free(ctx: omni_context_p, is_qwen: bool = True): - _lib = _lib_qwen if is_qwen else _lib_omni - return _lib.omni_free(ctx) - -for lib in [_lib_omni, _lib_qwen]: - # Configure context_default_params - lib.omni_context_default_params.argtypes = [] - lib.omni_context_default_params.restype = omni_context_params - - # Configure init_context - lib.omni_init_context.argtypes = [omni_context_params_p] - lib.omni_init_context.restype = omni_context_p - - # Configure process_full - lib.omni_process_full.argtypes = [omni_context_p, omni_context_params_p] - lib.omni_process_full.restype = ctypes.c_char_p - - # Configure free - lib.omni_free.argtypes = [omni_context_p] - lib.omni_free.restype = None diff --git a/nexa/gguf/llama/kv_cache.py b/nexa/gguf/llama/kv_cache.py deleted file mode 100644 index aa01630d..00000000 --- a/nexa/gguf/llama/kv_cache.py +++ /dev/null @@ -1,86 +0,0 @@ -from nexa.gguf.llama.llama_cache import LlamaDiskCache -from typing import Any, Dict - -def run_inference_with_disk_cache( - model: Any, - cache_prompt: str, - total_prompt: str, - use_cache: bool = True, - cache_dir: str = "llama.cache", - **kwargs: Dict[str, Any] -) -> Any: - """ - Runs inference using a disk cache to store and retrieve model states. - - Parameters: - - model: The model object that supports caching and inference. - - cache_prompt: The prompt used to generate a cache key. - - total_prompt: The full prompt for generating output. - - use_cache: Flag to determine if caching should be used. - - cache_dir: Directory where cache files are stored. - - kwargs: Additional parameters for model inference. - - Returns: - - The output generated by the model. - """ - temperature = kwargs.get('temperature', 0.7) - max_tokens = kwargs.get('max_tokens', 2048) - top_p = kwargs.get('top_p', 0.8) - top_k = kwargs.get('top_k', 50) - repeat_penalty = kwargs.get('repeat_penalty', 1.0) - - if use_cache: - # Initialize disk cache with specified directory - cache_context = LlamaDiskCache(cache_dir=cache_dir) - model.set_cache(cache_context) - # Convert prompt to tokens for cache key - prompt_tokens = model.tokenize(cache_prompt.encode("utf-8")) - - try: - # Try to load existing cache - cached_state = cache_context[prompt_tokens] - model.load_state(cached_state) - - output = model( - total_prompt, - max_tokens=max_tokens, - temperature=temperature, - stream=True, - ) - except KeyError: - # If cache doesn't exist, create it - model.reset() - # Run initial inference to populate cache - _ = model( - cache_prompt, - max_tokens=1, # Minimal tokens for cache creation - temperature=temperature, - echo=False, - ) - # Save the state to cache - cache_context[prompt_tokens] = model.save_state() - - # Generate output after creating cache - output = model( - total_prompt, - max_tokens=max_tokens, - temperature=temperature, - top_p=top_p, - top_k=top_k, - repeat_penalty=repeat_penalty, - stream=True, - ) - else: - model.reset() - model.set_cache(None) - - output = model( - total_prompt, - max_tokens=max_tokens, - temperature=temperature, - top_p=top_p, - top_k=top_k, - repeat_penalty=repeat_penalty, - stream=True, - ) - return output \ No newline at end of file diff --git a/nexa/gguf/llama/llama.py b/nexa/gguf/llama/llama.py index 4ceb378f..72490310 100644 --- a/nexa/gguf/llama/llama.py +++ b/nexa/gguf/llama/llama.py @@ -47,9 +47,9 @@ import numpy as np import numpy.typing as npt -import nexa.gguf.llama._internals_transformers as internals -from nexa.gguf.llama._logger_transformers import set_verbose -from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr +import nexa.gguf.llama._internals as internals +from nexa.gguf.llama._logger import set_verbose +from nexa.gguf.llama._utils import suppress_stdout_stderr class Llama: diff --git a/nexa/gguf/llama/llama_chat_format.py b/nexa/gguf/llama/llama_chat_format.py index aeee3399..bb0dc41f 100644 --- a/nexa/gguf/llama/llama_chat_format.py +++ b/nexa/gguf/llama/llama_chat_format.py @@ -32,8 +32,8 @@ import nexa.gguf.llama.llama_types as llama_types import nexa.gguf.llama.llama_grammar as llama_grammar -from nexa.gguf.llama._logger_transformers import logger -from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr, Singleton +from nexa.gguf.llama._logger import logger +from nexa.gguf.llama._utils import suppress_stdout_stderr, Singleton ### Common Chat Templates and Special Tokens ### @@ -2667,7 +2667,7 @@ class Llava15ChatHandler: ) def __init__(self, clip_model_path: str, verbose: bool = True): - import nexa.gguf.llama.llava_cpp as llava_cpp + import llama_cpp.llava_cpp as llava_cpp self.clip_model_path = clip_model_path self.verbose = verbose diff --git a/nexa/gguf/llama/llama_cpp.py b/nexa/gguf/llama/llama_cpp.py index 3f4b9baa..0f390932 100644 --- a/nexa/gguf/llama/llama_cpp.py +++ b/nexa/gguf/llama/llama_cpp.py @@ -12,13 +12,14 @@ TYPE_CHECKING, ) -from typing_extensions import ( +from nexa.gguf.llama._ctypes_extensions import ( + load_shared_library, byref, ctypes_function_for_shared_library, ) if TYPE_CHECKING: - from typing_extensions import ( + from nexa.gguf.llama._ctypes_extensions import ( CtypesCData, CtypesArray, CtypesPointer, @@ -28,13 +29,14 @@ CtypesFuncPointer, ) -from nexa.gguf.lib_utils import load_library -from nexa.gguf.llama._ctypes_extensions import ctypes_function_for_shared_library # Specify the base name of the shared library to load _lib_base_name = "llama" +# _override_base_path = os.environ.get("LLAMA_CPP_LIB_PATH") +_override_base_path = 'D:/repo/nexa-ai/llama-cpp-python/llama_cpp/lib' +_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _override_base_path is None else pathlib.Path(_override_base_path) # Load the library -_lib = load_library(_lib_base_name) +_lib = load_shared_library(_lib_base_name, _base_path) ctypes_function = ctypes_function_for_shared_library(_lib) diff --git a/nexa/gguf/llama/llama_tokenizer.py b/nexa/gguf/llama/llama_tokenizer.py index a0105cc8..cefd3011 100644 --- a/nexa/gguf/llama/llama_tokenizer.py +++ b/nexa/gguf/llama/llama_tokenizer.py @@ -7,7 +7,7 @@ Any, ) -import llama_cpp +import nexa.gguf.llama from nexa.gguf.llama.llama_types import List diff --git a/nexa/gguf/llama/llava_cpp.py b/nexa/gguf/llama/llava_cpp.py index 9989407a..8ac2934b 100644 --- a/nexa/gguf/llama/llava_cpp.py +++ b/nexa/gguf/llama/llava_cpp.py @@ -22,20 +22,24 @@ import nexa.gguf.llama.llama_cpp as llama_cpp -from nexa.gguf.lib_utils import load_library - -from nexa.gguf.llama._ctypes_extensions import ctypes_function_for_shared_library +from nexa.gguf.llama._ctypes_extensions import ( + load_shared_library, + ctypes_function_for_shared_library, +) if TYPE_CHECKING: from nexa.gguf.llama._ctypes_extensions import ( CtypesArray, ) + # Specify the base name of the shared library to load -_libllava_base_name = "llava_shared" +_libllava_base_name = "llava" +_libllava_override_path = os.environ.get("LLAVA_CPP_LIB") +_libllava_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libllava_override_path is None else pathlib.Path() # Load the library -_libllava = load_library(_libllava_base_name) +_libllava = load_shared_library(_libllava_base_name, _libllava_base_path) ctypes_function = ctypes_function_for_shared_library(_libllava) diff --git a/nexa/gguf/llama/omni_vlm_cpp.py b/nexa/gguf/llama/omni_vlm_cpp.py deleted file mode 100644 index ab7a5699..00000000 --- a/nexa/gguf/llama/omni_vlm_cpp.py +++ /dev/null @@ -1,79 +0,0 @@ -import ctypes -import os -import sys -from pathlib import Path - - -# Load the library -def _load_shared_library(lib_base_name: str, base_path: Path = None): - # Determine the file extension based on the platform - if sys.platform.startswith("linux"): - lib_ext = ".so" - elif sys.platform == "darwin": - lib_ext = ".dylib" - elif sys.platform == "win32": - lib_ext = ".dll" - else: - raise RuntimeError("Unsupported platform") - # Construct the paths to the possible shared library names - if base_path is None: - _base_path = Path(__file__).parent.parent.resolve() - else: - _base_path = base_path - _lib_paths = [ - _base_path / f"lib{lib_base_name}{lib_ext}", - _base_path / f"{lib_base_name}{lib_ext}", - ] - # Add the library directory to the DLL search path on Windows (if needed) - if sys.platform == "win32" and sys.version_info >= (3, 8): - os.add_dll_directory(str(_base_path)) - # Try to load the shared library, handling potential errors - for _lib_path in _lib_paths: - if _lib_path.exists(): - try: - return ctypes.CDLL(str(_lib_path)) - except Exception as e: - print(f"Failed to load shared library '{_lib_path}': {e}") - raise FileNotFoundError( - f"Shared library with base name '{lib_base_name}' not found" - ) - -def _get_lib(): - # Specify the base name of the shared library to load - _lib_base_name = "omni_vlm_wrapper_shared" - base_path = ( - Path(__file__).parent.parent.parent.parent.resolve() - / "nexa" - / "gguf" - / "lib" - ) - return _load_shared_library(_lib_base_name, base_path) - -# Initialize both libraries -_lib = _get_lib() - -omni_char_p = ctypes.c_char_p - - -def omnivlm_init(llm_model_path: omni_char_p, mmproj_model_path: omni_char_p, vlm_version: omni_char_p): - return _lib.omnivlm_init(llm_model_path, mmproj_model_path, vlm_version) - - -_lib.omnivlm_init.argtypes = [omni_char_p, omni_char_p, omni_char_p] -_lib.omnivlm_init.restype = None - - -def omnivlm_inference(prompt: omni_char_p, image_path: omni_char_p): - return _lib.omnivlm_inference(prompt, image_path) - - -_lib.omnivlm_inference.argtypes = [omni_char_p, omni_char_p] -_lib.omnivlm_inference.restype = omni_char_p - - -def omnivlm_free(): - return _lib.omnivlm_free() - - -_lib.omnivlm_free.argtypes = [] -_lib.omnivlm_free.restype = None \ No newline at end of file diff --git a/nexa/gguf/nexa_inference_audio_lm.py b/nexa/gguf/nexa_inference_audio_lm.py index fa63cd38..e9036315 100644 --- a/nexa/gguf/nexa_inference_audio_lm.py +++ b/nexa/gguf/nexa_inference_audio_lm.py @@ -15,7 +15,7 @@ ) from nexa.gguf.lib_utils import is_gpu_available from nexa.gguf.llama import audio_lm_cpp -from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr +from nexa.gguf.llama._utils import suppress_stdout_stderr from nexa.general import pull_model def is_qwen(model_name): diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py index 65ef0c22..5f237290 100644 --- a/nexa/gguf/nexa_inference_image.py +++ b/nexa/gguf/nexa_inference_image.py @@ -15,7 +15,7 @@ NEXA_RUN_T5XXL_MAP, ) from nexa.utils import SpinningCursorAnimation, nexa_prompt -from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr +from nexa.gguf.llama._utils import suppress_stdout_stderr from streamlit.web import cli as stcli from nexa.general import pull_model diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py index c7802ace..c01a71e5 100644 --- a/nexa/gguf/nexa_inference_text.py +++ b/nexa/gguf/nexa_inference_text.py @@ -14,7 +14,7 @@ from nexa.gguf.lib_utils import is_gpu_available from nexa.general import pull_model from nexa.utils import SpinningCursorAnimation, nexa_prompt -from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr +from nexa.gguf.llama._utils import suppress_stdout_stderr logging.basicConfig( diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py index fa62b589..eeb1e436 100644 --- a/nexa/gguf/nexa_inference_vlm.py +++ b/nexa/gguf/nexa_inference_vlm.py @@ -25,7 +25,7 @@ NanoLlavaChatHandler, ) from nexa.utils import SpinningCursorAnimation, nexa_prompt -from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr +from nexa.gguf.llama._utils import suppress_stdout_stderr logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" diff --git a/nexa/gguf/nexa_inference_vlm_omni.py b/nexa/gguf/nexa_inference_vlm_omni.py index bd5b6b29..2d30f3ae 100644 --- a/nexa/gguf/nexa_inference_vlm_omni.py +++ b/nexa/gguf/nexa_inference_vlm_omni.py @@ -11,7 +11,7 @@ ) from nexa.gguf.lib_utils import is_gpu_available from nexa.gguf.llama import omni_vlm_cpp -from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr +from nexa.gguf.llama._utils import suppress_stdout_stderr from nexa.general import pull_model class NexaOmniVlmInference: diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py index b3659776..510c6737 100644 --- a/nexa/gguf/nexa_inference_voice.py +++ b/nexa/gguf/nexa_inference_voice.py @@ -11,7 +11,7 @@ ) from nexa.general import pull_model from nexa.utils import nexa_prompt, SpinningCursorAnimation -from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr +from nexa.gguf.llama._utils import suppress_stdout_stderr logging.basicConfig(level=logging.INFO) diff --git a/nexa/gguf/server/nexa_service.py b/nexa/gguf/server/nexa_service.py index 73772e31..31d4e135 100644 --- a/nexa/gguf/server/nexa_service.py +++ b/nexa/gguf/server/nexa_service.py @@ -35,7 +35,7 @@ Llava16ChatHandler, NanoLlavaChatHandler, ) -from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr +from nexa.gguf.llama._utils import suppress_stdout_stderr from nexa.general import pull_model from nexa.gguf.llama.llama import Llama from nexa.gguf.sd.stable_diffusion import StableDiffusion diff --git a/nexa/gguf/streamlit/streamlit_audio_lm.py b/nexa/gguf/streamlit/streamlit_audio_lm.py index 889a8870..85200388 100644 --- a/nexa/gguf/streamlit/streamlit_audio_lm.py +++ b/nexa/gguf/streamlit/streamlit_audio_lm.py @@ -7,7 +7,7 @@ from st_audiorec import st_audiorec from nexa.general import pull_model -from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr +from nexa.gguf.llama._utils import suppress_stdout_stderr from nexa.gguf.nexa_inference_audio_lm import NexaAudioLMInference # Initialize session state diff --git a/tests/test_text_generation.py b/tests/test_text_generation.py index 4a5109f6..91e5d62d 100644 --- a/tests/test_text_generation.py +++ b/tests/test_text_generation.py @@ -1,5 +1,8 @@ from nexa.gguf import NexaTextInference from nexa.gguf.lib_utils import is_gpu_available +# import pdb; +# pdb.set_trace() + model = NexaTextInference( model_path="gemma", From 24dc443689c2e65ddbb00bde0ed8a4f38b75d2ea Mon Sep 17 00:00:00 2001 From: zhycheng614 Date: Mon, 9 Dec 2024 18:31:33 +0000 Subject: [PATCH 11/20] update executable version to 9.6 in readme --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 17887873..1bd83309 100644 --- a/README.md +++ b/README.md @@ -33,13 +33,13 @@ Welcome to submit your requests through [issues](https://github.com/NexaAI/nexa- ## Install Option 1: Executable Installer

- + macOS Installer

- + Windows Installer

From d3dc3a402445239bce953e5dbf0386dfdd891c63 Mon Sep 17 00:00:00 2001 From: Te993 <3923106166@qq.com> Date: Tue, 10 Dec 2024 13:34:23 +0800 Subject: [PATCH 12/20] upgrade --- nexa/gguf/llama/__init__.py | 2 - nexa/gguf/llama/_ctypes_extensions.py | 54 -- nexa/gguf/llama/_ggml.py | 11 + nexa/gguf/llama/_internals_transformers.py | 862 +++++++++++++++++++++ nexa/gguf/llama/_logger_transformers.py | 42 + nexa/gguf/llama/_utils_spinner.py | 79 ++ nexa/gguf/llama/_utils_transformers.py | 78 ++ nexa/gguf/llama/audio_lm_cpp.py | 116 +++ nexa/gguf/llama/kv_cache.py | 86 ++ nexa/gguf/llama/llama.py | 6 +- nexa/gguf/llama/llama_cache.py | 20 +- nexa/gguf/llama/llama_chat_format.py | 4 +- nexa/gguf/llama/llama_cpp.py | 8 +- nexa/gguf/llama/llava_cpp.py | 8 +- nexa/gguf/llama/omni_vlm_cpp.py | 79 ++ nexa/gguf/nexa_inference_audio_lm.py | 2 +- nexa/gguf/nexa_inference_image.py | 2 +- nexa/gguf/nexa_inference_text.py | 2 +- nexa/gguf/nexa_inference_vlm.py | 2 +- nexa/gguf/nexa_inference_vlm_omni.py | 2 +- nexa/gguf/nexa_inference_voice.py | 2 +- nexa/gguf/server/nexa_service.py | 2 +- nexa/gguf/streamlit/streamlit_audio_lm.py | 2 +- 23 files changed, 1382 insertions(+), 89 deletions(-) create mode 100644 nexa/gguf/llama/_ggml.py create mode 100644 nexa/gguf/llama/_internals_transformers.py create mode 100644 nexa/gguf/llama/_logger_transformers.py create mode 100644 nexa/gguf/llama/_utils_spinner.py create mode 100644 nexa/gguf/llama/_utils_transformers.py create mode 100644 nexa/gguf/llama/audio_lm_cpp.py create mode 100644 nexa/gguf/llama/kv_cache.py create mode 100644 nexa/gguf/llama/omni_vlm_cpp.py diff --git a/nexa/gguf/llama/__init__.py b/nexa/gguf/llama/__init__.py index 6f1a2122..b3dcd6ed 100644 --- a/nexa/gguf/llama/__init__.py +++ b/nexa/gguf/llama/__init__.py @@ -1,4 +1,2 @@ from nexa.gguf.llama.llama_cpp import * from nexa.gguf.llama.llama import * - -# __version__ = "0.3.2" diff --git a/nexa/gguf/llama/_ctypes_extensions.py b/nexa/gguf/llama/_ctypes_extensions.py index e88ed387..c27f5c04 100644 --- a/nexa/gguf/llama/_ctypes_extensions.py +++ b/nexa/gguf/llama/_ctypes_extensions.py @@ -19,60 +19,6 @@ from typing_extensions import TypeAlias -# Load the library -def load_shared_library(lib_base_name: str, base_path: pathlib.Path): - """Platform independent shared library loader""" - # Searching for the library in the current directory under the name "libllama" (default name - # for llamacpp) and "llama" (default name for this repo) - lib_paths: List[pathlib.Path] = [] - # Determine the file extension based on the platform - if sys.platform.startswith("linux") or sys.platform.startswith("freebsd"): - lib_paths += [ - base_path / f"lib{lib_base_name}.so", - ] - elif sys.platform == "darwin": - lib_paths += [ - base_path / f"lib{lib_base_name}.so", - base_path / f"lib{lib_base_name}.dylib", - ] - elif sys.platform == "win32": - lib_paths += [ - base_path / f"{lib_base_name}.dll", - base_path / f"lib{lib_base_name}.dll", - ] - else: - raise RuntimeError("Unsupported platform") - - cdll_args = dict() # type: ignore - - # Add the library directory to the DLL search path on Windows (if needed) - if sys.platform == "win32": - os.add_dll_directory(str(base_path)) - os.environ["PATH"] = str(base_path) + os.pathsep + os.environ["PATH"] - - if sys.platform == "win32" and sys.version_info >= (3, 8): - os.add_dll_directory(str(base_path)) - if "CUDA_PATH" in os.environ: - os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin")) - os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib")) - if "HIP_PATH" in os.environ: - os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "bin")) - os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "lib")) - cdll_args["winmode"] = ctypes.RTLD_GLOBAL - - # Try to load the shared library, handling potential errors - for lib_path in lib_paths: - if lib_path.exists(): - try: - return ctypes.CDLL(str(lib_path), **cdll_args) # type: ignore - except Exception as e: - raise RuntimeError(f"Failed to load shared library '{lib_path}': {e}") - - raise FileNotFoundError( - f"Shared library with base name '{lib_base_name}' not found" - ) - - # ctypes sane type hint helpers # # - Generic Pointer and Array types diff --git a/nexa/gguf/llama/_ggml.py b/nexa/gguf/llama/_ggml.py new file mode 100644 index 00000000..5b175d4c --- /dev/null +++ b/nexa/gguf/llama/_ggml.py @@ -0,0 +1,11 @@ +"""Internal module use at your own risk + +This module provides a minimal interface for working with ggml tensors from llama-cpp-python +""" +import os +import pathlib + +from nexa.gguf.lib_utils import load_library + +libggml = load_library("ggml") + diff --git a/nexa/gguf/llama/_internals_transformers.py b/nexa/gguf/llama/_internals_transformers.py new file mode 100644 index 00000000..4de2f41b --- /dev/null +++ b/nexa/gguf/llama/_internals_transformers.py @@ -0,0 +1,862 @@ +from __future__ import annotations + +import os +import ctypes + +from typing import ( + Dict, + List, + Tuple, + Optional, + Sequence, +) +from dataclasses import dataclass, field +from contextlib import ExitStack + +import numpy as np +import numpy.typing as npt + +from nexa.gguf.llama.llama_types import * +from nexa.gguf.llama.llama_grammar import LlamaGrammar +from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr + +from nexa.gguf.llama.llama_cpp import * + +# Python wrappers over llama.h structs + + +class LlamaModel: + """Intermediate Python wrapper for a llama.cpp llama_model. + NOTE: For stability it's recommended you use the Llama class instead.""" + + def __init__( + self, + *, + path_model: str, + params: llama_model_params, + verbose: bool = True, + ): + self.path_model = path_model + self.params = params + self.verbose = verbose + self._exit_stack = ExitStack() + + model = None + + if not os.path.exists(path_model): + raise ValueError(f"Model path does not exist: {path_model}") + + with suppress_stdout_stderr(disable=verbose): + model = llama_load_model_from_file( + self.path_model.encode("utf-8"), self.params + ) + + if model is None: + raise ValueError(f"Failed to load model from file: {path_model}") + + self.model = model + + def free_model(): + if self.model is None: + return + llama_free_model(self.model) + self.model = None + + self._exit_stack.callback(free_model) + + def close(self): + self._exit_stack.close() + + def __del__(self): + self.close() + + def vocab_type(self) -> int: + return llama_vocab_type(self.model) + + def n_vocab(self) -> int: + return llama_n_vocab(self.model) + + def n_ctx_train(self) -> int: + return llama_n_ctx_train(self.model) + + def n_embd(self) -> int: + return llama_n_embd(self.model) + + def rope_freq_scale_train(self) -> float: + return llama_rope_freq_scale_train(self.model) + + def desc(self) -> str: + buf = ctypes.create_string_buffer(1024) + llama_model_desc(self.model, buf, 1024) + return buf.value.decode("utf-8") + + def size(self) -> int: + return llama_model_size(self.model) + + def n_params(self) -> int: + return llama_model_n_params(self.model) + + def get_tensor(self, name: str) -> ctypes.c_void_p: + return llama_get_model_tensor(self.model, name.encode("utf-8")) + + # Vocab + + def token_get_text(self, token: int) -> str: + return llama_token_get_text(self.model, token).decode("utf-8") + + def token_get_score(self, token: int) -> float: + return llama_token_get_score(self.model, token) + + def token_get_attr(self, token: int) -> int: + return llama_token_get_attr(self.model, token) + + # Special tokens + + def token_bos(self) -> int: + return llama_token_bos(self.model) + + def token_eos(self) -> int: + return llama_token_eos(self.model) + + def token_cls(self) -> int: + return llama_token_cls(self.model) + + def token_sep(self) -> int: + return llama_token_sep(self.model) + + def token_nl(self) -> int: + return llama_token_nl(self.model) + + def token_prefix(self) -> int: + return llama_token_prefix(self.model) + + def token_middle(self) -> int: + return llama_token_middle(self.model) + + def token_suffix(self) -> int: + return llama_token_suffix(self.model) + + def token_eot(self) -> int: + return llama_token_eot(self.model) + + def add_bos_token(self) -> bool: + return llama_add_bos_token(self.model) + + def add_eos_token(self) -> bool: + return llama_add_eos_token(self.model) + + # Tokenization + + def tokenize(self, text: bytes, add_bos: bool, special: bool): + n_ctx = self.n_ctx_train() + tokens = (llama_token * n_ctx)() + n_tokens = llama_tokenize( + self.model, text, len(text), tokens, n_ctx, add_bos, special + ) + if n_tokens < 0: + n_tokens = abs(n_tokens) + tokens = (llama_token * n_tokens)() + n_tokens = llama_tokenize( + self.model, text, len(text), tokens, n_tokens, add_bos, special + ) + if n_tokens < 0: + raise RuntimeError( + f'Failed to tokenize: text="{text}" n_tokens={n_tokens}' + ) + return list(tokens[:n_tokens]) + + def token_to_piece(self, token: int, special: bool = False) -> bytes: + buf = ctypes.create_string_buffer(32) + llama_token_to_piece(self.model, token, buf, 32, 0, special) + return bytes(buf) + + def detokenize(self, tokens: List[int], special: bool = False) -> bytes: + output = b"" + size = 32 + buffer = (ctypes.c_char * size)() + for token in tokens: + n = llama_token_to_piece( + self.model, llama_token(token), buffer, size, 0, special + ) + assert n <= size + output += bytes(buffer[:n]) + # NOTE: Llama1 models automatically added a space at the start of the prompt + # this line removes a leading space if the first token is a beginning of sentence token + return ( + output[1:] + if len(tokens) > 0 and tokens[0] == self.token_bos() and output[0:1] == b" " + else output + ) + + # Extra + def metadata(self) -> Dict[str, str]: + metadata: Dict[str, str] = {} + buffer_size = 1024 + buffer = ctypes.create_string_buffer(buffer_size) + # zero the buffer + buffer.value = b"\0" * buffer_size + # iterate over model keys + for i in range(llama_model_meta_count(self.model)): + nbytes = llama_model_meta_key_by_index( + self.model, i, buffer, buffer_size + ) + if nbytes > buffer_size: + buffer_size = nbytes + 1 + buffer = ctypes.create_string_buffer(buffer_size) + nbytes = llama_model_meta_key_by_index( + self.model, i, buffer, buffer_size + ) + key = buffer.value.decode("utf-8") + nbytes = llama_model_meta_val_str_by_index( + self.model, i, buffer, buffer_size + ) + if nbytes > buffer_size: + buffer_size = nbytes + 1 + buffer = ctypes.create_string_buffer(buffer_size) + nbytes = llama_model_meta_val_str_by_index( + self.model, i, buffer, buffer_size + ) + value = buffer.value.decode("utf-8") + metadata[key] = value + return metadata + + @staticmethod + def default_params(): + """Get the default llama_model_params.""" + return llama_model_default_params() + + +class LlamaContext: + """Intermediate Python wrapper for a llama.cpp llama_context. + NOTE: For stability it's recommended you use the Llama class instead.""" + + def __init__( + self, + *, + model: LlamaModel, + params: llama_context_params, + verbose: bool = True, + ): + self.model = model + self.params = params + self.verbose = verbose + self._exit_stack = ExitStack() + + ctx = llama_new_context_with_model(self.model.model, self.params) + + if ctx is None: + raise ValueError("Failed to create llama_context") + + self.ctx = ctx + + def free_ctx(): + if self.ctx is None: + return + llama_free(self.ctx) + self.ctx = None + + self._exit_stack.callback(free_ctx) + + def close(self): + self._exit_stack.close() + + def __del__(self): + self.close() + + def n_ctx(self) -> int: + return llama_n_ctx(self.ctx) + + def pooling_type(self) -> int: + return llama_pooling_type(self.ctx) + + def kv_cache_clear(self): + llama_kv_cache_clear(self.ctx) + + def kv_cache_seq_rm(self, seq_id: int, p0: int, p1: int): + llama_kv_cache_seq_rm(self.ctx, seq_id, p0, p1) + + def kv_cache_seq_cp(self, seq_id_src: int, seq_id_dst: int, p0: int, p1: int): + llama_kv_cache_seq_cp(self.ctx, seq_id_src, seq_id_dst, p0, p1) + + def kv_cache_seq_keep(self, seq_id: int): + llama_kv_cache_seq_keep(self.ctx, seq_id) + + def kv_cache_seq_shift(self, seq_id: int, p0: int, p1: int, shift: int): + llama_kv_cache_seq_add(self.ctx, seq_id, p0, p1, shift) + + def get_state_size(self) -> int: + return llama_get_state_size(self.ctx) + + # TODO: copy_state_data + + # TODO: set_state_data + + # TODO: llama_load_session_file + + # TODO: llama_save_session_file + + def decode(self, batch: LlamaBatch): + return_code = llama_decode( + self.ctx, + batch.batch, + ) + if return_code != 0: + raise RuntimeError(f"llama_decode returned {return_code}") + + def set_n_threads(self, n_threads: int, n_threads_batch: int): + llama_set_n_threads(self.ctx, n_threads, n_threads_batch) + + def get_logits(self): + return llama_get_logits(self.ctx) + + def get_logits_ith(self, i: int): + return llama_get_logits_ith(self.ctx, i) + + def get_embeddings(self): + return llama_get_embeddings(self.ctx) + + # Sampling functions + + def set_rng_seed(self, seed: int): + # TODO: Fix + llama_set_rng_seed(self.ctx, seed) + + def sample_repetition_penalties( + self, + candidates: "_LlamaTokenDataArray", + last_tokens_data: "Array[llama_token]", + penalty_last_n: int, + penalty_repeat: float, + penalty_freq: float, + penalty_present: float, + ): + llama_sample_repetition_penalties( + self.ctx, + byref(candidates.candidates), + last_tokens_data, + penalty_last_n, + penalty_repeat, + penalty_freq, + penalty_present, + ) + + def sample_softmax(self, candidates: "_LlamaTokenDataArray"): + llama_sample_softmax( + self.ctx, + byref(candidates.candidates), + ) + + def sample_top_k(self, candidates: "_LlamaTokenDataArray", k: int, min_keep: int): + llama_sample_top_k( + self.ctx, byref(candidates.candidates), k, min_keep + ) + + def sample_top_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int): + llama_sample_top_p( + self.ctx, byref(candidates.candidates), p, min_keep + ) + + def sample_min_p(self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int): + llama_sample_min_p( + self.ctx, byref(candidates.candidates), p, min_keep + ) + + def sample_typical( + self, candidates: "_LlamaTokenDataArray", p: float, min_keep: int + ): + llama_sample_typical( + self.ctx, byref(candidates.candidates), p, min_keep + ) + + def sample_temp(self, candidates: "_LlamaTokenDataArray", temp: float): + llama_sample_temp( + self.ctx, byref(candidates.candidates), temp + ) + + def sample_grammar(self, candidates: "_LlamaTokenDataArray", grammar: LlamaGrammar): + llama_sample_grammar( + self.ctx, + byref(candidates.candidates), + grammar.grammar, + ) + + def sample_token_mirostat( + self, + candidates: "_LlamaTokenDataArray", + tau: float, + eta: float, + m: int, + mu: CtypesPointerOrRef[ctypes.c_float], + ) -> int: + return llama_sample_token_mirostat( + self.ctx, + byref(candidates.candidates), + tau, + eta, + m, + mu, + ) + + def sample_token_mirostat_v2( + self, + candidates: "_LlamaTokenDataArray", + tau: float, + eta: float, + mu: CtypesPointerOrRef[ctypes.c_float], + ) -> int: + return llama_sample_token_mirostat_v2( + self.ctx, + byref(candidates.candidates), + tau, + eta, + mu, + ) + + def sample_token_greedy(self, candidates: "_LlamaTokenDataArray") -> int: + return llama_sample_token_greedy( + self.ctx, + byref(candidates.candidates), + ) + + def sample_token(self, candidates: "_LlamaTokenDataArray") -> int: + return llama_sample_token( + self.ctx, + byref(candidates.candidates), + ) + + # Grammar + def grammar_accept_token(self, grammar: LlamaGrammar, token: int): + llama_grammar_accept_token(grammar.grammar, self.ctx, token) + + def reset_timings(self): + llama_perf_context_reset(self.ctx) + + def print_timings(self): + llama_perf_context_print(self.ctx) + + # Utility functions + @staticmethod + def default_params(): + """Get the default llama_context_params.""" + return llama_context_default_params() + + +class LlamaBatch: + def __init__( + self, *, n_tokens: int, embd: int, n_seq_max: int, verbose: bool = True + ): + self._n_tokens = n_tokens + self.embd = embd + self.n_seq_max = n_seq_max + self.verbose = verbose + self._exit_stack = ExitStack() + + batch = llama_batch_init(self._n_tokens, self.embd, self.n_seq_max) + + if batch is None: + raise ValueError("Failed to create llama_batch") + + self.batch = batch + + def free_batch(): + if self.batch is None: + return + llama_batch_free(self.batch) + self.batch = None + + self._exit_stack.callback(free_batch) + + def close(self): + self._exit_stack.close() + + def __del__(self): + self.close() + + def n_tokens(self) -> int: + return self.batch.n_tokens + + def reset(self): + self.batch.n_tokens = 0 + + def set_batch(self, batch: Sequence[int], n_past: int, logits_all: bool): + n_tokens = len(batch) + self.batch.n_tokens = n_tokens + for i in range(n_tokens): + self.batch.token[i] = batch[i] + self.batch.pos[i] = n_past + i + self.batch.seq_id[i][0] = 0 + self.batch.n_seq_id[i] = 1 + self.batch.logits[i] = logits_all + self.batch.logits[n_tokens - 1] = True + + def add_sequence(self, batch: Sequence[int], seq_id: int, logits_all: bool): + n_tokens = len(batch) + n_tokens0 = self.batch.n_tokens + self.batch.n_tokens += n_tokens + for i in range(n_tokens): + j = n_tokens0 + i + self.batch.token[j] = batch[i] + self.batch.pos[j] = i + self.batch.seq_id[j][0] = seq_id + self.batch.n_seq_id[j] = 1 + self.batch.logits[j] = logits_all + self.batch.logits[n_tokens - 1] = True + + +class LlamaTokenDataArray: + def __init__(self, *, n_vocab: int): + self.n_vocab = n_vocab + self.candidates_data = np.recarray( + (self.n_vocab,), + dtype=np.dtype( + [("id", np.intc), ("logit", np.single), ("p", np.single)], align=True + ), + ) + self.candidates = llama_token_data_array( + data=self.candidates_data.ctypes.data_as(llama_token_data_p), + size=self.n_vocab, + sorted=False, + ) + self.default_candidates_data_id = np.arange(self.n_vocab, dtype=np.intc) # type: ignore + self.default_candidates_data_p = np.zeros(self.n_vocab, dtype=np.single) + + def copy_logits(self, logits: npt.NDArray[np.single]): + self.candidates_data.id[:] = self.default_candidates_data_id + self.candidates_data.logit[:] = logits + self.candidates_data.p[:] = self.default_candidates_data_p + self.candidates.sorted = False + self.candidates.size = self.n_vocab + + +# Embedding functions + + +def normalize_embedding(embedding): + norm = float(np.linalg.norm(embedding)) + if norm == 0.0: + return embedding + return [v / norm for v in embedding] + + +# Python wrappers over common/sampling structs + + +@dataclass +class LlamaSamplingParams: + n_prev: int = 64 + n_probs: int = 0 + top_k: int = 40 + top_p: float = 0.95 + min_p: float = 0.05 + tfs_z: float = 1.00 + typical_p: float = 1.00 + temp: float = 0.80 + penalty_last_n: int = 64 + penalty_repeat: float = 1.0 + penalty_freq: float = 0.00 + penalty_present: float = 0.00 + mirostat: int = 0 + mirostat_tau: float = 5.00 + mirostat_eta: float = 0.10 + penalize_nl: bool = True + + grammar: str = "" + + cfg_negative_prompt: str = "" + cfg_scale: float = 1.00 + + logit_bias: dict[int, float] = field(default_factory=dict) + + +@dataclass +class LlamaSamplingContext: + params: LlamaSamplingParams = field(default_factory=LlamaSamplingParams) + mirostat_mu: ctypes.c_float = field(default_factory=ctypes.c_float) + grammar: Optional[LlamaGrammar] = None + # NOTE: Missing parsed_grammar + prev: list[int] = field(default_factory=list) + cur: list[llama_token_data] = field(default_factory=list) + + def reset(self): + self.prev = [] + self.cur = [] + if self.grammar is not None: + self.grammar.reset() + + def cp(self): + return LlamaSamplingContext( + params=self.params, + mirostat_mu=self.mirostat_mu, + grammar=self.grammar, + prev=self.prev.copy(), + cur=self.cur.copy(), + ) + + def last(self) -> Optional[int]: + if len(self.prev) > 0: + return self.prev[-1] + else: + return None + + def prev_str(self, ctx_main: LlamaContext, n: int) -> str: + return ctx_main.model.detokenize(self.prev[-n:]).decode("utf-8") + + def sample( + self, + ctx_main: LlamaContext, + idx: int = 0, + logits_array: Optional[npt.NDArray[np.single]] = None, + ): + n_vocab = ctx_main.model.n_vocab() + id: int = 0 + + if logits_array is None: + logits = ctx_main.get_logits_ith(idx) + logits_array = np.array( + ctypes.cast(logits, ctypes.POINTER(ctypes.c_float * n_vocab)).contents, + dtype=np.single, + ) + + # apply logit_bias + for token, logit_bias in self.params.logit_bias.items(): + logits_array[token] += logit_bias + + token_data_array = LlamaTokenDataArray( + n_vocab=n_vocab + ) # TODO: Only create this once + token_data_array.copy_logits(logits_array) + + # apply penalties + if len(self.prev) > 0: + nl_token = ctx_main.model.token_nl() + nl_logit = logits_array[nl_token] + last_tokens = self.prev[-self.params.penalty_last_n :] + last_tokens_size = min(len(last_tokens), self.params.penalty_last_n) + if last_tokens_size > 0: + last_tokens_p = (llama_token * len(last_tokens))(*last_tokens) + ctx_main.sample_repetition_penalties( + token_data_array, + last_tokens_p, + last_tokens_size, + self.params.penalty_repeat, + self.params.penalty_freq, + self.params.penalty_present, + ) + if not self.params.penalize_nl: + token_data_array.candidates_data.logit[nl_token] = nl_logit + + if self.grammar is not None: + ctx_main.sample_grammar(token_data_array, self.grammar) + + if self.params.temp < 0: + ctx_main.sample_softmax(token_data_array) + id = token_data_array.candidates_data.id[0] + elif self.params.temp == 0: + id = ctx_main.sample_token_greedy(token_data_array) + else: + if self.params.mirostat == 1: + mirostat_m = 100 + ctx_main.sample_temp(token_data_array, self.params.temp) + id = ctx_main.sample_token_mirostat( + token_data_array, + self.params.mirostat_tau, + self.params.mirostat_eta, + mirostat_m, + ctypes.pointer(self.mirostat_mu), + ) + elif self.params.mirostat == 2: + ctx_main.sample_temp(token_data_array, self.params.temp) + id = ctx_main.sample_token_mirostat_v2( + token_data_array, + self.params.mirostat_tau, + self.params.mirostat_eta, + ctypes.pointer(self.mirostat_mu), + ) + else: + min_keep = max(1, self.params.n_probs) + ctx_main.sample_top_k( + token_data_array, self.params.top_k, min_keep=min_keep + ) + ctx_main.sample_typical( + token_data_array, self.params.typical_p, min_keep=min_keep + ) + ctx_main.sample_top_p( + token_data_array, self.params.top_p, min_keep=min_keep + ) + ctx_main.sample_min_p( + token_data_array, self.params.min_p, min_keep=min_keep + ) + ctx_main.sample_temp(token_data_array, self.params.temp) + id = ctx_main.sample_token(token_data_array) + return id + + def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool): + if apply_grammar and self.grammar is not None: + ctx_main.grammar_accept_token(self.grammar, id) + self.prev.append(id) + + +from typing import List, Callable, Optional, Union +import ctypes +import llama_cpp + + +class CustomSampler: + def __init__( + self, apply_func: typing.Callable[[llama_token_data_array], None] + ): + self.apply_func = apply_func + + def apply_wrapper( + sampler: llama_sampler_p, + cur_p: llama_token_data_array_p, + ): + self.apply_func(cur_p) + + def free_wrapper(sampler: llama_sampler_p): + pass + + sampler_i = llama_sampler_i() + sampler_i.apply = llama_sampler_i_apply(apply_wrapper) + self._apply_wrapper_ref = apply_wrapper + + sampler_i.name = llama_sampler_i_name(0) + sampler_i.accept = llama_sampler_i_accept(0) + sampler_i.reset = llama_sampler_i_reset(0) + sampler_i.clone = llama_sampler_i_clone(0) + sampler_i.free = llama_sampler_i_free(0) + + self.sampler = llama_sampler() + self.sampler.iface = ctypes.pointer(sampler_i) + self.sampler.ctx = None + + def get_sampler(self) -> llama_sampler_p: + return ctypes.pointer(self.sampler) + + +class LlamaSampler: + def __init__(self): + params = llama_sampler_chain_params() + self.sampler = llama_sampler_chain_init(params) + self.samplers: List[llama_sampler_p] = [] + self.custom_samplers: List[Tuple[int, CustomSampler]] = [] + + def add_greedy(self): + sampler = llama_sampler_init_greedy() + self._add_sampler(sampler) + + def add_dist(self, seed: int): + sampler = llama_sampler_init_dist(seed) + self._add_sampler(sampler) + + def add_softmax(self): + sampler = llama_sampler_init_softmax() + self._add_sampler(sampler) + + def add_top_k(self, k: int): + sampler = llama_sampler_init_top_k(k) + self._add_sampler(sampler) + + def add_top_p(self, p: float, min_keep: int): + sampler = llama_sampler_init_top_p(p, min_keep) + self._add_sampler(sampler) + + def add_min_p(self, p: float, min_keep: int): + sampler = llama_sampler_init_min_p(p, min_keep) + self._add_sampler(sampler) + + def add_typical(self, p: float, min_keep: int): + sampler = llama_sampler_init_typical(p, min_keep) + self._add_sampler(sampler) + + def add_temp(self, temp: float): + sampler = llama_sampler_init_temp(temp) + self._add_sampler(sampler) + + def add_temp_ext(self, t: float, delta: float, exponent: float): + sampler = llama_sampler_init_temp_ext(t, delta, exponent) + self._add_sampler(sampler) + + def add_mirostat(self, n_vocab: int, seed: int, tau: float, eta: float, m: int): + sampler = llama_sampler_init_mirostat(n_vocab, seed, tau, eta, m) + self._add_sampler(sampler) + + def add_mirostat_v2(self, seed: int, tau: float, eta: float): + sampler = llama_sampler_init_mirostat_v2(seed, tau, eta) + self._add_sampler(sampler) + + def add_grammar(self, model: LlamaModel, grammar: LlamaGrammar): + sampler = llama_sampler_init_grammar( + model.model, grammar._grammar.encode("utf-8"), grammar._root.encode("utf-8") + ) + self._add_sampler(sampler) + + def add_penalties( + self, + n_vocab: int, + special_eos_id: int, + linefeed_id: int, + penalty_last_n: int, + penalty_repeat: float, + penalty_freq: float, + penalty_present: float, + penalize_nl: bool, + ignore_eos: bool, + ): + sampler = llama_sampler_init_penalties( + n_vocab, + special_eos_id, + linefeed_id, + penalty_last_n, + penalty_repeat, + penalty_freq, + penalty_present, + penalize_nl, + ignore_eos, + ) + self._add_sampler(sampler) + + def init_logit_bias( + self, n_vocab: int, n_logit_bias, logit_bias: llama_logit_bias_p + ): + sampler = llama_sampler_init_logit_bias( + n_vocab, n_logit_bias, logit_bias + ) + self._add_sampler(sampler) + + def add_custom( + self, apply_func: Callable[[llama_token_data_array], None] + ): + custom_sampler = CustomSampler(apply_func) + sampler = custom_sampler.get_sampler() + self._add_sampler(sampler) + # NOTE: Must remove custom samplers before free or llama.cpp will try to free them + self.custom_samplers.append( + (llama_sampler_chain_n(self.sampler) - 1, custom_sampler) + ) + + def _add_sampler(self, sampler: llama_sampler_p): + assert self.sampler is not None + llama_sampler_chain_add(self.sampler, sampler) + self.samplers.append(sampler) + + def get_seed(self) -> int: + assert self.sampler is not None + return llama_sampler_get_seed(self.sampler) + + def sample(self, ctx: LlamaContext, idx: int) -> int: + assert self.sampler is not None + return llama_sampler_sample(self.sampler, ctx.ctx, idx) + + def close(self): + if self.sampler: + # NOTE: Must remove custom samplers before free or llama.cpp will try to free them + for i, _ in reversed(self.custom_samplers): + llama_sampler_chain_remove(self.sampler, i) + llama_sampler_free(self.sampler) + self.sampler = None + self.samplers.clear() + self.custom_samplers.clear() + + def __del__(self): + self.close() diff --git a/nexa/gguf/llama/_logger_transformers.py b/nexa/gguf/llama/_logger_transformers.py new file mode 100644 index 00000000..2fb0b209 --- /dev/null +++ b/nexa/gguf/llama/_logger_transformers.py @@ -0,0 +1,42 @@ +import sys +import ctypes +import logging + +import nexa.gguf.llama as llama_cpp + +# enum ggml_log_level { +# GGML_LOG_LEVEL_NONE = 0, +# GGML_LOG_LEVEL_INFO = 1, +# GGML_LOG_LEVEL_WARN = 2, +# GGML_LOG_LEVEL_ERROR = 3, +# GGML_LOG_LEVEL_DEBUG = 4, +# GGML_LOG_LEVEL_CONT = 5, // continue previous log +# }; +GGML_LOG_LEVEL_TO_LOGGING_LEVEL = { + 0: logging.CRITICAL, + 1: logging.INFO, + 2: logging.WARNING, + 3: logging.ERROR, + 4: logging.DEBUG, + 5: logging.DEBUG, +} +# Mapping ggml log levels to Python logging levels +GGML_LOG_LEVEL_TO_LOGGING_LEVEL = { + 2: logging.ERROR, + 3: logging.WARNING, + 4: logging.INFO, + 5: logging.DEBUG, +} + +# Initialize the logger for llama-cpp-python +logger = logging.getLogger("nexa-transformers") + +# Utility function to set verbosity +def set_verbose(verbose: bool): + logger.setLevel(logging.DEBUG if verbose else logging.ERROR) + +# Example usage +if __name__ == "__main__": + # Set the verbosity based on a condition or user input + set_verbose(False) + # Rest of your application code here diff --git a/nexa/gguf/llama/_utils_spinner.py b/nexa/gguf/llama/_utils_spinner.py new file mode 100644 index 00000000..f89ba2e1 --- /dev/null +++ b/nexa/gguf/llama/_utils_spinner.py @@ -0,0 +1,79 @@ +# For similar spinner animation implementation, refer to: nexa/utils.py + +import sys +import threading +import time +import os +import itertools +from contextlib import contextmanager + +def get_spinner_style(style="default"): + spinners = { + "default": ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"] + } + return spinners.get(style, spinners["default"]) + +def _get_output_stream(): + """Get the appropriate output stream based on platform.""" + if sys.platform == "win32": + return open('CONOUT$', 'wb') + else: + try: + return os.open('/dev/tty', os.O_WRONLY) + except (FileNotFoundError, OSError): + return os.open('/dev/stdout', os.O_WRONLY) + +def show_spinner(stop_event, style="default", message=""): + spinner = itertools.cycle(get_spinner_style(style)) + fd = _get_output_stream() + is_windows = sys.platform == "win32" + + try: + while not stop_event.is_set(): + display = f"\r{message} {next(spinner)}" if message else f"\r{next(spinner)} " + + if is_windows: + fd.write(display.encode()) + fd.flush() + else: + os.write(fd, display.encode()) + time.sleep(0.1) + + # Clear the spinner + clear_msg = b"\r" + b" " * (len(message) + 2) + b"\r" + if is_windows: + fd.write(clear_msg) + fd.flush() + else: + os.write(fd, clear_msg) + + finally: + if is_windows: + fd.close() + else: + os.close(fd) + +def start_spinner(style="default", message=""): + stop_event = threading.Event() + spinner_thread = threading.Thread( + target=show_spinner, + args=(stop_event, style, message), + daemon=True + ) + spinner_thread.start() + return stop_event, spinner_thread + +def stop_spinner(stop_event, spinner_thread): + if stop_event and not stop_event.is_set(): + stop_event.set() + if spinner_thread and spinner_thread.is_alive(): + spinner_thread.join() + +@contextmanager +def spinning_cursor(message="", style="default"): + """Context manager for spinner animation.""" + stop_event, thread = start_spinner(style, message) + try: + yield + finally: + stop_spinner(stop_event, thread) \ No newline at end of file diff --git a/nexa/gguf/llama/_utils_transformers.py b/nexa/gguf/llama/_utils_transformers.py new file mode 100644 index 00000000..29628193 --- /dev/null +++ b/nexa/gguf/llama/_utils_transformers.py @@ -0,0 +1,78 @@ +import os +import sys + +from typing import Any, Dict + +# Avoid "LookupError: unknown encoding: ascii" when open() called in a destructor +outnull_file = open(os.devnull, "w") +errnull_file = open(os.devnull, "w") + +STDOUT_FILENO = 1 +STDERR_FILENO = 2 + + +class suppress_stdout_stderr(object): + # NOTE: these must be "saved" here to avoid exceptions when using + # this context manager inside of a __del__ method + sys = sys + os = os + + def __init__(self, disable: bool = True): + self.disable = disable + + # Oddly enough this works better than the contextlib version + def __enter__(self): + if self.disable: + return self + + self.old_stdout_fileno_undup = STDOUT_FILENO + self.old_stderr_fileno_undup = STDERR_FILENO + + self.old_stdout_fileno = self.os.dup(self.old_stdout_fileno_undup) + self.old_stderr_fileno = self.os.dup(self.old_stderr_fileno_undup) + + self.old_stdout = self.sys.stdout + self.old_stderr = self.sys.stderr + + self.os.dup2(outnull_file.fileno(), self.old_stdout_fileno_undup) + self.os.dup2(errnull_file.fileno(), self.old_stderr_fileno_undup) + + self.sys.stdout = outnull_file + self.sys.stderr = errnull_file + return self + + def __exit__(self, *_): + if self.disable: + return + + # Check if sys.stdout and sys.stderr have fileno method + self.sys.stdout = self.old_stdout + self.sys.stderr = self.old_stderr + + self.os.dup2(self.old_stdout_fileno, self.old_stdout_fileno_undup) + self.os.dup2(self.old_stderr_fileno, self.old_stderr_fileno_undup) + + self.os.close(self.old_stdout_fileno) + self.os.close(self.old_stderr_fileno) + + +class MetaSingleton(type): + """ + Metaclass for implementing the Singleton pattern. + """ + + _instances: Dict[type, Any] = {} + + def __call__(cls, *args: Any, **kwargs: Any) -> Any: + if cls not in cls._instances: + cls._instances[cls] = super(MetaSingleton, cls).__call__(*args, **kwargs) + return cls._instances[cls] + + +class Singleton(object, metaclass=MetaSingleton): + """ + Base class for implementing the Singleton pattern. + """ + + def __init__(self): + super(Singleton, self).__init__() diff --git a/nexa/gguf/llama/audio_lm_cpp.py b/nexa/gguf/llama/audio_lm_cpp.py new file mode 100644 index 00000000..88db2a33 --- /dev/null +++ b/nexa/gguf/llama/audio_lm_cpp.py @@ -0,0 +1,116 @@ +import ctypes +import os +import sys +from pathlib import Path + + +# Load the library +def _load_shared_library(lib_base_name: str, base_path: Path = None): + # Determine the file extension based on the platform + if sys.platform.startswith("linux"): + lib_ext = ".so" + elif sys.platform == "darwin": + lib_ext = ".dylib" + elif sys.platform == "win32": + lib_ext = ".dll" + else: + raise RuntimeError("Unsupported platform") + # Construct the paths to the possible shared library names + if base_path is None: + _base_path = Path(__file__).parent.parent.resolve() + else: + _base_path = base_path + _lib_paths = [ + _base_path / f"lib{lib_base_name}{lib_ext}", + _base_path / f"{lib_base_name}{lib_ext}", + ] + # Add the library directory to the DLL search path on Windows (if needed) + if sys.platform == "win32" and sys.version_info >= (3, 8): + os.add_dll_directory(str(_base_path)) + # Try to load the shared library, handling potential errors + for _lib_path in _lib_paths: + if _lib_path.exists(): + try: + return ctypes.CDLL(str(_lib_path)) + except Exception as e: + print(f"Failed to load shared library '{_lib_path}': {e}") + raise FileNotFoundError( + f"Shared library with base name '{lib_base_name}' not found" + ) + +def _get_lib(is_qwen: bool = True): + # Specify the base name of the shared library to load + _lib_base_name = "nexa-qwen2-audio-lib_shared" if is_qwen else "nexa-omni-audio-lib_shared" + base_path = ( + Path(__file__).parent.parent.parent.parent.resolve() + / "nexa" + / "gguf" + / "lib" + ) + return _load_shared_library(_lib_base_name, base_path) + +# Initialize both libraries +_lib_omni = _get_lib(is_qwen=False) +_lib_qwen = _get_lib(is_qwen=True) + +# conda config --add channels conda-forge +# conda update libstdcxx-ng +# struct omni_context_params +# { +# char *model; +# char *mmproj; +# char *file; +# char *prompt; +# int32_t n_gpu_layers; +# }; +class omni_context_params(ctypes.Structure): + _fields_ = [ + ("model", ctypes.c_char_p), + ("mmproj", ctypes.c_char_p), + ("file", ctypes.c_char_p), + ("prompt", ctypes.c_char_p), + ("n_gpu_layers", ctypes.c_int32), + ] + +omni_context_params_p = ctypes.POINTER(omni_context_params) +omni_context_p = ctypes.c_void_p + +# OMNI_AUDIO_API omni_context_params omni_context_default_params(); +def context_default_params(is_qwen: bool = True) -> omni_context_params: + _lib = _lib_qwen if is_qwen else _lib_omni + return _lib.omni_context_default_params() + +# OMNI_AUDIO_API struct omni_context *omni_init_context(omni_context_params ¶ms); +def init_context(params: omni_context_params_p, is_qwen: bool = True) -> omni_context_p: # type: ignore + _lib = _lib_qwen if is_qwen else _lib_omni + return _lib.omni_init_context(params) + +# OMNI_AUDIO_API void omni_process_full( +# struct omni_context *ctx_omni, +# omni_context_params ¶ms +# ); +def process_full(ctx: omni_context_p, params: omni_context_params_p, is_qwen: bool = True): # type: ignore + _lib = _lib_qwen if is_qwen else _lib_omni + return _lib.omni_process_full(ctx, params) + +# OMNI_AUDIO_API void omni_free(struct omni_context *ctx_omni); +def free(ctx: omni_context_p, is_qwen: bool = True): + _lib = _lib_qwen if is_qwen else _lib_omni + return _lib.omni_free(ctx) + +for lib in [_lib_omni, _lib_qwen]: + # Configure context_default_params + lib.omni_context_default_params.argtypes = [] + lib.omni_context_default_params.restype = omni_context_params + + # Configure init_context + lib.omni_init_context.argtypes = [omni_context_params_p] + lib.omni_init_context.restype = omni_context_p + + # Configure process_full + lib.omni_process_full.argtypes = [omni_context_p, omni_context_params_p] + lib.omni_process_full.restype = ctypes.c_char_p + + # Configure free + lib.omni_free.argtypes = [omni_context_p] + lib.omni_free.restype = None diff --git a/nexa/gguf/llama/kv_cache.py b/nexa/gguf/llama/kv_cache.py new file mode 100644 index 00000000..aa01630d --- /dev/null +++ b/nexa/gguf/llama/kv_cache.py @@ -0,0 +1,86 @@ +from nexa.gguf.llama.llama_cache import LlamaDiskCache +from typing import Any, Dict + +def run_inference_with_disk_cache( + model: Any, + cache_prompt: str, + total_prompt: str, + use_cache: bool = True, + cache_dir: str = "llama.cache", + **kwargs: Dict[str, Any] +) -> Any: + """ + Runs inference using a disk cache to store and retrieve model states. + + Parameters: + - model: The model object that supports caching and inference. + - cache_prompt: The prompt used to generate a cache key. + - total_prompt: The full prompt for generating output. + - use_cache: Flag to determine if caching should be used. + - cache_dir: Directory where cache files are stored. + - kwargs: Additional parameters for model inference. + + Returns: + - The output generated by the model. + """ + temperature = kwargs.get('temperature', 0.7) + max_tokens = kwargs.get('max_tokens', 2048) + top_p = kwargs.get('top_p', 0.8) + top_k = kwargs.get('top_k', 50) + repeat_penalty = kwargs.get('repeat_penalty', 1.0) + + if use_cache: + # Initialize disk cache with specified directory + cache_context = LlamaDiskCache(cache_dir=cache_dir) + model.set_cache(cache_context) + # Convert prompt to tokens for cache key + prompt_tokens = model.tokenize(cache_prompt.encode("utf-8")) + + try: + # Try to load existing cache + cached_state = cache_context[prompt_tokens] + model.load_state(cached_state) + + output = model( + total_prompt, + max_tokens=max_tokens, + temperature=temperature, + stream=True, + ) + except KeyError: + # If cache doesn't exist, create it + model.reset() + # Run initial inference to populate cache + _ = model( + cache_prompt, + max_tokens=1, # Minimal tokens for cache creation + temperature=temperature, + echo=False, + ) + # Save the state to cache + cache_context[prompt_tokens] = model.save_state() + + # Generate output after creating cache + output = model( + total_prompt, + max_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + top_k=top_k, + repeat_penalty=repeat_penalty, + stream=True, + ) + else: + model.reset() + model.set_cache(None) + + output = model( + total_prompt, + max_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + top_k=top_k, + repeat_penalty=repeat_penalty, + stream=True, + ) + return output \ No newline at end of file diff --git a/nexa/gguf/llama/llama.py b/nexa/gguf/llama/llama.py index 72490310..4ceb378f 100644 --- a/nexa/gguf/llama/llama.py +++ b/nexa/gguf/llama/llama.py @@ -47,9 +47,9 @@ import numpy as np import numpy.typing as npt -import nexa.gguf.llama._internals as internals -from nexa.gguf.llama._logger import set_verbose -from nexa.gguf.llama._utils import suppress_stdout_stderr +import nexa.gguf.llama._internals_transformers as internals +from nexa.gguf.llama._logger_transformers import set_verbose +from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr class Llama: diff --git a/nexa/gguf/llama/llama_cache.py b/nexa/gguf/llama/llama_cache.py index 05c5a0fa..6b05e11e 100644 --- a/nexa/gguf/llama/llama_cache.py +++ b/nexa/gguf/llama/llama_cache.py @@ -32,7 +32,7 @@ def _find_longest_prefix_key( pass @abstractmethod - def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": + def __getitem__(self, key: Sequence[int]) -> "nexa.gguf.llama.LlamaState": raise NotImplementedError @abstractmethod @@ -41,7 +41,7 @@ def __contains__(self, key: Sequence[int]) -> bool: @abstractmethod def __setitem__( - self, key: Sequence[int], value: "llama_cpp.llama.LlamaState" + self, key: Sequence[int], value: "nexa.gguf.llama.LlamaState" ) -> None: raise NotImplementedError @@ -53,7 +53,7 @@ def __init__(self, capacity_bytes: int = (2 << 30)): super().__init__(capacity_bytes) self.capacity_bytes = capacity_bytes self.cache_state: OrderedDict[ - Tuple[int, ...], "llama_cpp.llama.LlamaState" + Tuple[int, ...], "nexa.gguf.llama.LlamaState" ] = OrderedDict() @property @@ -67,7 +67,7 @@ def _find_longest_prefix_key( min_len = 0 min_key = None keys = ( - (k, llama_cpp.llama.Llama.longest_token_prefix(k, key)) + (k, nexa.gguf.llama.Llama.longest_token_prefix(k, key)) for k in self.cache_state.keys() ) for k, prefix_len in keys: @@ -76,7 +76,7 @@ def _find_longest_prefix_key( min_key = k return min_key - def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": + def __getitem__(self, key: Sequence[int]) -> "nexa.gguf.llama.LlamaState": key = tuple(key) _key = self._find_longest_prefix_key(key) if _key is None: @@ -88,7 +88,7 @@ def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": def __contains__(self, key: Sequence[int]) -> bool: return self._find_longest_prefix_key(tuple(key)) is not None - def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"): + def __setitem__(self, key: Sequence[int], value: "nexa.gguf.llama.LlamaState"): key = tuple(key) if key in self.cache_state: del self.cache_state[key] @@ -121,18 +121,18 @@ def _find_longest_prefix_key( min_len = 0 min_key: Optional[Tuple[int, ...]] = None for k in self.cache.iterkeys(): # type: ignore - prefix_len = llama_cpp.llama.Llama.longest_token_prefix(k, key) + prefix_len = nexa.gguf.llama.Llama.longest_token_prefix(k, key) if prefix_len > min_len: min_len = prefix_len min_key = k # type: ignore return min_key - def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": + def __getitem__(self, key: Sequence[int]) -> "nexa.gguf.llama.LlamaState": key = tuple(key) _key = self._find_longest_prefix_key(key) if _key is None: raise KeyError("Key not found") - value: "llama_cpp.llama.LlamaState" = self.cache.pop(_key) # type: ignore + value: "nexa.gguf.llama.LlamaState" = self.cache.pop(_key) # type: ignore # NOTE: This puts an integer as key in cache, which breaks, # Llama.longest_token_prefix(k, key) above since k is not a tuple of ints/tokens # self.cache.push(_key, side="front") # type: ignore @@ -141,7 +141,7 @@ def __getitem__(self, key: Sequence[int]) -> "llama_cpp.llama.LlamaState": def __contains__(self, key: Sequence[int]) -> bool: return self._find_longest_prefix_key(tuple(key)) is not None - def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"): + def __setitem__(self, key: Sequence[int], value: "nexa.gguf.llama.LlamaState"): print("LlamaDiskCache.__setitem__: called", file=sys.stderr) key = tuple(key) if key in self.cache: diff --git a/nexa/gguf/llama/llama_chat_format.py b/nexa/gguf/llama/llama_chat_format.py index bb0dc41f..f4e72617 100644 --- a/nexa/gguf/llama/llama_chat_format.py +++ b/nexa/gguf/llama/llama_chat_format.py @@ -32,8 +32,8 @@ import nexa.gguf.llama.llama_types as llama_types import nexa.gguf.llama.llama_grammar as llama_grammar -from nexa.gguf.llama._logger import logger -from nexa.gguf.llama._utils import suppress_stdout_stderr, Singleton +from nexa.gguf.llama._logger_transformers import logger +from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr, Singleton ### Common Chat Templates and Special Tokens ### diff --git a/nexa/gguf/llama/llama_cpp.py b/nexa/gguf/llama/llama_cpp.py index 0f390932..b8b8702e 100644 --- a/nexa/gguf/llama/llama_cpp.py +++ b/nexa/gguf/llama/llama_cpp.py @@ -13,11 +13,12 @@ ) from nexa.gguf.llama._ctypes_extensions import ( - load_shared_library, byref, ctypes_function_for_shared_library, ) +from nexa.gguf.lib_utils import load_library + if TYPE_CHECKING: from nexa.gguf.llama._ctypes_extensions import ( CtypesCData, @@ -32,11 +33,8 @@ # Specify the base name of the shared library to load _lib_base_name = "llama" -# _override_base_path = os.environ.get("LLAMA_CPP_LIB_PATH") -_override_base_path = 'D:/repo/nexa-ai/llama-cpp-python/llama_cpp/lib' -_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _override_base_path is None else pathlib.Path(_override_base_path) # Load the library -_lib = load_shared_library(_lib_base_name, _base_path) +_lib = load_library(_lib_base_name) ctypes_function = ctypes_function_for_shared_library(_lib) diff --git a/nexa/gguf/llama/llava_cpp.py b/nexa/gguf/llama/llava_cpp.py index 8ac2934b..e6728f1c 100644 --- a/nexa/gguf/llama/llava_cpp.py +++ b/nexa/gguf/llama/llava_cpp.py @@ -23,10 +23,11 @@ import nexa.gguf.llama.llama_cpp as llama_cpp from nexa.gguf.llama._ctypes_extensions import ( - load_shared_library, ctypes_function_for_shared_library, ) +from nexa.gguf.lib_utils import load_library + if TYPE_CHECKING: from nexa.gguf.llama._ctypes_extensions import ( CtypesArray, @@ -35,11 +36,8 @@ # Specify the base name of the shared library to load _libllava_base_name = "llava" -_libllava_override_path = os.environ.get("LLAVA_CPP_LIB") -_libllava_base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" if _libllava_override_path is None else pathlib.Path() - # Load the library -_libllava = load_shared_library(_libllava_base_name, _libllava_base_path) +_libllava = load_library(_libllava_base_name) ctypes_function = ctypes_function_for_shared_library(_libllava) diff --git a/nexa/gguf/llama/omni_vlm_cpp.py b/nexa/gguf/llama/omni_vlm_cpp.py new file mode 100644 index 00000000..ab7a5699 --- /dev/null +++ b/nexa/gguf/llama/omni_vlm_cpp.py @@ -0,0 +1,79 @@ +import ctypes +import os +import sys +from pathlib import Path + + +# Load the library +def _load_shared_library(lib_base_name: str, base_path: Path = None): + # Determine the file extension based on the platform + if sys.platform.startswith("linux"): + lib_ext = ".so" + elif sys.platform == "darwin": + lib_ext = ".dylib" + elif sys.platform == "win32": + lib_ext = ".dll" + else: + raise RuntimeError("Unsupported platform") + # Construct the paths to the possible shared library names + if base_path is None: + _base_path = Path(__file__).parent.parent.resolve() + else: + _base_path = base_path + _lib_paths = [ + _base_path / f"lib{lib_base_name}{lib_ext}", + _base_path / f"{lib_base_name}{lib_ext}", + ] + # Add the library directory to the DLL search path on Windows (if needed) + if sys.platform == "win32" and sys.version_info >= (3, 8): + os.add_dll_directory(str(_base_path)) + # Try to load the shared library, handling potential errors + for _lib_path in _lib_paths: + if _lib_path.exists(): + try: + return ctypes.CDLL(str(_lib_path)) + except Exception as e: + print(f"Failed to load shared library '{_lib_path}': {e}") + raise FileNotFoundError( + f"Shared library with base name '{lib_base_name}' not found" + ) + +def _get_lib(): + # Specify the base name of the shared library to load + _lib_base_name = "omni_vlm_wrapper_shared" + base_path = ( + Path(__file__).parent.parent.parent.parent.resolve() + / "nexa" + / "gguf" + / "lib" + ) + return _load_shared_library(_lib_base_name, base_path) + +# Initialize both libraries +_lib = _get_lib() + +omni_char_p = ctypes.c_char_p + + +def omnivlm_init(llm_model_path: omni_char_p, mmproj_model_path: omni_char_p, vlm_version: omni_char_p): + return _lib.omnivlm_init(llm_model_path, mmproj_model_path, vlm_version) + + +_lib.omnivlm_init.argtypes = [omni_char_p, omni_char_p, omni_char_p] +_lib.omnivlm_init.restype = None + + +def omnivlm_inference(prompt: omni_char_p, image_path: omni_char_p): + return _lib.omnivlm_inference(prompt, image_path) + + +_lib.omnivlm_inference.argtypes = [omni_char_p, omni_char_p] +_lib.omnivlm_inference.restype = omni_char_p + + +def omnivlm_free(): + return _lib.omnivlm_free() + + +_lib.omnivlm_free.argtypes = [] +_lib.omnivlm_free.restype = None \ No newline at end of file diff --git a/nexa/gguf/nexa_inference_audio_lm.py b/nexa/gguf/nexa_inference_audio_lm.py index e9036315..fa63cd38 100644 --- a/nexa/gguf/nexa_inference_audio_lm.py +++ b/nexa/gguf/nexa_inference_audio_lm.py @@ -15,7 +15,7 @@ ) from nexa.gguf.lib_utils import is_gpu_available from nexa.gguf.llama import audio_lm_cpp -from nexa.gguf.llama._utils import suppress_stdout_stderr +from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr from nexa.general import pull_model def is_qwen(model_name): diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py index 5f237290..65ef0c22 100644 --- a/nexa/gguf/nexa_inference_image.py +++ b/nexa/gguf/nexa_inference_image.py @@ -15,7 +15,7 @@ NEXA_RUN_T5XXL_MAP, ) from nexa.utils import SpinningCursorAnimation, nexa_prompt -from nexa.gguf.llama._utils import suppress_stdout_stderr +from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr from streamlit.web import cli as stcli from nexa.general import pull_model diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py index c01a71e5..c7802ace 100644 --- a/nexa/gguf/nexa_inference_text.py +++ b/nexa/gguf/nexa_inference_text.py @@ -14,7 +14,7 @@ from nexa.gguf.lib_utils import is_gpu_available from nexa.general import pull_model from nexa.utils import SpinningCursorAnimation, nexa_prompt -from nexa.gguf.llama._utils import suppress_stdout_stderr +from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr logging.basicConfig( diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py index eeb1e436..fa62b589 100644 --- a/nexa/gguf/nexa_inference_vlm.py +++ b/nexa/gguf/nexa_inference_vlm.py @@ -25,7 +25,7 @@ NanoLlavaChatHandler, ) from nexa.utils import SpinningCursorAnimation, nexa_prompt -from nexa.gguf.llama._utils import suppress_stdout_stderr +from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr logging.basicConfig( level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s" diff --git a/nexa/gguf/nexa_inference_vlm_omni.py b/nexa/gguf/nexa_inference_vlm_omni.py index 2d30f3ae..bd5b6b29 100644 --- a/nexa/gguf/nexa_inference_vlm_omni.py +++ b/nexa/gguf/nexa_inference_vlm_omni.py @@ -11,7 +11,7 @@ ) from nexa.gguf.lib_utils import is_gpu_available from nexa.gguf.llama import omni_vlm_cpp -from nexa.gguf.llama._utils import suppress_stdout_stderr +from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr from nexa.general import pull_model class NexaOmniVlmInference: diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py index 510c6737..b3659776 100644 --- a/nexa/gguf/nexa_inference_voice.py +++ b/nexa/gguf/nexa_inference_voice.py @@ -11,7 +11,7 @@ ) from nexa.general import pull_model from nexa.utils import nexa_prompt, SpinningCursorAnimation -from nexa.gguf.llama._utils import suppress_stdout_stderr +from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr logging.basicConfig(level=logging.INFO) diff --git a/nexa/gguf/server/nexa_service.py b/nexa/gguf/server/nexa_service.py index 31d4e135..73772e31 100644 --- a/nexa/gguf/server/nexa_service.py +++ b/nexa/gguf/server/nexa_service.py @@ -35,7 +35,7 @@ Llava16ChatHandler, NanoLlavaChatHandler, ) -from nexa.gguf.llama._utils import suppress_stdout_stderr +from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr from nexa.general import pull_model from nexa.gguf.llama.llama import Llama from nexa.gguf.sd.stable_diffusion import StableDiffusion diff --git a/nexa/gguf/streamlit/streamlit_audio_lm.py b/nexa/gguf/streamlit/streamlit_audio_lm.py index 85200388..889a8870 100644 --- a/nexa/gguf/streamlit/streamlit_audio_lm.py +++ b/nexa/gguf/streamlit/streamlit_audio_lm.py @@ -7,7 +7,7 @@ from st_audiorec import st_audiorec from nexa.general import pull_model -from nexa.gguf.llama._utils import suppress_stdout_stderr +from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr from nexa.gguf.nexa_inference_audio_lm import NexaAudioLMInference # Initialize session state From d82b11032d1515322800eb93c6d98de948fd1918 Mon Sep 17 00:00:00 2001 From: Te993 <3923106166@qq.com> Date: Tue, 10 Dec 2024 18:24:19 +0800 Subject: [PATCH 13/20] upgrade llama cpp python --- CMakeLists.txt | 3 +++ dependency/llama.cpp | 2 +- pyproject.toml | 5 ++++- 3 files changed, 8 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 41738eb8..d11f63d0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,5 +1,8 @@ cmake_minimum_required(VERSION 3.16) project(nexa_gguf) +if(MSVC) + add_compile_options(/utf-8) +endif() include(ExternalProject) diff --git a/dependency/llama.cpp b/dependency/llama.cpp index 64a6001a..5962b506 160000 --- a/dependency/llama.cpp +++ b/dependency/llama.cpp @@ -1 +1 @@ -Subproject commit 64a6001a1a408129eb510f49840947876220c5fa +Subproject commit 5962b506bab3f46821e0fb74bcbe224cb6b10b68 diff --git a/pyproject.toml b/pyproject.toml index 24b6ee35..c225a1bf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -127,10 +127,13 @@ build.verbose = true cmake.build-type = "Release" cmake.version = ">=3.16" cmake.args = [ + "-DCMAKE_BUILD_TYPE=Release", "-DCMAKE_BUILD_PARALLEL_LEVEL=16", "-DSTABLE_DIFFUSION_BUILD=ON", "-DLLAMA_BUILD=ON", - "-DBARK_BUILD=ON" + "-DBARK_BUILD=ON", + "-DBUILD_SHARED_LIBS=ON", + "-DLLAMA_BUILD_TESTS=OFF" ] [tool.scikit-build.metadata.version] From 2ae682874884b6c1c6fb77c0e009a8008c99e50e Mon Sep 17 00:00:00 2001 From: Te993 <3923106166@qq.com> Date: Tue, 10 Dec 2024 18:33:39 +0800 Subject: [PATCH 14/20] chore: update llama.cpp submodule --- dependency/llama.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/dependency/llama.cpp b/dependency/llama.cpp index 5962b506..5fc409ed 160000 --- a/dependency/llama.cpp +++ b/dependency/llama.cpp @@ -1 +1 @@ -Subproject commit 5962b506bab3f46821e0fb74bcbe224cb6b10b68 +Subproject commit 5fc409ed770fe7165c1827137ad0d2153b36dace From 418d07786875a64b275ecbdebf84ee9716a8af3a Mon Sep 17 00:00:00 2001 From: Te993 <3923106166@qq.com> Date: Tue, 10 Dec 2024 21:33:13 +0800 Subject: [PATCH 15/20] update --- nexa/gguf/lib_utils.py | 3 +-- nexa/gguf/llama/llama_chat_format.py | 2 +- nexa/gguf/llama/llava_cpp.py | 2 +- pyproject.toml | 2 +- 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/nexa/gguf/lib_utils.py b/nexa/gguf/lib_utils.py index 8397e026..ec030b9d 100644 --- a/nexa/gguf/lib_utils.py +++ b/nexa/gguf/lib_utils.py @@ -17,8 +17,7 @@ def is_gpu_available(): # Load the library def load_library(lib_base_name: str): # Construct the paths to the possible shared library names - # _base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" - _base_path = pathlib.Path('D:/repo/nexa-ai/llama-cpp-python/llama_cpp/lib') + _base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib" # Searching for the library in the current directory under the name "libllama" (default name # for llamacpp) and "llama" (default name for this repo) _lib_paths: List[pathlib.Path] = [] diff --git a/nexa/gguf/llama/llama_chat_format.py b/nexa/gguf/llama/llama_chat_format.py index f4e72617..aeee3399 100644 --- a/nexa/gguf/llama/llama_chat_format.py +++ b/nexa/gguf/llama/llama_chat_format.py @@ -2667,7 +2667,7 @@ class Llava15ChatHandler: ) def __init__(self, clip_model_path: str, verbose: bool = True): - import llama_cpp.llava_cpp as llava_cpp + import nexa.gguf.llama.llava_cpp as llava_cpp self.clip_model_path = clip_model_path self.verbose = verbose diff --git a/nexa/gguf/llama/llava_cpp.py b/nexa/gguf/llama/llava_cpp.py index e6728f1c..945da826 100644 --- a/nexa/gguf/llama/llava_cpp.py +++ b/nexa/gguf/llama/llava_cpp.py @@ -35,7 +35,7 @@ # Specify the base name of the shared library to load -_libllava_base_name = "llava" +_libllava_base_name = "llava_shared" # Load the library _libllava = load_library(_libllava_base_name) diff --git a/pyproject.toml b/pyproject.toml index c225a1bf..57443c1c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -131,7 +131,7 @@ cmake.args = [ "-DCMAKE_BUILD_PARALLEL_LEVEL=16", "-DSTABLE_DIFFUSION_BUILD=ON", "-DLLAMA_BUILD=ON", - "-DBARK_BUILD=ON", + "-DBARK_BUILD=OFF", "-DBUILD_SHARED_LIBS=ON", "-DLLAMA_BUILD_TESTS=OFF" ] From 66cd9b1d6fd5f4cbe0b9ea4ac7f5ab22285d9dc7 Mon Sep 17 00:00:00 2001 From: Te993 <3923106166@qq.com> Date: Wed, 11 Dec 2024 11:58:39 +0800 Subject: [PATCH 16/20] update --- dependency/llama.cpp | 2 +- nexa/gguf/llama/_internals_transformers.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/dependency/llama.cpp b/dependency/llama.cpp index 5fc409ed..5962b506 160000 --- a/dependency/llama.cpp +++ b/dependency/llama.cpp @@ -1 +1 @@ -Subproject commit 5fc409ed770fe7165c1827137ad0d2153b36dace +Subproject commit 5962b506bab3f46821e0fb74bcbe224cb6b10b68 diff --git a/nexa/gguf/llama/_internals_transformers.py b/nexa/gguf/llama/_internals_transformers.py index 4de2f41b..5d625b8d 100644 --- a/nexa/gguf/llama/_internals_transformers.py +++ b/nexa/gguf/llama/_internals_transformers.py @@ -698,7 +698,7 @@ def accept(self, ctx_main: LlamaContext, id: int, apply_grammar: bool): from typing import List, Callable, Optional, Union import ctypes -import llama_cpp +import nexa.gguf.llama.llama_cpp class CustomSampler: From 98d1769db1da1f6d0081f50eb8c218cf969acb92 Mon Sep 17 00:00:00 2001 From: Te993 <3923106166@qq.com> Date: Wed, 11 Dec 2024 13:56:41 +0800 Subject: [PATCH 17/20] revert some config --- CMakeLists.txt | 3 --- pyproject.toml | 3 --- 2 files changed, 6 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index d11f63d0..41738eb8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,8 +1,5 @@ cmake_minimum_required(VERSION 3.16) project(nexa_gguf) -if(MSVC) - add_compile_options(/utf-8) -endif() include(ExternalProject) diff --git a/pyproject.toml b/pyproject.toml index 57443c1c..b77f12a4 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -127,13 +127,10 @@ build.verbose = true cmake.build-type = "Release" cmake.version = ">=3.16" cmake.args = [ - "-DCMAKE_BUILD_TYPE=Release", "-DCMAKE_BUILD_PARALLEL_LEVEL=16", "-DSTABLE_DIFFUSION_BUILD=ON", "-DLLAMA_BUILD=ON", - "-DBARK_BUILD=OFF", "-DBUILD_SHARED_LIBS=ON", - "-DLLAMA_BUILD_TESTS=OFF" ] [tool.scikit-build.metadata.version] From a8c690be7c594b05f8f04f3ff38b8b452e830dbb Mon Sep 17 00:00:00 2001 From: Te993 <3923106166@qq.com> Date: Wed, 11 Dec 2024 14:04:24 +0800 Subject: [PATCH 18/20] disable bark build --- pyproject.toml | 2 +- tests/test_text_generation.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index b77f12a4..ef2be1f6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -130,7 +130,7 @@ cmake.args = [ "-DCMAKE_BUILD_PARALLEL_LEVEL=16", "-DSTABLE_DIFFUSION_BUILD=ON", "-DLLAMA_BUILD=ON", - "-DBUILD_SHARED_LIBS=ON", + "-DBARK_BUILD=OFF", ] [tool.scikit-build.metadata.version] diff --git a/tests/test_text_generation.py b/tests/test_text_generation.py index 91e5d62d..b5ff3cd4 100644 --- a/tests/test_text_generation.py +++ b/tests/test_text_generation.py @@ -1,7 +1,7 @@ from nexa.gguf import NexaTextInference from nexa.gguf.lib_utils import is_gpu_available -# import pdb; -# pdb.set_trace() +import pdb; +pdb.set_trace() model = NexaTextInference( From 20ca4fcffd8a682a57f145210c837d04f0aa0978 Mon Sep 17 00:00:00 2001 From: Te993 <3923106166@qq.com> Date: Wed, 11 Dec 2024 14:43:19 +0800 Subject: [PATCH 19/20] fix audio issue --- nexa/gguf/llama/audio_lm_cpp.py | 2 +- tests/test_text_generation.py | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/nexa/gguf/llama/audio_lm_cpp.py b/nexa/gguf/llama/audio_lm_cpp.py index 88db2a33..76187f8c 100644 --- a/nexa/gguf/llama/audio_lm_cpp.py +++ b/nexa/gguf/llama/audio_lm_cpp.py @@ -40,7 +40,7 @@ def _load_shared_library(lib_base_name: str, base_path: Path = None): def _get_lib(is_qwen: bool = True): # Specify the base name of the shared library to load - _lib_base_name = "nexa-qwen2-audio-lib_shared" if is_qwen else "nexa-omni-audio-lib_shared" + _lib_base_name = "nexa-qwen2-audio-lib_shared" if is_qwen else "omni_audio_shared" base_path = ( Path(__file__).parent.parent.parent.parent.resolve() / "nexa" diff --git a/tests/test_text_generation.py b/tests/test_text_generation.py index b5ff3cd4..33500c21 100644 --- a/tests/test_text_generation.py +++ b/tests/test_text_generation.py @@ -1,7 +1,5 @@ from nexa.gguf import NexaTextInference from nexa.gguf.lib_utils import is_gpu_available -import pdb; -pdb.set_trace() model = NexaTextInference( From 8c6d8bec60e8f6a6615a07f318ca6d83619fd778 Mon Sep 17 00:00:00 2001 From: Ubuntu Date: Thu, 12 Dec 2024 03:17:05 +0000 Subject: [PATCH 20/20] llama cpp to relase branch --- .gitmodules | 2 +- dependency/llama.cpp | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.gitmodules b/.gitmodules index e3daca82..2a64b27c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -5,7 +5,7 @@ [submodule "dependency/llama.cpp"] path = dependency/llama.cpp url = https://github.com/NexaAI/llama.cpp.git - branch = master + branch = release [submodule "nexa/eval/benchmark_tasks"] path = nexa/eval/benchmark_tasks url = https://github.com/NexaAI/benchmark-tasks.git diff --git a/dependency/llama.cpp b/dependency/llama.cpp index 5962b506..b2958b33 160000 --- a/dependency/llama.cpp +++ b/dependency/llama.cpp @@ -1 +1 @@ -Subproject commit 5962b506bab3f46821e0fb74bcbe224cb6b10b68 +Subproject commit b2958b33ddd4c8f13c98fb1c1249ac067769df91