NexaAI · zhiyuan8 · Aug 25, 2024 · Aug 25, 2024 · Aug 25, 2024
diff --git a/nexa/cli/entry.py b/nexa/cli/entry.py
@@ -102,7 +102,6 @@ def main():
     image_group = run_parser.add_argument_group('Image generation options')
     image_group.add_argument("-i2i", "--img2img", action="store_true", help="Whether to run image-to-image generation")
     image_group.add_argument("-ns", "--num_inference_steps", type=int, help="Number of inference steps")
-    image_group.add_argument("-np", "--num_images_per_prompt", type=int, default=1, help="Number of images to generate per prompt")
     image_group.add_argument("-H", "--height", type=int, help="Height of the output image")
     image_group.add_argument("-W", "--width", type=int, help="Width of the output image")
     image_group.add_argument("-g", "--guidance_scale", type=float, help="Guidance scale for diffusion")

diff --git a/nexa/constants.py b/nexa/constants.py
@@ -181,7 +181,6 @@
 
 DEFAULT_IMG_GEN_PARAMS = {
     "num_inference_steps": 20,
-    "num_images_per_prompt": 1,
     "height": 512,
     "width": 512,
     "guidance_scale": 7.5,
@@ -191,7 +190,6 @@
 
 DEFAULT_IMG_GEN_PARAMS_LCM = {
     "num_inference_steps": 4,
-    "num_images_per_prompt": 1,
     "height": 512,
     "width": 512,
     "guidance_scale": 1.0,
@@ -201,7 +199,6 @@
 
 DEFAULT_IMG_GEN_PARAMS_TURBO = {
     "num_inference_steps": 5,
-    "num_images_per_prompt": 1,
     "height": 512,
     "width": 512,
     "guidance_scale": 5.0,

diff --git a/nexa/gguf/lib_utils.py b/nexa/gguf/lib_utils.py
@@ -18,7 +18,6 @@ def is_gpu_available():
 def load_library(lib_base_name: str):
     # Construct the paths to the possible shared library names
     _base_path = pathlib.Path(os.path.abspath(os.path.dirname(__file__))) / "lib"
-    logging.debug(f"Base path for libraries: {_base_path}")
     # Searching for the library in the current directory under the name "libllama" (default name
     # for llamacpp) and "llama" (default name for this repo)
     _lib_paths: List[pathlib.Path] = []
@@ -29,18 +28,16 @@ def load_library(lib_base_name: str):
         ]
     elif sys.platform == "darwin":
         _lib_paths += [
-            _base_path / f"lib{lib_base_name}.dylib",
             _base_path / f"lib{lib_base_name}.so",
+            _base_path / f"lib{lib_base_name}.dylib",
         ]
     elif sys.platform == "win32":
         _lib_paths += [
             _base_path / f"{lib_base_name}.dll",
             _base_path / f"lib{lib_base_name}.dll",
         ]
-        _add_windows_dll_directories(_base_path)
     else:
         raise RuntimeError("Unsupported platform")
-    logging.debug(f"Possible shared library paths: {_lib_paths}")
 
     if "LLAMA_CPP_LIB" in os.environ:
         lib_base_name = os.environ["LLAMA_CPP_LIB"]
@@ -50,19 +47,31 @@ def load_library(lib_base_name: str):
 
     cdll_args = dict()  # type: ignore
 
+    # Add the library directory to the DLL search path on Windows (if needed)
+    if sys.platform == "win32":
+        os.add_dll_directory(str(_base_path))
+        os.environ["PATH"] = str(_base_path) + os.pathsep + os.environ["PATH"]
+
+    if sys.platform == "win32" and sys.version_info >= (3, 8):
+        os.add_dll_directory(str(_base_path))
+        if "CUDA_PATH" in os.environ:
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "bin"))
+            os.add_dll_directory(os.path.join(os.environ["CUDA_PATH"], "lib"))
+        if "HIP_PATH" in os.environ:
+            os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "bin"))
+            os.add_dll_directory(os.path.join(os.environ["HIP_PATH"], "lib"))
+        cdll_args["winmode"] = ctypes.RTLD_GLOBAL
+
     # Try to load the shared library, handling potential errors
     for _lib_path in _lib_paths:
-        logging.debug(f"Trying to load shared library from: {_lib_path}")
         if _lib_path.exists():
             try:
-                loaded_lib = ctypes.CDLL(str(_lib_path), **cdll_args)  # type: ignore
-                logging.debug(f"Successfully loaded shared library: {_lib_path}")
-                return loaded_lib
+                return ctypes.CDLL(str(_lib_path), **cdll_args)  # type: ignore
             except Exception as e:
                 raise RuntimeError(f"Failed to load shared library '{_lib_path}': {e}")
 
     raise FileNotFoundError(
-        f"Shared library with base name '{lib_base_name}' not found in paths: {_lib_paths}"
+        f"Shared library with base name '{lib_base_name}' not found"
     )
 
 

diff --git a/nexa/gguf/llama/_internals_transformers.py b/nexa/gguf/llama/_internals_transformers.py
@@ -179,11 +179,11 @@ def token_eot(self) -> int:
         assert self.model is not None
         return llama_cpp.llama_token_eot(self.model)
 
-    def add_bos_token(self) -> int:
+    def add_bos_token(self) -> bool:
         assert self.model is not None
         return llama_cpp.llama_add_bos_token(self.model)
 
-    def add_eos_token(self) -> int:
+    def add_eos_token(self) -> bool:
         assert self.model is not None
         return llama_cpp.llama_add_eos_token(self.model)
 
@@ -343,14 +343,6 @@ def get_state_size(self) -> int:
         assert self.ctx is not None
         return llama_cpp.llama_get_state_size(self.ctx)
 
-    # TODO: copy_state_data
-
-    # TODO: set_state_data
-
-    # TODO: llama_load_session_file
-
-    # TODO: llama_save_session_file
-
     def decode(self, batch: "_LlamaBatch"):
         assert self.ctx is not None
         assert batch.batch is not None
@@ -511,7 +503,7 @@ def sample_token(self, candidates: "_LlamaTokenDataArray") -> int:
     def grammar_accept_token(self, grammar: LlamaGrammar, token: int):
         assert self.ctx is not None
         assert grammar.grammar is not None
-        llama_cpp.llama_grammar_accept_token(self.ctx, grammar.grammar, token)
+        llama_cpp.llama_grammar_accept_token(grammar.grammar, self.ctx, token)
 
     def reset_timings(self):
         assert self.ctx is not None
@@ -691,8 +683,8 @@ def _detokenize_bpe(model: _LlamaModel, tokens: List[int]) -> str:
 def _should_add_bos(model: _LlamaModel) -> bool:
     assert model.model is not None
     add_bos = llama_cpp.llama_add_bos_token(model.model)
-    if add_bos != -1:
-        return add_bos != 0
+    if add_bos:
+        return add_bos
     else:
         return llama_cpp.llama_vocab_type(model.model) == llama_cpp.LLAMA_VOCAB_TYPE_SPM
 

diff --git a/nexa/gguf/llama/_utils_transformers.py b/nexa/gguf/llama/_utils_transformers.py
@@ -17,7 +17,7 @@ class suppress_stdout_stderr(object):
     sys = sys
     os = os
 
-    def __init__(self, disable: bool = False):
+    def __init__(self, disable: bool = True):
         self.disable = disable
 
     # Oddly enough this works better than the contextlib version

diff --git a/nexa/gguf/llama/llama.py b/nexa/gguf/llama/llama.py
@@ -1,27 +1,33 @@
 from __future__ import annotations
 
-import contextlib
-import ctypes
-import multiprocessing
 import os
 import sys
+import uuid
 import time
+import json
+import ctypes
 import typing
-import uuid
+import fnmatch
 import warnings
-from collections import deque
+import contextlib
+import multiprocessing
+
 from typing import (
     Any,
-    Callable,
-    Deque,
-    Dict,
-    Generator,
-    Iterator,
     List,
+    Literal,
     Optional,
-    Sequence,
     Union,
+    Generator,
+    Sequence,
+    Iterator,
+    Deque,
+    Callable,
+    Dict,
 )
+from collections import deque
+from pathlib import Path
+
 
 import numpy as np
 import numpy.typing as npt
@@ -37,10 +43,9 @@
 from nexa.gguf.llama._internals_transformers import _normalize_embedding  # type: ignore
 from nexa.gguf.llama._logger_transformers import set_verbose
 from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr
-
-# from nexa.gguf.llama.llama_cache import LlamaCache  # type: ignore
-# from nexa.gguf.llama.llama_cache import LlamaDiskCache  # type: ignore
-# from nexa.gguf.llama.llama_cache import LlamaRAMCache  # type: ignore
+from nexa.gguf.llama.llama_cache import LlamaCache  # type: ignore
+from nexa.gguf.llama.llama_cache import LlamaDiskCache  # type: ignore
+from nexa.gguf.llama.llama_cache import LlamaRAMCache  # type: ignore
 from nexa.gguf.llama.llama_cache import BaseLlamaCache
 from nexa.gguf.llama.llama_grammar import LlamaGrammar
 from nexa.gguf.llama.llama_speculative import LlamaDraftModel
@@ -187,6 +192,7 @@ def __init__(
             A Llama instance.
         """
         self.verbose = verbose
+        self._stack = contextlib.ExitStack()
 
         set_verbose(verbose)
 
@@ -251,28 +257,28 @@ def __init__(
             for i, (k, v) in enumerate(kv_overrides.items()):
                 self._kv_overrides_array[i].key = k.encode("utf-8")
                 if isinstance(v, bool):
-                    self._kv_overrides_array[
-                        i
-                    ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL
+                    self._kv_overrides_array[i].tag = (
+                        llama_cpp.LLAMA_KV_OVERRIDE_TYPE_BOOL
+                    )
                     self._kv_overrides_array[i].value.val_bool = v
                 elif isinstance(v, int):
-                    self._kv_overrides_array[
-                        i
-                    ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT
+                    self._kv_overrides_array[i].tag = (
+                        llama_cpp.LLAMA_KV_OVERRIDE_TYPE_INT
+                    )
                     self._kv_overrides_array[i].value.val_i64 = v
                 elif isinstance(v, float):
-                    self._kv_overrides_array[
-                        i
-                    ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT
+                    self._kv_overrides_array[i].tag = (
+                        llama_cpp.LLAMA_KV_OVERRIDE_TYPE_FLOAT
+                    )
                     self._kv_overrides_array[i].value.val_f64 = v
                 elif isinstance(v, str):  # type: ignore
                     v_bytes = v.encode("utf-8")
                     if len(v_bytes) > 128:  # TODO: Make this a constant
                         raise ValueError(f"Value for {k} is too long: {v}")
                     v_bytes = v_bytes.ljust(128, b"\0")
-                    self._kv_overrides_array[
-                        i
-                    ].tag = llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
+                    self._kv_overrides_array[i].tag = (
+                        llama_cpp.LLAMA_KV_OVERRIDE_TYPE_STR
+                    )
                     # copy min(v_bytes, 128) to str_value
                     address = typing.cast(
                         int,
@@ -288,9 +294,9 @@ def __init__(
                 else:
                     raise ValueError(f"Unknown value type for {k}: {v}")
 
-            self._kv_overrides_array[
-                -1
-            ].key = b"\0"  # ensure sentinel element is zeroed
+            self._kv_overrides_array[-1].key = (
+                b"\0"  # ensure sentinel element is zeroed
+            )
             self.model_params.kv_overrides = self._kv_overrides_array
 
         self.n_batch = min(n_ctx, n_batch)  # ???
@@ -354,8 +360,6 @@ def __init__(
         if not os.path.exists(model_path):
             raise ValueError(f"Model path does not exist: {model_path}")
 
-        self._stack = contextlib.ExitStack()
-
         self._model = self._stack.enter_context(
             contextlib.closing(
                 _LlamaModel(
@@ -409,6 +413,15 @@ def __init__(
                 raise RuntimeError(
                     f"Failed to initialize LoRA adapter from lora path: {self.lora_path}"
                 )
+
+            def free_lora_adapter():
+                if self._lora_adapter is None:
+                    return
+                llama_cpp.llama_lora_adapter_free(self._lora_adapter)
+                self._lora_adapter = None
+
+            self._stack.callback(free_lora_adapter)
+
             assert self._ctx.ctx is not None
             if llama_cpp.llama_lora_adapter_set(
                 self._ctx.ctx, self._lora_adapter, self.lora_scale
@@ -422,9 +435,9 @@ def __init__(
 
         self.chat_format = chat_format
         self.chat_handler = chat_handler
-        self._chat_handlers: Dict[
-            str, llama_chat_format.LlamaChatCompletionHandler
-        ] = {}
+        self._chat_handlers: Dict[str, llama_chat_format.LlamaChatCompletionHandler] = (
+            {}
+        )
 
         self.draft_model = draft_model
 
@@ -766,11 +779,12 @@ def generate(
                 else:
                     break
             if longest_prefix > 0:
-                if self.verbose:
-                    print("Llama.generate: prefix-match hit", file=sys.stderr)
                 reset = False
                 tokens = tokens[longest_prefix:]
                 self.n_tokens = longest_prefix
+                if self.verbose:
+                    print(f"Llama.generate: {longest_prefix} prefix-match hit, "
+                          f"remaining {len(tokens)} prompt tokens to eval", file=sys.stderr)                    
 
         # Reset the model state
         if reset:
@@ -1046,13 +1060,13 @@ def _create_completion(
 
         if (
             (isinstance(prompt, list) and suffix is None)
-            or self._model.add_bos_token() == 0
+            or not self._model.add_bos_token()
             or bos_tokens[:1] == [-1]
         ):
             bos_tokens = []
 
         if (isinstance(prompt, list) and suffix is None) or (
-            self._model.add_eos_token() != 1 and sep_token_id == -1
+            not self._model.add_eos_token() and sep_token_id == -1
         ):
             eos_tokens = []
 
@@ -1511,7 +1525,8 @@ def logit_bias_processor(
                 if self.verbose:
                     print("Llama._create_completion: cache save", file=sys.stderr)
                 self.cache[prompt_tokens + completion_tokens] = self.save_state()
-                print("Llama._create_completion: cache saved", file=sys.stderr)
+                if self.verbose:
+                    print("Llama._create_completion: cache saved", file=sys.stderr)
             return
 
         if self.cache:
@@ -1930,10 +1945,7 @@ def create_chat_completion_openai_v1(
             stream = kwargs.get("stream", False)  # type: ignore
             assert isinstance(stream, bool)
             if stream:
-                return (
-                    ChatCompletionChunk(**chunk)
-                    for chunk in self.create_chat_completion(*args, **kwargs)
-                )  # type: ignore
+                return (ChatCompletionChunk(**chunk) for chunk in self.create_chat_completion(*args, **kwargs))  # type: ignore
             else:
                 return ChatCompletion(**self.create_chat_completion(*args, **kwargs))  # type: ignore
         except ImportError:
@@ -2078,8 +2090,6 @@ def close(self) -> None:
         self._stack.close()
 
     def __del__(self) -> None:
-        if self._lora_adapter is not None:
-            llama_cpp.llama_lora_adapter_free(self._lora_adapter)
         self.close()
 
     @staticmethod
@@ -2164,4 +2174,4 @@ def __call__(
             self.prompt_tokens = len(input_ids)
         if len(input_ids) - self.prompt_tokens < self.min_tokens:
             scores[self.token_eos] = -np.inf
-        return scores
+        return scores
diff --git a/nexa/gguf/llama/llama_cache.py b/nexa/gguf/llama/llama_cache.py
@@ -152,4 +152,4 @@ def __setitem__(self, key: Sequence[int], value: "llama_cpp.llama.LlamaState"):
         while self.cache_size > self.capacity_bytes and len(self.cache) > 0:
             key_to_remove = next(iter(self.cache))
             del self.cache[key_to_remove]
-        print("LlamaDiskCache.__setitem__: trim", file=sys.stderr)
+        print("LlamaDiskCache.__setitem__: trim", file=sys.stderr)
diff --git a/nexa/gguf/llama/llama_chat_format.py b/nexa/gguf/llama/llama_chat_format.py
@@ -3776,4 +3776,4 @@ def chatml_function_calling(
             },
         }
 
-    raise ValueError("Automatic streaming tool choice is not supported")
+    raise ValueError("Automatic streaming tool choice is not supported")
-Original file line number
+Diff line change
@@ Expand Up / @@ -3776,4 +3776,4 @@ def chatml_function_calling( @@
                 },
             }
-        raise ValueError("Automatic streaming tool choice is not supported")
+        raise ValueError("Automatic streaming tool choice is not supported")