From 0f68529c1de9f9ef3a436407eec86df4ef16a3b5 Mon Sep 17 00:00:00 2001 From: zack Zhiyuan Li Date: Thu, 21 Nov 2024 00:53:50 +0000 Subject: [PATCH 1/3] update readme --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 924b76ed..eb427b93 100644 --- a/README.md +++ b/README.md @@ -212,9 +212,11 @@ Below is our differentiation from other similar tools: | **Text Generation** | ✅ | ✅ | ✅ | ✅ | | **Image Generation** | ✅ | ❌ | ❌ | ❌ | | **Vision-Language Models** | ✅ | ✅ | ✅ | ✅ | +| **Audio-Language Models** | ✅ | ❌ | ❌ | ❌ | | **Text-to-Speech** | ✅ | ❌ | ✅ | ❌ | | **Server Capability** | ✅ | ✅ | ✅ | ✅ | | **User Interface** | ✅ | ❌ | ❌ | ✅ | +| **Executable Installation** | ✅ | ✅ | ❌ | ✅ | ## Supported Models & Model Hub From e1fab21986adb65c0d0df821820d04e933f11a47 Mon Sep 17 00:00:00 2001 From: zack Zhiyuan Li Date: Thu, 21 Nov 2024 01:04:48 +0000 Subject: [PATCH 2/3] support kv cache --- nexa/gguf/llama/kv_cache.py | 66 +++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 nexa/gguf/llama/kv_cache.py diff --git a/nexa/gguf/llama/kv_cache.py b/nexa/gguf/llama/kv_cache.py new file mode 100644 index 00000000..0e795ec1 --- /dev/null +++ b/nexa/gguf/llama/kv_cache.py @@ -0,0 +1,66 @@ +from nexa.gguf.llama.llama_cache import LlamaDiskCache + +def run_inference_with_disk_cache( + model, cache_prompt, total_prompt, use_cache=True, cache_dir="llama.cache", **kwargs +): + temperature = kwargs.get('temperature', 0.7) + max_tokens = kwargs.get('max_tokens', 2048) + top_p = kwargs.get('top_p', 1.0) + top_k = kwargs.get('top_k', 50) + repeat_penalty = kwargs.get('repeat_penalty', 1.0) + + if use_cache: + # Initialize disk cache with specified directory + cache_context = LlamaDiskCache(cache_dir=cache_dir) + model.set_cache(cache_context) + # Convert prompt to tokens for cache key + prompt_tokens = model.tokenize(cache_prompt.encode("utf-8")) + + try: + # Try to load existing cache + cached_state = cache_context[prompt_tokens] + model.load_state(cached_state) + + output = model( + total_prompt, + max_tokens=max_tokens, + temperature=temperature, + stream=True, + ) + except KeyError: + # If cache doesn't exist, create it + model.reset() + # Run initial inference to populate cache + _ = model( + cache_prompt, + max_tokens=1, # Minimal tokens for cache creation + temperature=temperature, + echo=False, + ) + # Save the state to cache + cache_context[prompt_tokens] = model.save_state() + + # Generate output after creating cache + output = model( + total_prompt, + max_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + top_k=top_k, + repeat_penalty=repeat_penalty, + stream=True, + ) + else: + model.reset() + model.set_cache(None) + + output = model( + total_prompt, + max_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + top_k=top_k, + repeat_penalty=repeat_penalty, + stream=True, + ) + return output \ No newline at end of file From 47f22321aa2ba3cc2afdc42d05407010528ab575 Mon Sep 17 00:00:00 2001 From: zack Zhiyuan Li Date: Thu, 21 Nov 2024 01:06:48 +0000 Subject: [PATCH 3/3] improve kv cache docs --- nexa/gguf/llama/kv_cache.py | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/nexa/gguf/llama/kv_cache.py b/nexa/gguf/llama/kv_cache.py index 0e795ec1..aa01630d 100644 --- a/nexa/gguf/llama/kv_cache.py +++ b/nexa/gguf/llama/kv_cache.py @@ -1,11 +1,31 @@ from nexa.gguf.llama.llama_cache import LlamaDiskCache +from typing import Any, Dict def run_inference_with_disk_cache( - model, cache_prompt, total_prompt, use_cache=True, cache_dir="llama.cache", **kwargs -): + model: Any, + cache_prompt: str, + total_prompt: str, + use_cache: bool = True, + cache_dir: str = "llama.cache", + **kwargs: Dict[str, Any] +) -> Any: + """ + Runs inference using a disk cache to store and retrieve model states. + + Parameters: + - model: The model object that supports caching and inference. + - cache_prompt: The prompt used to generate a cache key. + - total_prompt: The full prompt for generating output. + - use_cache: Flag to determine if caching should be used. + - cache_dir: Directory where cache files are stored. + - kwargs: Additional parameters for model inference. + + Returns: + - The output generated by the model. + """ temperature = kwargs.get('temperature', 0.7) max_tokens = kwargs.get('max_tokens', 2048) - top_p = kwargs.get('top_p', 1.0) + top_p = kwargs.get('top_p', 0.8) top_k = kwargs.get('top_k', 50) repeat_penalty = kwargs.get('repeat_penalty', 1.0)