From 0f68529c1de9f9ef3a436407eec86df4ef16a3b5 Mon Sep 17 00:00:00 2001
From: zack Zhiyuan Li <zhiyuanli0718@gmail.com>
Date: Thu, 21 Nov 2024 00:53:50 +0000
Subject: [PATCH 1/3] update readme

---
 README.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/README.md b/README.md
index 924b76ed..eb427b93 100644
--- a/README.md
+++ b/README.md
@@ -212,9 +212,11 @@ Below is our differentiation from other similar tools:
 | **Text Generation**        |                         ✅                         |                       ✅                       |                          ✅                           |                       ✅                        |
 | **Image Generation**       |                         ✅                         |                       ❌                       |                          ❌                           |                       ❌                        |
 | **Vision-Language Models** |                         ✅                         |                       ✅                       |                          ✅                           |                       ✅                        |
+| **Audio-Language Models** |                         ✅                         |                       ❌                       |                          ❌                           |                       ❌                        |
 | **Text-to-Speech**         |                         ✅                         |                       ❌                       |                          ✅                           |                       ❌                        |
 | **Server Capability**      |                         ✅                         |                       ✅                       |                          ✅                           |                       ✅                        |
 | **User Interface**         |                         ✅                         |                       ❌                       |                          ❌                           |                       ✅                        |
+| **Executable Installation**         |                         ✅                         |                       ✅                       |                          ❌                           |                       ✅                        |
 
 ## Supported Models & Model Hub
 

From e1fab21986adb65c0d0df821820d04e933f11a47 Mon Sep 17 00:00:00 2001
From: zack Zhiyuan Li <zhiyuanli0718@gmail.com>
Date: Thu, 21 Nov 2024 01:04:48 +0000
Subject: [PATCH 2/3] support kv cache

---
 nexa/gguf/llama/kv_cache.py | 66 +++++++++++++++++++++++++++++++++++++
 1 file changed, 66 insertions(+)
 create mode 100644 nexa/gguf/llama/kv_cache.py

diff --git a/nexa/gguf/llama/kv_cache.py b/nexa/gguf/llama/kv_cache.py
new file mode 100644
index 00000000..0e795ec1
--- /dev/null
+++ b/nexa/gguf/llama/kv_cache.py
@@ -0,0 +1,66 @@
+from nexa.gguf.llama.llama_cache import LlamaDiskCache
+
+def run_inference_with_disk_cache(
+    model, cache_prompt, total_prompt, use_cache=True, cache_dir="llama.cache", **kwargs
+):
+    temperature = kwargs.get('temperature', 0.7)
+    max_tokens = kwargs.get('max_tokens', 2048)
+    top_p = kwargs.get('top_p', 1.0)
+    top_k = kwargs.get('top_k', 50)
+    repeat_penalty = kwargs.get('repeat_penalty', 1.0)
+
+    if use_cache:
+        # Initialize disk cache with specified directory
+        cache_context = LlamaDiskCache(cache_dir=cache_dir)
+        model.set_cache(cache_context)
+        # Convert prompt to tokens for cache key
+        prompt_tokens = model.tokenize(cache_prompt.encode("utf-8"))
+
+        try:
+            # Try to load existing cache
+            cached_state = cache_context[prompt_tokens]
+            model.load_state(cached_state)
+
+            output = model(
+                total_prompt,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                stream=True,
+            )
+        except KeyError:
+            # If cache doesn't exist, create it
+            model.reset()
+            # Run initial inference to populate cache
+            _ = model(
+                cache_prompt,
+                max_tokens=1,  # Minimal tokens for cache creation
+                temperature=temperature,
+                echo=False,
+            )
+            # Save the state to cache
+            cache_context[prompt_tokens] = model.save_state()
+
+            # Generate output after creating cache
+            output = model(
+                total_prompt,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repeat_penalty=repeat_penalty,
+                stream=True,
+            )
+    else:
+        model.reset()
+        model.set_cache(None)
+
+        output = model(
+            total_prompt,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            repeat_penalty=repeat_penalty,
+            stream=True,
+        )
+    return output
\ No newline at end of file

From 47f22321aa2ba3cc2afdc42d05407010528ab575 Mon Sep 17 00:00:00 2001
From: zack Zhiyuan Li <zhiyuanli0718@gmail.com>
Date: Thu, 21 Nov 2024 01:06:48 +0000
Subject: [PATCH 3/3] improve kv cache docs

---
 nexa/gguf/llama/kv_cache.py | 26 +++++++++++++++++++++++---
 1 file changed, 23 insertions(+), 3 deletions(-)

diff --git a/nexa/gguf/llama/kv_cache.py b/nexa/gguf/llama/kv_cache.py
index 0e795ec1..aa01630d 100644
--- a/nexa/gguf/llama/kv_cache.py
+++ b/nexa/gguf/llama/kv_cache.py
@@ -1,11 +1,31 @@
 from nexa.gguf.llama.llama_cache import LlamaDiskCache
+from typing import Any, Dict
 
 def run_inference_with_disk_cache(
-    model, cache_prompt, total_prompt, use_cache=True, cache_dir="llama.cache", **kwargs
-):
+    model: Any, 
+    cache_prompt: str, 
+    total_prompt: str, 
+    use_cache: bool = True, 
+    cache_dir: str = "llama.cache", 
+    **kwargs: Dict[str, Any]
+) -> Any:
+    """
+    Runs inference using a disk cache to store and retrieve model states.
+
+    Parameters:
+    - model: The model object that supports caching and inference.
+    - cache_prompt: The prompt used to generate a cache key.
+    - total_prompt: The full prompt for generating output.
+    - use_cache: Flag to determine if caching should be used.
+    - cache_dir: Directory where cache files are stored.
+    - kwargs: Additional parameters for model inference.
+
+    Returns:
+    - The output generated by the model.
+    """
     temperature = kwargs.get('temperature', 0.7)
     max_tokens = kwargs.get('max_tokens', 2048)
-    top_p = kwargs.get('top_p', 1.0)
+    top_p = kwargs.get('top_p', 0.8)
     top_k = kwargs.get('top_k', 50)
     repeat_penalty = kwargs.get('repeat_penalty', 1.0)