diff --git a/.github/workflows/build-wheels-vulkan-win.yaml b/.github/workflows/build-wheels-vulkan-win.yaml
index 4683f8f8..ca68208d 100644
--- a/.github/workflows/build-wheels-vulkan-win.yaml
+++ b/.github/workflows/build-wheels-vulkan-win.yaml
@@ -88,6 +88,6 @@ jobs:
         uses: softprops/action-gh-release@v2
         with:
           files: dist/*
-          tag_name: ${{ github.ref_name }}-vulkan${{ env.VULKAN_VERSION }}
+          tag_name: ${{ github.ref_name }}-vulkan
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
\ No newline at end of file
diff --git a/.github/workflows/generate-index-from-release.yaml b/.github/workflows/generate-index-from-release.yaml
index 11741cab..b993847a 100644
--- a/.github/workflows/generate-index-from-release.yaml
+++ b/.github/workflows/generate-index-from-release.yaml
@@ -3,7 +3,7 @@ name: Wheels Index
 on:
   # Trigger on new release
   workflow_run:
-    workflows: ["Release", "Build Wheels (CUDA)", "Build Wheels (Metal)", "Build Wheels (ROCm)", "Build Wheels (Vulkan)"]
+    workflows: ["Build Wheels (CPU)", "Build Wheels (CUDA)", "Build Wheels (Metal)", "Build Wheels (ROCm)", "Build Wheels (Vulkan)"]
     types:
       - completed
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e1f28a3d..41738eb8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -188,7 +188,8 @@ if(LLAMA_BUILD)
 endif()
 
 # bark_cpp project
-option(BARK_BUILD "Build bark.cpp" ON)
+# Temporarily disabled since version v0.0.9.3
+option(BARK_BUILD "Build bark.cpp" OFF)
 if(BARK_BUILD)
     # Filter out HIPBLAS and Vulkan options for bark.cpp since it doesn't support them
     set(BARK_CMAKE_OPTIONS ${USER_DEFINED_OPTIONS})
diff --git a/README.md b/README.md
index 924b76ed..0e6b50db 100644
--- a/README.md
+++ b/README.md
@@ -21,8 +21,9 @@
 
 ## Latest News 🔥
 
-- Support Nexa AI's own vision language model (0.9B parameters): `nexa run omnivision` and audio language model (2.9B): `nexa run omniaudio`
+- Support Nexa AI's own vision language model (0.9B parameters): `nexa run omnivision` and audio language model (2.9B parameters): `nexa run omniaudio`
 - Support audio language model: `nexa run qwen2audio`, **we are the first open-source toolkit to support audio language model with GGML tensor library.**
+- Support iOS Swift binding for local inference on **iOS mobile** devices.
 - Support embedding model: `nexa embed <model_path> <prompt>`
 - Support pull and run supported Computer Vision models in GGUF format from HuggingFace or ModelScope: `nexa run -hf <hf_model_id> -mt COMPUTER_VISION` or `nexa run -ms <ms_model_id> -mt COMPUTER_VISION`
 - Support pull and run NLP models in GGUF format from HuggingFace or ModelScope: `nexa run -hf <hf_model_id> -mt NLP` or `nexa run -ms <ms_model_id> -mt NLP`
@@ -32,13 +33,13 @@ Welcome to submit your requests through [issues](https://github.com/NexaAI/nexa-
 ## Install Option 1: Executable Installer
 
 <p>
-    <a href="https://public-storage.nexa4ai.com/nexa-sdk-executable-installer/nexa-sdk-0.0.9.2-macos-installer.pkg">
+    <a href="https://public-storage.nexa4ai.com/nexa-sdk-executable-installer/nexa-sdk-0.0.9.4-macos-installer.pkg">
         <img src="./assets/mac.png" style="height: 1em; width: auto" /> <strong> macOS Installer </strong>
     </a>
 </p>
 
 <p>
-    <a href="https://public-storage.nexa4ai.com/nexa-sdk-executable-installer/nexa-sdk-0.0.9.2-windows-setup.exe">
+    <a href="https://public-storage.nexa4ai.com/nexa-sdk-executable-installer/nexa-sdk-0.0.9.4-windows-setup.exe">
         <img src="./assets/windows.png" style="height: 1em; width: auto" /> <strong>Windows Installer</strong>
     </a>
 </p>
@@ -64,12 +65,12 @@ nexa-exe <command>
 
 ## Install Option 2: Python Package
 
-We have released pre-built wheels for various Python versions, platforms, and backends for convenient installation on our [index page](https://nexaai.github.io/nexa-sdk/whl/).
+We have released pre-built wheels for various Python versions, platforms, and backends for convenient installation on our [index page](https://github.nexa.ai/whl/).
 
 <details> <summary><strong style="font-size: 1.2em;">CPU</strong></summary>
 
 ```bash
-pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/cpu --extra-index-url https://pypi.org/simple --no-cache-dir
+pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/cpu --extra-index-url https://pypi.org/simple --no-cache-dir
 ```
 
 </details>
@@ -79,7 +80,7 @@ pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk
 For the GPU version supporting **Metal (macOS)**:
 
 ```bash
-CMAKE_ARGS="-DGGML_METAL=ON -DSD_METAL=ON" pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/metal --extra-index-url https://pypi.org/simple --no-cache-dir
+CMAKE_ARGS="-DGGML_METAL=ON -DSD_METAL=ON" pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/metal --extra-index-url https://pypi.org/simple --no-cache-dir
 ```
 
 <details>
@@ -92,7 +93,7 @@ wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge
 bash Miniforge3-MacOSX-arm64.sh
 conda create -n nexasdk python=3.10
 conda activate nexasdk
-CMAKE_ARGS="-DGGML_METAL=ON -DSD_METAL=ON" pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/metal --extra-index-url https://pypi.org/simple --no-cache-dir
+CMAKE_ARGS="-DGGML_METAL=ON -DSD_METAL=ON" pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/metal --extra-index-url https://pypi.org/simple --no-cache-dir
 ```
 
 </details>
@@ -105,25 +106,25 @@ To install with CUDA support, make sure you have [CUDA Toolkit 12.0 or later](ht
 For **Linux**:
 
 ```bash
-CMAKE_ARGS="-DGGML_CUDA=ON -DSD_CUBLAS=ON" pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/cu124 --extra-index-url https://pypi.org/simple --no-cache-dir
+CMAKE_ARGS="-DGGML_CUDA=ON -DSD_CUBLAS=ON" pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/cu124 --extra-index-url https://pypi.org/simple --no-cache-dir
 ```
 
 For **Windows PowerShell**:
 
 ```bash
-$env:CMAKE_ARGS="-DGGML_CUDA=ON -DSD_CUBLAS=ON"; pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/cu124 --extra-index-url https://pypi.org/simple --no-cache-dir
+$env:CMAKE_ARGS="-DGGML_CUDA=ON -DSD_CUBLAS=ON"; pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/cu124 --extra-index-url https://pypi.org/simple --no-cache-dir
 ```
 
 For **Windows Command Prompt**:
 
 ```bash
-set CMAKE_ARGS="-DGGML_CUDA=ON -DSD_CUBLAS=ON" & pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/cu124 --extra-index-url https://pypi.org/simple --no-cache-dir
+set CMAKE_ARGS="-DGGML_CUDA=ON -DSD_CUBLAS=ON" & pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/cu124 --extra-index-url https://pypi.org/simple --no-cache-dir
 ```
 
 For **Windows Git Bash**:
 
 ```bash
-CMAKE_ARGS="-DGGML_CUDA=ON -DSD_CUBLAS=ON" pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/cu124 --extra-index-url https://pypi.org/simple --no-cache-dir
+CMAKE_ARGS="-DGGML_CUDA=ON -DSD_CUBLAS=ON" pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/cu124 --extra-index-url https://pypi.org/simple --no-cache-dir
 ```
 
 <details>
@@ -150,7 +151,7 @@ To install with ROCm support, make sure you have [ROCm 6.2.1 or later](https://r
 For **Linux**:
 
 ```bash
-CMAKE_ARGS="-DGGML_HIPBLAS=on" pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/rocm621 --extra-index-url https://pypi.org/simple --no-cache-dir
+CMAKE_ARGS="-DGGML_HIPBLAS=on" pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/rocm621 --extra-index-url https://pypi.org/simple --no-cache-dir
 ```
 
 </details>
@@ -162,19 +163,19 @@ To install with Vulkan support, make sure you have [Vulkan SDK 1.3.261.1 or late
 For **Windows PowerShell**:
 
 ```bash
-$env:CMAKE_ARGS="-DGGML_VULKAN=on"; pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/vulkan --extra-index-url https://pypi.org/simple --no-cache-dir
+$env:CMAKE_ARGS="-DGGML_VULKAN=on"; pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/vulkan --extra-index-url https://pypi.org/simple --no-cache-dir
 ```
 
 For **Windows Command Prompt**:
 
 ```bash
-set CMAKE_ARGS="-DGGML_VULKAN=on" & pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/vulkan --extra-index-url https://pypi.org/simple --no-cache-dir
+set CMAKE_ARGS="-DGGML_VULKAN=on" & pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/vulkan --extra-index-url https://pypi.org/simple --no-cache-dir
 ```
 
 For **Windows Git Bash**:
 
 ```bash
-CMAKE_ARGS="-DGGML_VULKAN=on" pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/vulkan --extra-index-url https://pypi.org/simple --no-cache-dir
+CMAKE_ARGS="-DGGML_VULKAN=on" pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/vulkan --extra-index-url https://pypi.org/simple --no-cache-dir
 ```
 
 </details>
@@ -205,16 +206,18 @@ pip install -e .
 
 Below is our differentiation from other similar tools:
 
-| **Feature**                | **[Nexa SDK](https://github.com/NexaAI/nexa-sdk)** | **[ollama](https://github.com/ollama/ollama)** | **[Optimum](https://github.com/huggingface/optimum)** | **[LM Studio](https://github.com/lmstudio-ai)** |
-| -------------------------- | :------------------------------------------------: | :--------------------------------------------: | :---------------------------------------------------: | :---------------------------------------------: |
-| **GGML Support**           |                         ✅                         |                       ✅                       |                          ❌                           |                       ✅                        |
-| **ONNX Support**           |                         ✅                         |                       ❌                       |                          ✅                           |                       ❌                        |
-| **Text Generation**        |                         ✅                         |                       ✅                       |                          ✅                           |                       ✅                        |
-| **Image Generation**       |                         ✅                         |                       ❌                       |                          ❌                           |                       ❌                        |
-| **Vision-Language Models** |                         ✅                         |                       ✅                       |                          ✅                           |                       ✅                        |
-| **Text-to-Speech**         |                         ✅                         |                       ❌                       |                          ✅                           |                       ❌                        |
-| **Server Capability**      |                         ✅                         |                       ✅                       |                          ✅                           |                       ✅                        |
-| **User Interface**         |                         ✅                         |                       ❌                       |                          ❌                           |                       ✅                        |
+| **Feature**                 | **[Nexa SDK](https://github.com/NexaAI/nexa-sdk)** | **[ollama](https://github.com/ollama/ollama)** | **[Optimum](https://github.com/huggingface/optimum)** | **[LM Studio](https://github.com/lmstudio-ai)** |
+| --------------------------- | :------------------------------------------------: | :--------------------------------------------: | :---------------------------------------------------: | :---------------------------------------------: |
+| **GGML Support**            |                         ✅                         |                       ✅                       |                          ❌                           |                       ✅                        |
+| **ONNX Support**            |                         ✅                         |                       ❌                       |                          ✅                           |                       ❌                        |
+| **Text Generation**         |                         ✅                         |                       ✅                       |                          ✅                           |                       ✅                        |
+| **Image Generation**        |                         ✅                         |                       ❌                       |                          ❌                           |                       ❌                        |
+| **Vision-Language Models**  |                         ✅                         |                       ✅                       |                          ✅                           |                       ✅                        |
+| **Audio-Language Models**   |                         ✅                         |                       ❌                       |                          ❌                           |                       ❌                        |
+| **Text-to-Speech**          |                         ✅                         |                       ❌                       |                          ✅                           |                       ❌                        |
+| **Server Capability**       |                         ✅                         |                       ✅                       |                          ✅                           |                       ✅                        |
+| **User Interface**          |                         ✅                         |                       ❌                       |                          ❌                           |                       ✅                        |
+| **Executable Installation** |                         ✅                         |                       ✅                       |                          ❌                           |                       ✅                        |
 
 ## Supported Models & Model Hub
 
@@ -223,11 +226,11 @@ Our on-device model hub offers all types of quantized models (text, image, audio
 Supported model examples (full list at [Model Hub](https://nexa.ai/models)):
 | Model | Type | Format | Command |
 | ------------------------------------------------------------------------------------------------------- | --------------- | --------- | -------------------------------------- |
-| [omniaudio](https://nexa.ai/NexaAI/Octo-omni-audio/gguf-q4_0/readme) | AudioLM | GGUF | `nexa run omniaudio` |
+| [omniaudio](https://nexa.ai/NexaAI/omniaudio/gguf-q4_0/readme) | AudioLM | GGUF | `nexa run omniaudio` |
 | [qwen2audio](https://www.nexaai.com/qnguyen3/nanoLLaVA/gguf-fp16/readme) | AudioLM | GGUF | `nexa run qwen2audio` |
 | [octopus-v2](https://www.nexaai.com/NexaAI/Octopus-v2/gguf-q4_0/readme) | Function Call | GGUF | `nexa run octopus-v2` |
 | [octo-net](https://www.nexaai.com/NexaAI/Octo-net/gguf-q4_0/readme) | Text | GGUF | `nexa run octo-net` |
-| [omnivision](https://nexa.ai/NexaAI/Octo-omni-vision/gguf-fp16/readme) | Multimodal | GGUF | `nexa run omnivision` |
+| [omnivision](https://nexa.ai/NexaAI/omnivision/gguf-fp16/readme) | Multimodal | GGUF | `nexa run omnivision` |
 | [nanollava](https://www.nexaai.com/qnguyen3/nanoLLaVA/gguf-fp16/readme) | Multimodal | GGUF | `nexa run nanollava` |
 | [llava-phi3](https://www.nexaai.com/xtuner/llava-phi-3-mini/gguf-q4_0/readme) | Multimodal | GGUF | `nexa run llava-phi3` |
 | [llava-llama3](https://www.nexaai.com/xtuner/llava-llama-3-8b-v1.1/gguf-q4_0/readme) | Multimodal | GGUF | `nexa run llava-llama3` |
@@ -255,25 +258,37 @@ Supported model examples (full list at [Model Hub](https://nexa.ai/models)):
 | [bark-small](https://nexa.ai/suno/bark-small/gguf-fp16/readme) | Text-to-Speech | GGUF | `nexa run bark-small:fp16` |
 
 ## Run Models from 🤗 HuggingFace or 🤖 ModelScope
+
 You can pull, convert (to .gguf), quantize and run [llama.cpp supported](https://github.com/ggerganov/llama.cpp#description) text generation models from HF or MS with Nexa SDK.
+
 ### Run .gguf File
+
 Use `nexa run -hf <hf-model-id>` or `nexa run -ms <ms-model-id>` to run models with provided .gguf files:
+
 ```bash
 nexa run -hf Qwen/Qwen2.5-Coder-7B-Instruct-GGUF
 ```
+
 ```bash
 nexa run -ms Qwen/Qwen2.5-Coder-7B-Instruct-GGUF
 ```
+
 > **Note:** You will be prompted to select a single .gguf file. If your desired quantization version has multiple split files (like fp16-00001-of-00004), please use Nexa's conversion tool (see below) to convert and quantize the model locally.
+
 ### Convert .safetensors Files
+
 Install [Nexa Python package](https://github.com/NexaAI/nexa-sdk?tab=readme-ov-file#install-option-2-python-package), and install Nexa conversion tool with `pip install "nexaai[convert]"`, then convert models from huggingface with `nexa convert <hf-model-id>`:
+
 ```bash
 nexa convert HuggingFaceTB/SmolLM2-135M-Instruct
 ```
+
 Or you can convert models from ModelScope with `nexa convert -ms <ms-model-id>`:
+
 ```bash
 nexa convert -ms Qwen/Qwen2.5-7B-Instruct
 ```
+
 > **Note:** Check our [leaderboard](https://nexa.ai/leaderboard) for performance benchmarks of different quantized versions of mainstream language models and [HuggingFace docs](https://huggingface.co/docs/optimum/en/concept_guides/quantization) to learn about quantization options.
 
 📋 You can view downloaded and converted models with `nexa list`
diff --git a/dependency/llama.cpp b/dependency/llama.cpp
index bb33473f..ed459776 160000
--- a/dependency/llama.cpp
+++ b/dependency/llama.cpp
@@ -1 +1 @@
-Subproject commit bb33473f08db604e1f30334366032f0904e2a722
+Subproject commit ed459776811d0928ce55a001e9e5a6bc3bf22ca4
diff --git a/docs/README.md b/docs/README.md
index c6481515..252116f7 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -61,6 +61,7 @@ twine upload dist/*
 ```
 git tag
 git tag -d <version>
+git push origin --delete <version>
 git tag <version>
 git push origin <version>
 ```
diff --git a/nexa/__init__.py b/nexa/__init__.py
index b53e36d3..488bc026 100644
--- a/nexa/__init__.py
+++ b/nexa/__init__.py
@@ -1 +1 @@
-__version__ = "0.0.9.2"
+__version__ = "0.0.9.4"
diff --git a/nexa/cli/entry.py b/nexa/cli/entry.py
index b4ea222d..ece27ca7 100644
--- a/nexa/cli/entry.py
+++ b/nexa/cli/entry.py
@@ -120,8 +120,10 @@ def run_ggml_inference(args):
             from nexa.gguf.nexa_inference_voice import NexaVoiceInference
             inference = NexaVoiceInference(model_path=model_path, local_path=local_path, **kwargs)
         elif run_type == "TTS":
-            from nexa.gguf.nexa_inference_tts import NexaTTSInference
-            inference = NexaTTSInference(model_path=model_path, local_path=local_path, **kwargs)
+            # # Temporarily disabled since version v0.0.9.3
+            raise NotImplementedError("TTS model is not supported in CLI mode.")
+            # from nexa.gguf.nexa_inference_tts import NexaTTSInference
+            # inference = NexaTTSInference(model_path=model_path, local_path=local_path, **kwargs)
         elif run_type == "AudioLM":
             from nexa.gguf.nexa_inference_audio_lm import NexaAudioLMInference
             inference = NexaAudioLMInference(model_path=model_path, local_path=local_path, **kwargs)
@@ -330,7 +332,7 @@ def _select_quantization_type():
         except ValueError:
             print("Please enter a valid number.")
 
-def _store_in_nexa_list(converted_path, model_type):
+def _store_in_nexa_list(converted_path, model_type, input_name, output_ftype):
     """Helper function to store converted model in nexa list."""
     import shutil
     from nexa.general import add_model_to_list
@@ -344,7 +346,8 @@ def _store_in_nexa_list(converted_path, model_type):
     shutil.copy2(converted_path, nexa_list_path)
     
     # Add the new path to the model list
-    add_model_to_list(os.path.basename(converted_path), nexa_list_path, "gguf", model_type)
+    nexa_model_name = f"{input_name}:{output_ftype}"
+    add_model_to_list(nexa_model_name, nexa_list_path, "gguf", model_type)
 
 def _run_converted_model(converted_path, model_type):
     """Helper function to run the converted model."""
@@ -391,7 +394,7 @@ def run_convert(args):
     
     try:
         from nexa.gguf.converter.nexa_convert import convert_hf_to_quantized_gguf
-        converted_path = convert_hf_to_quantized_gguf(
+        converted_path, input_name, output_ftype = convert_hf_to_quantized_gguf(
             input_path,
             output_file=args.output_file,
             ftype=ftype,
@@ -404,7 +407,7 @@ def run_convert(args):
             # Ask if user wants to store in nexa list
             store_choice = input("\nWould you like to store this model in nexa list so you can run it with `nexa run <model_name>` anywhere and anytime? (y/N): ").strip().lower()
             if store_choice == 'y':
-                _store_in_nexa_list(converted_path, model_type)
+                _store_in_nexa_list(converted_path, model_type, input_name, output_ftype)
             
             # Ask if user wants to run the model
             run_choice = input("\nWould you like to run the converted model? (y/N): ").strip().lower()
@@ -414,7 +417,8 @@ def run_convert(args):
                 print("Exiting without running the model.")
             
             print(f"\nConverted model stored at {converted_path}")
-            running_command = f"nexa run {converted_path.split('/')[-1]}"\
+            nexa_model_name = f"{input_name}:{output_ftype}"
+            running_command = f"nexa run {nexa_model_name}"\
                 if store_choice == 'y' else f"nexa run {converted_path} -lp -mt {model_type}"
             print(f"\nYou can run the converted model with command: {running_command}")
         else:
diff --git a/nexa/general.py b/nexa/general.py
index 3190c49d..aa88e710 100644
--- a/nexa/general.py
+++ b/nexa/general.py
@@ -594,6 +594,15 @@ def is_model_exists(model_name):
 
     with open(NEXA_MODEL_LIST_PATH, "r") as f:
         model_list = json.load(f)
+        
+    # For AudioLM and Multimodal models, should check the file location instead of model name
+    if ":" in model_name:
+        model_path_with_slash = model_name.replace(":", "/")
+        
+        # Check if model_prefix/model_suffix exists in any location path
+        for model_key, model_info in model_list.items():
+            if model_path_with_slash in model_info["location"]:
+                return model_key
 
     return model_name in model_list
 
@@ -606,6 +615,13 @@ def add_model_to_list(model_name, model_location, model_type, run_type):
             model_list = json.load(f)
     else:
         model_list = {}
+    
+    # For AudioLM and Multimodal models, should remove the "model-" prefix from the tag name
+    if run_type == "AudioLM" or run_type == "Multimodal":
+        tag_name = model_name.split(":")[1]
+        if tag_name.startswith("model-"):
+            tag_name = tag_name[6:]
+            model_name = f"{model_name.split(':')[0]}:{tag_name}"
 
     model_list[model_name] = {
         "type": model_type,
@@ -624,11 +640,21 @@ def get_model_info(model_name):
     with open(NEXA_MODEL_LIST_PATH, "r") as f:
         model_list = json.load(f)
 
+    # First try direct lookup
     model_data = model_list.get(model_name, {})
-    location = model_data.get("location")
-    run_type = model_data.get("run_type")
+    if model_data:
+        return model_data.get("location"), model_data.get("run_type")
+
+    # If not found and model_name contains ":", try path-based lookup
+    if ":" in model_name:
+        model_path_with_slash = model_name.replace(":", "/")
+        
+        # Check if model_prefix/model_suffix exists in any location path
+        for model_key, model_info in model_list.items():
+            if model_path_with_slash in model_info["location"]:
+                return model_info["location"], model_info["run_type"]
 
-    return location, run_type
+    return None, None
 
 
 def list_models():
@@ -642,7 +668,7 @@ def list_models():
         filtered_list = {
             model_name: model_info 
             for model_name, model_info in model_list.items() 
-            if not model_name.split(':')[1].startswith('projector')  
+            if ':' not in model_name or not model_name.split(':')[1].startswith('projector')
         }
 
         table = [
diff --git a/nexa/gguf/__init__.py b/nexa/gguf/__init__.py
index 9ba70de9..d2afff06 100644
--- a/nexa/gguf/__init__.py
+++ b/nexa/gguf/__init__.py
@@ -2,13 +2,15 @@
 from .nexa_inference_text import NexaTextInference
 from .nexa_inference_vlm import NexaVLMInference
 from .nexa_inference_voice import NexaVoiceInference
-from .nexa_inference_tts import NexaTTSInference
+
+# Temporarily disabled since version v0.0.9.3
+# from .nexa_inference_tts import NexaTTSInference
 
 __all__ = [
     "NexaImageInference",
     "NexaTextInference",
     "NexaVLMInference",
     "NexaVoiceInference",
-    "NexaTTSInference",
+    #"NexaTTSInference",
     "NexaAudioLMInference"
 ]
\ No newline at end of file
diff --git a/nexa/gguf/converter/nexa_convert.py b/nexa/gguf/converter/nexa_convert.py
index 5e13c16a..7c24771b 100644
--- a/nexa/gguf/converter/nexa_convert.py
+++ b/nexa/gguf/converter/nexa_convert.py
@@ -110,7 +110,7 @@ def convert_hf_to_quantized_gguf(
     ftype: str = "q4_0", 
     convert_type: str = "f16", 
     **kwargs
-) -> Optional[str]:
+) -> Optional[tuple[str, str, str]]:
     """
     Convert a model in safetensors format to a quantized GGUF file.
 
@@ -118,14 +118,14 @@ def convert_hf_to_quantized_gguf(
     It can process both directories containing .safetensors files and existing .gguf files.
 
     Args:
-        input_path (str): Path to the input Hugging Face model directory or GGUF file.
+        input_path (str): Path in the local file system to the input Hugging Face model directory or GGUF file.
         output_file (str, optional): Path to the output quantized GGUF file. If None, a default path will be used.
         ftype (str, optional): Quantization type (default: "q4_0").
         convert_type (str, optional): Conversion type for safetensors to GGUF (default: "f16").
         **kwargs: Additional keyword arguments for the conversion and quantization process.
 
     Returns:
-        Optional[str]: Path to the output quantized GGUF file if successful, None otherwise.
+        Optional[tuple[str, str, str]]: Tuple of (output_file_path, input_name, ftype) if successful, None otherwise.
 
     Raises:
         FileNotFoundError: If the input directory or file does not exist.
@@ -139,11 +139,13 @@ def convert_hf_to_quantized_gguf(
     # Convert input path to absolute path
     input_path = os.path.abspath(input_path)
     
+    # Get input name early
+    input_name = os.path.basename(input_path)
+    if input_path.endswith('.gguf'):
+        input_name = os.path.splitext(input_name)[0]  # Remove .gguf extension
+    
     # Set default output file if not provided
     if not output_file:
-        input_name = os.path.basename(input_path)
-        if input_path.endswith('.gguf'):
-            input_name = os.path.splitext(input_name)[0]  # Remove .gguf extension
         output_file = os.path.abspath(f"./{input_name}-{ftype}.gguf")
     else:
         output_file = os.path.abspath(output_file)
@@ -168,7 +170,7 @@ def convert_hf_to_quantized_gguf(
 
                 # Quantize GGUF model
                 quantize_model(str(tmp_file_path.absolute()), output_file, ftype, **kwargs)
-                return output_file
+                return output_file, input_name, ftype
             finally:
                 # Delete the temporary file
                 if tmp_file_path.exists():
@@ -179,7 +181,7 @@ def convert_hf_to_quantized_gguf(
     elif input_path.endswith('.gguf'):
         # Directly call quantize_model with input_path
         quantize_model(input_file=input_path, output_file=output_file, ftype=ftype, **kwargs)
-        return output_file
+        return output_file, input_name, ftype
     else:
         logger.error(f"Invalid input path: {input_path}. Must be a directory with .safetensors files or a .gguf file.")
         return None
diff --git a/nexa/gguf/llama/kv_cache.py b/nexa/gguf/llama/kv_cache.py
new file mode 100644
index 00000000..aa01630d
--- /dev/null
+++ b/nexa/gguf/llama/kv_cache.py
@@ -0,0 +1,86 @@
+from nexa.gguf.llama.llama_cache import LlamaDiskCache
+from typing import Any, Dict
+
+def run_inference_with_disk_cache(
+    model: Any, 
+    cache_prompt: str, 
+    total_prompt: str, 
+    use_cache: bool = True, 
+    cache_dir: str = "llama.cache", 
+    **kwargs: Dict[str, Any]
+) -> Any:
+    """
+    Runs inference using a disk cache to store and retrieve model states.
+
+    Parameters:
+    - model: The model object that supports caching and inference.
+    - cache_prompt: The prompt used to generate a cache key.
+    - total_prompt: The full prompt for generating output.
+    - use_cache: Flag to determine if caching should be used.
+    - cache_dir: Directory where cache files are stored.
+    - kwargs: Additional parameters for model inference.
+
+    Returns:
+    - The output generated by the model.
+    """
+    temperature = kwargs.get('temperature', 0.7)
+    max_tokens = kwargs.get('max_tokens', 2048)
+    top_p = kwargs.get('top_p', 0.8)
+    top_k = kwargs.get('top_k', 50)
+    repeat_penalty = kwargs.get('repeat_penalty', 1.0)
+
+    if use_cache:
+        # Initialize disk cache with specified directory
+        cache_context = LlamaDiskCache(cache_dir=cache_dir)
+        model.set_cache(cache_context)
+        # Convert prompt to tokens for cache key
+        prompt_tokens = model.tokenize(cache_prompt.encode("utf-8"))
+
+        try:
+            # Try to load existing cache
+            cached_state = cache_context[prompt_tokens]
+            model.load_state(cached_state)
+
+            output = model(
+                total_prompt,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                stream=True,
+            )
+        except KeyError:
+            # If cache doesn't exist, create it
+            model.reset()
+            # Run initial inference to populate cache
+            _ = model(
+                cache_prompt,
+                max_tokens=1,  # Minimal tokens for cache creation
+                temperature=temperature,
+                echo=False,
+            )
+            # Save the state to cache
+            cache_context[prompt_tokens] = model.save_state()
+
+            # Generate output after creating cache
+            output = model(
+                total_prompt,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repeat_penalty=repeat_penalty,
+                stream=True,
+            )
+    else:
+        model.reset()
+        model.set_cache(None)
+
+        output = model(
+            total_prompt,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            repeat_penalty=repeat_penalty,
+            stream=True,
+        )
+    return output
\ No newline at end of file
diff --git a/nexa/gguf/llama/llama.py b/nexa/gguf/llama/llama.py
index d7b241e7..0007b515 100644
--- a/nexa/gguf/llama/llama.py
+++ b/nexa/gguf/llama/llama.py
@@ -31,6 +31,7 @@
 
 from nexa.gguf.llama.llama_types import *
 from nexa.gguf.llama.llama_grammar import LlamaGrammar
+from nexa.gguf.llama.llama_cache import BaseLlamaCache
 from nexa.gguf.llama.llama_tokenizer import BaseLlamaTokenizer, LlamaTokenizer
 import nexa.gguf.llama.llama_cpp as llama_cpp
 import nexa.gguf.llama.llama_chat_format as llama_chat_format
@@ -350,6 +351,8 @@ def __init__(
         # Sampling Params
         self.last_n_tokens_size = last_n_tokens_size
 
+        self.cache: Optional[BaseLlamaCache] = None
+
         self.lora_base = lora_base
         self.lora_scale = lora_scale
         self.lora_path = lora_path
@@ -596,6 +599,14 @@ def detokenize(
             The detokenized string.
         """
         return self.tokenizer_.detokenize(tokens, prev_tokens=prev_tokens, special=special)
+    
+    def set_cache(self, cache: Optional[BaseLlamaCache]):
+        """Set the cache.
+
+        Args:
+            cache: The cache to set.
+        """
+        self.cache = cache
 
     def set_seed(self, seed: int):
         """Set the random seed.
@@ -1211,6 +1222,23 @@ def logit_bias_processor(
             raise ValueError(
                 "logprobs is not supported for models created with logits_all=False"
             )
+        
+        if self.cache:
+            try:
+                cache_item = self.cache[prompt_tokens]
+                cache_prefix_len = Llama.longest_token_prefix(
+                    cache_item.input_ids.tolist(), prompt_tokens
+                )
+                eval_prefix_len = Llama.longest_token_prefix(
+                    self._input_ids.tolist(), prompt_tokens
+                )
+                if cache_prefix_len > eval_prefix_len:
+                    self.load_state(cache_item)
+                    if self.verbose:
+                        print("Llama._create_completion: cache hit", file=sys.stderr)
+            except KeyError:
+                if self.verbose:
+                    print("Llama._create_completion: cache miss", file=sys.stderr)
 
         if seed is not None:
             self._ctx.set_rng_seed(seed)
@@ -1552,8 +1580,19 @@ def logit_bias_processor(
                     }
                 ],
             }
+            if self.cache:
+                if self.verbose:
+                    print("Llama._create_completion: cache save", file=sys.stderr)
+                self.cache[prompt_tokens + completion_tokens] = self.save_state()
+                if self.verbose:
+                    print("Llama._create_completion: cache saved", file=sys.stderr)
             return
 
+        if self.cache:
+            if self.verbose:
+                print("Llama._create_completion: cache save", file=sys.stderr)
+            self.cache[prompt_tokens + completion_tokens] = self.save_state()
+
         text_str = text.decode("utf-8", errors="ignore")
 
         if echo:
diff --git a/nexa/transformers/README.md b/nexa/transformers/README.md
deleted file mode 100644
index c539b454..00000000
--- a/nexa/transformers/README.md
+++ /dev/null
@@ -1,8 +0,0 @@
-# transformers support for Nexa AI models
-
-```
-python run_omnivision.py
-```
-
-## Acknowledgements
-We thank the [Hugging Face Transformers](https://github.com/huggingface/transformers) for their amazing work on the Transformers library.
diff --git a/nexa/transformers/__init__.py b/nexa/transformers/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/nexa/transformers/omnivision/__init__.py b/nexa/transformers/omnivision/__init__.py
deleted file mode 100644
index e69de29b..00000000
diff --git a/nexa/transformers/omnivision/configuration.py b/nexa/transformers/omnivision/configuration.py
deleted file mode 100644
index d356a315..00000000
--- a/nexa/transformers/omnivision/configuration.py
+++ /dev/null
@@ -1,130 +0,0 @@
-# Copyright (c) 2024 Nexa AI Inc., Alibaba Group (Qwen team), and HuggingFace Inc.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-""" Qwen2 model configuration"""
-
-from transformers.configuration_utils import PretrainedConfig
-from transformers.utils import logging
-from typing import Union
-from transformers import PretrainedConfig
-import os
-from transformers.models.auto import CONFIG_MAPPING
-
-logger = logging.get_logger(__name__)
-
-
-class SigLipVisionConfig(PretrainedConfig):
-    model_type = "siglip_vision_model"
-    def __init__(
-            self,
-            hidden_size=1152,
-            image_mean=(0.5, 0.5, 0.5),
-            intermediate_size=4304,
-            num_hidden_layers=27,
-            num_attention_heads=16,
-            num_channels=3,
-            image_size=384,
-            patch_size=14,
-            hidden_act="gelu_pytorch_tanh",
-            layer_norm_eps=1e-6,
-            attention_dropout=0.0,
-            **kwargs,
-    ):
-        super().__init__(**kwargs)
-        self.hidden_size = hidden_size
-        self.intermediate_size = intermediate_size
-        self.num_hidden_layers = num_hidden_layers
-        self.num_attention_heads = num_attention_heads
-        self.num_channels = num_channels
-        self.patch_size = patch_size
-        self.image_size = image_size
-        self.attention_dropout = attention_dropout
-        self.layer_norm_eps = layer_norm_eps
-        self.hidden_act = hidden_act
-        self.image_mean = image_mean
-
-    @classmethod
-    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
-        cls._set_token_in_kwargs(kwargs)
-
-        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
-
-        # get the vision config dict if we are loading from SigLipConfig
-        if config_dict.get("model_type") == "siglip":
-            config_dict = config_dict["vision_config"]
-
-        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
-            logger.warning(
-                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
-                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
-            )
-        return cls.from_dict(config_dict, **kwargs)
-        
-        
-""" Nexa AI model configuration"""
-class OminiVLMConfig(PretrainedConfig):
-    model_type = "nano-omini-vlm"
-    
-    model_type = "omini_vlm"
-    keys_to_ignore_at_inference = ["past_key_values"]
-    
-    def __init__(
-        self,
-        vision_config=None,
-        text_config=None,
-        hidden_size=4096,
-        mm_hidden_size=1152,
-        mm_projector_lr=None,
-        mm_projector_type="mlp2x_gelu",
-        image_token_index=151655,
-        initializer_range=0.02,
-        **kwargs,
-    ):
-        self.hidden_size = hidden_size
-        self.mm_hidden_size = mm_hidden_size
-        self.mm_projector_lr = mm_projector_lr
-        self.mm_projector_type = mm_projector_type
-        self.image_token_index = image_token_index
-        self.initializer_range = initializer_range
-        if isinstance(vision_config, dict):
-            vision_config = SigLipVisionConfig(**vision_config)
-        elif vision_config is None:
-            vision_config = SigLipVisionConfig(
-                hidden_size=1152,
-                image_mean=(0.5, 0.5, 0.5),
-                intermediate_size=4304,
-                num_hidden_layers=27,
-                num_attention_heads=16,
-                num_channels=3,
-                image_size=384,
-                patch_size=14,
-                hidden_act="gelu_pytorch_tanh",
-                layer_norm_eps=1e-6,
-                attention_dropout=0.0,
-            )
-        self.vision_config = vision_config
-        
-        if isinstance(text_config, dict):
-            text_config["model_type"] = (
-                text_config["model_type"] if "model_type" in text_config else "qwen2"
-            )
-            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
-        elif text_config is None:
-            text_config = CONFIG_MAPPING["qwen2"]()
-
-        self.text_config = text_config
-
-        super().__init__(**kwargs)
-            
\ No newline at end of file
diff --git a/nexa/transformers/omnivision/modeling.py b/nexa/transformers/omnivision/modeling.py
deleted file mode 100644
index 94bd67b1..00000000
--- a/nexa/transformers/omnivision/modeling.py
+++ /dev/null
@@ -1,709 +0,0 @@
-# Copyright (c) 2024 Nexa AI Inc., Alibaba Group (Qwen team), and HuggingFace Inc.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import math
-from dataclasses import dataclass
-from typing import Any, Dict, List, Optional, Tuple, Union
-from dataclasses import dataclass
-from transformers.activations import ACT2FN
-import torch.utils.checkpoint
-from torch import nn
-from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
-from transformers.modeling_utils import PreTrainedModel
-from transformers.utils import ModelOutput
-from transformers import Qwen2ForCausalLM
-from .configuration import SigLipVisionConfig, OminiVLMConfig
-
-# ======================================================================================== #
-#                          vision tower                                                    #
-# ======================================================================================== #
-@dataclass
-class SigLipVisionModelOutput(ModelOutput):
-    """
-    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
-
-    Args:
-        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
-            The image embeddings obtained by applying the projection layer to the pooler_output.
-        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-            Sequence of hidden-states at the output of the last layer of the model.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-    """
-    image_embeds: Optional[torch.FloatTensor] = None
-    last_hidden_state: torch.FloatTensor = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    
-    
-@dataclass
-class OminiVLMOutputWithPast(ModelOutput):
-    """
-    Base class for Gemma2Audio causal language model (or autoregressive) outputs.
-
-    Args:
-        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
-            Language modeling loss (for next-token prediction).
-        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
-            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
-        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
-            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
-            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
-
-            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
-            `past_key_values` input) to speed up sequential decoding.
-        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
-            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
-            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
-
-            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
-        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
-            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
-            sequence_length)`.
-
-            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
-            heads.
-        attention_mask (`torch.FloatTensor`, *optional*):
-            Attentions mask, used to update attention mask and position_ids.
-    """
-    loss: Optional[torch.FloatTensor] = None
-    logits: torch.FloatTensor = None
-    past_key_values: Optional[List[torch.FloatTensor]] = None
-    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
-    attentions: Optional[Tuple[torch.FloatTensor]] = None
-    attention_mask: Optional[torch.FloatTensor] = None
-    
-
-class SigLipVisionEmbeddings(nn.Module):
-    def __init__(self, config: SigLipVisionConfig):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.image_size = config.image_size
-        self.patch_size = config.patch_size
-
-        self.patch_embedding = nn.Conv2d(
-            in_channels=config.num_channels,
-            out_channels=self.embed_dim,
-            kernel_size=self.patch_size,
-            stride=self.patch_size,
-            padding="valid",
-        )
-
-        self.num_patches = (self.image_size // self.patch_size) ** 2
-        self.num_positions = self.num_patches
-        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
-        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
-
-    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
-        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
-        embeddings = patch_embeds.flatten(2).transpose(1, 2)
-
-        embeddings = embeddings + self.position_embedding(self.position_ids)
-        return embeddings
-
-
-class SigLipAttention(nn.Module):
-    """Multi-headed attention from 'Attention Is All You Need' paper"""
-
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.embed_dim = config.hidden_size
-        self.num_heads = config.num_attention_heads
-        self.head_dim = self.embed_dim // self.num_heads
-        if self.head_dim * self.num_heads != self.embed_dim:
-            raise ValueError(
-                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
-                f" {self.num_heads})."
-            )
-        self.scale = self.head_dim ** -0.5
-        self.dropout = config.attention_dropout
-
-        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
-        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
-
-    def forward(
-            self,
-            hidden_states: torch.Tensor,
-            attention_mask: Optional[torch.Tensor] = None,
-            output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
-        """Input shape: Batch x Time x Channel"""
-
-        batch_size, q_len, _ = hidden_states.size()
-
-        query_states = self.q_proj(hidden_states)
-        key_states = self.k_proj(hidden_states)
-        value_states = self.v_proj(hidden_states)
-
-        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-
-        k_v_seq_len = key_states.shape[-2]
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
-
-        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
-            raise ValueError(
-                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
-                f" {attn_weights.size()}"
-            )
-
-        if attention_mask is not None:
-            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
-                raise ValueError(
-                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
-                )
-            attn_weights = attn_weights + attention_mask
-
-        # upcast attention to fp32
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
-        attn_output = torch.matmul(attn_weights, value_states)
-
-        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
-            raise ValueError(
-                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
-                f" {attn_output.size()}"
-            )
-
-        attn_output = attn_output.transpose(1, 2).contiguous()
-        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
-
-        attn_output = self.out_proj(attn_output)
-
-        return attn_output, attn_weights
-
-
-class SigLipMLP(nn.Module):
-    def __init__(self, config):
-        super().__init__()
-        self.config = config
-        self.activation_fn = ACT2FN[config.hidden_act]
-        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
-        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
-
-    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
-        hidden_states = self.fc1(hidden_states)
-        hidden_states = self.activation_fn(hidden_states)
-        hidden_states = self.fc2(hidden_states)
-        return hidden_states
-
-
-class SigLipEncoderLayer(nn.Module):
-    def __init__(self, config: SigLipVisionConfig):
-        super().__init__()
-        self.embed_dim = config.hidden_size
-        self.self_attn = SigLipAttention(config)
-        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
-        self.mlp = SigLipMLP(config)
-        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
-
-    def forward(
-            self,
-            hidden_states: torch.Tensor,
-            attention_mask: torch.Tensor,
-            output_attentions: Optional[bool] = False,
-    ) -> Tuple[torch.FloatTensor]:
-        """
-        Args:
-            hidden_states (`torch.FloatTensor`):
-                Input to the layer of shape `(batch, seq_len, embed_dim)`.
-            attention_mask (`torch.FloatTensor`):
-                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
-            output_attentions (`bool`, *optional*, defaults to `False`):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-        """
-        residual = hidden_states
-
-        hidden_states = self.layer_norm1(hidden_states)
-        hidden_states, attn_weights = self.self_attn(
-            hidden_states=hidden_states,
-            attention_mask=attention_mask,
-            output_attentions=output_attentions,
-        )
-        hidden_states = residual + hidden_states
-
-        residual = hidden_states
-        hidden_states = self.layer_norm2(hidden_states)
-        hidden_states = self.mlp(hidden_states)
-        hidden_states = residual + hidden_states
-
-        outputs = (hidden_states,)
-
-        if output_attentions:
-            outputs += (attn_weights,)
-
-        return outputs
-
-
-class SigLipPreTrainedModel(PreTrainedModel):
-    """
-    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
-    models.
-    """
-
-    config_class = SigLipVisionConfig
-    base_model_prefix = "siglip"
-    supports_gradient_checkpointing = True
-
-    def _init_weights(self, module):
-        """Initialize the weights"""
-        pass
-
-
-class SigLipEncoder(nn.Module):
-    """
-    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
-    [`SigLipEncoderLayer`].
-
-    Args:
-        config: SigLipVisionConfig
-    """
-
-    def __init__(self, config: SigLipVisionConfig):
-        super().__init__()
-        self.config = config
-        self.layers = nn.ModuleList([SigLipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
-        self.gradient_checkpointing = False
-
-    def forward(
-            self,
-            inputs_embeds,
-            attention_mask: Optional[torch.Tensor] = None,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutput]:
-        r"""
-        Args:
-            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
-                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
-                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
-                than the model's internal embedding lookup matrix.
-            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
-
-                - 1 for tokens that are **not masked**,
-                - 0 for tokens that are **masked**.
-
-                [What are attention masks?](../glossary#attention-mask)
-            output_attentions (`bool`, *optional*):
-                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
-                returned tensors for more detail.
-            output_hidden_states (`bool`, *optional*):
-                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
-                for more detail.
-            return_dict (`bool`, *optional*):
-                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        encoder_states = () if output_hidden_states else None
-        all_attentions = () if output_attentions else None
-
-        hidden_states = inputs_embeds
-        for encoder_layer in self.layers:
-            if output_hidden_states:
-                encoder_states = encoder_states + (hidden_states,)
-            if self.gradient_checkpointing and self.training:
-                layer_outputs = self._gradient_checkpointing_func(
-                    encoder_layer.__call__,
-                    hidden_states,
-                    attention_mask,
-                    output_attentions,
-                )
-            else:
-                layer_outputs = encoder_layer(
-                    hidden_states,
-                    attention_mask,
-                    output_attentions=output_attentions,
-                )
-
-            hidden_states = layer_outputs[0]
-
-            if output_attentions:
-                all_attentions = all_attentions + (layer_outputs[1],)
-
-        if output_hidden_states:
-            encoder_states = encoder_states + (hidden_states,)
-
-        if not return_dict:
-            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
-        return BaseModelOutput(
-            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
-        )
-
-
-class SigLipMultiheadAttentionPoolingHead(nn.Module):
-    """Multihead Attention Pooling."""
-
-    def __init__(self, config: SigLipVisionConfig):
-        super().__init__()
-
-        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
-        self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
-        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
-        self.mlp = SigLipMLP(config)
-
-    def forward(self, hidden_state):
-        batch_size = hidden_state.shape[0]
-        probe = self.probe.repeat(batch_size, 1, 1)
-
-        hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
-
-        residual = hidden_state
-        hidden_state = self.layernorm(hidden_state)
-        hidden_state = residual + self.mlp(hidden_state)
-
-        return hidden_state[:, 0]
-    
-
-class SigLipVisionTransformer(nn.Module):
-    def __init__(self, config: SigLipVisionConfig):
-        super().__init__()
-        self.config = config
-        embed_dim = config.hidden_size
-
-        self.embeddings = SigLipVisionEmbeddings(config)
-        self.encoder = SigLipEncoder(config)
-        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
-        self.head = SigLipMultiheadAttentionPoolingHead(config)
-    
-    def get_dtype(self) -> torch.dtype:
-        return self.encoder.layers[0].mlp.fc2.weight.dtype
-
-    def forward(
-            self,
-            pixel_values,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        r"""
-        Returns:
-
-        """
-        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
-        output_hidden_states = (
-            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
-        )
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        hidden_states = self.embeddings(pixel_values)
-
-        encoder_outputs = self.encoder(
-            inputs_embeds=hidden_states,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        last_hidden_state = encoder_outputs[0]
-        last_hidden_state = self.post_layernorm(last_hidden_state)
-
-        pooled_output = self.head(last_hidden_state)
-
-        if not return_dict:
-            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
-
-        return BaseModelOutputWithPooling(
-            last_hidden_state=last_hidden_state,
-            pooler_output=pooled_output,
-            hidden_states=encoder_outputs.hidden_states,
-            attentions=encoder_outputs.attentions,
-        )
-
-
-class SigLipVisionModel(SigLipPreTrainedModel):
-    config_class = SigLipVisionConfig
-    main_input_name = "pixel_values"
-    _no_split_modules = ["SigLipEncoderLayer"]
-
-    def __init__(self, config: SigLipVisionConfig):
-        super().__init__(config)
-        self.vision_model = SigLipVisionTransformer(config)
-
-        # Initialize weights and apply final processing
-        self.post_init()
-
-    def get_input_embeddings(self) -> nn.Module:
-        return self.vision_model.embeddings.patch_embedding
-
-    def forward(
-            self,
-            pixel_values,
-            output_attentions: Optional[bool] = None,
-            output_hidden_states: Optional[bool] = None,
-            return_dict: Optional[bool] = None,
-    ) -> Union[Tuple, BaseModelOutputWithPooling]:
-        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-
-        return self.vision_model(
-            pixel_values=pixel_values,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-
-# ======================================================================================== #
-#                          Projector                                                       #
-# ======================================================================================== #
-
-import re
-def build_vision_projector(config, delay_load=False, **kwargs):
-    projector_type = getattr(config, 'mm_projector_type', 'mlp2x_gelu')
-    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
-    if mlp_gelu_match:
-        mlp_depth = int(mlp_gelu_match.group(1))
-        modules = [nn.Linear(config.mm_hidden_size*9, config.hidden_size)]
-        for _ in range(1, mlp_depth):
-            modules.append(nn.GELU())
-            modules.append(nn.Linear(config.hidden_size, config.text_config.hidden_size))
-        return nn.Sequential(*modules)
-    
-    
-# ======================================================================================== #
-#                          LLM                                                             #
-# ======================================================================================== #
-class OminiVLMPreTrainedModel(PreTrainedModel):
-    config_class = OminiVLMConfig
-    base_model_prefix = "model"
-    supports_gradient_checkpointing = True
-    _no_split_modules = ["Qwen2DecoderLayer", "SigLipEncoderLayer"]
-    _skip_keys_device_placement = "past_key_values"
-    _supports_flash_attn_2 = True
-    _supports_sdpa = True
-    _supports_cache_class = True
-    _supports_static_cache = True
-
-    def _init_weights(self, module):
-        std = self.config.initializer_range
-        if isinstance(module, (nn.Linear, nn.Conv3d)):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.bias is not None:
-                module.bias.data.zero_()
-        elif isinstance(module, nn.Embedding):
-            module.weight.data.normal_(mean=0.0, std=std)
-            if module.padding_idx is not None:
-                module.weight.data[module.padding_idx].zero_()
-
-
-class OminiVLMForConditionalGeneration(OminiVLMPreTrainedModel):
-    def __init__(self, config: OminiVLMConfig):
-        super().__init__(config)
-        if isinstance(config.vision_config, dict):
-            vision_config = SigLipVisionConfig(**config.vision_config)
-        else:
-            vision_config = config.vision_config
-        self.vision_tower = SigLipVisionModel(vision_config)
-        self.multi_modal_projector = build_vision_projector(config)
-        self.vocab_size = config.text_config.vocab_size
-        self.language_model = Qwen2ForCausalLM(
-            config.text_config, 
-        )
-        self.pad_token_id = (
-            self.config.pad_token_id if self.config.pad_token_id is not None else -1
-        )
-        self._padding_side = "right"  # set it to left by default, user can use setter to change padding_sides
-        self.post_init()
-
-    @property
-    def padding_side(self):
-        return self._padding_side
-
-    @padding_side.setter
-    def padding_side(self, padding_side: str):
-        if padding_side not in ["left", "right"]:
-            raise ValueError(f"{padding_side} is not `left` or `right`.")
-        self._padding_side = padding_side
-
-    def get_input_embeddings(self):
-        return self.language_model.get_input_embeddings()
-
-    def set_input_embeddings(self, value):
-        self.language_model.set_input_embeddings(value)
-
-    def get_output_embeddings(self):
-        return self.language_model.get_output_embeddings()
-
-    def set_output_embeddings(self, new_embeddings):
-        self.language_model.set_output_embeddings(new_embeddings)
-
-    def set_decoder(self, decoder):
-        self.language_model.set_decoder(decoder)
-
-    def get_decoder(self):
-        return self.language_model.get_decoder()
-
-    def tie_weights(self):
-        return self.language_model.tie_weights()
-
-    def resize_token_embeddings(
-        self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None
-    ) -> nn.Embedding:
-        model_embeds = self.language_model.resize_token_embeddings(
-            new_num_tokens, pad_to_multiple_of
-        )
-        # update vocab size
-        self.config.text_config.vocab_size = model_embeds.num_embeddings
-        self.vocab_size = model_embeds.num_embeddings
-        return model_embeds
-    
-    def _update_model_kwargs_for_generation(
-        self,
-        outputs: ModelOutput,
-        model_kwargs: Dict[str, Any],
-        is_encoder_decoder: bool = False,
-        num_new_tokens: int = 1,
-    ) -> Dict[str, Any]:
-        model_kwargs = super()._update_model_kwargs_for_generation(
-            outputs=outputs,
-            model_kwargs=model_kwargs,
-            is_encoder_decoder=is_encoder_decoder,
-            num_new_tokens=num_new_tokens,
-        )
-        return model_kwargs
-
-    def forward(
-        self,
-        input_ids: torch.LongTensor = None,
-        attention_mask: Optional[torch.Tensor] = None,
-        position_ids: Optional[torch.LongTensor] = None,
-        past_key_values: Optional[List[torch.FloatTensor]] = None,
-        inputs_embeds: Optional[torch.FloatTensor] = None,
-        labels: Optional[torch.LongTensor] = None,
-        use_cache: Optional[bool] = None,
-        output_attentions: Optional[bool] = None,
-        output_hidden_states: Optional[bool] = None,
-        return_dict: Optional[bool] = None,
-        pixel_values: Optional[torch.Tensor] = None,
-    ) -> Union[Tuple, OminiVLMOutputWithPast]:
-        r"""
-        Args:
-            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
-                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
-                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
-                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
-
-        Returns:
-        ```"""
-        output_attentions = (
-            output_attentions
-            if output_attentions is not None
-            else self.config.output_attentions
-        )
-        output_hidden_states = (
-            output_hidden_states
-            if output_hidden_states is not None
-            else self.config.output_hidden_states
-        )
-        return_dict = (
-            return_dict if return_dict is not None else self.config.use_return_dict
-        )
-
-        target_device = self.vision_tower.device
-
-        if pixel_values is not None:
-            pixel_values = pixel_values.to(target_device)
-
-        if inputs_embeds is None:
-            # 1. Extract the input embeddings
-            inputs_embeds = self.get_input_embeddings()(input_ids)
-
-            # 2. Merge text and vision features
-            if pixel_values is not None:
-                pixel_values = pixel_values.type(self.vision_tower.vision_model.get_dtype())
-                image_embeds = self.vision_tower(pixel_values).last_hidden_state.to(pixel_values.dtype)
-                image_embeds = image_embeds.view(image_embeds.shape[0], 81, -1)
-                image_embeds = self.multi_modal_projector(image_embeds)
-                image_mask = (
-                    (input_ids == self.config.image_token_index)
-                    .unsqueeze(-1)
-                    .expand_as(inputs_embeds)
-                    .to(inputs_embeds.device)
-                )
-                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
-                
-                ## This is to intelligently replace the image tokens with the image features
-                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
-    
-            if attention_mask is not None:
-                attention_mask = attention_mask.to(inputs_embeds.device)
-                
-        outputs = self.language_model(
-            attention_mask=attention_mask,
-            position_ids=position_ids,
-            past_key_values=past_key_values,
-            inputs_embeds=inputs_embeds,
-            use_cache=use_cache,
-            output_attentions=output_attentions,
-            output_hidden_states=output_hidden_states,
-            return_dict=return_dict,
-        )
-
-        logits = outputs[0]
-
-        loss = None
-        if labels is not None:
-            # Shift so that tokens < n predict n
-            if attention_mask is not None:
-                shift_attention_mask = attention_mask[..., 1:]
-                shift_logits = logits[..., :-1, :][
-                    shift_attention_mask.to(logits.device) != 0
-                ].contiguous()
-                shift_labels = labels[..., 1:][
-                    shift_attention_mask.to(labels.device) != 0
-                ].contiguous()
-            else:
-                shift_logits = logits[..., :-1, :].contiguous()
-                shift_labels = labels[..., 1:].contiguous()
-            # Flatten the tokens
-            loss_fct = nn.CrossEntropyLoss()
-            loss = loss_fct(
-                shift_logits.view(-1, shift_logits.size(-1)),
-                shift_labels.view(-1).to(shift_logits.device),
-            )
-
-        if not return_dict:
-            output = (logits,) + outputs[1:]
-            return (loss,) + output if loss is not None else output
-
-        return OminiVLMOutputWithPast(
-            loss=loss,
-            logits=logits,
-            past_key_values=outputs.past_key_values,
-            hidden_states=outputs.hidden_states,
-            attentions=outputs.attentions,
-            attention_mask=attention_mask,
-        )
\ No newline at end of file
diff --git a/nexa/transformers/omnivision/processing.py b/nexa/transformers/omnivision/processing.py
deleted file mode 100644
index 2bc3f008..00000000
--- a/nexa/transformers/omnivision/processing.py
+++ /dev/null
@@ -1,201 +0,0 @@
-# Copyright (c) 2024 Nexa AI Inc., Alibaba Group (Qwen team), and HuggingFace Inc.
-# All rights reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from typing import List, Union
-
-try:
-    from typing import Unpack
-except ImportError:
-    from typing_extensions import Unpack
-
-from transformers.feature_extraction_utils import BatchFeature
-from transformers.image_utils import ImageInput, VideoInput
-from transformers.processing_utils import (
-    ProcessingKwargs,
-    ProcessorMixin,
-)
-from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
-from transformers.utils import logging
-
-
-logger = logging.get_logger(__name__)
-NUM_IMAGE_TOKENS = 81
-
-class NanoVLMProcessorKwargs(ProcessingKwargs, total=False):
-    _defaults = {
-        "text_kwargs": {
-            "padding": False,
-        },
-    }
-
-
-class NanoVLMProcessor(ProcessorMixin):
-    attributes = ["image_processor", "tokenizer"]
-    valid_kwargs = ["chat_template"]
-    image_processor_class = "SiglipImageProcessor"
-    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
-
-    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
-        if chat_template is None:
-            chat_template = self.default_chat_template
-        super().__init__(image_processor, tokenizer, chat_template=chat_template)
-
-    def __call__(
-        self,
-        images: ImageInput = None,
-        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
-        **kwargs: Unpack[NanoVLMProcessorKwargs],
-    ) -> BatchFeature:
-        """
-        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
-        and `kwargs` arguments to Gemma2TokenizerFast's [`~Gemma2TokenizerFast.__call__`] if `text` is not `None` to encode
-        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
-        Gemma2VLImageProcessor's [`~Gemma2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
-
-        Args:
-            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
-                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
-                tensor. Both channels-first and channels-last formats are supported.
-            text (`str`, `List[str]`, `List[List[str]]`):
-                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
-                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
-                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
-            return_tensors (`str` or [`~utils.TensorType`], *optional*):
-                If set, will return tensors of a particular framework. Acceptable values are:
-                - `'tf'`: Return TensorFlow `tf.constant` objects.
-                - `'pt'`: Return PyTorch `torch.Tensor` objects.
-                - `'np'`: Return NumPy `np.ndarray` objects.
-                - `'jax'`: Return JAX `jnp.ndarray` objects.
-
-        Returns:
-            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
-
-            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
-            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
-              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
-              `None`).
-            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
-        """
-        output_kwargs = self._merge_kwargs(
-            NanoVLMProcessorKwargs,
-            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
-            **kwargs,
-        )
-        
-        # check the number of images is equal to the number of all image_pad tokens
-        assert len(images) == sum([t.count("<|image_pad|>") for t in text]), "The number of images must be equal to the number of all image_pad tokens in the text."
-        
-        if images is not None:
-            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
-        else:
-            image_inputs = {}
-
-        if not isinstance(text, list):
-            text = [text]
-
-        if image_inputs is not None:
-            index = 0
-            for i in range(len(text)):
-                while "<|image_pad|>" in text[i]:
-                    text[i] = text[i].replace(
-                        "<|image_pad|>", "<|placeholder|>" * NUM_IMAGE_TOKENS, 1
-                    )
-                    index += 1
-                text[i] = text[i].replace("<|placeholder|>", "<|image_pad|>")
-
-        _ = output_kwargs["text_kwargs"].pop("padding_side", None)
-        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
-        
-        return BatchFeature(data={**text_inputs, **image_inputs})
-
-    def batch_decode(self, *args, **kwargs):
-        """
-        This method forwards all its arguments to Gemma2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
-        refer to the docstring of this method for more information.
-        """
-        return self.tokenizer.batch_decode(*args, **kwargs)
-
-    def decode(self, *args, **kwargs):
-        """
-        This method forwards all its arguments to Gemma2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
-        the docstring of this method for more information.
-        """
-        return self.tokenizer.decode(*args, **kwargs)
-
-    @property
-    def model_input_names(self):
-        tokenizer_input_names = self.tokenizer.model_input_names
-        image_processor_input_names = self.image_processor.model_input_names
-        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
-    
-    
-    @property
-    def default_chat_template(self):
-        return (
-            "{%- if tools %}"
-                "{{- '<|im_start|>system\n' }}"
-                "{%- if messages[0]['role'] == 'system' %}"
-                    "{{- messages[0]['content'] }}"
-                "{%- else %}"
-                    "{{- 'You are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.' }}"
-                "{%- endif %}"
-                "{{- \"\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\" }}"
-                "{%- for tool in tools %}"
-                    "{{- \"\n\" }}"
-                    "{{- tool | tojson }}"
-                "{%- endfor %}"
-                "{{- \"\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\n</tool_call><|im_end|>\n\" }}"
-            "{%- else %}"
-                "{%- if messages[0]['role'] == 'system' %}"
-                    "{{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}"
-                "{%- else %}"
-                    "{{- '<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n' }}"
-                "{%- endif %}"
-            "{%- endif %}"
-            "{%- for message in messages %}"
-                "{%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}"
-                    "{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}"
-                "{%- elif message.role == \"assistant\" %}"
-                    "{{- '<|im_start|>' + message.role }}"
-                    "{%- if message.content %}"
-                        "{{- '\n' + message.content }}"
-                    "{%- endif %}"
-                    "{%- for tool_call in message.tool_calls %}"
-                        "{%- if tool_call.function is defined %}"
-                            "{%- set tool_call = tool_call.function %}"
-                        "{%- endif %}"
-                        "{{- '\n<tool_call>\n{\"name\": \"' }}"
-                        "{{- tool_call.name }}"
-                        "{{- '\", \"arguments\": ' }}"
-                        "{{- tool_call.arguments | tojson }}"
-                        "{{- '}\n</tool_call>' }}"
-                    "{%- endfor %}"
-                    "{{- '<|im_end|>\n' }}"
-                "{%- elif message.role == \"tool\" %}"
-                    "{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}"
-                        "{{- '<|im_start|>user' }}"
-                    "{%- endif %}"
-                    "{{- '\n<tool_response>\n' }}"
-                    "{{- message.content }}"
-                    "{{- '\n</tool_response>' }}"
-                    "{%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}"
-                        "{{- '<|im_end|>\n' }}"
-                    "{%- endif %}"
-                "{%- endif %}"
-            "{%- endfor %}"
-            "{%- if add_generation_prompt %}"
-                "{{- '<|im_start|>assistant\n' }}"
-            "{%- endif %}"
-        )
\ No newline at end of file
diff --git a/nexa/transformers/run_omnivision.py b/nexa/transformers/run_omnivision.py
deleted file mode 100644
index f81d1efe..00000000
--- a/nexa/transformers/run_omnivision.py
+++ /dev/null
@@ -1,92 +0,0 @@
-from nexa.transformers.omnivision.processing import NanoVLMProcessor
-from nexa.transformers.omnivision.modeling import OminiVLMForConditionalGeneration
-import argparse
-import torch
-
-
-model_name = "NexaAIDev/omnivlm-dpo"
-image_url = "https://public-storage.nexa4ai.com/public-images/cat.png"
-
-
-def get_device():
-    if torch.cuda.is_available():
-        return "cuda"
-    elif torch.backends.mps.is_available():
-        return "mps"
-    return "cpu"
-
-
-def load_model_and_processor(model_path):
-    device = get_device()
-    proc_path = "nexa-collaboration/nano-vlm-processor"
-    processor = NanoVLMProcessor.from_pretrained(proc_path)
-    processor.tokenizer.pad_token = processor.tokenizer.eos_token
-    processor.tokenizer.padding_side = "right"
-
-    model_kwargs = {}
-    # Adjust dtype based on device
-    dtype = torch.bfloat16 if device == "cuda" else torch.float32
-    local_model = OminiVLMForConditionalGeneration.from_pretrained(
-        model_path,
-        torch_dtype=dtype,
-        **model_kwargs
-    )
-    local_model = local_model.to(device)
-    return local_model, processor
-
-
-def process_single_image(processor, image_path, input_prompt=None):
-    text = f"<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_prompt}\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>"
-    # Changed from Image.open() to handle URLs
-    if image_path.startswith('http'):
-        from PIL import Image
-        import requests
-        from io import BytesIO
-        response = requests.get(image_path)
-        image = Image.open(BytesIO(response.content)).convert('RGB')
-    else:
-        image = Image.open(image_path).convert('RGB')
-    inputs = processor(
-        text=[text],
-        images=[image],
-        padding=True,
-        return_tensors="pt",
-    )
-    return inputs.to(get_device())
-
-
-def generate_output(model, processor, inputs, max_tokens):
-    cur_ids = inputs['input_ids']
-    cur_attention_mask = inputs['attention_mask']
-    input_token_length = cur_ids.shape[-1]
-    for _ in range(max_tokens):
-        out = model(
-            cur_ids,
-            attention_mask=cur_attention_mask,
-            pixel_values=inputs['pixel_values'],
-            use_cache=False
-        )
-        next_token = out.logits[:, -1].argmax()
-        next_word = processor.decode(next_token)
-        cur_ids = torch.cat([cur_ids, next_token.unsqueeze(0).unsqueeze(0)], dim=-1)
-        cur_attention_mask = torch.cat([cur_attention_mask, torch.ones_like(next_token).unsqueeze(0).unsqueeze(0)], dim=-1)
-        if next_word in ("<|im_end|>"):
-            break
-    return processor.batch_decode(cur_ids[:, input_token_length:])[0]
-
-def main(args):
-    model, processor = load_model_and_processor(args.model_path)
-    inputs = process_single_image(processor, args.image_path, args.input_prompt)
-    output = generate_output(model, processor, inputs, args.max_tokens)
-    print("=== Inference Result ===\n", output)
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser(description="Inference script for Nano-Omni-VLM")
-    parser.add_argument("--model_path", default=model_name, help="Path to the model checkpoint")
-    # Add image_path argument
-    parser.add_argument("--image_path", default=image_url, help="Path to input image or image URL")
-    parser.add_argument("--input_prompt", type=str, default="Describe this image for me", help="Input prompt for instruct task")
-    parser.add_argument("--max_tokens", type=int, default=512, help="Maximum number of tokens to generate")
-
-    args = parser.parse_args()
-    main(args)
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index 7e9c6478..5e3b1b1a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -80,12 +80,6 @@ convert = [
     "nexa-gguf",
 ]
 
-transformers = [
-    "transformers",
-    "torch",
-    "pillow"
-]
-
 [project.urls]
 Homepage = "https://github.com/NexaAI/nexa-sdk"
 Issues = "https://github.com/NexaAI/nexa-sdk/issues"
diff --git a/tests/test_tts_generation.py b/tests/test_tts_generation.py
index 5d55ed4d..2dc9c526 100644
--- a/tests/test_tts_generation.py
+++ b/tests/test_tts_generation.py
@@ -1,22 +1,24 @@
-from nexa.gguf import NexaTTSInference
+# Temporarily disabled since version v0.0.9.3
 
-def test_tts_generation():
-    tts = NexaTTSInference(
-        model_path="bark-small",
-        local_path=None,
-        n_threads=4,
-        seed=42,
-        sampling_rate=24000,
-        verbosity=2
-    )
+# from nexa.gguf import NexaTTSInference
+
+# def test_tts_generation():
+#     tts = NexaTTSInference(
+#         model_path="bark-small",
+#         local_path=None,
+#         n_threads=4,
+#         seed=42,
+#         sampling_rate=24000,
+#         verbosity=2
+#     )
     
-    # Generate audio from prompt
-    prompt = "Hello, this is a test of the Bark text to speech system."
-    audio_data = tts.audio_generation(prompt)
+#     # Generate audio from prompt
+#     prompt = "Hello, this is a test of the Bark text to speech system."
+#     audio_data = tts.audio_generation(prompt)
     
-    # Save the generated audio
-    tts._save_audio(audio_data, tts.sampling_rate, "tts_output")
-    print("TTS generation test completed successfully!")
+#     # Save the generated audio
+#     tts._save_audio(audio_data, tts.sampling_rate, "tts_output")
+#     print("TTS generation test completed successfully!")
 
-if __name__ == "__main__":
-    test_tts_generation()
\ No newline at end of file
+# if __name__ == "__main__":
+#     test_tts_generation()
\ No newline at end of file