diff --git a/.github/workflows/build-wheels-vulkan-win.yaml b/.github/workflows/build-wheels-vulkan-win.yaml
index 4683f8f8..ca68208d 100644
--- a/.github/workflows/build-wheels-vulkan-win.yaml
+++ b/.github/workflows/build-wheels-vulkan-win.yaml
@@ -88,6 +88,6 @@ jobs:
         uses: softprops/action-gh-release@v2
         with:
           files: dist/*
-          tag_name: ${{ github.ref_name }}-vulkan${{ env.VULKAN_VERSION }}
+          tag_name: ${{ github.ref_name }}-vulkan
         env:
           GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
\ No newline at end of file
diff --git a/.github/workflows/generate-index-from-release.yaml b/.github/workflows/generate-index-from-release.yaml
index 11741cab..b993847a 100644
--- a/.github/workflows/generate-index-from-release.yaml
+++ b/.github/workflows/generate-index-from-release.yaml
@@ -3,7 +3,7 @@ name: Wheels Index
 on:
   # Trigger on new release
   workflow_run:
-    workflows: ["Release", "Build Wheels (CUDA)", "Build Wheels (Metal)", "Build Wheels (ROCm)", "Build Wheels (Vulkan)"]
+    workflows: ["Build Wheels (CPU)", "Build Wheels (CUDA)", "Build Wheels (Metal)", "Build Wheels (ROCm)", "Build Wheels (Vulkan)"]
     types:
       - completed
 
diff --git a/CLI.md b/CLI.md
index 06ed3a33..5f219047 100644
--- a/CLI.md
+++ b/CLI.md
@@ -46,11 +46,12 @@ nexa pull MODEL_PATH
 usage: nexa pull [-h] model_path
 
 positional arguments:
-  model_path  Path or identifier for the model in Nexa Model Hub, or Hugging Face repo ID when using -hf flag
+  model_path  Path or identifier for the model in Nexa Model Hub, Hugging Face repo ID when using -hf flag, or ModelScope model ID when using -ms flag
 
 options:
   -h, --help            show this help message and exit
   -hf, --huggingface    Pull model from Hugging Face Hub
+  -ms, --modelscope     Pull model from ModelScope Hub
   -o, --output_path OUTPUT_PATH
                         Custom output path for the pulled model
 ```
@@ -102,7 +103,7 @@ You can run any model shown in `nexa list` command.
 
 ```
 nexa run MODEL_PATH
-usage: nexa run [-h] [-t TEMPERATURE] [-m MAX_NEW_TOKENS] [-k TOP_K] [-p TOP_P] [-sw [STOP_WORDS ...]] [-pf] [-st] [-lp] [-mt {NLP, COMPUTER_VISION, MULTIMODAL, AUDIO}] [-hf] model_path
+usage: nexa run [-h] [-t TEMPERATURE] [-m MAX_NEW_TOKENS] [-k TOP_K] [-p TOP_P] [-sw [STOP_WORDS ...]] [-pf] [-st] [-lp] [-mt {NLP, COMPUTER_VISION, MULTIMODAL, AUDIO}] [-hf] [-ms] model_path
 
 positional arguments:
   model_path            Path or identifier for the model in Nexa Model Hub
@@ -112,8 +113,9 @@ options:
   -pf, --profiling      Enable profiling logs for the inference process
   -st, --streamlit      Run the inference in Streamlit UI, can be used with -lp or -hf
   -lp, --local_path     Indicate that the model path provided is the local path
-  -mt, --model_type     Indicate the model running type, must be used with -lp or -hf, choose from [NLP, COMPUTER_VISION, MULTIMODAL, AUDIO]
+  -mt, --model_type     Indicate the model running type, must be used with -lp or -hf or -ms, choose from [NLP, COMPUTER_VISION, MULTIMODAL, AUDIO]
   -hf, --huggingface    Load model from Hugging Face Hub
+  -ms, --modelscope     Load model from ModelScope Hub
 
 Text generation options:
   -t, --temperature TEMPERATURE
@@ -137,7 +139,7 @@ nexa run llama2
 
 ```
 nexa run MODEL_PATH
-usage: nexa run [-h] [-i2i] [-ns NUM_INFERENCE_STEPS] [-np NUM_IMAGES_PER_PROMPT] [-H HEIGHT] [-W WIDTH] [-g GUIDANCE_SCALE] [-o OUTPUT] [-s RANDOM_SEED] [-st] [-lp] [-mt {NLP, COMPUTER_VISION, MULTIMODAL, AUDIO}] [-hf] model_path
+usage: nexa run [-h] [-i2i] [-ns NUM_INFERENCE_STEPS] [-np NUM_IMAGES_PER_PROMPT] [-H HEIGHT] [-W WIDTH] [-g GUIDANCE_SCALE] [-o OUTPUT] [-s RANDOM_SEED] [-st] [-lp] [-mt {NLP, COMPUTER_VISION, MULTIMODAL, AUDIO}] [-hf] [-ms] model_path
 
 positional arguments:
   model_path            Path or identifier for the model in Nexa Model Hub
@@ -146,8 +148,9 @@ options:
   -h, --help            show this help message and exit
   -st, --streamlit      Run the inference in Streamlit UI, can be used with -lp or -hf
   -lp, --local_path     Indicate that the model path provided is the local path
-  -mt, --model_type     Indicate the model running type, must be used with -lp or -hf, choose from [NLP, COMPUTER_VISION, MULTIMODAL, AUDIO]
+  -mt, --model_type     Indicate the model running type, must be used with -lp or -hf or -ms, choose from [NLP, COMPUTER_VISION, MULTIMODAL, AUDIO]
   -hf, --huggingface    Load model from Hugging Face Hub
+  -ms, --modelscope     Load model from ModelScope Hub
 
 Image generation options:
   -i2i, --img2img       Whether to run image-to-image generation
@@ -182,7 +185,7 @@ nexa run sd1-4
 
 ```
 nexa run MODEL_PATH
-usage: nexa run [-h] [-t TEMPERATURE] [-m MAX_NEW_TOKENS] [-k TOP_K] [-p TOP_P] [-sw [STOP_WORDS ...]] [-pf] [-st] [-lp] [-mt {NLP, COMPUTER_VISION, MULTIMODAL, AUDIO}] [-hf] model_path
+usage: nexa run [-h] [-t TEMPERATURE] [-m MAX_NEW_TOKENS] [-k TOP_K] [-p TOP_P] [-sw [STOP_WORDS ...]] [-pf] [-st] [-lp] [-mt {NLP, COMPUTER_VISION, MULTIMODAL, AUDIO}] [-hf] [-ms] model_path
 
 positional arguments:
   model_path            Path or identifier for the model in Nexa Model Hub
@@ -192,8 +195,9 @@ options:
   -pf, --profiling      Enable profiling logs for the inference process
   -st, --streamlit      Run the inference in Streamlit UI, can be used with -lp or -hf
   -lp, --local_path     Indicate that the model path provided is the local path
-  -mt, --model_type     Indicate the model running type, must be used with -lp or -hf, choose from [NLP, COMPUTER_VISION, MULTIMODAL, AUDIO]
+  -mt, --model_type     Indicate the model running type, must be used with -lp or -hf or -ms, choose from [NLP, COMPUTER_VISION, MULTIMODAL, AUDIO]
   -hf, --huggingface    Load model from Hugging Face Hub
+  -ms, --modelscope     Load model from ModelScope Hub
 
 VLM generation options:
   -t, --temperature TEMPERATURE
@@ -217,7 +221,7 @@ nexa run nanollava
 
 ```
 nexa run MODEL_PATH
-usage: nexa run [-h] [-o OUTPUT_DIR] [-b BEAM_SIZE] [-l LANGUAGE] [--task TASK] [-t TEMPERATURE] [-c COMPUTE_TYPE] [-st] [-lp] [-mt {NLP, COMPUTER_VISION, MULTIMODAL, AUDIO}] [-hf] model_path
+usage: nexa run [-h] [-o OUTPUT_DIR] [-b BEAM_SIZE] [-l LANGUAGE] [--task TASK] [-t TEMPERATURE] [-c COMPUTE_TYPE] [-st] [-lp] [-mt {NLP, COMPUTER_VISION, MULTIMODAL, AUDIO}] [-hf] [-ms] model_path
 
 positional arguments:
   model_path            Path or identifier for the model in Nexa Model Hub
@@ -226,8 +230,9 @@ options:
   -h, --help            show this help message and exit
   -st, --streamlit      Run the inference in Streamlit UI, can be used with -lp or -hf
   -lp, --local_path     Indicate that the model path provided is the local path
-  -mt, --model_type     Indicate the model running type, must be used with -lp or -hf, choose from [NLP, COMPUTER_VISION, MULTIMODAL, AUDIO]
+  -mt, --model_type     Indicate the model running type, must be used with -lp or -hf or -ms, choose from [NLP, COMPUTER_VISION, MULTIMODAL, AUDIO]
   -hf, --huggingface    Load model from Hugging Face Hub
+  -ms, --modelscope     Load model from ModelScope Hub
 
 Automatic Speech Recognition options:
   -b, --beam_size BEAM_SIZE
@@ -251,7 +256,7 @@ nexa run faster-whisper-tiny
 
 ```
 nexa embed MODEL_PATH
-usage: nexa embed [-h] [-lp] [-hf] [-n] [-nt] model_path prompt
+usage: nexa embed [-h] [-lp] [-hf] [-ms] [-n] [-nt] model_path prompt
 
 positional arguments:
   model_path            Path or identifier for the model in Nexa Model Hub
@@ -261,6 +266,7 @@ options:
   -h, --help            show this help message and exit
   -lp, --local_path     Indicate that the model path provided is the local path
   -hf, --huggingface    Load model from Hugging Face Hub
+  -ms, --modelscope     Load model from ModelScope Hub
   -n, --normalize       Normalize the embeddings
   -nt, --no_truncate    Not truncate the embeddings
 ```
@@ -318,6 +324,7 @@ options:
   --only_copy           Only copy tensors (ignores ftype, allow_requantize, and quantize_output_tensor)
   --pure                Quantize all tensors to the default type
   --keep_split          Quantize to the same number of shards
+  -ms --modelscope      Load model from ModelScope Hub
 ```
 
 #### Example
@@ -341,7 +348,7 @@ Start a local server using models on your local computer.
 
 ```
 nexa server MODEL_PATH
-usage: nexa server [-h] [--host HOST] [--port PORT] [--reload] [-lp] [-mt {NLP, COMPUTER_VISION, MULTIMODAL, AUDIO}] [-hf] model_path
+usage: nexa server [-h] [--host HOST] [--port PORT] [--reload] [-lp] [-mt {NLP, COMPUTER_VISION, MULTIMODAL, AUDIO}] [-hf] [-ms] model_path
 
 positional arguments:
   model_path   Path or identifier for the model in S3
@@ -349,8 +356,9 @@ positional arguments:
 options:
   -h, --help   show this help message and exit
   -lp, --local_path     Indicate that the model path provided is the local path
-  -mt, --model_type     Indicate the model running type, must be used with -lp or -hf, choose from [NLP, COMPUTER_VISION, MULTIMODAL, AUDIO]
+  -mt, --model_type     Indicate the model running type, must be used with -lp or -hf or -ms, choose from [NLP, COMPUTER_VISION, MULTIMODAL, AUDIO]
   -hf, --huggingface    Load model from Hugging Face Hub
+  -ms, --modelscope     Load model from ModelScope Hub
   --host HOST  Host to bind the server to
   --port PORT  Port to bind the server to
   --reload     Enable automatic reloading on code changes
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e1f28a3d..41738eb8 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -188,7 +188,8 @@ if(LLAMA_BUILD)
 endif()
 
 # bark_cpp project
-option(BARK_BUILD "Build bark.cpp" ON)
+# Temporarily disabled since version v0.0.9.3
+option(BARK_BUILD "Build bark.cpp" OFF)
 if(BARK_BUILD)
     # Filter out HIPBLAS and Vulkan options for bark.cpp since it doesn't support them
     set(BARK_CMAKE_OPTIONS ${USER_DEFINED_OPTIONS})
diff --git a/README.md b/README.md
index fd245fc2..daa71a1e 100644
--- a/README.md
+++ b/README.md
@@ -20,12 +20,12 @@
 - **Local UI:** Streamlit for interactive model deployment and testing
 
 ## Latest News 🔥
-
-- Support Nexa AI's own vision language model (0.9B parameters): `nexa run omnivision` and audio language model (2.9B): `nexa run omniaudio`
+- Support Nexa AI's own vision language model (0.9B parameters): `nexa run omnivision` and audio language model (2.9B parameters): `nexa run omniaudio`
 - Support audio language model: `nexa run qwen2audio`, **we are the first open-source toolkit to support audio language model with GGML tensor library.**
+- Support iOS Swift binding for local inference on **iOS mobile** devices.
 - Support embedding model: `nexa embed <model_path> <prompt>`
-- Support pull and run supported Computer Vision models in GGUF format from HuggingFace: `nexa run -hf <model_id> -mt COMPUTER_VISION`
-- Support pull and run NLP models in GGUF format from HuggingFace: `nexa run -hf <model_id> -mt NLP`
+- Support pull and run supported Computer Vision models in GGUF format from HuggingFace or ModelScope: `nexa run -hf <hf_model_id> -mt COMPUTER_VISION` or `nexa run -ms <ms_model_id> -mt COMPUTER_VISION`
+- Support pull and run NLP models in GGUF format from HuggingFace or ModelScope: `nexa run -hf <hf_model_id> -mt NLP` or `nexa run -ms <ms_model_id> -mt NLP`
 
 Welcome to submit your requests through [issues](https://github.com/NexaAI/nexa-sdk/issues/new/choose), we ship weekly.
 
@@ -212,9 +212,11 @@ Below is our differentiation from other similar tools:
 | **Text Generation**        |                         ✅                         |                       ✅                       |                          ✅                           |                       ✅                        |
 | **Image Generation**       |                         ✅                         |                       ❌                       |                          ❌                           |                       ❌                        |
 | **Vision-Language Models** |                         ✅                         |                       ✅                       |                          ✅                           |                       ✅                        |
+| **Audio-Language Models** |                         ✅                         |                       ❌                       |                          ❌                           |                       ❌                        |
 | **Text-to-Speech**         |                         ✅                         |                       ❌                       |                          ✅                           |                       ❌                        |
 | **Server Capability**      |                         ✅                         |                       ✅                       |                          ✅                           |                       ✅                        |
 | **User Interface**         |                         ✅                         |                       ❌                       |                          ❌                           |                       ✅                        |
+| **Executable Installation**         |                         ✅                         |                       ✅                       |                          ❌                           |                       ✅                        |
 
 ## Supported Models & Model Hub
 
@@ -254,19 +256,26 @@ Supported model examples (full list at [Model Hub](https://nexa.ai/models)):
 | [all-MiniLM-L12-v2](https://nexa.ai/sentence-transformers/all-MiniLM-L12-v2/gguf-fp16/readme) | Embedding | GGUF | `nexa embed all-MiniLM-L12-v2:fp16` |
 | [bark-small](https://nexa.ai/suno/bark-small/gguf-fp16/readme) | Text-to-Speech | GGUF | `nexa run bark-small:fp16` |
 
-## Run Models from 🤗 HuggingFace 
-You can pull, convert (to .gguf), quantize and run [llama.cpp supported](https://github.com/ggerganov/llama.cpp#description) text generation models from HF with Nexa SDK.
+## Run Models from 🤗 HuggingFace or 🤖 ModelScope
+You can pull, convert (to .gguf), quantize and run [llama.cpp supported](https://github.com/ggerganov/llama.cpp#description) text generation models from HF or MS with Nexa SDK.
 ### Run .gguf File
-Use `nexa run -hf <hf-model-id>` to run models with provided .gguf files:
+Use `nexa run -hf <hf-model-id>` or `nexa run -ms <ms-model-id>` to run models with provided .gguf files:
 ```bash
 nexa run -hf Qwen/Qwen2.5-Coder-7B-Instruct-GGUF
 ```
+```bash
+nexa run -ms Qwen/Qwen2.5-Coder-7B-Instruct-GGUF
+```
 > **Note:** You will be prompted to select a single .gguf file. If your desired quantization version has multiple split files (like fp16-00001-of-00004), please use Nexa's conversion tool (see below) to convert and quantize the model locally.
 ### Convert .safetensors Files
-Install [Nexa Python package](https://github.com/NexaAI/nexa-sdk?tab=readme-ov-file#install-option-2-python-package), and install Nexa conversion tool with `pip install "nexaai[convert]"`, then convert models with `nexa convert <hf-model-id>`:
+Install [Nexa Python package](https://github.com/NexaAI/nexa-sdk?tab=readme-ov-file#install-option-2-python-package), and install Nexa conversion tool with `pip install "nexaai[convert]"`, then convert models from huggingface with `nexa convert <hf-model-id>`:
 ```bash
 nexa convert HuggingFaceTB/SmolLM2-135M-Instruct
 ```
+Or you can convert models from ModelScope with `nexa convert -ms <ms-model-id>`:
+```bash
+nexa convert -ms Qwen/Qwen2.5-7B-Instruct
+```
 > **Note:** Check our [leaderboard](https://nexa.ai/leaderboard) for performance benchmarks of different quantized versions of mainstream language models and [HuggingFace docs](https://huggingface.co/docs/optimum/en/concept_guides/quantization) to learn about quantization options.
 
 📋 You can view downloaded and converted models with `nexa list`
diff --git a/SERVER.md b/SERVER.md
index 77c9b0b5..b75efa45 100644
--- a/SERVER.md
+++ b/SERVER.md
@@ -9,8 +9,9 @@ usage: nexa server [-h] [--host HOST] [--port PORT] [--reload] model_path
 ### Options:
 
 - `-lp, --local_path`: Indicate that the model path provided is the local path
-- `-mt, --model_type`: Indicate the model running type, must be used with -lp or -hf, choose from [NLP, COMPUTER_VISION, MULTIMODAL, AUDIO]
+- `-mt, --model_type`: Indicate the model running type, must be used with -lp or -hf or ms, choose from [NLP, COMPUTER_VISION, MULTIMODAL, AUDIO]
 - `-hf, --huggingface`: Load model from Hugging Face Hub
+- `-ms, --modelscope`: Load model from ModelScope Hub
 - `--host`: Host to bind the server to
 - `--port`: Port to bind the server to
 - `--reload`: Enable automatic reloading on code changes
diff --git a/nexa/__init__.py b/nexa/__init__.py
index b53e36d3..c2582443 100644
--- a/nexa/__init__.py
+++ b/nexa/__init__.py
@@ -1 +1 @@
-__version__ = "0.0.9.2"
+__version__ = "0.0.9.3"
diff --git a/nexa/cli/entry.py b/nexa/cli/entry.py
index 84325c53..52611a1c 100644
--- a/nexa/cli/entry.py
+++ b/nexa/cli/entry.py
@@ -40,17 +40,18 @@ def run_ggml_inference(args):
     is_local_path = kwargs.pop("local_path", False)
     model_type = kwargs.pop("model_type", None)
     hf = kwargs.pop('huggingface', False)
+    ms = kwargs.pop('modelscope', False)
     
     run_type = None
     if model_type:
         run_type = ModelType[model_type].value
-    elif is_local_path or hf:
+    elif is_local_path or hf or ms:
         run_type = ModelType["NLP"].value
 
     local_path = None
     projector_local_path = None
     
-    if is_local_path or hf:
+    if is_local_path or hf or ms:
         if is_local_path:
             local_path = os.path.abspath(model_path)
             model_path = local_path
@@ -69,13 +70,13 @@ def run_ggml_inference(args):
                 if not os.path.isdir(local_path):
                     print("Error: For Audio models with --local_path, the provided path must be a directory containing all related files.")
                     return
-        else:  # hf case
+        else:  # hf or ms case
             # TODO: remove this after adding support for Multimodal model in CLI
             if run_type == "Multimodal" or run_type == "Audio" or run_type == "TTS":
                 print("Running multimodal model or audio model or TTS model from Hugging Face is currently not supported in CLI mode. Please use SDK to run Multimodal model or Audio model or TTS model.")
                 return
             from nexa.general import pull_model
-            local_path, _ = pull_model(model_path, hf=True, run_type=run_type)
+            local_path, _ = pull_model(model_path, hf=hf, ms=ms, run_type=run_type)
     else: # Model Hub
         from nexa.general import pull_model
         local_path, run_type = pull_model(model_path)
@@ -83,7 +84,7 @@ def run_ggml_inference(args):
     stop_words = kwargs.pop("stop_words", None)
 
     try:
-        if (is_local_path or hf) and not model_type:
+        if (is_local_path or hf or ms) and not model_type:
             print("No model type specified. Running with default model type: NLP")
             print("You can specify a different model type using the -mt flag")
             
@@ -119,8 +120,10 @@ def run_ggml_inference(args):
             from nexa.gguf.nexa_inference_voice import NexaVoiceInference
             inference = NexaVoiceInference(model_path=model_path, local_path=local_path, **kwargs)
         elif run_type == "TTS":
-            from nexa.gguf.nexa_inference_tts import NexaTTSInference
-            inference = NexaTTSInference(model_path=model_path, local_path=local_path, **kwargs)
+            # # Temporarily disabled since version v0.0.9.3
+            raise NotImplementedError("TTS model is not supported in CLI mode.")
+            # from nexa.gguf.nexa_inference_tts import NexaTTSInference
+            # inference = NexaTTSInference(model_path=model_path, local_path=local_path, **kwargs)
         elif run_type == "AudioLM":
             from nexa.gguf.nexa_inference_audio_lm import NexaAudioLMInference
             inference = NexaAudioLMInference(model_path=model_path, local_path=local_path, **kwargs)
@@ -149,11 +152,12 @@ def run_ggml_server(args):
     is_local_path = kwargs.pop("local_path", False)
     model_type = kwargs.pop("model_type", None)
     hf = kwargs.pop('huggingface', False)
+    ms = kwargs.pop('modelscope', False)
     
     run_type = None
     if model_type:
         run_type = ModelType[model_type].value
-    elif is_local_path or hf:
+    elif is_local_path or hf or ms:
         run_type = ModelType["NLP"].value
 
     projector_local_path = None
@@ -173,7 +177,7 @@ def run_ggml_server(args):
             print("Error: For Audio models with --local_path, the provided path must be a directory containing all related files.")
             return
 
-    if (is_local_path or hf) and not model_type:
+    if (is_local_path or hf or ms) and not model_type:
         print("No model type specified. Running with default model type: NLP")
         print("You can specify a different model type using the -mt flag")
 
@@ -182,6 +186,7 @@ def run_ggml_server(args):
         is_local_path_arg=is_local_path,
         model_type_arg=run_type,
         huggingface=hf,
+        modelscope=ms,
         projector_local_path_arg=projector_local_path,
         **kwargs
     )
@@ -269,17 +274,18 @@ def run_embedding_generation(args):
     prompt = kwargs.pop("prompt")
     is_local_path = kwargs.pop("local_path", False)
     hf = kwargs.pop('huggingface', False)
+    ms = kwargs.pop('modelscope', False)
     normalize = kwargs.pop('normalize', False)
     no_truncate = kwargs.pop('no_truncate', False)
 
     local_path = None
-    if is_local_path or hf:
+    if is_local_path or hf or ms:
         if is_local_path:
             local_path = os.path.abspath(model_path)
             model_path = local_path
-        else:  # hf case
+        else:  # hf or ms case
             from nexa.general import pull_model
-            local_path, _ = pull_model(model_path, hf=True, run_type="Text Embedding")
+            local_path, _ = pull_model(model_path, hf=hf, ms=ms, run_type="Text Embedding")
     else:  # Model Hub
         from nexa.general import pull_model
         local_path, _ = pull_model(model_path)
@@ -368,9 +374,13 @@ def run_convert(args):
         # Valid GGUF file, proceed as is
         pass
     else:
-        # Try downloading from HF if path isn't a valid local directory/file
-        from nexa.general import download_repo_from_hf
-        success, local_path = download_repo_from_hf(input_path)
+        # Try downloading from HF or MS if path isn't a valid local directory/file
+        if args.modelscope:
+            from nexa.general import download_repo_from_ms
+            success, local_path = download_repo_from_ms(input_path)
+        else:
+            from nexa.general import download_repo_from_hf
+            success, local_path = download_repo_from_hf(input_path)
         
         if success:
             input_path = local_path
@@ -427,6 +437,7 @@ def main():
     run_parser.add_argument("-lp", "--local_path", action="store_true", help="Indicate that the model path provided is the local path")
     run_parser.add_argument("-mt", "--model_type", type=str, choices=[e.name for e in ModelType], help="Indicate the model running type (default: NLP)")
     run_parser.add_argument("-hf", "--huggingface", action="store_true", help="Load model from Hugging Face Hub")
+    run_parser.add_argument("-ms", "--modelscope", action="store_true", help="Load model from ModelScope Hub")
 
     # Text generation/vlm arguments
     text_group = run_parser.add_argument_group('Text generation/VLM options')
@@ -505,6 +516,7 @@ def main():
     embed_parser.add_argument("prompt", type=str, help="The prompt to generate an embedding for")
     embed_parser.add_argument("-lp", "--local_path", action="store_true", help="Indicate that the model path provided is the local path")
     embed_parser.add_argument("-hf", "--huggingface", action="store_true", help="Load model from Hugging Face Hub")
+    embed_parser.add_argument("-ms", "--modelscope", action="store_true", help="Load model from ModelScope Hub")
     embed_parser.add_argument("-n", "--normalize", action="store_true", help="Normalize the embeddings")
     embed_parser.add_argument("-nt", "--no_truncate", action="store_true", help="Not truncate the embeddings")
 
@@ -525,6 +537,7 @@ def main():
     convert_hf_parser.add_argument("--no_tensor_first_split", action="store_true", help="Disable tensor-first splitting")
     convert_hf_parser.add_argument("--vocab_only", action="store_true", help="Only process vocabulary")
     convert_hf_parser.add_argument("--dry_run", action="store_true", help="Perform a dry run without actual conversion")
+    convert_hf_parser.add_argument("-ms", "--modelscope", action="store_true", help="Download model from ModelScope Hub")
 
     quantization_parser = convert_parser.add_argument_group('Quantization options')
     quantization_parser.add_argument("--nthread", type=int, default=4, help="Number of threads to use (default: 4)")
@@ -540,8 +553,9 @@ def main():
     server_parser = subparsers.add_parser("server", help="Run the Nexa AI Text Generation Service")
     server_parser.add_argument("model_path", type=str, nargs='?', help="Path or identifier for the model in Nexa Model Hub")
     server_parser.add_argument("-lp", "--local_path", action="store_true", help="Indicate that the model path provided is the local path")
-    server_parser.add_argument("-mt", "--model_type", type=str, choices=[e.name for e in ModelType], help="Indicate the model running type, must be used with -lp or -hf")
+    server_parser.add_argument("-mt", "--model_type", type=str, choices=[e.name for e in ModelType], help="Indicate the model running type, must be used with -lp, -hf or -ms")
     server_parser.add_argument("-hf", "--huggingface", action="store_true", help="Load model from Hugging Face Hub")
+    server_parser.add_argument("-ms", "--modelscope", action="store_true", help="Load model from ModelScope Hub")
     server_parser.add_argument("--host", type=str, default="localhost", help="Host to bind the server to")
     server_parser.add_argument("--port", type=int, default=8000, help="Port to bind the server to")
     server_parser.add_argument("--reload", action="store_true", help="Enable automatic reloading on code changes")
@@ -551,6 +565,7 @@ def main():
     pull_parser = subparsers.add_parser("pull", help="Pull a model from official or hub.")
     pull_parser.add_argument("model_path", type=str, help="Path or identifier for the model in Nexa Model Hub")
     pull_parser.add_argument("-hf", "--huggingface", action="store_true", help="Pull model from Hugging Face Hub")
+    pull_parser.add_argument("-ms", "--modelscope", action="store_true", help="Pull model from ModelScope Hub")
     pull_parser.add_argument("-o", "--output_path", type=str, help="Custom output path for the pulled model")
 
     remove_parser = subparsers.add_parser("remove", help="Remove a model from local machine.")
@@ -584,11 +599,23 @@ def main():
         if args.local_path and args.huggingface:
             print("Error: --local_path and --huggingface flags cannot be used together")
             return
+        if args.local_path and args.modelscope:
+            print("Error: --local_path and --modelscope flags cannot be used together")
+            return
+        if args.huggingface and args.modelscope:
+            print("Error: --huggingface and --modelscope flags cannot be used together")
+            return
         run_ggml_inference(args)
     elif args.command == "server":
         if args.local_path and args.huggingface:
             print("Error: --local_path and --huggingface flags cannot be used together")
             return
+        if args.local_path and args.modelscope:
+            print("Error: --local_path and --modelscope flags cannot be used together")
+            return
+        if args.huggingface and args.modelscope:
+            print("Error: --huggingface and --modelscope flags cannot be used together")
+            return
         run_ggml_server(args)
     elif args.command == "onnx":
         run_onnx_inference(args)
@@ -601,6 +628,7 @@ def main():
         import os
 
         hf = getattr(args, 'huggingface', False)
+        ms = getattr(args, 'modelscope', False)
         local_download_path = None
         
         if args.output_path:
@@ -608,8 +636,7 @@ def main():
                 os.makedirs(args.output_path, exist_ok=True)
                 print(f"Created output directory: {args.output_path}")
             local_download_path = os.path.abspath(args.output_path)
-            
-        pull_model(args.model_path, hf, local_download_path=local_download_path)
+        pull_model(args.model_path, hf, ms, local_download_path=local_download_path)
     elif args.command == "convert":
         run_convert(args)
     elif args.command == "remove":
diff --git a/nexa/constants.py b/nexa/constants.py
index b034f73d..24acd195 100644
--- a/nexa/constants.py
+++ b/nexa/constants.py
@@ -9,6 +9,7 @@
 NEXA_MODEL_EVAL_RESULTS_PATH = NEXA_CACHE_ROOT / "eval"
 NEXA_MODELS_HUB_OFFICIAL_DIR = NEXA_MODELS_HUB_DIR / "official"
 NEXA_MODELS_HUB_HF_DIR = NEXA_MODELS_HUB_DIR / "huggingface"
+NEXA_MODELS_HUB_MS_DIR = NEXA_MODELS_HUB_DIR / "modelscope"
 NEXA_MODEL_LIST_PATH = NEXA_MODELS_HUB_DIR / "model_list.json"
 
 # URLs and buckets
diff --git a/nexa/general.py b/nexa/general.py
index 5c92f52a..3190c49d 100644
--- a/nexa/general.py
+++ b/nexa/general.py
@@ -18,6 +18,7 @@
     NEXA_MODELS_HUB_DIR,
     NEXA_MODELS_HUB_OFFICIAL_DIR,
     NEXA_MODELS_HUB_HF_DIR,
+    NEXA_MODELS_HUB_MS_DIR,
     NEXA_OFFICIAL_BUCKET,
     NEXA_RUN_MODEL_MAP,
     NEXA_TOKEN_PATH,
@@ -107,12 +108,14 @@ def get_user_info(token):
         return None
 
 
-def pull_model(model_path, hf = False, **kwargs):
+def pull_model(model_path, hf = False, ms = False, **kwargs):
     model_path = NEXA_RUN_MODEL_MAP.get(model_path, model_path)
 
     try:
         if hf == True:
             result = pull_model_from_hf(model_path, **kwargs)
+        elif ms == True:
+            result = pull_model_from_ms(model_path, **kwargs)
         else: 
             if is_model_exists(model_path):
                 location, run_type = get_model_info(model_path)
@@ -126,11 +129,11 @@ def pull_model(model_path, hf = False, **kwargs):
 
         if result["success"]:
             # Only add to model list if not using custom download path
-            model_path = model_path if not hf else f"{model_path}:{result['local_path'].split('/')[-1]}"
+            model_path = model_path if not (hf or ms) else f"{model_path}:{result['local_path'].split('/')[-1]}"
             if not kwargs.get('local_download_path'):
                 add_model_to_list(model_path, result["local_path"], result["model_type"], result["run_type"])
             
-            if hf:
+            if hf or ms:
                 print(f"Successfully pulled model {model_path} to {result['local_path']}")
             else:
                 print(f"Successfully pulled model {model_path} to {result['local_path']}, run_type: {result['run_type']}")
@@ -232,7 +235,7 @@ def pull_model_from_official(model_path, **kwargs):
     }
 
 def pull_model_from_hf(repo_id, run_type = "NLP", **kwargs):
-    repo_id, filename = select_gguf_in_hf_repo(repo_id)
+    repo_id, filename = select_gguf_from_repo(repo_id, 'huggingface')
     success, model_path = download_gguf_from_hf(repo_id, filename, **kwargs)
 
     # For beta version, we only support NLP gguf models
@@ -244,6 +247,19 @@ def pull_model_from_hf(repo_id, run_type = "NLP", **kwargs):
     }
 
 
+def pull_model_from_ms(repo_id, run_type = "NLP", **kwargs):
+    repo_id, filename = select_gguf_from_repo(repo_id, 'modelscope')
+    success, model_path = download_gguf_from_ms(repo_id, filename, **kwargs)
+
+    # For beta version, we only support NLP gguf models
+    return {
+        "success": success,
+        "local_path": model_path,
+        "model_type": "gguf",
+        "run_type": run_type
+    }
+
+
 def get_run_type_from_model_path(model_path):
     model_name, _ = model_path.split(":")
     return NEXA_OFFICIAL_MODELS_TYPE.get(model_name, ModelType.NLP).value
@@ -469,6 +485,32 @@ def download_repo_from_hf(repo_id):
         print(f"Failed to download the repository: {e}")
         return False, None
 
+def download_repo_from_ms(repo_id):
+    try:
+        from modelscope import snapshot_download
+        from pathlib import Path
+    except ImportError:
+        print("The modelscope package is required. Please install it with `pip install modelscope`.")
+        return False, None
+    
+    # Define the local directory to save the model
+    local_dir = NEXA_MODELS_HUB_MS_DIR / Path(repo_id)
+    local_dir.mkdir(parents=True, exist_ok=True)
+    
+    try:
+        # Download the entire repository
+        repo_path = snapshot_download(
+            model_id=repo_id,
+            local_dir=local_dir,
+            revision="master"
+        )
+
+        print(f"Successfully downloaded repository '{repo_id}' to {repo_path}")
+        return True, repo_path
+    except Exception as e:
+        print(f"Failed to download the repository: {e}")
+        return False, None
+
 def download_gguf_from_hf(repo_id, filename, **kwargs):
     try:
         from huggingface_hub import hf_hub_download
@@ -508,6 +550,44 @@ def download_gguf_from_hf(repo_id, filename, **kwargs):
         print(f"Failed to download the model: {e}")
         return False, None
 
+def download_gguf_from_ms(repo_id, filename, **kwargs):
+    from pathlib import Path
+    import shutil
+    try:
+        from modelscope.hub.file_download import model_file_download
+    except ImportError:
+        print("The modelscope package is required. Please install it with `pip install modelscope`.")
+        return None
+
+    # Get custom download path from kwargs if present
+    local_download_path = kwargs.get('local_download_path')
+    base_download_dir = Path(local_download_path) if local_download_path else NEXA_MODELS_HUB_MS_DIR
+    local_dir = base_download_dir / Path(repo_id)
+    local_dir.mkdir(parents=True, exist_ok=True)
+
+    # Download the model
+    try:
+        model_path = model_file_download(
+            model_id=repo_id,
+            file_path=filename,
+            local_dir=local_dir,
+            local_files_only=False,
+        )
+        # If using custom download path, move the file and cleanup
+        if local_download_path:
+            model_file = Path(model_path)
+            target_path = base_download_dir / filename
+            shutil.move(str(model_file), str(target_path))
+            # Get the organization directory (first part of repo_id)
+            org_dir = base_download_dir / repo_id.split('/')[0]
+            shutil.rmtree(org_dir)
+            return True, str(target_path)
+
+        return True, model_path
+    except Exception as e:
+        print(f"Failed to download the model: {e}")
+        return False, None
+
 def is_model_exists(model_name):
     if not NEXA_MODEL_LIST_PATH.exists():
         return False
@@ -660,42 +740,60 @@ def clean():
     except Exception as e:
         print(f"An error occurred while cleaning the directory: {e}")
 
-def select_gguf_in_hf_repo(repo_id: str) -> Tuple[str, str]:
+def select_gguf_from_repo(repo_id: str, model_hub: str) -> Tuple[str, str]:
     """
-    Lists all files ending with .gguf in the given Hugging Face repository,
+    Lists all files ending with .gguf in the given (HuggingFace or ModelScope) repository,
     prompts the user to select one, and returns the repo_id and the selected filename.
 
     Args:
-        repo_id (str): The Hugging Face repository ID.
+        repo_id (str): The repository ID.
+        model_hub (str): huggingface or modelscope
 
     Returns:
         Tuple[str, str]: A tuple containing the repo_id and the selected filename.
     """
-    try:
-        from huggingface_hub import HfFileSystem
-        from huggingface_hub.utils import validate_repo_id
-        from pathlib import Path
-    except ImportError:
-        print("The huggingface-hub package is required. Please install it with `pip install huggingface-hub`.")
-        exit(1)
+    if model_hub == 'huggingface':
+        try:
+            from huggingface_hub import HfFileSystem
+            from huggingface_hub.utils import validate_repo_id
+            from pathlib import Path
+        except ImportError:
+            print("The huggingface-hub package is required. Please install it with `pip install huggingface-hub`.")
+            exit(1)
 
-    validate_repo_id(repo_id)
-    hffs = HfFileSystem()
+        validate_repo_id(repo_id)
+        hffs = HfFileSystem()
 
-    try:
-        files = [
-            file["name"] if isinstance(file, dict) else file
-            for file in hffs.ls(repo_id, recursive=True)
-        ]
-    except Exception as e:
-        print(f"Error accessing repository '{repo_id}'. Please make sure you have access to the Hugging Face repository first.")
-        exit(1)
+        try:
+            files = [
+                file["name"] if isinstance(file, dict) else file
+                for file in hffs.ls(repo_id, recursive=True)
+            ]
+        except Exception as e:
+            print(f"Error accessing repository '{repo_id}'. Please make sure you have access to the Hugging Face repository first.")
+            exit(1)
+
+        # Remove the repo prefix from files
+        file_list = []
+        for file in files:
+            rel_path = Path(file).relative_to(repo_id)
+            file_list.append(str(rel_path))
+    elif model_hub == 'modelscope':
+        try:
+            from modelscope.hub.api import HubApi
+        except ImportError:
+            print("The modelscope package is required. Please install it with `pip install modelscope`.")
+            exit(1)
 
-    # Remove the repo prefix from files
-    file_list = []
-    for file in files:
-        rel_path = Path(file).relative_to(repo_id)
-        file_list.append(str(rel_path))
+        try:
+            ms_api = HubApi()
+            infos = ms_api.get_model_files(repo_id, recursive=True)
+            file_list = [info['Path'] for info in infos]
+        except Exception as e:
+            print(f"Error accessing repository '{repo_id}'. Please make sure you have access to the ModelScope repository first.")
+            exit(1)
+    else:
+        raise ValueError("Invalid model hub specified. Supported model hub are 'huggingface' and 'modelscope")
 
     # Filter for files ending with .gguf
     gguf_files = [file for file in file_list if file.endswith('.gguf')]
diff --git a/nexa/gguf/__init__.py b/nexa/gguf/__init__.py
index 9ba70de9..d2afff06 100644
--- a/nexa/gguf/__init__.py
+++ b/nexa/gguf/__init__.py
@@ -2,13 +2,15 @@
 from .nexa_inference_text import NexaTextInference
 from .nexa_inference_vlm import NexaVLMInference
 from .nexa_inference_voice import NexaVoiceInference
-from .nexa_inference_tts import NexaTTSInference
+
+# Temporarily disabled since version v0.0.9.3
+# from .nexa_inference_tts import NexaTTSInference
 
 __all__ = [
     "NexaImageInference",
     "NexaTextInference",
     "NexaVLMInference",
     "NexaVoiceInference",
-    "NexaTTSInference",
+    #"NexaTTSInference",
     "NexaAudioLMInference"
 ]
\ No newline at end of file
diff --git a/nexa/gguf/llama/kv_cache.py b/nexa/gguf/llama/kv_cache.py
new file mode 100644
index 00000000..aa01630d
--- /dev/null
+++ b/nexa/gguf/llama/kv_cache.py
@@ -0,0 +1,86 @@
+from nexa.gguf.llama.llama_cache import LlamaDiskCache
+from typing import Any, Dict
+
+def run_inference_with_disk_cache(
+    model: Any, 
+    cache_prompt: str, 
+    total_prompt: str, 
+    use_cache: bool = True, 
+    cache_dir: str = "llama.cache", 
+    **kwargs: Dict[str, Any]
+) -> Any:
+    """
+    Runs inference using a disk cache to store and retrieve model states.
+
+    Parameters:
+    - model: The model object that supports caching and inference.
+    - cache_prompt: The prompt used to generate a cache key.
+    - total_prompt: The full prompt for generating output.
+    - use_cache: Flag to determine if caching should be used.
+    - cache_dir: Directory where cache files are stored.
+    - kwargs: Additional parameters for model inference.
+
+    Returns:
+    - The output generated by the model.
+    """
+    temperature = kwargs.get('temperature', 0.7)
+    max_tokens = kwargs.get('max_tokens', 2048)
+    top_p = kwargs.get('top_p', 0.8)
+    top_k = kwargs.get('top_k', 50)
+    repeat_penalty = kwargs.get('repeat_penalty', 1.0)
+
+    if use_cache:
+        # Initialize disk cache with specified directory
+        cache_context = LlamaDiskCache(cache_dir=cache_dir)
+        model.set_cache(cache_context)
+        # Convert prompt to tokens for cache key
+        prompt_tokens = model.tokenize(cache_prompt.encode("utf-8"))
+
+        try:
+            # Try to load existing cache
+            cached_state = cache_context[prompt_tokens]
+            model.load_state(cached_state)
+
+            output = model(
+                total_prompt,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                stream=True,
+            )
+        except KeyError:
+            # If cache doesn't exist, create it
+            model.reset()
+            # Run initial inference to populate cache
+            _ = model(
+                cache_prompt,
+                max_tokens=1,  # Minimal tokens for cache creation
+                temperature=temperature,
+                echo=False,
+            )
+            # Save the state to cache
+            cache_context[prompt_tokens] = model.save_state()
+
+            # Generate output after creating cache
+            output = model(
+                total_prompt,
+                max_tokens=max_tokens,
+                temperature=temperature,
+                top_p=top_p,
+                top_k=top_k,
+                repeat_penalty=repeat_penalty,
+                stream=True,
+            )
+    else:
+        model.reset()
+        model.set_cache(None)
+
+        output = model(
+            total_prompt,
+            max_tokens=max_tokens,
+            temperature=temperature,
+            top_p=top_p,
+            top_k=top_k,
+            repeat_penalty=repeat_penalty,
+            stream=True,
+        )
+    return output
\ No newline at end of file
diff --git a/nexa/gguf/llama/llama.py b/nexa/gguf/llama/llama.py
index d7b241e7..0007b515 100644
--- a/nexa/gguf/llama/llama.py
+++ b/nexa/gguf/llama/llama.py
@@ -31,6 +31,7 @@
 
 from nexa.gguf.llama.llama_types import *
 from nexa.gguf.llama.llama_grammar import LlamaGrammar
+from nexa.gguf.llama.llama_cache import BaseLlamaCache
 from nexa.gguf.llama.llama_tokenizer import BaseLlamaTokenizer, LlamaTokenizer
 import nexa.gguf.llama.llama_cpp as llama_cpp
 import nexa.gguf.llama.llama_chat_format as llama_chat_format
@@ -350,6 +351,8 @@ def __init__(
         # Sampling Params
         self.last_n_tokens_size = last_n_tokens_size
 
+        self.cache: Optional[BaseLlamaCache] = None
+
         self.lora_base = lora_base
         self.lora_scale = lora_scale
         self.lora_path = lora_path
@@ -596,6 +599,14 @@ def detokenize(
             The detokenized string.
         """
         return self.tokenizer_.detokenize(tokens, prev_tokens=prev_tokens, special=special)
+    
+    def set_cache(self, cache: Optional[BaseLlamaCache]):
+        """Set the cache.
+
+        Args:
+            cache: The cache to set.
+        """
+        self.cache = cache
 
     def set_seed(self, seed: int):
         """Set the random seed.
@@ -1211,6 +1222,23 @@ def logit_bias_processor(
             raise ValueError(
                 "logprobs is not supported for models created with logits_all=False"
             )
+        
+        if self.cache:
+            try:
+                cache_item = self.cache[prompt_tokens]
+                cache_prefix_len = Llama.longest_token_prefix(
+                    cache_item.input_ids.tolist(), prompt_tokens
+                )
+                eval_prefix_len = Llama.longest_token_prefix(
+                    self._input_ids.tolist(), prompt_tokens
+                )
+                if cache_prefix_len > eval_prefix_len:
+                    self.load_state(cache_item)
+                    if self.verbose:
+                        print("Llama._create_completion: cache hit", file=sys.stderr)
+            except KeyError:
+                if self.verbose:
+                    print("Llama._create_completion: cache miss", file=sys.stderr)
 
         if seed is not None:
             self._ctx.set_rng_seed(seed)
@@ -1552,8 +1580,19 @@ def logit_bias_processor(
                     }
                 ],
             }
+            if self.cache:
+                if self.verbose:
+                    print("Llama._create_completion: cache save", file=sys.stderr)
+                self.cache[prompt_tokens + completion_tokens] = self.save_state()
+                if self.verbose:
+                    print("Llama._create_completion: cache saved", file=sys.stderr)
             return
 
+        if self.cache:
+            if self.verbose:
+                print("Llama._create_completion: cache save", file=sys.stderr)
+            self.cache[prompt_tokens + completion_tokens] = self.save_state()
+
         text_str = text.decode("utf-8", errors="ignore")
 
         if echo:
diff --git a/nexa/gguf/server/nexa_service.py b/nexa/gguf/server/nexa_service.py
index d0267098..73772e31 100644
--- a/nexa/gguf/server/nexa_service.py
+++ b/nexa/gguf/server/nexa_service.py
@@ -82,6 +82,7 @@
 is_local_path = False
 model_type = None
 is_huggingface = False
+is_modelscope = False
 projector_path = None
 # Request Classes
 class GenerationRequest(BaseModel):
@@ -176,7 +177,7 @@ class EmbeddingRequest(BaseModel):
 
 # helper functions
 async def load_model():
-    global model, chat_format, completion_template, model_path, n_ctx, is_local_path, model_type, is_huggingface, projector_path
+    global model, chat_format, completion_template, model_path, n_ctx, is_local_path, model_type, is_huggingface, is_modelscope, projector_path
     if is_local_path:
         if model_type == "Multimodal":
             if not projector_path:
@@ -185,11 +186,11 @@ async def load_model():
             projector_downloaded_path = projector_path
         else:
             downloaded_path = model_path
-    elif is_huggingface:
+    elif is_huggingface or is_modelscope:
         # TODO: currently Multimodal models and Audio models are not supported for Hugging Face
         if model_type == "Multimodal" or model_type == "Audio":
             raise ValueError("Multimodal and Audio models are not supported for Hugging Face")
-        downloaded_path, _ = pull_model(model_path, hf=True)
+        downloaded_path, _ = pull_model(model_path, hf=is_huggingface, ms=is_modelscope)
     else:
         if model_path in NEXA_RUN_MODEL_MAP_VLM: # for Multimodal models
             downloaded_path, _ = pull_model(NEXA_RUN_MODEL_MAP_VLM[model_path])
@@ -333,7 +334,7 @@ def nexa_run_text_generation(
     logprobs_or_none = None
 
     if is_chat_completion:
-        if is_local_path or is_huggingface: # do not add system prompt if local path or huggingface
+        if is_local_path or is_huggingface or is_modelscope: # do not add system prompt if local path or huggingface or modelscope
             messages = [{"role": "user", "content": prompt}]
         else:
             messages = chat_completion_system_prompt + [{"role": "user", "content": prompt}]
@@ -496,14 +497,15 @@ def image_url_to_base64(image_url: str) -> str:
     return f"data:image/png;base64,{base64.b64encode(buffered.getvalue()).decode()}"
 
 
-def run_nexa_ai_service(model_path_arg=None, is_local_path_arg=False, model_type_arg=None, huggingface=False, projector_local_path_arg=None, **kwargs):
-    global model_path, n_ctx, is_local_path, model_type, is_huggingface, projector_path
+def run_nexa_ai_service(model_path_arg=None, is_local_path_arg=False, model_type_arg=None, huggingface=False, modelscope=False, projector_local_path_arg=None, **kwargs):
+    global model_path, n_ctx, is_local_path, model_type, is_huggingface, is_modelscope, projector_path
     is_local_path = is_local_path_arg
     is_huggingface = huggingface
+    is_modelscope = modelscope
     projector_path = projector_local_path_arg
-    if is_local_path_arg or huggingface:
+    if is_local_path_arg or huggingface or modelscope:
         if not model_path_arg:
-            raise ValueError("model_path must be provided when using --local_path or --huggingface")
+            raise ValueError("model_path must be provided when using --local_path or --huggingface or --modelscope")
         if is_local_path_arg and not model_type_arg:
             raise ValueError("--model_type must be provided when using --local_path")
         model_path = os.path.abspath(model_path_arg) if is_local_path_arg else model_path_arg
@@ -515,6 +517,7 @@ def run_nexa_ai_service(model_path_arg=None, is_local_path_arg=False, model_type
     os.environ["IS_LOCAL_PATH"] = str(is_local_path_arg)
     os.environ["MODEL_TYPE"] = model_type if model_type else ""
     os.environ["HUGGINGFACE"] = str(huggingface)
+    os.environ["MODELSCOPE"] = str(modelscope)
     os.environ["PROJECTOR_PATH"] = projector_path if projector_path else ""
     n_ctx = kwargs.get("nctx", 2048)
     host = kwargs.get("host", "localhost")
@@ -525,11 +528,12 @@ def run_nexa_ai_service(model_path_arg=None, is_local_path_arg=False, model_type
 # Endpoints
 @app.on_event("startup")
 async def startup_event():
-    global model_path, is_local_path, model_type, is_huggingface, projector_path
+    global model_path, is_local_path, model_type, is_huggingface, is_modelscope, projector_path
     model_path = os.getenv("MODEL_PATH", "gemma")
     is_local_path = os.getenv("IS_LOCAL_PATH", "False").lower() == "true"
     model_type = os.getenv("MODEL_TYPE", None)
     is_huggingface = os.getenv("HUGGINGFACE", "False").lower() == "true"
+    is_modelscope = os.getenv("MODELSCOPE", "False").lower() == "true"
     projector_path = os.getenv("PROJECTOR_PATH", None)
     await load_model()
 
@@ -859,12 +863,18 @@ async def create_embedding(request: EmbeddingRequest):
         action="store_true",
         help="Use a Hugging Face model",
     )
+    parser.add_argument(
+        "--modelscope",
+        action="store_true",
+        help="Use a ModelScope model",
+    )
     args = parser.parse_args()
     run_nexa_ai_service(
         args.model_path,
         is_local_path_arg=args.local_path,
         model_type_arg=args.model_type,
         huggingface=args.huggingface,
+        modelscope=args.modelscope,
         nctx=args.nctx,
         host=args.host,
         port=args.port,
diff --git a/nexa/transformers/README.md b/nexa/transformers/README.md
new file mode 100644
index 00000000..c539b454
--- /dev/null
+++ b/nexa/transformers/README.md
@@ -0,0 +1,8 @@
+# transformers support for Nexa AI models
+
+```
+python run_omnivision.py
+```
+
+## Acknowledgements
+We thank the [Hugging Face Transformers](https://github.com/huggingface/transformers) for their amazing work on the Transformers library.
diff --git a/nexa/transformers/__init__.py b/nexa/transformers/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/nexa/transformers/omnivision/__init__.py b/nexa/transformers/omnivision/__init__.py
new file mode 100644
index 00000000..e69de29b
diff --git a/nexa/transformers/omnivision/configuration.py b/nexa/transformers/omnivision/configuration.py
new file mode 100644
index 00000000..d356a315
--- /dev/null
+++ b/nexa/transformers/omnivision/configuration.py
@@ -0,0 +1,130 @@
+# Copyright (c) 2024 Nexa AI Inc., Alibaba Group (Qwen team), and HuggingFace Inc.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+""" Qwen2 model configuration"""
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+from typing import Union
+from transformers import PretrainedConfig
+import os
+from transformers.models.auto import CONFIG_MAPPING
+
+logger = logging.get_logger(__name__)
+
+
+class SigLipVisionConfig(PretrainedConfig):
+    model_type = "siglip_vision_model"
+    def __init__(
+            self,
+            hidden_size=1152,
+            image_mean=(0.5, 0.5, 0.5),
+            intermediate_size=4304,
+            num_hidden_layers=27,
+            num_attention_heads=16,
+            num_channels=3,
+            image_size=384,
+            patch_size=14,
+            hidden_act="gelu_pytorch_tanh",
+            layer_norm_eps=1e-6,
+            attention_dropout=0.0,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.image_mean = image_mean
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig":
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the vision config dict if we are loading from SigLipConfig
+        if config_dict.get("model_type") == "siglip":
+            config_dict = config_dict["vision_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+        return cls.from_dict(config_dict, **kwargs)
+        
+        
+""" Nexa AI model configuration"""
+class OminiVLMConfig(PretrainedConfig):
+    model_type = "nano-omini-vlm"
+    
+    model_type = "omini_vlm"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        hidden_size=4096,
+        mm_hidden_size=1152,
+        mm_projector_lr=None,
+        mm_projector_type="mlp2x_gelu",
+        image_token_index=151655,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        self.hidden_size = hidden_size
+        self.mm_hidden_size = mm_hidden_size
+        self.mm_projector_lr = mm_projector_lr
+        self.mm_projector_type = mm_projector_type
+        self.image_token_index = image_token_index
+        self.initializer_range = initializer_range
+        if isinstance(vision_config, dict):
+            vision_config = SigLipVisionConfig(**vision_config)
+        elif vision_config is None:
+            vision_config = SigLipVisionConfig(
+                hidden_size=1152,
+                image_mean=(0.5, 0.5, 0.5),
+                intermediate_size=4304,
+                num_hidden_layers=27,
+                num_attention_heads=16,
+                num_channels=3,
+                image_size=384,
+                patch_size=14,
+                hidden_act="gelu_pytorch_tanh",
+                layer_norm_eps=1e-6,
+                attention_dropout=0.0,
+            )
+        self.vision_config = vision_config
+        
+        if isinstance(text_config, dict):
+            text_config["model_type"] = (
+                text_config["model_type"] if "model_type" in text_config else "qwen2"
+            )
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["qwen2"]()
+
+        self.text_config = text_config
+
+        super().__init__(**kwargs)
+            
\ No newline at end of file
diff --git a/nexa/transformers/omnivision/modeling.py b/nexa/transformers/omnivision/modeling.py
new file mode 100644
index 00000000..94bd67b1
--- /dev/null
+++ b/nexa/transformers/omnivision/modeling.py
@@ -0,0 +1,709 @@
+# Copyright (c) 2024 Nexa AI Inc., Alibaba Group (Qwen team), and HuggingFace Inc.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Any, Dict, List, Optional, Tuple, Union
+from dataclasses import dataclass
+from transformers.activations import ACT2FN
+import torch.utils.checkpoint
+from torch import nn
+from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import ModelOutput
+from transformers import Qwen2ForCausalLM
+from .configuration import SigLipVisionConfig, OminiVLMConfig
+
+# ======================================================================================== #
+#                          vision tower                                                    #
+# ======================================================================================== #
+@dataclass
+class SigLipVisionModelOutput(ModelOutput):
+    """
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+
+    Args:
+        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    
+    
+@dataclass
+class OminiVLMOutputWithPast(ModelOutput):
+    """
+    Base class for Gemma2Audio causal language model (or autoregressive) outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss (for next-token prediction).
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`)
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+            `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        attention_mask (`torch.FloatTensor`, *optional*):
+            Attentions mask, used to update attention mask and position_ids.
+    """
+    loss: Optional[torch.FloatTensor] = None
+    logits: torch.FloatTensor = None
+    past_key_values: Optional[List[torch.FloatTensor]] = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    attention_mask: Optional[torch.FloatTensor] = None
+    
+
+class SigLipVisionEmbeddings(nn.Module):
+    def __init__(self, config: SigLipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            padding="valid",
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
+
+    def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor:
+        patch_embeds = self.patch_embedding(pixel_values)  # shape = [*, width, grid, grid]
+        embeddings = patch_embeds.flatten(2).transpose(1, 2)
+
+        embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+
+
+class SigLipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim ** -0.5
+        self.dropout = config.attention_dropout
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        k_v_seq_len = key_states.shape[-2]
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale
+
+        if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights + attention_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+        attn_output = torch.matmul(attn_weights, value_states)
+
+        if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+class SigLipMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class SigLipEncoderLayer(nn.Module):
+    def __init__(self, config: SigLipVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = SigLipAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = SigLipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+            self,
+            hidden_states: torch.Tensor,
+            attention_mask: torch.Tensor,
+            output_attentions: Optional[bool] = False,
+    ) -> Tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            attention_mask (`torch.FloatTensor`):
+                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class SigLipPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SigLipVisionConfig
+    base_model_prefix = "siglip"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        pass
+
+
+class SigLipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`SigLipEncoderLayer`].
+
+    Args:
+        config: SigLipVisionConfig
+    """
+
+    def __init__(self, config: SigLipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([SigLipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+            self,
+            inputs_embeds,
+            attention_mask: Optional[torch.Tensor] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    encoder_layer.__call__,
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                )
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class SigLipMultiheadAttentionPoolingHead(nn.Module):
+    """Multihead Attention Pooling."""
+
+    def __init__(self, config: SigLipVisionConfig):
+        super().__init__()
+
+        self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True)
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.mlp = SigLipMLP(config)
+
+    def forward(self, hidden_state):
+        batch_size = hidden_state.shape[0]
+        probe = self.probe.repeat(batch_size, 1, 1)
+
+        hidden_state = self.attention(probe, hidden_state, hidden_state)[0]
+
+        residual = hidden_state
+        hidden_state = self.layernorm(hidden_state)
+        hidden_state = residual + self.mlp(hidden_state)
+
+        return hidden_state[:, 0]
+    
+
+class SigLipVisionTransformer(nn.Module):
+    def __init__(self, config: SigLipVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = SigLipVisionEmbeddings(config)
+        self.encoder = SigLipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.head = SigLipMultiheadAttentionPoolingHead(config)
+    
+    def get_dtype(self) -> torch.dtype:
+        return self.encoder.layers[0].mlp.fc2.weight.dtype
+
+    def forward(
+            self,
+            pixel_values,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = self.head(last_hidden_state)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class SigLipVisionModel(SigLipPreTrainedModel):
+    config_class = SigLipVisionConfig
+    main_input_name = "pixel_values"
+    _no_split_modules = ["SigLipEncoderLayer"]
+
+    def __init__(self, config: SigLipVisionConfig):
+        super().__init__(config)
+        self.vision_model = SigLipVisionTransformer(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    def forward(
+            self,
+            pixel_values,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, BaseModelOutputWithPooling]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+# ======================================================================================== #
+#                          Projector                                                       #
+# ======================================================================================== #
+
+import re
+def build_vision_projector(config, delay_load=False, **kwargs):
+    projector_type = getattr(config, 'mm_projector_type', 'mlp2x_gelu')
+    mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type)
+    if mlp_gelu_match:
+        mlp_depth = int(mlp_gelu_match.group(1))
+        modules = [nn.Linear(config.mm_hidden_size*9, config.hidden_size)]
+        for _ in range(1, mlp_depth):
+            modules.append(nn.GELU())
+            modules.append(nn.Linear(config.hidden_size, config.text_config.hidden_size))
+        return nn.Sequential(*modules)
+    
+    
+# ======================================================================================== #
+#                          LLM                                                             #
+# ======================================================================================== #
+class OminiVLMPreTrainedModel(PreTrainedModel):
+    config_class = OminiVLMConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Qwen2DecoderLayer", "SigLipEncoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn_2 = True
+    _supports_sdpa = True
+    _supports_cache_class = True
+    _supports_static_cache = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv3d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+class OminiVLMForConditionalGeneration(OminiVLMPreTrainedModel):
+    def __init__(self, config: OminiVLMConfig):
+        super().__init__(config)
+        if isinstance(config.vision_config, dict):
+            vision_config = SigLipVisionConfig(**config.vision_config)
+        else:
+            vision_config = config.vision_config
+        self.vision_tower = SigLipVisionModel(vision_config)
+        self.multi_modal_projector = build_vision_projector(config)
+        self.vocab_size = config.text_config.vocab_size
+        self.language_model = Qwen2ForCausalLM(
+            config.text_config, 
+        )
+        self.pad_token_id = (
+            self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        )
+        self._padding_side = "right"  # set it to left by default, user can use setter to change padding_sides
+        self.post_init()
+
+    @property
+    def padding_side(self):
+        return self._padding_side
+
+    @padding_side.setter
+    def padding_side(self, padding_side: str):
+        if padding_side not in ["left", "right"]:
+            raise ValueError(f"{padding_side} is not `left` or `right`.")
+        self._padding_side = padding_side
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def get_output_embeddings(self):
+        return self.language_model.get_output_embeddings()
+
+    def set_output_embeddings(self, new_embeddings):
+        self.language_model.set_output_embeddings(new_embeddings)
+
+    def set_decoder(self, decoder):
+        self.language_model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.language_model.get_decoder()
+
+    def tie_weights(self):
+        return self.language_model.tie_weights()
+
+    def resize_token_embeddings(
+        self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None
+    ) -> nn.Embedding:
+        model_embeds = self.language_model.resize_token_embeddings(
+            new_num_tokens, pad_to_multiple_of
+        )
+        # update vocab size
+        self.config.text_config.vocab_size = model_embeds.num_embeddings
+        self.vocab_size = model_embeds.num_embeddings
+        return model_embeds
+    
+    def _update_model_kwargs_for_generation(
+        self,
+        outputs: ModelOutput,
+        model_kwargs: Dict[str, Any],
+        is_encoder_decoder: bool = False,
+        num_new_tokens: int = 1,
+    ) -> Dict[str, Any]:
+        model_kwargs = super()._update_model_kwargs_for_generation(
+            outputs=outputs,
+            model_kwargs=model_kwargs,
+            is_encoder_decoder=is_encoder_decoder,
+            num_new_tokens=num_new_tokens,
+        )
+        return model_kwargs
+
+    def forward(
+        self,
+        input_ids: torch.LongTensor = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[List[torch.FloatTensor]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, OminiVLMOutputWithPast]:
+        r"""
+        Args:
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+        ```"""
+        output_attentions = (
+            output_attentions
+            if output_attentions is not None
+            else self.config.output_attentions
+        )
+        output_hidden_states = (
+            output_hidden_states
+            if output_hidden_states is not None
+            else self.config.output_hidden_states
+        )
+        return_dict = (
+            return_dict if return_dict is not None else self.config.use_return_dict
+        )
+
+        target_device = self.vision_tower.device
+
+        if pixel_values is not None:
+            pixel_values = pixel_values.to(target_device)
+
+        if inputs_embeds is None:
+            # 1. Extract the input embeddings
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+            # 2. Merge text and vision features
+            if pixel_values is not None:
+                pixel_values = pixel_values.type(self.vision_tower.vision_model.get_dtype())
+                image_embeds = self.vision_tower(pixel_values).last_hidden_state.to(pixel_values.dtype)
+                image_embeds = image_embeds.view(image_embeds.shape[0], 81, -1)
+                image_embeds = self.multi_modal_projector(image_embeds)
+                image_mask = (
+                    (input_ids == self.config.image_token_index)
+                    .unsqueeze(-1)
+                    .expand_as(inputs_embeds)
+                    .to(inputs_embeds.device)
+                )
+                image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype)
+                
+                ## This is to intelligently replace the image tokens with the image features
+                inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds)
+    
+            if attention_mask is not None:
+                attention_mask = attention_mask.to(inputs_embeds.device)
+                
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        logits = outputs[0]
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            if attention_mask is not None:
+                shift_attention_mask = attention_mask[..., 1:]
+                shift_logits = logits[..., :-1, :][
+                    shift_attention_mask.to(logits.device) != 0
+                ].contiguous()
+                shift_labels = labels[..., 1:][
+                    shift_attention_mask.to(labels.device) != 0
+                ].contiguous()
+            else:
+                shift_logits = logits[..., :-1, :].contiguous()
+                shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = nn.CrossEntropyLoss()
+            loss = loss_fct(
+                shift_logits.view(-1, shift_logits.size(-1)),
+                shift_labels.view(-1).to(shift_logits.device),
+            )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return OminiVLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            attention_mask=attention_mask,
+        )
\ No newline at end of file
diff --git a/nexa/transformers/omnivision/processing.py b/nexa/transformers/omnivision/processing.py
new file mode 100644
index 00000000..2bc3f008
--- /dev/null
+++ b/nexa/transformers/omnivision/processing.py
@@ -0,0 +1,201 @@
+# Copyright (c) 2024 Nexa AI Inc., Alibaba Group (Qwen team), and HuggingFace Inc.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import List, Union
+
+try:
+    from typing import Unpack
+except ImportError:
+    from typing_extensions import Unpack
+
+from transformers.feature_extraction_utils import BatchFeature
+from transformers.image_utils import ImageInput, VideoInput
+from transformers.processing_utils import (
+    ProcessingKwargs,
+    ProcessorMixin,
+)
+from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+NUM_IMAGE_TOKENS = 81
+
+class NanoVLMProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+        },
+    }
+
+
+class NanoVLMProcessor(ProcessorMixin):
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template"]
+    image_processor_class = "SiglipImageProcessor"
+    tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast")
+
+    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
+        if chat_template is None:
+            chat_template = self.default_chat_template
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    def __call__(
+        self,
+        images: ImageInput = None,
+        text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
+        **kwargs: Unpack[NanoVLMProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to Gemma2TokenizerFast's [`~Gemma2TokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to
+        Gemma2VLImageProcessor's [`~Gemma2VLImageProcessor.__call__`] if `vision_infos` is not `None`.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            NanoVLMProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        
+        # check the number of images is equal to the number of all image_pad tokens
+        assert len(images) == sum([t.count("<|image_pad|>") for t in text]), "The number of images must be equal to the number of all image_pad tokens in the text."
+        
+        if images is not None:
+            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+        else:
+            image_inputs = {}
+
+        if not isinstance(text, list):
+            text = [text]
+
+        if image_inputs is not None:
+            index = 0
+            for i in range(len(text)):
+                while "<|image_pad|>" in text[i]:
+                    text[i] = text[i].replace(
+                        "<|image_pad|>", "<|placeholder|>" * NUM_IMAGE_TOKENS, 1
+                    )
+                    index += 1
+                text[i] = text[i].replace("<|placeholder|>", "<|image_pad|>")
+
+        _ = output_kwargs["text_kwargs"].pop("padding_side", None)
+        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        
+        return BatchFeature(data={**text_inputs, **image_inputs})
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Gemma2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to Gemma2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+    
+    
+    @property
+    def default_chat_template(self):
+        return (
+            "{%- if tools %}"
+                "{{- '<|im_start|>system\n' }}"
+                "{%- if messages[0]['role'] == 'system' %}"
+                    "{{- messages[0]['content'] }}"
+                "{%- else %}"
+                    "{{- 'You are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.' }}"
+                "{%- endif %}"
+                "{{- \"\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within <tools></tools> XML tags:\n<tools>\" }}"
+                "{%- for tool in tools %}"
+                    "{{- \"\n\" }}"
+                    "{{- tool | tojson }}"
+                "{%- endfor %}"
+                "{{- \"\n</tools>\n\nFor each function call, return a json object with function name and arguments within <tool_call></tool_call> XML tags:\n<tool_call>\n{\\\"name\\\": <function-name>, \\\"arguments\\\": <args-json-object>}\n</tool_call><|im_end|>\n\" }}"
+            "{%- else %}"
+                "{%- if messages[0]['role'] == 'system' %}"
+                    "{{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}"
+                "{%- else %}"
+                    "{{- '<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n' }}"
+                "{%- endif %}"
+            "{%- endif %}"
+            "{%- for message in messages %}"
+                "{%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}"
+                    "{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}"
+                "{%- elif message.role == \"assistant\" %}"
+                    "{{- '<|im_start|>' + message.role }}"
+                    "{%- if message.content %}"
+                        "{{- '\n' + message.content }}"
+                    "{%- endif %}"
+                    "{%- for tool_call in message.tool_calls %}"
+                        "{%- if tool_call.function is defined %}"
+                            "{%- set tool_call = tool_call.function %}"
+                        "{%- endif %}"
+                        "{{- '\n<tool_call>\n{\"name\": \"' }}"
+                        "{{- tool_call.name }}"
+                        "{{- '\", \"arguments\": ' }}"
+                        "{{- tool_call.arguments | tojson }}"
+                        "{{- '}\n</tool_call>' }}"
+                    "{%- endfor %}"
+                    "{{- '<|im_end|>\n' }}"
+                "{%- elif message.role == \"tool\" %}"
+                    "{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}"
+                        "{{- '<|im_start|>user' }}"
+                    "{%- endif %}"
+                    "{{- '\n<tool_response>\n' }}"
+                    "{{- message.content }}"
+                    "{{- '\n</tool_response>' }}"
+                    "{%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}"
+                        "{{- '<|im_end|>\n' }}"
+                    "{%- endif %}"
+                "{%- endif %}"
+            "{%- endfor %}"
+            "{%- if add_generation_prompt %}"
+                "{{- '<|im_start|>assistant\n' }}"
+            "{%- endif %}"
+        )
\ No newline at end of file
diff --git a/nexa/transformers/run_omnivision.py b/nexa/transformers/run_omnivision.py
new file mode 100644
index 00000000..f81d1efe
--- /dev/null
+++ b/nexa/transformers/run_omnivision.py
@@ -0,0 +1,92 @@
+from nexa.transformers.omnivision.processing import NanoVLMProcessor
+from nexa.transformers.omnivision.modeling import OminiVLMForConditionalGeneration
+import argparse
+import torch
+
+
+model_name = "NexaAIDev/omnivlm-dpo"
+image_url = "https://public-storage.nexa4ai.com/public-images/cat.png"
+
+
+def get_device():
+    if torch.cuda.is_available():
+        return "cuda"
+    elif torch.backends.mps.is_available():
+        return "mps"
+    return "cpu"
+
+
+def load_model_and_processor(model_path):
+    device = get_device()
+    proc_path = "nexa-collaboration/nano-vlm-processor"
+    processor = NanoVLMProcessor.from_pretrained(proc_path)
+    processor.tokenizer.pad_token = processor.tokenizer.eos_token
+    processor.tokenizer.padding_side = "right"
+
+    model_kwargs = {}
+    # Adjust dtype based on device
+    dtype = torch.bfloat16 if device == "cuda" else torch.float32
+    local_model = OminiVLMForConditionalGeneration.from_pretrained(
+        model_path,
+        torch_dtype=dtype,
+        **model_kwargs
+    )
+    local_model = local_model.to(device)
+    return local_model, processor
+
+
+def process_single_image(processor, image_path, input_prompt=None):
+    text = f"<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_prompt}\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>"
+    # Changed from Image.open() to handle URLs
+    if image_path.startswith('http'):
+        from PIL import Image
+        import requests
+        from io import BytesIO
+        response = requests.get(image_path)
+        image = Image.open(BytesIO(response.content)).convert('RGB')
+    else:
+        image = Image.open(image_path).convert('RGB')
+    inputs = processor(
+        text=[text],
+        images=[image],
+        padding=True,
+        return_tensors="pt",
+    )
+    return inputs.to(get_device())
+
+
+def generate_output(model, processor, inputs, max_tokens):
+    cur_ids = inputs['input_ids']
+    cur_attention_mask = inputs['attention_mask']
+    input_token_length = cur_ids.shape[-1]
+    for _ in range(max_tokens):
+        out = model(
+            cur_ids,
+            attention_mask=cur_attention_mask,
+            pixel_values=inputs['pixel_values'],
+            use_cache=False
+        )
+        next_token = out.logits[:, -1].argmax()
+        next_word = processor.decode(next_token)
+        cur_ids = torch.cat([cur_ids, next_token.unsqueeze(0).unsqueeze(0)], dim=-1)
+        cur_attention_mask = torch.cat([cur_attention_mask, torch.ones_like(next_token).unsqueeze(0).unsqueeze(0)], dim=-1)
+        if next_word in ("<|im_end|>"):
+            break
+    return processor.batch_decode(cur_ids[:, input_token_length:])[0]
+
+def main(args):
+    model, processor = load_model_and_processor(args.model_path)
+    inputs = process_single_image(processor, args.image_path, args.input_prompt)
+    output = generate_output(model, processor, inputs, args.max_tokens)
+    print("=== Inference Result ===\n", output)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Inference script for Nano-Omni-VLM")
+    parser.add_argument("--model_path", default=model_name, help="Path to the model checkpoint")
+    # Add image_path argument
+    parser.add_argument("--image_path", default=image_url, help="Path to input image or image URL")
+    parser.add_argument("--input_prompt", type=str, default="Describe this image for me", help="Input prompt for instruct task")
+    parser.add_argument("--max_tokens", type=int, default=512, help="Maximum number of tokens to generate")
+
+    args = parser.parse_args()
+    main(args)
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index e80e093e..7e9c6478 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -80,6 +80,12 @@ convert = [
     "nexa-gguf",
 ]
 
+transformers = [
+    "transformers",
+    "torch",
+    "pillow"
+]
+
 [project.urls]
 Homepage = "https://github.com/NexaAI/nexa-sdk"
 Issues = "https://github.com/NexaAI/nexa-sdk/issues"
@@ -105,6 +111,7 @@ wheel.packages = [
     "nexa.onnx.streamlit",
     "nexa.onnx.server",
     "nexa.eval",
+    "nexa.transformers",
 ]
 sdist.include = [
     "CMakeLists.txt",
diff --git a/requirements.txt b/requirements.txt
index 978b8c1d..6e732a0e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,6 +19,7 @@ pydantic
 pillow
 python-multipart
 huggingface_hub
+modelscope
 
 # For onnx
 optimum[onnxruntime]  # for CPU version
diff --git a/tests/test_tts_generation.py b/tests/test_tts_generation.py
index 5d55ed4d..2dc9c526 100644
--- a/tests/test_tts_generation.py
+++ b/tests/test_tts_generation.py
@@ -1,22 +1,24 @@
-from nexa.gguf import NexaTTSInference
+# Temporarily disabled since version v0.0.9.3
 
-def test_tts_generation():
-    tts = NexaTTSInference(
-        model_path="bark-small",
-        local_path=None,
-        n_threads=4,
-        seed=42,
-        sampling_rate=24000,
-        verbosity=2
-    )
+# from nexa.gguf import NexaTTSInference
+
+# def test_tts_generation():
+#     tts = NexaTTSInference(
+#         model_path="bark-small",
+#         local_path=None,
+#         n_threads=4,
+#         seed=42,
+#         sampling_rate=24000,
+#         verbosity=2
+#     )
     
-    # Generate audio from prompt
-    prompt = "Hello, this is a test of the Bark text to speech system."
-    audio_data = tts.audio_generation(prompt)
+#     # Generate audio from prompt
+#     prompt = "Hello, this is a test of the Bark text to speech system."
+#     audio_data = tts.audio_generation(prompt)
     
-    # Save the generated audio
-    tts._save_audio(audio_data, tts.sampling_rate, "tts_output")
-    print("TTS generation test completed successfully!")
+#     # Save the generated audio
+#     tts._save_audio(audio_data, tts.sampling_rate, "tts_output")
+#     print("TTS generation test completed successfully!")
 
-if __name__ == "__main__":
-    test_tts_generation()
\ No newline at end of file
+# if __name__ == "__main__":
+#     test_tts_generation()
\ No newline at end of file