diff --git a/.github/workflows/build-wheels-cpu-macos.yaml b/.github/workflows/build-wheels-cpu-macos.yaml
new file mode 100644
index 00000000..a97c69f6
--- /dev/null
+++ b/.github/workflows/build-wheels-cpu-macos.yaml
@@ -0,0 +1,68 @@
+name: Build Wheels (CPU) (MacOS)
+
+on: workflow_dispatch
+
+permissions:
+  contents: write
+
+jobs:
+
+  build_wheels_macos:
+    name: Build wheels on  macos-${{ matrix.os }}
+    runs-on: macos-${{ matrix.os }}
+    strategy:
+      matrix:
+        os: [12, 13, 14]
+
+    steps:
+      - uses: actions/checkout@v4
+        with:
+          submodules: "recursive"
+
+      # Used to host cibuildwheel
+      - uses: actions/setup-python@v5
+        with:
+          python-version: "3.8"
+          cache: "pip"
+
+      - name: Install dependencies
+        run: |
+          python -m pip install --upgrade pip
+          # python -m pip install -e .
+          python -m pip install build wheel
+
+      - name: Build wheels
+        uses: pypa/cibuildwheel@v2.20.0
+        env:
+          # disable repair
+          CIBW_REPAIR_WHEEL_COMMAND: ""
+          CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64;x86_64 "
+          CIBW_BUILD: "cp310-* cp311-* cp312-*"
+          MACOSX_DEPLOYMENT_TARGET: ${{ matrix.os }}
+          CMAKE_BUILD_PARALLEL_LEVEL: 4
+        with:
+          package-dir: .
+          output-dir: wheelhouse
+
+      - uses: actions/upload-artifact@v4
+        with:
+          name: wheels-macos-${{ matrix.os }}
+          path: ./wheelhouse/*.whl
+
+  release:
+    name: Release
+    needs: [build_wheels_macos]
+    runs-on: ubuntu-latest
+
+    steps:
+      - uses: actions/download-artifact@v4
+        with:
+          merge-multiple: true
+          path: dist
+
+      - uses: softprops/action-gh-release@v2
+        with:
+          files: dist/*
+          tag_name: ${{ github.ref_name }}
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
diff --git a/.github/workflows/build-wheels-cpu.yaml b/.github/workflows/build-wheels-cpu.yaml
index 7a13cef5..e18e66b5 100644
--- a/.github/workflows/build-wheels-cpu.yaml
+++ b/.github/workflows/build-wheels-cpu.yaml
@@ -88,48 +88,6 @@ jobs:
           name: wheels-${{ matrix.os }}
           path: ./wheelhouse/*.whl
 
-  build_wheels_macos:
-    name: Build wheels on  macos-${{ matrix.os }}
-    runs-on: macos-${{ matrix.os }}
-    strategy:
-      matrix:
-        os: [12, 13, 14]
-
-    steps:
-      - uses: actions/checkout@v4
-        with:
-          submodules: "recursive"
-
-      # Used to host cibuildwheel
-      - uses: actions/setup-python@v5
-        with:
-          python-version: "3.8"
-          cache: "pip"
-
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          # python -m pip install -e .
-          python -m pip install build wheel
-
-      - name: Build wheels
-        uses: pypa/cibuildwheel@v2.20.0
-        env:
-          # disable repair
-          CIBW_REPAIR_WHEEL_COMMAND: ""
-          CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64;x86_64 "
-          CIBW_BUILD: "cp310-* cp311-* cp312-*"
-          MACOSX_DEPLOYMENT_TARGET: ${{ matrix.os }}
-          CMAKE_BUILD_PARALLEL_LEVEL: 4
-        with:
-          package-dir: .
-          output-dir: wheelhouse
-
-      - uses: actions/upload-artifact@v4
-        with:
-          name: wheels-macos-${{ matrix.os }}
-          path: ./wheelhouse/*.whl
-
   build_wheels_arm64:
     name: Build arm64 wheels
     runs-on: ubuntu-20.04
@@ -185,7 +143,7 @@ jobs:
 
   release:
     name: Release
-    needs: [build_wheels, build_wheels_macos, build_wheels_arm64, build_sdist]
+    needs: [build_wheels_linux, build_wheels_win, build_wheels_arm64, build_sdist]
     runs-on: ubuntu-latest
 
     steps:
diff --git a/.github/workflows/build-wheels-cuda-linux.yaml b/.github/workflows/build-wheels-cuda-linux.yaml
index a4cd4b89..b2a001c5 100644
--- a/.github/workflows/build-wheels-cuda-linux.yaml
+++ b/.github/workflows/build-wheels-cuda-linux.yaml
@@ -115,7 +115,8 @@ jobs:
           $env:VERBOSE = '1'
 
           $env:CMAKE_BUILD_PARALLEL_LEVEL = $(nproc)
-          cp tomls/pyproject_cuda.toml pyproject.toml
+          $env:CMAKE_ARGS = '-DSD_CUBLAS=ON -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=all'
+          $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_CUDA_FORCE_MMQ=ON -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF'
 
           python -m build --wheel
           # write the build tag to the output
diff --git a/.github/workflows/build-wheels-cuda-win.yaml b/.github/workflows/build-wheels-cuda-win.yaml
index b1e33402..1eeb1381 100644
--- a/.github/workflows/build-wheels-cuda-win.yaml
+++ b/.github/workflows/build-wheels-cuda-win.yaml
@@ -115,7 +115,8 @@ jobs:
           }
           $env:VERBOSE = '1'
 
-          cp tomls/pyproject_cuda.toml pyproject.toml
+          $env:CMAKE_ARGS = '-DSD_CUBLAS=ON -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES=all'
+          $env:CMAKE_ARGS = $env:CMAKE_ARGS + ' -DGGML_CUDA_FORCE_MMQ=ON -DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF'
           $env:CMAKE_BUILD_PARALLEL_LEVEL = $(nproc)
 
           python -m build --wheel
diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml
index dc5b1cca..b423c92c 100644
--- a/.github/workflows/build-wheels-metal.yaml
+++ b/.github/workflows/build-wheels-metal.yaml
@@ -30,10 +30,6 @@ jobs:
           # python -m pip install -e .
           python -m pip install build wheel
 
-      - name: Copy pyproject.toml
-        run: |
-          cp tomls/pyproject_metal.toml pyproject.toml
-
       - name: Build wheels
         uses: pypa/cibuildwheel@v2.20.0
         env:
@@ -41,6 +37,7 @@ jobs:
           CIBW_REPAIR_WHEEL_COMMAND: ""
           CIBW_ARCHS: "arm64"
           CIBW_BUILD: "cp310-* cp311-* cp312-*"
+          CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DGGML_METAL=ON -DSD_METAL=ON"
           MACOSX_DEPLOYMENT_TARGET: ${{ matrix.os }}
           CMAKE_BUILD_PARALLEL_LEVEL: 4
         with:
diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 3bdcd138..85c19f7f 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -10,28 +10,37 @@ on:
 
 jobs:
   build:
-    runs-on: ubuntu-latest
+    runs-on: ${{ matrix.os }}
+
+    strategy:
+      matrix:
+        os: [macos-latest, windows-latest]
+        python-version: [3.10]
 
     steps:
     - name: Checkout code
       uses: actions/checkout@v3
       with:
         submodules: recursive  # This will clone the repository with all its submodules
-        fetch-depth: 0    # This fetches all history so you can access any version of the submodules
-
+        fetch-depth: 0  # This fetches all history so you can access any version of the submodules
 
     - name: Set up Python
       uses: actions/setup-python@v4
       with:
-        python-version: '3.10'  # Specify the Python version you want
+        python-version: ${{ matrix.python-version }}
 
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
         python -m pip install build pytest
+      shell: bash
+
     - name: Build DLL
       run: |
         python -m pip install -e .
+      shell: bash
+
     - name: Run tests
       run: |
-        python -m pytest tests
\ No newline at end of file
+        python -m pytest tests
+      shell: bash
diff --git a/.github/workflows/generate-index-from-release.yaml b/.github/workflows/generate-index-from-release.yaml
index 662cb6f6..bb096098 100644
--- a/.github/workflows/generate-index-from-release.yaml
+++ b/.github/workflows/generate-index-from-release.yaml
@@ -32,14 +32,19 @@ jobs:
     steps:
       - name: Checkout
         uses: actions/checkout@v4
+
       - name: Setup Pages
         uses: actions/configure-pages@v5
+
+      - name: Set execute permissions for script
+        run: chmod +x ./scripts/releases-to-pep-503.sh
+        
       - name: Build
         run: |
           ./scripts/releases-to-pep-503.sh index/whl/cpu '^[v]?[0-9]+\.[0-9]+\.[0-9]+$'
-          ./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$'
-          ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$'
-          ./scripts/releases-to-pep-503.sh index/whl/cu123 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu123$'
+          # ./scripts/releases-to-pep-503.sh index/whl/cu121 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu121$'
+          # ./scripts/releases-to-pep-503.sh index/whl/cu122 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu122$'
+          # ./scripts/releases-to-pep-503.sh index/whl/cu123 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu123$'
           ./scripts/releases-to-pep-503.sh index/whl/cu124 '^[v]?[0-9]+\.[0-9]+\.[0-9]+-cu124$'
           ./scripts/releases-to-pep-503.sh index/whl/metal '^[v]?[0-9]+\.[0-9]+\.[0-9]+-metal$'
       - name: Upload artifact
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8b4264fa..0b30e6e4 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,13 +1,19 @@
 cmake_minimum_required(VERSION 3.16)
 
 if (GGML_CUDA OR GGML_METAL)
-    set(EMPTY_FILE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib/empty_file.txt")
+    set(SOURCE_EMPTY_FILE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib/empty_file.txt")
     add_custom_command(
-        OUTPUT ${EMPTY_FILE_PATH}
-        COMMAND ${CMAKE_COMMAND} -E touch ${EMPTY_FILE_PATH}
-        COMMENT "Creating an empty file because MY_FEATURE is ON"
+        OUTPUT ${SOURCE_EMPTY_FILE_PATH}
+        COMMAND ${CMAKE_COMMAND} -E touch ${SOURCE_EMPTY_FILE_PATH}
+        COMMENT "Creating an empty file to source folder because gpu option is ON"
+    )
+    set(WHEEL_EMPTY_FILE_PATH "${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib/empty_file.txt")
+    add_custom_command(
+        OUTPUT ${WHEEL_EMPTY_FILE_PATH}
+        COMMAND ${CMAKE_COMMAND} -E touch ${WHEEL_EMPTY_FILE_PATH}
+        COMMENT "Creating an empty file to lib folder because gpu option is ON"
     )    
-    add_custom_target(create_empty_file ALL DEPENDS ${EMPTY_FILE_PATH})
+    add_custom_target(create_empty_file ALL DEPENDS ${SOURCE_EMPTY_FILE_PATH} ${WHEEL_EMPTY_FILE_PATH})
 endif()
 
 # Project: stable_diffusion_cpp
diff --git a/README.md b/README.md
index c0e2b1e2..a10d1d64 100644
--- a/README.md
+++ b/README.md
@@ -97,7 +97,7 @@ Example:
 `docker run -v /home/ubuntu/.cache/nexa/hub/official:/model -it nexa4ai/sdk:latest nexa gen-text /model/Phi-3-mini-128k-instruct/q4_0.gguf`
 
 will create an interactive session with text generation
-```
+
 
 ## Nexa CLI commands
 
diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py
index 819e3dbf..b0086d61 100644
--- a/nexa/gguf/nexa_inference_image.py
+++ b/nexa/gguf/nexa_inference_image.py
@@ -13,7 +13,9 @@
     DEFAULT_IMG_GEN_PARAMS_LCM,
     DEFAULT_IMG_GEN_PARAMS_TURBO,
 )
-from nexa.utils import SpinningCursorAnimation, nexa_prompt, suppress_stdout_stderr
+from nexa.utils import SpinningCursorAnimation, nexa_prompt
+from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr
+
 from streamlit.web import cli as stcli
 from nexa.general import pull_model
 
diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py
index 9027e50a..9f0afd60 100644
--- a/nexa/gguf/nexa_inference_text.py
+++ b/nexa/gguf/nexa_inference_text.py
@@ -13,8 +13,10 @@
     NEXA_STOP_WORDS_MAP,
 )
 from nexa.gguf.lib_utils import is_gpu_available
-from nexa.utils import SpinningCursorAnimation, nexa_prompt, suppress_stdout_stderr
 from nexa.general import pull_model
+from nexa.utils import SpinningCursorAnimation, nexa_prompt
+from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr
+
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py
index ac886ccf..da8103f5 100644
--- a/nexa/gguf/nexa_inference_vlm.py
+++ b/nexa/gguf/nexa_inference_vlm.py
@@ -24,7 +24,8 @@
     Llava16ChatHandler,
     NanoLlavaChatHandler,
 )
-from nexa.utils import SpinningCursorAnimation, nexa_prompt, suppress_stdout_stderr
+from nexa.utils import SpinningCursorAnimation, nexa_prompt
+from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr
 
 logging.basicConfig(
     level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s"
diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py
index 0725f704..ef0a653b 100644
--- a/nexa/gguf/nexa_inference_voice.py
+++ b/nexa/gguf/nexa_inference_voice.py
@@ -10,9 +10,11 @@
     EXIT_REMINDER,
     NEXA_RUN_MODEL_MAP_VOICE,
 )
-from nexa.utils import nexa_prompt
-from nexa.utils import nexa_prompt, SpinningCursorAnimation, suppress_stdout_stderr
 from nexa.general import pull_model
+from nexa.utils import nexa_prompt, SpinningCursorAnimation
+from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr
+
+
 logging.basicConfig(level=logging.INFO)
 
 
@@ -69,6 +71,7 @@ def _load_model(self):
 
         logging.debug(f"Loading model from: {self.downloaded_path}")
         with suppress_stdout_stderr():
+            os.environ["KMP_DUPLICATE_LIB_OK"] = "TRUE"
             self.model = WhisperModel(
                 self.downloaded_path,
                 device="cpu",
diff --git a/nexa/onnx/nexa_inference_image.py b/nexa/onnx/nexa_inference_image.py
index 9e10f7bb..6566cbb2 100644
--- a/nexa/onnx/nexa_inference_image.py
+++ b/nexa/onnx/nexa_inference_image.py
@@ -14,7 +14,7 @@
 )
 from nexa.general import pull_model
 from nexa.constants import EXIT_REMINDER, NEXA_RUN_MODEL_MAP_ONNX
-from nexa.utils import nexa_prompt
+from nexa.utils import nexa_prompt, SpinningCursorAnimation
 
 logging.basicConfig(level=logging.INFO)
 
@@ -75,6 +75,7 @@ def run(self):
         self._load_model(self.download_onnx_folder)
         self._dialogue_mode()
 
+    @SpinningCursorAnimation()
     def _load_model(self, model_path):
         """
         Load the model from the given model path using the appropriate pipeline.
@@ -149,7 +150,7 @@ def generate_images(self, prompt, negative_prompt):
         images = self.pipeline(**pipeline_kwargs).images
         return images
 
-            
+
 
     def _save_images(self, images):
         """
diff --git a/nexa/onnx/nexa_inference_text.py b/nexa/onnx/nexa_inference_text.py
index 794efca9..765d26b7 100644
--- a/nexa/onnx/nexa_inference_text.py
+++ b/nexa/onnx/nexa_inference_text.py
@@ -9,7 +9,7 @@
 from transformers import AutoTokenizer, TextStreamer
 from nexa.general import pull_model
 from nexa.constants import NEXA_RUN_MODEL_MAP_ONNX
-from nexa.utils import nexa_prompt
+from nexa.utils import nexa_prompt, SpinningCursorAnimation
 
 logging.basicConfig(level=logging.INFO)
 
@@ -51,6 +51,7 @@ def __init__(self, model_path, local_path=None, **kwargs):
         self.timings = kwargs.get("timings", False)
         self.device = "cpu"
 
+    @SpinningCursorAnimation()
     def _load_model_and_tokenizer(self) -> Tuple[Any, Any, Any, bool]:
         logging.debug(f"Loading model from {self.downloaded_onnx_folder}")
         start_time = time.time()
diff --git a/nexa/utils.py b/nexa/utils.py
index 2483582f..71e9ca7e 100644
--- a/nexa/utils.py
+++ b/nexa/utils.py
@@ -10,9 +10,6 @@
 from prompt_toolkit.styles import Style
 
 from nexa.constants import EXIT_COMMANDS, EXIT_REMINDER
-from nexa.gguf.llama._utils_transformers import (
-    suppress_stdout_stderr,
-)  # re-import, don't comment out
 
 
 def is_package_installed(package_name: str) -> bool:
diff --git a/pyproject.toml b/pyproject.toml
index 74007e06..a51549a7 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"
 
 [project]
 name = "nexaai"
-version = "0.0.2.dev"
+version = "0.0.5"
 description = "Nexa AI SDK"
 readme = "README.md"
 license = { text = "MIT" }
@@ -36,7 +36,7 @@ classifiers = [
 [project.optional-dependencies]
 onnx = [
     "librosa",
-    "optimum[onnxruntime]>=1.7.3",  # for CPU version
+    "optimum[onnxruntime]",  # for CPU version
     "diffusers",  # required for image generation
     "optuna",
     "pydantic",
@@ -78,7 +78,7 @@ sdist.exclude = [".github", "build", "dist", "nexa.egg-info", "dependency/llama.
 build.verbose = true
 cmake.build-type = "Release"
 cmake.version = ">=3.16"
-cmake.args = ["-DCMAKE_CXX_FLAGS=-fopenmp"]
+# cmake.args = ["-DCMAKE_CXX_FLAGS=-fopenmp"]
 
 [tool.pytest.ini_options]
 testpaths = ["tests"]
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
index a05f5892..5dfbadee 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -16,7 +16,6 @@ fastapi
 uvicorn
 pydantic
 pillow
-prompt_toolkit
 
 # For onnx
 optimum[onnxruntime]  # for CPU version
@@ -31,6 +30,7 @@ transformers
 ttstokenizer
 
 # Shared dependencies
+prompt_toolkit
 tqdm
 tabulate
 streamlit
diff --git a/scripts/releases-to-pep-503.sh b/scripts/releases-to-pep-503.sh
index 05195868..9d2dc312 100644
--- a/scripts/releases-to-pep-503.sh
+++ b/scripts/releases-to-pep-503.sh
@@ -1,7 +1,7 @@
 #!/bin/bash
 
 package_name="nexaai"
-repo_name="nexaai-sdk-cpp"
+repo_name="nexa-sdk"
 
 # Get output directory or default to index/whl/cpu
 output_dir=${1:-"index/whl/cpu"}