diff --git a/.github/workflows/build-wheels-cpu-macos.yaml b/.github/workflows/build-wheels-cpu-macos.yaml
index f1e1fc18..30e508e5 100644
--- a/.github/workflows/build-wheels-cpu-macos.yaml
+++ b/.github/workflows/build-wheels-cpu-macos.yaml
@@ -8,7 +8,7 @@ permissions:
 jobs:
 
   build_wheels_macos:
-    name: Build wheels on  macos-${{ matrix.os }}
+    name: Build wheels on macos-${{ matrix.os }}
     runs-on: macos-${{ matrix.os }}
     strategy:
       matrix:
@@ -28,7 +28,6 @@ jobs:
       - name: Install dependencies
         run: |
           python -m pip install --upgrade pip
-          # python -m pip install -e .
           python -m pip install build wheel
 
       - name: Build wheels
@@ -37,8 +36,8 @@ jobs:
           # disable repair
           CIBW_REPAIR_WHEEL_COMMAND: ""
           CIBW_BUILD_FRONTEND: "build"
-          CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64;x86_64 "
-          CIBW_BUILD: "cp310-* cp311-* cp312-*"
+          CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64;x86_64"
+          CIBW_BUILD: "cp37-* cp38-* cp39-* cp310-* cp311-* cp312-*"
           MACOSX_DEPLOYMENT_TARGET: ${{ matrix.os }}
           CMAKE_BUILD_PARALLEL_LEVEL: 4
         with:
diff --git a/.github/workflows/build-wheels-cpu.yaml b/.github/workflows/build-wheels-cpu.yaml
index f86ec878..4dc36265 100644
--- a/.github/workflows/build-wheels-cpu.yaml
+++ b/.github/workflows/build-wheels-cpu.yaml
@@ -37,7 +37,7 @@ jobs:
   #         CIBW_REPAIR_WHEEL_COMMAND: ""
   #         CIBW_BUILD_FRONTEND: "build"
   #         CIBW_SKIP: "*musllinux*"
-  #         CIBW_BUILD: "cp310-* cp311-* cp312-*"
+  #         CIBW_BUILD: "cp37-* cp38-* cp39-* cp310-* cp311-* cp312-*"
   #         CMAKE_BUILD_PARALLEL_LEVEL: 4
   #       with:
   #         package-dir: .
@@ -79,7 +79,7 @@ jobs:
           # disable repair
           CIBW_REPAIR_WHEEL_COMMAND: ""
           CIBW_BUILD_FRONTEND: "build"
-          CIBW_BUILD: "cp310-* cp311-* cp312-*"
+          CIBW_BUILD: "cp37-* cp38-* cp39-* cp310-* cp311-* cp312-*"
           CMAKE_BUILD_PARALLEL_LEVEL: 16
         with:
           package-dir: .
@@ -110,7 +110,7 @@ jobs:
           CIBW_BUILD_FRONTEND: "build"
           CIBW_SKIP: "*musllinux* pp*"
           CIBW_ARCHS: "aarch64"
-          CIBW_BUILD: "cp310-* cp311-* cp312-*"
+          CIBW_BUILD: "cp37-* cp38-* cp39-* cp310-* cp311-* cp312-*"
           CMAKE_BUILD_PARALLEL_LEVEL: $(nproc)
         with:
           output-dir: wheelhouse
diff --git a/.github/workflows/build-wheels-metal.yaml b/.github/workflows/build-wheels-metal.yaml
index 930aad72..bea2fd49 100644
--- a/.github/workflows/build-wheels-metal.yaml
+++ b/.github/workflows/build-wheels-metal.yaml
@@ -37,7 +37,7 @@ jobs:
           CIBW_REPAIR_WHEEL_COMMAND: ""
           CIBW_BUILD_FRONTEND: "build"
           CIBW_ARCHS: "arm64"
-          CIBW_BUILD: "cp310-* cp311-* cp312-*"
+          CIBW_BUILD: "cp37-* cp38-* cp39-* cp310-* cp311-* cp312-*"
           CIBW_ENVIRONMENT: CMAKE_ARGS="-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64 -DGGML_METAL=ON -DSD_METAL=ON"
           MACOSX_DEPLOYMENT_TARGET: ${{ matrix.os }}
           CMAKE_BUILD_PARALLEL_LEVEL: 4
diff --git a/.gitignore b/.gitignore
index 047b7b5e..00d554d9 100644
--- a/.gitignore
+++ b/.gitignore
@@ -15,6 +15,7 @@ build*.sh
 *.dll
 *.dylib
 *.a
+generated_images/
 
 # Python
 __pycache__/
diff --git a/README.md b/README.md
index 6491a99b..48b4a3b1 100644
--- a/README.md
+++ b/README.md
@@ -65,7 +65,7 @@ We have released pre-built wheels for various Python versions, platforms, and ba
 #### CPU
 
 ```bash
-pip install nexaai --index-url https://nexaai.github.io/nexa-sdk/whl/cpu --extra-index-url https://pypi.org/simple
+pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/cpu --extra-index-url https://pypi.org/simple --no-cache-dir
 ```
 
 #### GPU (Metal)
@@ -73,52 +73,31 @@ pip install nexaai --index-url https://nexaai.github.io/nexa-sdk/whl/cpu --extra
 For the GPU version supporting **Metal (macOS)**:
 
 ```bash
-CMAKE_ARGS="-DGGML_METAL=ON -DSD_METAL=ON" pip install nexaai --index-url https://nexaai.github.io/nexa-sdk/whl/metal --extra-index-url https://pypi.org/simple
+CMAKE_ARGS="-DGGML_METAL=ON -DSD_METAL=ON" pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/metal --extra-index-url https://pypi.org/simple --no-cache-dir
 ```
 
-#### GPU (CUDA)
-
-For the GPU version supporting **CUDA (Linux/Windows)**:
-
-```bash
-CMAKE_ARGS="-DGGML_CUDA=ON -DSD_CUBLAS=ON" pip install nexaai --index-url https://nexaai.github.io/nexa-sdk/whl/cu124 --extra-index-url https://pypi.org/simple
+<details>
+<summary><strong>FAQ: cannot using Metal/GPU on m1</strong></summary>
+try the following command:
 ```
-
-> [!NOTE]
-> The CUDA wheels are built with CUDA 12.4, but should be compatible with all CUDA 12.X
-
-
-#### GPU (Metal)
-
-For the GPU version supporting Metal (macOS):
-
-```bash
-CMAKE_ARGS="-DGGML_METAL=ON -DSD_METAL=ON" pip install nexaai
+wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-MacOSX-arm64.sh
+bash Miniforge3-MacOSX-arm64.sh
+conda create -n llama python=3.10
+conda activate llama
+CMAKE_ARGS="-DGGML_METAL=ON -DSD_METAL=ON" pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/metal --extra-index-url https://pypi.org/simple --no-cache-dir
 ```
+</details>
 
 #### GPU (CUDA)
 
-For the GPU version supporting CUDA (Linux/Windows), run the following command:
+For the GPU version supporting **CUDA (Linux/Windows)**:
 
 ```bash
-CMAKE_ARGS="-DGGML_CUDA=ON -DSD_CUBLAS=ON" pip install nexaai
+CMAKE_ARGS="-DGGML_CUDA=ON -DSD_CUBLAS=ON" pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/cu124 --extra-index-url https://pypi.org/simple --no-cache-dir
 ```
 
-> [!TIP]
-> You can accelerate the building process via parallel cmake by appending the following to the commands above:
->
-> ```bash
-> CMAKE_BUILD_PARALLEL_LEVEL=$(nproc)
-> ```
->
-> For example:
->
-> ```bash
-> CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) CMAKE_ARGS="-DGGML_METAL=ON -DSD_METAL
-> ```
-
-> [!TIP]
-> For Windows users, we recommend running the installation command in Git Bash to avoid unexpected behavior.
+> [!NOTE]
+> The CUDA wheels are built with CUDA 12.4, but should be compatible with all CUDA 12.X
 
 
 <details>
diff --git a/docs/.media/error.jpeg b/docs/.media/error.jpeg
new file mode 100644
index 00000000..1e374e19
Binary files /dev/null and b/docs/.media/error.jpeg differ
diff --git a/docs/cmd_macos.sh b/docs/cmd_macos.sh
deleted file mode 100644
index 0fbdae18..00000000
--- a/docs/cmd_macos.sh
+++ /dev/null
@@ -1,24 +0,0 @@
-#!/bin/bash
-
-cd "$(dirname "${BASH_SOURCE[0]}")"
-
-if [[ "$(pwd)" =~ " " ]]; then echo This script relies on Miniconda which can not be silently installed under a path with spaces. && exit; fi
-
-# deactivate existing conda envs as needed to avoid conflicts
-{ conda deactivate && conda deactivate && conda deactivate; } 2> /dev/null
-
-# config
-CONDA_ROOT_PREFIX="$(pwd)/installer_files/conda"
-INSTALL_ENV_DIR="$(pwd)/installer_files/env"
-
-# environment isolation
-export PYTHONNOUSERSITE=1
-unset PYTHONPATH
-unset PYTHONHOME
-export CUDA_PATH="$INSTALL_ENV_DIR"
-export CUDA_HOME="$CUDA_PATH"
-
-# activate env
-source $CONDA_ROOT_PREFIX/etc/profile.d/conda.sh
-conda activate $INSTALL_ENV_DIR
-exec bash --norc
\ No newline at end of file
diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py
index f12054f1..a2352607 100644
--- a/nexa/gguf/nexa_inference_text.py
+++ b/nexa/gguf/nexa_inference_text.py
@@ -96,7 +96,7 @@ def create_embedding(
 
     @SpinningCursorAnimation()
     def _load_model(self):
-        logging.debug(f"Loading model from {self.downloaded_path}")
+        logging.debug(f"Loading model from {self.downloaded_path}, use_cuda_or_metal : {is_gpu_available()}")
         start_time = time.time()
         with suppress_stdout_stderr():
             try:
diff --git a/pyproject.toml b/pyproject.toml
index c4f8f9df..f54a8e27 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -32,6 +32,8 @@ dependencies = [
 ]
 classifiers = [
     "Programming Language :: Python :: 3",
+    "Programming Language :: Python :: 3.7",
+    "Programming Language :: Python :: 3.8",
     "Programming Language :: Python :: 3.9",
     "Programming Language :: Python :: 3.10",
     "Programming Language :: Python :: 3.11",