From 2b30a915d2491476cad7aea6cf43bf4fb11e832b Mon Sep 17 00:00:00 2001
From: Yu xing <xyyimian@gmail.com>
Date: Tue, 20 Aug 2024 23:21:06 -0700
Subject: [PATCH 01/31] update ci

---
 .github/workflows/ci.yaml     | 37 +++++++++++++++++++++++++++++++++++
 tests/test_text_generation.py |  4 ++--
 tests/test_vlm.py             |  3 ++-
 3 files changed, 41 insertions(+), 3 deletions(-)
 create mode 100644 .github/workflows/ci.yaml

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
new file mode 100644
index 00000000..35568ec0
--- /dev/null
+++ b/.github/workflows/ci.yaml
@@ -0,0 +1,37 @@
+name: Python CI
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  build:
+    runs-on: ubuntu-latest
+
+    steps:
+    - name: Checkout code
+      uses: actions/checkout@v3
+      with:
+        submodules: recursive  # This will clone the repository with all its submodules
+        fetch-depth: 0    # This fetches all history so you can access any version of the submodules
+
+
+    - name: Set up Python
+      uses: actions/setup-python@v4
+      with:
+        python-version: '3.10'  # Specify the Python version you want
+
+    - name: Install dependencies
+      run: |
+        python -m pip install --upgrade pip
+        pip install build pytest
+    - name: Build DLL
+      run: |
+        pip install -e .
+    - name: Run tests
+      run: |
+        python -m pytest tests
\ No newline at end of file
diff --git a/tests/test_text_generation.py b/tests/test_text_generation.py
index f37a4781..04782a21 100644
--- a/tests/test_text_generation.py
+++ b/tests/test_text_generation.py
@@ -1,7 +1,7 @@
 import os
 from nexa.gguf.llama import llama
 from tests.utils import download_model
-
+from nexa.gguf.lib_utils import is_gpu_available
 # Constants
 TINY_LLAMA_URL = "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_0.gguf"
 OUTPUT_DIR = os.getcwd()
@@ -12,7 +12,7 @@ def init_llama_model(verbose=False, n_gpu_layers=-1, chat_format=None, embedding
     return llama.Llama(
         model_path=MODEL_PATH,
         verbose=verbose,
-        n_gpu_layers=n_gpu_layers,
+        n_gpu_layers=n_gpu_layers if is_gpu_available() else 0,
         chat_format=chat_format,
         embedding=embedding,
     )
diff --git a/tests/test_vlm.py b/tests/test_vlm.py
index b70389be..25d81d56 100644
--- a/tests/test_vlm.py
+++ b/tests/test_vlm.py
@@ -4,6 +4,7 @@
 from nexa.gguf.llama import llama
 from nexa.gguf.llama.llama_chat_format import NanoLlavaChatHandler
 from tests.utils import download_model
+from nexa.gguf.lib_utils import is_gpu_available
 
 def image_to_base64_data_uri(file_path):
     """
@@ -31,7 +32,7 @@ def test_image_generation():
         model_path=model_path,
         chat_handler=chat_handler,
         n_ctx=2048,  # n_ctx should be increased to accommodate the image embedding
-        n_gpu_layers=-1,  # Uncomment to use GPU acceleration
+        n_gpu_layers=-1 if is_gpu_available() else 0,  # Uncomment to use GPU acceleration
         verbose=False,
     )
     output = llm.create_chat_completion(

From 220dbc07bcb47332df974e5a7b5e42d38b427536 Mon Sep 17 00:00:00 2001
From: Yu xing <xyyimian@gmail.com>
Date: Tue, 20 Aug 2024 23:54:40 -0700
Subject: [PATCH 02/31] use tempfile and try to fix ci

---
 tests/test_vlm.py | 85 +++++++++++++++++++++++------------------------
 1 file changed, 42 insertions(+), 43 deletions(-)

diff --git a/tests/test_vlm.py b/tests/test_vlm.py
index 25d81d56..17400bd4 100644
--- a/tests/test_vlm.py
+++ b/tests/test_vlm.py
@@ -5,6 +5,7 @@
 from nexa.gguf.llama.llama_chat_format import NanoLlavaChatHandler
 from tests.utils import download_model
 from nexa.gguf.lib_utils import is_gpu_available
+import tempfile
 
 def image_to_base64_data_uri(file_path):
     """
@@ -15,53 +16,51 @@ def image_to_base64_data_uri(file_path):
         base64_data = base64.b64encode(img_file.read()).decode("utf-8")
         return f"data:image/png;base64,{base64_data}"
 
-model_url = "https://nexa-model-hub-bucket.s3.us-west-1.amazonaws.com/public/nanoLLaVA/model-fp16.gguf"
-mmproj_url = "https://nexa-model-hub-bucket.s3.us-west-1.amazonaws.com/public/nanoLLaVA/projector-fp16.gguf"
 
-# Download paths
-output_dir = os.getcwd()
-model_path = download_model(model_url, output_dir)
-mmproj_path = download_model(mmproj_url, output_dir)
-print("Model downloaded to:", model_path)
-print("MMProj downloaded to:", mmproj_path)
+def test_image_generation():
+    with tempfile.TemporaryDirectory() as temp_dir:
+        temp_dir = os.path.dirname(os.path.abspath(__file__))
+        model_url = "https://nexa-model-hub-bucket.s3.us-west-1.amazonaws.com/public/nanoLLaVA/model-fp16.gguf"
+        mmproj_url = "https://nexa-model-hub-bucket.s3.us-west-1.amazonaws.com/public/nanoLLaVA/projector-fp16.gguf"
 
-chat_handler = NanoLlavaChatHandler(clip_model_path=mmproj_path)
+        model_path = download_model(model_url, temp_dir)
+        mmproj_path = download_model(mmproj_url, temp_dir)
+        chat_handler = NanoLlavaChatHandler(clip_model_path=mmproj_path)
 
-def test_image_generation():
-    llm = llama.Llama(
-        model_path=model_path,
-        chat_handler=chat_handler,
-        n_ctx=2048,  # n_ctx should be increased to accommodate the image embedding
-        n_gpu_layers=-1 if is_gpu_available() else 0,  # Uncomment to use GPU acceleration
-        verbose=False,
-    )
-    output = llm.create_chat_completion(
-        messages=[
-            {
-                "role": "system",
-                "content": "You are an assistant who perfectly describes images.",
-            },
-            {
-                "role": "user",
-                "content": [
-                    {"type": "text", "text": "What's in this image?"},
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+        llm = llama.Llama(
+            model_path=model_path,
+            chat_handler=chat_handler,
+            n_ctx=2048,  # n_ctx should be increased to accommodate the image embedding
+            n_gpu_layers=-1 if is_gpu_available() else 0,  # Uncomment to use GPU acceleration
+            verbose=False,
+        )
+        output = llm.create_chat_completion(
+            messages=[
+                {
+                    "role": "system",
+                    "content": "You are an assistant who perfectly describes images.",
+                },
+                {
+                    "role": "user",
+                    "content": [
+                        {"type": "text", "text": "What's in this image?"},
+                        {
+                            "type": "image_url",
+                            "image_url": {
+                                "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+                            },
                         },
-                    },
-                ],
-            },
-        ],
-        stream=True,
-    )
-    for chunk in output:
-        delta = chunk["choices"][0]["delta"]
-        if "role" in delta:
-            print(delta["role"], end=": ")
-        elif "content" in delta:
-            print(delta["content"], end="")
+                    ],
+                },
+            ],
+            stream=True,
+        )
+        for chunk in output:
+            delta = chunk["choices"][0]["delta"]
+            if "role" in delta:
+                print(delta["role"], end=": ")
+            elif "content" in delta:
+                print(delta["content"], end="")
 
 
 # if __name__ == "__main__":

From 7047e1f91848c38f0091b138a1e631ada644da7c Mon Sep 17 00:00:00 2001
From: Yu xing <xyyimian@gmail.com>
Date: Wed, 21 Aug 2024 00:38:27 -0700
Subject: [PATCH 03/31] update ci

---
 .github/workflows/ci.yaml | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 35568ec0..bde784ed 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -28,10 +28,11 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        pip install build pytest
+        python -m pip install numpy --upgrade
+        python -m pip install build pytest
     - name: Build DLL
       run: |
-        pip install -e .
+        python -m pip install -e .
     - name: Run tests
       run: |
         python -m pytest tests
\ No newline at end of file

From 45751159c3b3c43776adaf286a73a372d7636ec2 Mon Sep 17 00:00:00 2001
From: Zack Li <zack@nexa4ai.com>
Date: Wed, 21 Aug 2024 17:33:31 +0000
Subject: [PATCH 04/31] wip

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 12c9232e..d290b249 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"
 
 [project]
 name = "nexaai"
-version = "0.0.1.dev"
+version = "0.0.2.dev"
 description = "Nexa AI SDK"
 readme = "README.md"
 license = { text = "MIT" }

From 663f20269fee0e5f763421450f7047e62642f0d5 Mon Sep 17 00:00:00 2001
From: Yu xing <xyyimian@gmail.com>
Date: Wed, 21 Aug 2024 12:57:37 -0700
Subject: [PATCH 05/31] update ci

---
 .github/workflows/ci.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index bde784ed..0e17e44d 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -10,7 +10,7 @@ on:
 
 jobs:
   build:
-    runs-on: ubuntu-latest
+    runs-on: macos-latest
 
     steps:
     - name: Checkout code

From dd49f40a98805ff3092f008a2e563dc997bdb7ba Mon Sep 17 00:00:00 2001
From: Yu xing <xyyimian@gmail.com>
Date: Wed, 21 Aug 2024 13:26:48 -0700
Subject: [PATCH 06/31] expose gguf interface

---
 nexa/gguf/nexa_inference_image.py | 124 +++++++++++++++++-------------
 nexa/gguf/nexa_inference_text.py  |  65 +++++++++++++---
 nexa/gguf/nexa_inference_vlm.py   |  25 ++++++
 nexa/gguf/nexa_inference_voice.py | 101 +++++++++++++++++++++---
 4 files changed, 238 insertions(+), 77 deletions(-)

diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py
index 21714fad..473b9f2f 100644
--- a/nexa/gguf/nexa_inference_image.py
+++ b/nexa/gguf/nexa_inference_image.py
@@ -29,21 +29,22 @@ class NexaImageInference:
     A class used for loading image models and running image generation.
 
     Methods:
-    run_txt2img: Run the text-to-image generation loop.
-    run_img2img: Run the image-to-image generation loop.
-    run_streamlit: Run the Streamlit UI.
+        run_txt2img: Run the text-to-image generation loop.
+        run_img2img: Run the image-to-image generation loop.
+        run_streamlit: Run the Streamlit UI.
 
     Args:
-    model_path (str): Path or identifier for the model in Nexa Model Hub.
-    num_inference_steps (int): Number of inference steps.
-    width (int): Width of the output image.
-    height (int): Height of the output image.
-    guidance_scale (float): Guidance scale for diffusion.
-    output_path (str): Output path for the generated image.
-    random_seed (int): Random seed for image generation.
-    streamlit (bool): Run the inference in Streamlit UI.
+        model_path (str): Path or identifier for the model in Nexa Model Hub.
+        num_inference_steps (int): Number of inference steps.
+        width (int): Width of the output image.
+        height (int): Height of the output image.
+        guidance_scale (float): Guidance scale for diffusion.
+        output_path (str): Output path for the generated image.
+        random_seed (int): Random seed for image generation.
+        streamlit (bool): Run the inference in Streamlit UI.
 
     """
+    from nexa.gguf.sd.stable_diffusion import StableDiffusion
 
     def __init__(self, model_path, **kwargs):
         self.model_path = None
@@ -107,63 +108,75 @@ def _save_images(self, images):
             file_path = os.path.join(output_dir, file_name)
             image.save(file_path)
             logging.info(f"\nImage {i+1} saved to: {file_path}")
+    
+    def txt2img(self, prompt, negative_prompt):
+        """
+        Used for SDK. Generate images from text.
+
+        Args:
+            prompt (str): Prompt for the image generation.
+            negative_prompt (str): Negative prompt for the image generation.
 
-    def loop_txt2img(self):
+        Returns:
+            list: List of generated images.
+        """
+        images = self.model.txt_to_img(
+            prompt=prompt,
+            negative_prompt=negative_prompt if negative_prompt else "",
+            cfg_scale=self.params["guidance_scale"],
+            width=self.params["width"],
+            height=self.params["height"],
+            sample_steps=self.params["num_inference_steps"],
+            seed=self.params["random_seed"],
+            control_cond=self.params.get("control_image_path", ""),
+            control_strength=self.params.get("control_strength", 0.9),
+        )
+        return images
 
+    def run_txt2img(self):
         while True:
             try:
                 prompt = nexa_prompt("Enter your prompt: ")
                 negative_prompt = nexa_prompt(
                     "Enter your negative prompt (press Enter to skip): "
                 )
-                self._txt2img(prompt, negative_prompt)
+                try:
+                    images = self.txt2img(prompt, negative_prompt)
+                    self._save_images(images)
+                except Exception as e:
+                    logging.error(f"Error during text to image generation: {e}")
             except KeyboardInterrupt:
                 print(EXIT_REMINDER)
             except Exception as e:
                 logging.error(f"Error during generation: {e}", exc_info=True)
 
-    def _txt2img(self, prompt: str, negative_prompt: str):
+    def img2img(self, image_path, prompt, negative_prompt):
         """
-        Generate images based on the given prompt, negative prompt, and parameters.
-        """
-        try:
-            images = self.model.txt_to_img(
-                prompt=prompt,
-                negative_prompt=negative_prompt if negative_prompt else "",
-                cfg_scale=self.params["guidance_scale"],
-                width=self.params["width"],
-                height=self.params["height"],
-                sample_steps=self.params["num_inference_steps"],
-                seed=self.params["random_seed"],
-                control_cond=self.params.get("control_image_path", ""),
-                control_strength=self.params.get("control_strength", 0.9),
-            )
-            self._save_images(images)
-        except Exception as e:
-            logging.error(f"Error during image generation: {e}")
+        Used for SDK. Generate images from an image.
 
-    def loop_img2img(self):
-        def _generate_images(image_path, prompt, negative_prompt):
-            """
-            Generate images based on the given prompt, negative prompt, and parameters.
-            """
-            try:
-                images = self.model.img_to_img(
-                    image=image_path,
-                    prompt=prompt,
-                    negative_prompt=negative_prompt if negative_prompt else "",
-                    cfg_scale=self.params["guidance_scale"],
-                    width=self.params["width"],
-                    height=self.params["height"],
-                    sample_steps=self.params["num_inference_steps"],
-                    seed=self.params["random_seed"],
-                    control_cond=self.params.get("control_image_path", ""),
-                    control_strength=self.params.get("control_strength", 0.9),
-                )
-                self._save_images(images)
-            except Exception as e:
-                logging.error(f"Error during image generation: {e}")
+        Args:
+            image_path (str): Path to the input image.
+            prompt (str): Prompt for the image generation.
+            negative_prompt (str): Negative prompt for the image generation.
 
+        Returns:
+            list: List of generated images.
+        """
+        images = self.model.img_to_img(
+            image=image_path,
+            prompt=prompt,
+            negative_prompt=negative_prompt if negative_prompt else "",
+            cfg_scale=self.params["guidance_scale"],
+            width=self.params["width"],
+            height=self.params["height"],
+            sample_steps=self.params["num_inference_steps"],
+            seed=self.params["random_seed"],
+            control_cond=self.params.get("control_image_path", ""),
+            control_strength=self.params.get("control_strength", 0.9),
+        )
+        return images
+
+    def run_img2img(self):
         while True:
             try:
                 image_path = nexa_prompt("Enter the path to your image: ")
@@ -171,7 +184,8 @@ def _generate_images(image_path, prompt, negative_prompt):
                 negative_prompt = nexa_prompt(
                     "Enter your negative prompt (press Enter to skip): "
                 )
-                _generate_images(image_path, prompt, negative_prompt)
+                images = self.img2img(image_path, prompt, negative_prompt)
+                self._save_images(images)
             except KeyboardInterrupt:
                 print(EXIT_REMINDER)
             except Exception as e:
@@ -257,6 +271,6 @@ def run_streamlit(self, model_path: str):
         inference.run_streamlit(model_path)
     else:
         if args.img2img:
-            inference.loop_img2img()
+            inference.run_img2img()
         else:
-            inference.loop_txt2img()
+            inference.run_txt2img()
\ No newline at end of file
diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py
index 210e0267..4f3e933f 100644
--- a/nexa/gguf/nexa_inference_text.py
+++ b/nexa/gguf/nexa_inference_text.py
@@ -14,7 +14,6 @@
 )
 from nexa.general import pull_model
 from nexa.gguf.lib_utils import is_gpu_available
-from nexa.gguf.llama.llama import Llama
 from nexa.utils import SpinningCursorAnimation, nexa_prompt, suppress_stdout_stderr
 
 logging.basicConfig(
@@ -27,19 +26,20 @@ class NexaTextInference:
     A class used for load text models and run text generation.
 
     Methods:
-    run: Run the text generation loop.
-    run_streamlit: Run the Streamlit UI.
+        run: Run the text generation loop.
+        run_streamlit: Run the Streamlit UI.
 
     Args:
-    model_path (str): Path or identifier for the model in Nexa Model Hub.
-    stop_words (list): List of stop words for early stopping.
-    profiling (bool): Enable timing measurements for the generation process.
-    streamlit (bool): Run the inference in Streamlit UI.
-    temperature (float): Temperature for sampling.
-    max_new_tokens (int): Maximum number of new tokens to generate.
-    top_k (int): Top-k sampling parameter.
-    top_p (float): Top-p sampling parameter
+        model_path (str): Path or identifier for the model in Nexa Model Hub.
+        stop_words (list): List of stop words for early stopping.
+        profiling (bool): Enable timing measurements for the generation process.
+        streamlit (bool): Run the inference in Streamlit UI.
+        temperature (float): Temperature for sampling.
+        max_new_tokens (int): Maximum number of new tokens to generate.
+        top_k (int): Top-k sampling parameter.
+        top_p (float): Top-p sampling parameter
     """
+    from nexa.gguf.llama import Llama
     def __init__(self, model_path, stop_words=None, **kwargs):
         self.params = DEFAULT_TEXT_GEN_PARAMS
         self.params.update(kwargs)
@@ -132,6 +132,9 @@ def _load_model(self):
         self.conversation_history = [] if self.chat_format else None
 
     def run(self):
+        """
+        CLI interactive session. Not for SDK. 
+        """
         while True:
             generated_text = ""
             try:
@@ -177,6 +180,44 @@ def run(self):
             except Exception as e:
                 logging.error(f"Error during generation: {e}", exc_info=True)
             print("\n")
+    
+    def create_chat_completion(self, messages, temperature=0.7, max_tokens=2048, top_k=50, top_p=1.0, stream=False, stop=None):
+        """
+        Used for SDK. Generate completion for a chat conversation.
+
+        Args:
+            messages (list): List of messages in the conversation.
+            temperature (float): Temperature for sampling.
+            max_tokens (int): Maximum number of new tokens to generate.
+            top_k (int): Top-k sampling parameter.
+            top_p (float): Top-p sampling parameter.
+            stream (bool): Stream the output.
+            stop (list): List of stop words for early stopping.
+
+        Returns:
+            Iterator: Iterator for the completion.
+        """
+        return self.model.create_chat_completion(messages=messages, temperature=temperature, max_tokens=max_tokens, top_k=top_k, top_p=top_p, stream=stream, stop=stop)
+    
+    def create_completion(self, prompt, temperature=0.7, max_tokens=2048, top_k=50, top_p=1.0, echo=False, stream=False, stop=None):
+        """
+        Used for SDK. Generate completion for a given prompt.
+
+        Args:
+            prompt (str): Prompt for the completion.
+            temperature (float): Temperature for sampling.
+            max_tokens (int): Maximum number of new tokens to generate.
+            top_k (int): Top-k sampling parameter.
+            top_p (float): Top-p sampling parameter.
+            echo (bool): Echo the prompt back in the output.
+            stream (bool): Stream the output.
+            stop (list): List of stop words for early stopping.
+
+        Returns:
+            Iterator: Iterator for the completion.
+        """
+        return self.model.create_completion(prompt=prompt, temperature=temperature, max_tokens=max_tokens, top_k=top_k, top_p=top_p, echo=echo, stream=stream, stop=stop)
+
 
     def _chat(self, user_input: str) -> Iterator:
         self.conversation_history.append({"role": "user", "content": user_input})
@@ -209,7 +250,7 @@ def _complete(self, user_input: str) -> Iterator:
 
     def run_streamlit(self, model_path: str):
         """
-        Run the Streamlit UI.
+        Used for CLI. Run the Streamlit UI.
         """
         logging.info("Running Streamlit UI...")
 
diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py
index 2d9e39e1..157bf28e 100644
--- a/nexa/gguf/nexa_inference_vlm.py
+++ b/nexa/gguf/nexa_inference_vlm.py
@@ -79,6 +79,8 @@ class NexaVLMInference:
     top_k (int): Top-k sampling parameter.
     top_p (float): Top-p sampling parameter
     """
+    from nexa.gguf.llama.llama import Llama
+
     def __init__(self, model_path, stop_words=None, **kwargs):
         self.params = DEFAULT_TEXT_GEN_PARAMS
         self.params.update(kwargs)
@@ -216,6 +218,29 @@ def run(self):
             except Exception as e:
                 logging.error(f"Error during generation: {e}", exc_info=True)
             print("\n")
+    
+    def create_chat_completion(self, messages, temperature, max_tokens, top_k, top_p, stream, stop):
+        """
+        Generate text completion for a given chat prompt.
+        
+        Args:
+            messages (list): List of messages in the chat prompt.
+            temperature (float): Temperature for sampling.
+            max_tokens (int): Maximum number of tokens to generate.
+            top_k (int): Top-k sampling parameter.
+            top_p (float): Top-p sampling parameter.
+            stream (bool): Stream the output.
+            stop (list): List of stop words for early stopping.
+        """
+        return self.model.create_chat_completion(
+            messages=messages,
+            temperature=temperature,
+            max_tokens=max_tokens,
+            top_k=top_k,
+            top_p=top_p,
+            stream=stream,
+            stop=stop,
+        )
 
     def _chat(self, user_input: str, image_path: str = None) -> Iterator:
         data_uri = image_to_base64_data_uri(image_path) if image_path else None
diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py
index 7822344d..3ab103d6 100644
--- a/nexa/gguf/nexa_inference_voice.py
+++ b/nexa/gguf/nexa_inference_voice.py
@@ -19,18 +19,18 @@ class NexaVoiceInference:
     A class used for loading voice models and running voice transcription.
 
     Methods:
-    run: Run the voice transcription loop.
-    run_streamlit: Run the Streamlit UI.
+        run: Run the voice transcription loop.
+        run_streamlit: Run the Streamlit UI.
 
     Args:
-    model_path (str): Path or identifier for the model in Nexa Model Hub.
-    output_dir (str): Output directory for transcriptions.
-    beam_size (int): Beam size to use for transcription.
-    language (str): The language spoken in the audio.
-    task (str): Task to execute (transcribe or translate).
-    temperature (float): Temperature for sampling.
-    compute_type (str): Type to use for computation (e.g., float16, int8, int8_float16).
-    output_dir (str): Output directory for transcriptions.
+        model_path (str): Path or identifier for the model in Nexa Model Hub.
+        output_dir (str): Output directory for transcriptions.
+        beam_size (int): Beam size to use for transcription.
+        language (str): The language spoken in the audio.
+        task (str): Task to execute (transcribe or translate).
+        temperature (float): Temperature for sampling.
+        compute_type (str): Type to use for computation (e.g., float16, int8, int8_float16).
+        output_dir (str): Output directory for transcriptions.
 
     """
     def __init__(self, model_path, **kwargs):
@@ -87,6 +87,87 @@ def run(self):
                 print(EXIT_REMINDER)
             except Exception as e:
                 logging.error(f"Error during text generation: {e}", exc_info=True)
+    
+    def transcribe(self, audio, **kwargs):
+        """
+        Transcribe the audio file.
+
+        Arguments:
+          audio: Path to the input file (or a file-like object), or the audio waveform.
+          language: The language spoken in the audio. It should be a language code such
+            as "en" or "fr". If not set, the language will be detected in the first 30 seconds
+            of audio.
+          task: Task to execute (transcribe or translate).
+          beam_size: Beam size to use for decoding.
+          best_of: Number of candidates when sampling with non-zero temperature.
+          patience: Beam search patience factor.
+          length_penalty: Exponential length penalty constant.
+          repetition_penalty: Penalty applied to the score of previously generated tokens
+            (set > 1 to penalize).
+          no_repeat_ngram_size: Prevent repetitions of ngrams with this size (set 0 to disable).
+          temperature: Temperature for sampling. It can be a tuple of temperatures,
+            which will be successively used upon failures according to either
+            `compression_ratio_threshold` or `log_prob_threshold`.
+          compression_ratio_threshold: If the gzip compression ratio is above this value,
+            treat as failed.
+          log_prob_threshold: If the average log probability over sampled tokens is
+            below this value, treat as failed.
+          no_speech_threshold: If the no_speech probability is higher than this value AND
+            the average log probability over sampled tokens is below `log_prob_threshold`,
+            consider the segment as silent.
+          condition_on_previous_text: If True, the previous output of the model is provided
+            as a prompt for the next window; disabling may make the text inconsistent across
+            windows, but the model becomes less prone to getting stuck in a failure loop,
+            such as repetition looping or timestamps going out of sync.
+          prompt_reset_on_temperature: Resets prompt if temperature is above this value.
+            Arg has effect only if condition_on_previous_text is True.
+          initial_prompt: Optional text string or iterable of token ids to provide as a
+            prompt for the first window.
+          prefix: Optional text to provide as a prefix for the first window.
+          suppress_blank: Suppress blank outputs at the beginning of the sampling.
+          suppress_tokens: List of token IDs to suppress. -1 will suppress a default set
+            of symbols as defined in the model config.json file.
+          without_timestamps: Only sample text tokens.
+          max_initial_timestamp: The initial timestamp cannot be later than this.
+          word_timestamps: Extract word-level timestamps using the cross-attention pattern
+            and dynamic time warping, and include the timestamps for each word in each segment.
+          prepend_punctuations: If word_timestamps is True, merge these punctuation symbols
+            with the next word
+          append_punctuations: If word_timestamps is True, merge these punctuation symbols
+            with the previous word
+          vad_filter: Enable the voice activity detection (VAD) to filter out parts of the audio
+            without speech. This step is using the Silero VAD model
+            https://github.com/snakers4/silero-vad.
+          vad_parameters: Dictionary of Silero VAD parameters or VadOptions class (see available
+            parameters and default values in the class `VadOptions`).
+          max_new_tokens: Maximum number of new tokens to generate per-chunk. If not set,
+            the maximum will be set by the default max_length.
+          chunk_length: The length of audio segments. If it is not None, it will overwrite the
+            default chunk_length of the FeatureExtractor.
+          clip_timestamps:
+            Comma-separated list start,end,start,end,... timestamps (in seconds) of clips to
+             process. The last end timestamp defaults to the end of the file.
+             vad_filter will be ignored if clip_timestamps is used.
+          hallucination_silence_threshold:
+            When word_timestamps is True, skip silent periods longer than this threshold
+             (in seconds) when a possible hallucination is detected
+          hotwords:
+            Hotwords/hint phrases to provide the model with. Has no effect if prefix is not None.
+          language_detection_threshold: If the maximum probability of the language tokens is higher
+           than this value, the language is detected.
+          language_detection_segments: Number of segments to consider for the language detection.
+
+        Returns:
+          A tuple with:
+
+            - a generator over transcribed segments
+            - an instance of TranscriptionInfo
+        """
+        return self.model.transcribe(
+            audio,
+            **kwargs,
+        )
+        
 
     def _transcribe_audio(self, audio_path):
         logging.debug(f"Transcribing audio from: {audio_path}")

From a0b2358de852af578c56c05cf1961ed7bb09425f Mon Sep 17 00:00:00 2001
From: Yu xing <xyyimian@gmail.com>
Date: Wed, 21 Aug 2024 13:54:01 -0700
Subject: [PATCH 07/31] update onnx interface

---
 nexa/gguf/nexa_inference_text.py  |  2 +-
 nexa/onnx/nexa_inference_image.py | 54 +++++++++++++++++--------------
 nexa/onnx/nexa_inference_text.py  | 20 ++++++------
 nexa/onnx/nexa_inference_tts.py   | 35 +++++++++++++-------
 4 files changed, 64 insertions(+), 47 deletions(-)

diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py
index 4f3e933f..23804abf 100644
--- a/nexa/gguf/nexa_inference_text.py
+++ b/nexa/gguf/nexa_inference_text.py
@@ -39,7 +39,7 @@ class NexaTextInference:
         top_k (int): Top-k sampling parameter.
         top_p (float): Top-p sampling parameter
     """
-    from nexa.gguf.llama import Llama
+    from nexa.gguf.llama.llama import Llama
     def __init__(self, model_path, stop_words=None, **kwargs):
         self.params = DEFAULT_TEXT_GEN_PARAMS
         self.params.update(kwargs)
diff --git a/nexa/onnx/nexa_inference_image.py b/nexa/onnx/nexa_inference_image.py
index 0a6e5f54..cf676255 100644
--- a/nexa/onnx/nexa_inference_image.py
+++ b/nexa/onnx/nexa_inference_image.py
@@ -105,15 +105,23 @@ def _dialogue_mode(self):
                 negative_prompt = nexa_prompt(
                     "Enter your negative prompt (press Enter to skip): "
                 )
-                self._generate_images(prompt, negative_prompt)
+                images = self.generate_images(prompt, negative_prompt)
+                self._save_images(images)
             except KeyboardInterrupt:
                 print(EXIT_REMINDER)
             except Exception as e:
                 logging.error(f"Error during text generation: {e}", exc_info=True)
 
-    def _generate_images(self, prompt, negative_prompt):
+    def generate_images(self, prompt, negative_prompt):
         """
-        Generate images based on the given prompt, negative prompt, and parameters.
+        Used for SDK. Generate images based on the given prompt, negative prompt, and parameters.
+
+        Arg:
+            prompt (str): Prompt for the image generation.
+            negative_prompt (str): Negative prompt for the image generation.
+
+        Returns:
+            list: List of generated images.
         """
         if self.pipeline is None:
             logging.error("Model not loaded. Exiting.")
@@ -121,28 +129,26 @@ def _generate_images(self, prompt, negative_prompt):
 
         generator = np.random.RandomState(self.params["random_seed"])
 
-        try:
-            is_lcm_pipeline = isinstance(
-                self.pipeline, ORTLatentConsistencyModelPipeline
-            )
+        is_lcm_pipeline = isinstance(
+            self.pipeline, ORTLatentConsistencyModelPipeline
+        )
 
-            pipeline_kwargs = {
-                "prompt": prompt,
-                "num_inference_steps": self.params["num_inference_steps"],
-                "num_images_per_prompt": self.params["num_images_per_prompt"],
-                "height": self.params["height"],
-                "width": self.params["width"],
-                "generator": generator,
-                "guidance_scale": self.params["guidance_scale"],
-            }
-            if not is_lcm_pipeline and negative_prompt:
-                pipeline_kwargs["negative_prompt"] = negative_prompt
-
-            images = self.pipeline(**pipeline_kwargs).images
-
-            self._save_images(images)
-        except Exception as e:
-            logging.error(f"Error during image generation: {e}")
+        pipeline_kwargs = {
+            "prompt": prompt,
+            "num_inference_steps": self.params["num_inference_steps"],
+            "num_images_per_prompt": self.params["num_images_per_prompt"],
+            "height": self.params["height"],
+            "width": self.params["width"],
+            "generator": generator,
+            "guidance_scale": self.params["guidance_scale"],
+        }
+        if not is_lcm_pipeline and negative_prompt:
+            pipeline_kwargs["negative_prompt"] = negative_prompt
+
+        images = self.pipeline(**pipeline_kwargs).images
+        return images
+
+            
 
     def _save_images(self, images):
         """
diff --git a/nexa/onnx/nexa_inference_text.py b/nexa/onnx/nexa_inference_text.py
index f2f94a3c..56f5c09b 100644
--- a/nexa/onnx/nexa_inference_text.py
+++ b/nexa/onnx/nexa_inference_text.py
@@ -20,18 +20,18 @@ class NexaTextInference:
     A class used for load text models and run text generation.
 
     Methods:
-    run: Run the text generation loop.
-    run_streamlit: Run the Streamlit UI.
+        run: Run the text generation loop.
+        run_streamlit: Run the Streamlit UI.
 
     Args:
-    model_path (str): Path or identifier for the model in Nexa Model Hub.
-    profiling (bool): Enable timing measurements for the generation process.
-    streamlit (bool): Run the inference in Streamlit UI.
-    temperature (float): Temperature for sampling.
-    min_new_tokens (int): Minimum number of new tokens to generate.
-    max_new_tokens (int): Maximum number of new tokens to generate.
-    top_k (int): Top-k sampling parameter.
-    top_p (float): Top-p sampling parameter
+        model_path (str): Path or identifier for the model in Nexa Model Hub.
+        profiling (bool): Enable timing measurements for the generation process.
+        streamlit (bool): Run the inference in Streamlit UI.
+        temperature (float): Temperature for sampling.
+        min_new_tokens (int): Minimum number of new tokens to generate.
+        max_new_tokens (int): Maximum number of new tokens to generate.
+        top_k (int): Top-k sampling parameter.
+        top_p (float): Top-p sampling parameter
     """
 
     def __init__(self, model_path, **kwargs):
diff --git a/nexa/onnx/nexa_inference_tts.py b/nexa/onnx/nexa_inference_tts.py
index e7167ee6..ff7093d6 100644
--- a/nexa/onnx/nexa_inference_tts.py
+++ b/nexa/onnx/nexa_inference_tts.py
@@ -23,14 +23,14 @@ class NexaTTSInference:
     A class used for loading text-to-speech models and running text-to-speech generation.
 
     Methods:
-    run: Run the text-to-speech generation loop.
-    run_streamlit: Run the Streamlit UI.
+        run: Run the text-to-speech generation loop.
+        run_streamlit: Run the Streamlit UI.
 
     Args:
-    model_path (str): Path or identifier for the model in Nexa Model Hub.
-    output_dir (str): Output directory for tts.
-    sampling_rate (int): Sampling rate for audio processing.
-    streamlit (bool): Run the inference in Streamlit UI.
+        model_path (str): Path or identifier for the model in Nexa Model Hub.
+        output_dir (str): Output directory for tts.
+        sampling_rate (int): Sampling rate for audio processing.
+        streamlit (bool): Run the inference in Streamlit UI.
     """
     
     def __init__(self, model_path, **kwargs):
@@ -71,19 +71,30 @@ def run(self):
         while True:
             try:
                 user_input = nexa_prompt("Enter text to generate audio: ")
-                self._audio_generation(user_input)
+                outputs = self.audio_generation(user_input)
+                self._save_audio(
+                    outputs[0], self.params["sampling_rate"], self.params["output_path"]
+                )
+                logging.info(f"Audio saved to {self.params['output_path']}")                
             except KeyboardInterrupt:
                 print(EXIT_REMINDER)
             except Exception as e:
                 logging.error(f"Error during text generation: {e}", exc_info=True)
 
-    def _audio_generation(self, user_input):
+    def audio_generation(self, user_input):
+        """
+        Used for SDK. Generate audio from the user input.
+
+        Args:
+            user_input (str): User input for audio generation.
+
+        Returns:
+            np.array: Audio data.
+        """
         inputs = self.tokenizer(user_input)
         outputs = self.model.run(None, {"text": inputs})
-        self._save_audio(
-            outputs[0], self.params["sampling_rate"], self.params["output_path"]
-        )
-        logging.info(f"Audio saved to {self.params['output_path']}")
+        return outputs
+
 
     def _save_audio(self, audio_data, sampling_rate, output_path):
         os.makedirs(output_path, exist_ok=True)

From c9f3d4fd8d232d124818d7f95986588f474a0652 Mon Sep 17 00:00:00 2001
From: Yu xing <xyyimian@gmail.com>
Date: Wed, 21 Aug 2024 13:59:30 -0700
Subject: [PATCH 08/31] move import position

---
 nexa/gguf/nexa_inference_image.py | 4 ++--
 nexa/gguf/nexa_inference_text.py  | 3 ++-
 nexa/gguf/nexa_inference_vlm.py   | 4 ++--
 nexa/gguf/nexa_inference_voice.py | 3 +--
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py
index 473b9f2f..3e2123ad 100644
--- a/nexa/gguf/nexa_inference_image.py
+++ b/nexa/gguf/nexa_inference_image.py
@@ -6,7 +6,6 @@
 import time
 from pathlib import Path
 
-from nexa.gguf.sd.stable_diffusion import StableDiffusion
 from nexa.general import pull_model
 from nexa.constants import (
     DEFAULT_IMG_GEN_PARAMS,
@@ -44,7 +43,7 @@ class NexaImageInference:
         streamlit (bool): Run the inference in Streamlit UI.
 
     """
-    from nexa.gguf.sd.stable_diffusion import StableDiffusion
+    
 
     def __init__(self, model_path, **kwargs):
         self.model_path = None
@@ -85,6 +84,7 @@ def __init__(self, model_path, **kwargs):
     @SpinningCursorAnimation()
     def _load_model(self, model_path: str):
         with suppress_stdout_stderr():
+            from nexa.gguf.sd.stable_diffusion import StableDiffusion
             self.model = StableDiffusion(
                 model_path=self.downloaded_path,
                 lora_model_dir=self.params.get("lora_dir", ""),
diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py
index 23804abf..9c9a3c32 100644
--- a/nexa/gguf/nexa_inference_text.py
+++ b/nexa/gguf/nexa_inference_text.py
@@ -39,7 +39,7 @@ class NexaTextInference:
         top_k (int): Top-k sampling parameter.
         top_p (float): Top-p sampling parameter
     """
-    from nexa.gguf.llama.llama import Llama
+    
     def __init__(self, model_path, stop_words=None, **kwargs):
         self.params = DEFAULT_TEXT_GEN_PARAMS
         self.params.update(kwargs)
@@ -110,6 +110,7 @@ def _load_model(self):
         logging.debug(f"Loading model from {self.downloaded_path}")
         start_time = time.time()
         with suppress_stdout_stderr():
+            from nexa.gguf.llama.llama import Llama
             self.model = Llama(
                 model_path=self.downloaded_path,
                 verbose=self.profiling,
diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py
index 157bf28e..1e2ab005 100644
--- a/nexa/gguf/nexa_inference_vlm.py
+++ b/nexa/gguf/nexa_inference_vlm.py
@@ -19,7 +19,6 @@
 )
 from nexa.general import pull_model
 from nexa.gguf.lib_utils import is_gpu_available
-from nexa.gguf.llama.llama import Llama
 from nexa.gguf.llama.llama_chat_format import (
     Llava15ChatHandler,
     Llava16ChatHandler,
@@ -79,7 +78,7 @@ class NexaVLMInference:
     top_k (int): Top-k sampling parameter.
     top_p (float): Top-p sampling parameter
     """
-    from nexa.gguf.llama.llama import Llama
+    
 
     def __init__(self, model_path, stop_words=None, **kwargs):
         self.params = DEFAULT_TEXT_GEN_PARAMS
@@ -151,6 +150,7 @@ def _load_model(self):
                 if self.projector_downloaded_path
                 else None
             )
+            from nexa.gguf.llama.llama import Llama
             self.model = Llama(
                 model_path=self.downloaded_path,
                 chat_handler=self.projector,
diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py
index 3ab103d6..372a72f6 100644
--- a/nexa/gguf/nexa_inference_voice.py
+++ b/nexa/gguf/nexa_inference_voice.py
@@ -7,9 +7,8 @@
 
 from nexa.constants import EXIT_REMINDER, NEXA_RUN_MODEL_MAP_VOICE, DEFAULT_VOICE_GEN_PARAMS
 from nexa.general import pull_model
-from nexa.utils import nexa_prompt
 from faster_whisper import WhisperModel
-from nexaai.utils import nexa_prompt, SpinningCursorAnimation, suppress_stdout_stderr
+from nexa.utils import nexa_prompt, SpinningCursorAnimation, suppress_stdout_stderr
 
 logging.basicConfig(level=logging.INFO)
 

From 5bc20f81687aacafab9ab7592a59576fb380884f Mon Sep 17 00:00:00 2001
From: Yu xing <xyyimian@gmail.com>
Date: Wed, 21 Aug 2024 14:26:07 -0700
Subject: [PATCH 09/31] update vlm test

---
 nexa/gguf/__init__.py           | 20 +++++++++---------
 nexa/gguf/nexa_inference_vlm.py | 37 ++++++++++++++++++++++++++++++++-
 tests/test_vlm.py               | 17 ++++-----------
 3 files changed, 50 insertions(+), 24 deletions(-)

diff --git a/nexa/gguf/__init__.py b/nexa/gguf/__init__.py
index 0001ab54..6ab29ece 100644
--- a/nexa/gguf/__init__.py
+++ b/nexa/gguf/__init__.py
@@ -1,11 +1,11 @@
-# from .nexa_inference_image import NexaImageInference
-# from .nexa_inference_text import NexaTextInference
-# from .nexa_inference_vlm import NexaVLMInference
-# from .nexa_inference_voice import NexaVoiceInference
+from .nexa_inference_image import NexaImageInference
+from .nexa_inference_text import NexaTextInference
+from .nexa_inference_vlm import NexaVLMInference
+from .nexa_inference_voice import NexaVoiceInference
 
-# __all__ = [
-#     "NexaImageInference",
-#     "NexaTextInference",
-#     "NexaVLMInference",
-#     "NexaVoiceInference",
-# ]
\ No newline at end of file
+__all__ = [
+    "NexaImageInference",
+    "NexaTextInference",
+    "NexaVLMInference",
+    "NexaVoiceInference",
+]
\ No newline at end of file
diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py
index 1e2ab005..63061852 100644
--- a/nexa/gguf/nexa_inference_vlm.py
+++ b/nexa/gguf/nexa_inference_vlm.py
@@ -219,7 +219,14 @@ def run(self):
                 logging.error(f"Error during generation: {e}", exc_info=True)
             print("\n")
     
-    def create_chat_completion(self, messages, temperature, max_tokens, top_k, top_p, stream, stop):
+    def create_chat_completion(self, 
+                            messages, 
+                            max_tokens:int = 2048, 
+                            temperature: float = 0.2,
+                            top_p: float = 0.95,
+                            top_k: int = 40,
+                            stream=False, 
+                            stop=[]):
         """
         Generate text completion for a given chat prompt.
         
@@ -231,6 +238,34 @@ def create_chat_completion(self, messages, temperature, max_tokens, top_k, top_p
             top_p (float): Top-p sampling parameter.
             stream (bool): Stream the output.
             stop (list): List of stop words for early stopping.
+        
+        Returns:
+            Iterator: An iterator of the generated text completion
+            return format:
+            {
+                "choices": [
+                    {
+                    "finish_reason": "stop",
+                    "index": 0,
+                    "message": {
+                        "content": "The 2020 World Series was played in Texas at Globe Life Field in Arlington.",
+                        "role": "assistant"
+                    },
+                    "logprobs": null
+                    }
+                ],
+                "created": 1677664795,
+                "id": "chatcmpl-7QyqpwdfhqwajicIEznoc6Q47XAyW",
+                "model": "gpt-4o-mini",
+                "object": "chat.completion",
+                "usage": {
+                    "completion_tokens": 17,
+                    "prompt_tokens": 57,
+                    "total_tokens": 74
+                }
+            }          
+            usage: message = completion.choices[0].message.content  
+            
         """
         return self.model.create_chat_completion(
             messages=messages,
diff --git a/tests/test_vlm.py b/tests/test_vlm.py
index 17400bd4..d8977a68 100644
--- a/tests/test_vlm.py
+++ b/tests/test_vlm.py
@@ -1,8 +1,7 @@
 import base64
 import os
 
-from nexa.gguf.llama import llama
-from nexa.gguf.llama.llama_chat_format import NanoLlavaChatHandler
+from nexa.gguf import NexaVLMInference
 from tests.utils import download_model
 from nexa.gguf.lib_utils import is_gpu_available
 import tempfile
@@ -23,18 +22,10 @@ def test_image_generation():
         model_url = "https://nexa-model-hub-bucket.s3.us-west-1.amazonaws.com/public/nanoLLaVA/model-fp16.gguf"
         mmproj_url = "https://nexa-model-hub-bucket.s3.us-west-1.amazonaws.com/public/nanoLLaVA/projector-fp16.gguf"
 
-        model_path = download_model(model_url, temp_dir)
-        mmproj_path = download_model(mmproj_url, temp_dir)
-        chat_handler = NanoLlavaChatHandler(clip_model_path=mmproj_path)
-
-        llm = llama.Llama(
-            model_path=model_path,
-            chat_handler=chat_handler,
-            n_ctx=2048,  # n_ctx should be increased to accommodate the image embedding
-            n_gpu_layers=-1 if is_gpu_available() else 0,  # Uncomment to use GPU acceleration
-            verbose=False,
+        model = NexaVLMInference(
+            model_path="nanollava",
         )
-        output = llm.create_chat_completion(
+        output = model.create_chat_completion(
             messages=[
                 {
                     "role": "system",

From 85d6ed3b48e6ebd185a5926dd543278e50e5e1b1 Mon Sep 17 00:00:00 2001
From: Yu xing <xyyimian@gmail.com>
Date: Wed, 21 Aug 2024 14:57:50 -0700
Subject: [PATCH 10/31] update tests to use sdk interface instead of from
 scratch

---
 nexa/gguf/nexa_inference_image.py | 84 +++++++++++++++++++++++--------
 nexa/gguf/nexa_inference_text.py  | 14 ++----
 nexa/gguf/nexa_inference_vlm.py   |  2 +-
 tests/test_image_generation.py    | 35 ++++---------
 tests/test_text_generation.py     | 44 ++++++++--------
 tests/test_vlm.py                 | 19 -------
 6 files changed, 100 insertions(+), 98 deletions(-)

diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py
index 3e2123ad..d9d38bfa 100644
--- a/nexa/gguf/nexa_inference_image.py
+++ b/nexa/gguf/nexa_inference_image.py
@@ -28,8 +28,8 @@ class NexaImageInference:
     A class used for loading image models and running image generation.
 
     Methods:
-        run_txt2img: Run the text-to-image generation loop.
-        run_img2img: Run the image-to-image generation loop.
+        txt2img: (Used for SDK) Run the text-to-image generation loop.
+        img2img: (Used for SDK) Run the image-to-image generation loop.
         run_streamlit: Run the Streamlit UI.
 
     Args:
@@ -109,7 +109,16 @@ def _save_images(self, images):
             image.save(file_path)
             logging.info(f"\nImage {i+1} saved to: {file_path}")
     
-    def txt2img(self, prompt, negative_prompt):
+    def txt2img(self, 
+                prompt, 
+                negative_prompt="",
+                cfg_scale=7.5,
+                width=512,
+                height=512,
+                sample_steps=20,
+                seed=0,
+                control_cond="",
+                control_strength=0.9):
         """
         Used for SDK. Generate images from text.
 
@@ -122,14 +131,14 @@ def txt2img(self, prompt, negative_prompt):
         """
         images = self.model.txt_to_img(
             prompt=prompt,
-            negative_prompt=negative_prompt if negative_prompt else "",
-            cfg_scale=self.params["guidance_scale"],
-            width=self.params["width"],
-            height=self.params["height"],
-            sample_steps=self.params["num_inference_steps"],
-            seed=self.params["random_seed"],
-            control_cond=self.params.get("control_image_path", ""),
-            control_strength=self.params.get("control_strength", 0.9),
+            negative_prompt=negative_prompt,
+            cfg_scale=cfg_scale,
+            width=width,
+            height=height,
+            sample_steps=sample_steps,
+            seed=seed,
+            control_cond=control_cond,
+            control_strength=control_strength,
         )
         return images
 
@@ -141,7 +150,17 @@ def run_txt2img(self):
                     "Enter your negative prompt (press Enter to skip): "
                 )
                 try:
-                    images = self.txt2img(prompt, negative_prompt)
+                    images = self.txt2img(
+                        prompt, 
+                        negative_prompt,
+                        cfg_scale=self.params["guidance_scale"],
+                        width=self.params["width"],
+                        height=self.params["height"],
+                        sample_steps=self.params["num_inference_steps"],
+                        seed=self.params["random_seed"],
+                        control_cond=self.params.get("control_image_path", ""),
+                        control_strength=self.params.get("control_strength", 0.9),
+                    )
                     self._save_images(images)
                 except Exception as e:
                     logging.error(f"Error during text to image generation: {e}")
@@ -150,7 +169,17 @@ def run_txt2img(self):
             except Exception as e:
                 logging.error(f"Error during generation: {e}", exc_info=True)
 
-    def img2img(self, image_path, prompt, negative_prompt):
+    def img2img(self, 
+                image_path, 
+                prompt, 
+                negative_prompt="",
+                cfg_scale=7.5,
+                width=512,
+                height=512,
+                sample_steps=20,
+                seed=0,
+                control_cond="",
+                control_strength=0.9):
         """
         Used for SDK. Generate images from an image.
 
@@ -165,14 +194,14 @@ def img2img(self, image_path, prompt, negative_prompt):
         images = self.model.img_to_img(
             image=image_path,
             prompt=prompt,
-            negative_prompt=negative_prompt if negative_prompt else "",
-            cfg_scale=self.params["guidance_scale"],
-            width=self.params["width"],
-            height=self.params["height"],
-            sample_steps=self.params["num_inference_steps"],
-            seed=self.params["random_seed"],
-            control_cond=self.params.get("control_image_path", ""),
-            control_strength=self.params.get("control_strength", 0.9),
+            negative_prompt=negative_prompt,
+            cfg_scale=cfg_scale,
+            width=width,
+            height=height,
+            sample_steps=sample_steps,
+            seed=seed,
+            control_cond=control_cond,
+            control_strength=control_strength,
         )
         return images
 
@@ -184,7 +213,18 @@ def run_img2img(self):
                 negative_prompt = nexa_prompt(
                     "Enter your negative prompt (press Enter to skip): "
                 )
-                images = self.img2img(image_path, prompt, negative_prompt)
+                images = self.img2img(image_path, 
+                                      prompt, 
+                                      negative_prompt,
+                                      cfg_scale=self.params["guidance_scale"],
+                                      width=self.params["width"],
+                                      height=self.params["height"],
+                                      sample_steps=self.params["num_inference_steps"],
+                                      seed=self.params["random_seed"],
+                                      control_cond=self.params.get("control_image_path", ""),
+                                        control_strength=self.params.get("control_strength", 0.9),
+                                    )
+                
                 self._save_images(images)
             except KeyboardInterrupt:
                 print(EXIT_REMINDER)
diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py
index 9c9a3c32..324f0811 100644
--- a/nexa/gguf/nexa_inference_text.py
+++ b/nexa/gguf/nexa_inference_text.py
@@ -31,6 +31,7 @@ class NexaTextInference:
 
     Args:
         model_path (str): Path or identifier for the model in Nexa Model Hub.
+        embedding (bool): Enable embedding generation.
         stop_words (list): List of stop words for early stopping.
         profiling (bool): Enable timing measurements for the generation process.
         streamlit (bool): Run the inference in Streamlit UI.
@@ -83,27 +84,19 @@ def __init__(self, model_path, stop_words=None, **kwargs):
                     "Failed to load model or tokenizer. Exiting.", exc_info=True
                 )
                 exit(1)
-    def embed(
+    def create_embedding(
         self,
         input: Union[str, List[str]],
-        normalize: bool = False,
-        truncate: bool = True,
-        return_count: bool = False,
     ):
         """Embed a string.
 
         Args:
             input: The utf-8 encoded string or a list of string to embed.
-            normalize: whether to normalize embedding in embedding dimension.
-            trunca
-            truncate: whether to truncate tokens to window length before generating embedding.
-            return count: if true, return (embedding, count) tuple. else return embedding only.
-
 
         Returns:
             A list of embeddings
         """
-        return self.model.embed(input, normalize, truncate, return_count)
+        return self.model.create_embedding(input)
 
     @SpinningCursorAnimation()
     def _load_model(self):
@@ -112,6 +105,7 @@ def _load_model(self):
         with suppress_stdout_stderr():
             from nexa.gguf.llama.llama import Llama
             self.model = Llama(
+                embedding=self.params.get("embedding", False),
                 model_path=self.downloaded_path,
                 verbose=self.profiling,
                 chat_format=self.chat_format,
diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py
index 63061852..b4cd0f5c 100644
--- a/nexa/gguf/nexa_inference_vlm.py
+++ b/nexa/gguf/nexa_inference_vlm.py
@@ -138,7 +138,7 @@ def __init__(self, model_path, stop_words=None, **kwargs):
                 )
                 exit(1)
 
-    @SpinningCursorAnimation()
+    # @SpinningCursorAnimation()
     def _load_model(self):
         logging.debug(f"Loading model from {self.downloaded_path}")
         start_time = time.time()
diff --git a/tests/test_image_generation.py b/tests/test_image_generation.py
index 6c9d5b21..7e749dc6 100644
--- a/tests/test_image_generation.py
+++ b/tests/test_image_generation.py
@@ -1,47 +1,34 @@
-import os
-from nexa.gguf.sd import stable_diffusion
-from tests.utils import download_model
+from nexa.gguf import NexaImageInference
 from tempfile import TemporaryDirectory
+from .utils import download_model
 
-# Constants
-STABLE_DIFFUSION_URL = "https://huggingface.co/second-state/stable-diffusion-v-1-4-GGUF/resolve/main/stable-diffusion-v1-4-Q4_0.gguf"
-IMAGE_URL = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-OUTPUT_DIR = os.getcwd()
-MODEL_PATH = download_model(STABLE_DIFFUSION_URL, OUTPUT_DIR)
+sd = NexaImageInference(
+    model_path="sd1-4",
+    wtype="q4_0",
+)
 
 
-# Print the model path
-print("Model downloaded to:", MODEL_PATH)
-
-# Helper function for Stable Diffusion initialization
-def init_stable_diffusion():
-    return stable_diffusion.StableDiffusion(
-        model_path=MODEL_PATH,
-        wtype="q4_0"  # Weight type (options: default, f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)
-    )
-
 # Test text-to-image generation
 def test_txt_to_img():
-    sd = init_stable_diffusion()
-    output = sd.txt_to_img("a lovely cat", width=128, height=128, sample_steps=2)
+    global sd
+    output = sd.txt2img("a lovely cat", width=128, height=128, sample_steps=2)
     output[0].save("output_txt_to_img.png")
 
 # Test image-to-image generation
 def test_img_to_img():
     
-    sd = init_stable_diffusion()
+    global sd
     img_url = "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"        
     with TemporaryDirectory() as temp_dir:
         img_path = download_model(img_url, temp_dir)
-        output = sd.img_to_img(
-            image=img_path,  
+        output = sd.img2img(
+            image_path=img_path,  
             prompt="blue sky",  
             width=128,
             height=128,
             negative_prompt="black soil",
             sample_steps=2
         )
-        output[0].save("output_img_to_img.png")
 
 # Main execution
 # if __name__ == "__main__":
diff --git a/tests/test_text_generation.py b/tests/test_text_generation.py
index 04782a21..e3ceed30 100644
--- a/tests/test_text_generation.py
+++ b/tests/test_text_generation.py
@@ -1,36 +1,28 @@
-import os
-from nexa.gguf.llama import llama
-from tests.utils import download_model
+from nexa.gguf import NexaTextInference
 from nexa.gguf.lib_utils import is_gpu_available
-# Constants
-TINY_LLAMA_URL = "https://huggingface.co/TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF/resolve/main/tinyllama-1.1b-chat-v1.0.Q4_0.gguf"
-OUTPUT_DIR = os.getcwd()
-MODEL_PATH = download_model(TINY_LLAMA_URL, OUTPUT_DIR)
 
-# Initialize Llama model
-def init_llama_model(verbose=False, n_gpu_layers=-1, chat_format=None, embedding=False):
-    return llama.Llama(
-        model_path=MODEL_PATH,
-        verbose=verbose,
-        n_gpu_layers=n_gpu_layers if is_gpu_available() else 0,
-        chat_format=chat_format,
-        embedding=embedding,
-    )
+model = NexaTextInference(
+    model_path="gemma",
+    verbose=False,
+    n_gpu_layers=-1 if is_gpu_available() else 0,
+    chat_format="llama-2",
+)
 
 # Test text generation from a prompt
 def test_text_generation():
-    model = init_llama_model()
-    output = model(
+    global model
+    output = model.create_completion(
         "Q: Name the planets in the solar system? A: ",
         max_tokens=512,
         stop=["Q:", "\n"],
         echo=True,
     )
-    print(output)
+    # print(output)
+    # TODO: add assertions here
 
 # Test chat completion in streaming mode
 def test_streaming():
-    model = init_llama_model()
+    global model
     output = model.create_completion(
         "Q: Name the planets in the solar system? A: ",
         max_tokens=512,
@@ -40,10 +32,12 @@ def test_streaming():
     for chunk in output:
         if "choices" in chunk:
             print(chunk["choices"][0]["text"], end="", flush=True)
+    # TODO: add assertions here
 
 # Test conversation mode with chat format
 def test_create_chat_completion():
-    model = init_llama_model(chat_format="llama-2")
+    global model
+
     output = model.create_chat_completion(
         messages=[
             {"role": "user", "content": "write a long 1000 word story about a detective"}
@@ -58,7 +52,13 @@ def test_create_chat_completion():
             print(delta["content"], end="", flush=True)
 
 def test_create_embedding():
-    model = init_llama_model(embedding=True)
+    model = NexaTextInference(
+        model_path="gemma",
+        verbose=False,
+        n_gpu_layers=-1 if is_gpu_available() else 0,
+        chat_format="llama-2",
+        embedding=True,
+    )    
     embeddings = model.create_embedding("Hello, world!")
     print("Embeddings:\n", embeddings)
 
diff --git a/tests/test_vlm.py b/tests/test_vlm.py
index d8977a68..2c863146 100644
--- a/tests/test_vlm.py
+++ b/tests/test_vlm.py
@@ -1,27 +1,8 @@
-import base64
-import os
-
 from nexa.gguf import NexaVLMInference
-from tests.utils import download_model
-from nexa.gguf.lib_utils import is_gpu_available
 import tempfile
 
-def image_to_base64_data_uri(file_path):
-    """
-    file_path = 'file_path.png'
-    data_uri = image_to_base64_data_uri(file_path)
-    """
-    with open(file_path, "rb") as img_file:
-        base64_data = base64.b64encode(img_file.read()).decode("utf-8")
-        return f"data:image/png;base64,{base64_data}"
-
-
 def test_image_generation():
     with tempfile.TemporaryDirectory() as temp_dir:
-        temp_dir = os.path.dirname(os.path.abspath(__file__))
-        model_url = "https://nexa-model-hub-bucket.s3.us-west-1.amazonaws.com/public/nanoLLaVA/model-fp16.gguf"
-        mmproj_url = "https://nexa-model-hub-bucket.s3.us-west-1.amazonaws.com/public/nanoLLaVA/projector-fp16.gguf"
-
         model = NexaVLMInference(
             model_path="nanollava",
         )

From be4520d6e91bedb995c1353b09807df0bb08bbe8 Mon Sep 17 00:00:00 2001
From: Yu xing <xyyimian@gmail.com>
Date: Wed, 21 Aug 2024 14:58:05 -0700
Subject: [PATCH 11/31] revert vlm

---
 nexa/gguf/nexa_inference_vlm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py
index b4cd0f5c..63061852 100644
--- a/nexa/gguf/nexa_inference_vlm.py
+++ b/nexa/gguf/nexa_inference_vlm.py
@@ -138,7 +138,7 @@ def __init__(self, model_path, stop_words=None, **kwargs):
                 )
                 exit(1)
 
-    # @SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self):
         logging.debug(f"Loading model from {self.downloaded_path}")
         start_time = time.time()

From fee9553cb6f3f356972dcdb73af31b4696d5d105 Mon Sep 17 00:00:00 2001
From: Yu xing <xyyimian@gmail.com>
Date: Wed, 21 Aug 2024 15:05:04 -0700
Subject: [PATCH 12/31] use ubuntu instead

---
 .github/workflows/ci.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index 0e17e44d..bde784ed 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -10,7 +10,7 @@ on:
 
 jobs:
   build:
-    runs-on: macos-latest
+    runs-on: ubuntu-latest
 
     steps:
     - name: Checkout code

From 1bae4e6b545b110530af94299e1007958bb19079 Mon Sep 17 00:00:00 2001
From: Yu xing <xyyimian@gmail.com>
Date: Wed, 21 Aug 2024 15:14:47 -0700
Subject: [PATCH 13/31] remove cursor to try

---
 .github/workflows/ci.yaml         | 1 -
 nexa/gguf/nexa_inference_image.py | 2 +-
 nexa/gguf/nexa_inference_text.py  | 2 +-
 nexa/gguf/nexa_inference_vlm.py   | 2 +-
 nexa/gguf/nexa_inference_voice.py | 2 +-
 5 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/ci.yaml b/.github/workflows/ci.yaml
index bde784ed..3bdcd138 100644
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -28,7 +28,6 @@ jobs:
     - name: Install dependencies
       run: |
         python -m pip install --upgrade pip
-        python -m pip install numpy --upgrade
         python -m pip install build pytest
     - name: Build DLL
       run: |
diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py
index d9d38bfa..494374d7 100644
--- a/nexa/gguf/nexa_inference_image.py
+++ b/nexa/gguf/nexa_inference_image.py
@@ -81,7 +81,7 @@ def __init__(self, model_path, **kwargs):
                 logging.error("Failed to load the model or pipeline.")
                 exit(1)
 
-    @SpinningCursorAnimation()
+    # @SpinningCursorAnimation()
     def _load_model(self, model_path: str):
         with suppress_stdout_stderr():
             from nexa.gguf.sd.stable_diffusion import StableDiffusion
diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py
index 324f0811..e707cf1a 100644
--- a/nexa/gguf/nexa_inference_text.py
+++ b/nexa/gguf/nexa_inference_text.py
@@ -98,7 +98,7 @@ def create_embedding(
         """
         return self.model.create_embedding(input)
 
-    @SpinningCursorAnimation()
+    # @SpinningCursorAnimation()
     def _load_model(self):
         logging.debug(f"Loading model from {self.downloaded_path}")
         start_time = time.time()
diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py
index 63061852..b4cd0f5c 100644
--- a/nexa/gguf/nexa_inference_vlm.py
+++ b/nexa/gguf/nexa_inference_vlm.py
@@ -138,7 +138,7 @@ def __init__(self, model_path, stop_words=None, **kwargs):
                 )
                 exit(1)
 
-    @SpinningCursorAnimation()
+    # @SpinningCursorAnimation()
     def _load_model(self):
         logging.debug(f"Loading model from {self.downloaded_path}")
         start_time = time.time()
diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py
index 372a72f6..88856fa8 100644
--- a/nexa/gguf/nexa_inference_voice.py
+++ b/nexa/gguf/nexa_inference_voice.py
@@ -66,7 +66,7 @@ def __init__(self, model_path, **kwargs):
                 exit(1)
 
 
-    @SpinningCursorAnimation()
+    # @SpinningCursorAnimation()
     def _load_model(self):
         logging.debug(f"Loading model from: {self.downloaded_path}")
         with suppress_stdout_stderr():

From 5ddc656c3dbf75c56e3350775e76c6ef246b8587 Mon Sep 17 00:00:00 2001
From: Yu xing <xyyimian@gmail.com>
Date: Wed, 21 Aug 2024 15:33:26 -0700
Subject: [PATCH 14/31] fix

---
 nexa/gguf/nexa_inference_text.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py
index c050a465..fa59e7ee 100644
--- a/nexa/gguf/nexa_inference_text.py
+++ b/nexa/gguf/nexa_inference_text.py
@@ -106,6 +106,7 @@ def _load_model(self):
             try:
                 from nexa.gguf.llama.llama import Llama
                 self.model = Llama(
+                    embedding=self.params.get("embedding", False),
                     model_path=self.downloaded_path,
                     verbose=self.profiling,
                     chat_format=self.chat_format,

From d2b119c0fafab3721fb24ae609fa4e0dcb4a60c7 Mon Sep 17 00:00:00 2001
From: Yu Xing <xyyimian@gmail.com>
Date: Wed, 21 Aug 2024 23:07:44 +0000
Subject: [PATCH 15/31] png should use smaller one

---
 tests/test_vlm.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_vlm.py b/tests/test_vlm.py
index 2c863146..57dff975 100644
--- a/tests/test_vlm.py
+++ b/tests/test_vlm.py
@@ -19,7 +19,7 @@ def test_image_generation():
                         {
                             "type": "image_url",
                             "image_url": {
-                                "url": "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+                                "url": "https://www.wikipedia.org/portal/wikipedia.org/assets/img/Wikipedia-logo-v2.png"
                             },
                         },
                     ],

From 4205236733bd6bec10d8843d0ac2f80c86e4f474 Mon Sep 17 00:00:00 2001
From: Yu Xing <xyyimian@gmail.com>
Date: Wed, 21 Aug 2024 23:26:31 +0000
Subject: [PATCH 16/31] use engine interface to test

---
 tests/test_vlm.py | 25 +++++--------------------
 1 file changed, 5 insertions(+), 20 deletions(-)

diff --git a/tests/test_vlm.py b/tests/test_vlm.py
index 57dff975..b7a450f2 100644
--- a/tests/test_vlm.py
+++ b/tests/test_vlm.py
@@ -1,32 +1,17 @@
 from nexa.gguf import NexaVLMInference
 import tempfile
+from .utils import download_model
 
 def test_image_generation():
     with tempfile.TemporaryDirectory() as temp_dir:
         model = NexaVLMInference(
             model_path="nanollava",
         )
-        output = model.create_chat_completion(
-            messages=[
-                {
-                    "role": "system",
-                    "content": "You are an assistant who perfectly describes images.",
-                },
-                {
-                    "role": "user",
-                    "content": [
-                        {"type": "text", "text": "What's in this image?"},
-                        {
-                            "type": "image_url",
-                            "image_url": {
-                                "url": "https://www.wikipedia.org/portal/wikipedia.org/assets/img/Wikipedia-logo-v2.png"
-                            },
-                        },
-                    ],
-                },
-            ],
-            stream=True,
+        image_path = download_model(
+            "https://www.wikipedia.org/portal/wikipedia.org/assets/img/Wikipedia-logo-v2.png",
+            temp_dir,
         )
+        output = model._chat("what's in this image?", image_path)
         for chunk in output:
             delta = chunk["choices"][0]["delta"]
             if "role" in delta:

From 9a78395f6d8816afc9a0a8643463e8c49fe4e60c Mon Sep 17 00:00:00 2001
From: Yu Xing <xyyimian@gmail.com>
Date: Wed, 21 Aug 2024 23:27:52 +0000
Subject: [PATCH 17/31] revert cursor and use interface

---
 nexa/gguf/nexa_inference_image.py | 2 +-
 nexa/gguf/nexa_inference_text.py  | 2 +-
 nexa/gguf/nexa_inference_vlm.py   | 2 +-
 nexa/gguf/nexa_inference_voice.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py
index 494374d7..d9d38bfa 100644
--- a/nexa/gguf/nexa_inference_image.py
+++ b/nexa/gguf/nexa_inference_image.py
@@ -81,7 +81,7 @@ def __init__(self, model_path, **kwargs):
                 logging.error("Failed to load the model or pipeline.")
                 exit(1)
 
-    # @SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self, model_path: str):
         with suppress_stdout_stderr():
             from nexa.gguf.sd.stable_diffusion import StableDiffusion
diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py
index fa59e7ee..c93d3519 100644
--- a/nexa/gguf/nexa_inference_text.py
+++ b/nexa/gguf/nexa_inference_text.py
@@ -98,7 +98,7 @@ def create_embedding(
         """
         return self.model.create_embedding(input)
 
-    # @SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self):
         logging.debug(f"Loading model from {self.downloaded_path}")
         start_time = time.time()
diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py
index 27c057be..47e54786 100644
--- a/nexa/gguf/nexa_inference_vlm.py
+++ b/nexa/gguf/nexa_inference_vlm.py
@@ -146,7 +146,7 @@ def __init__(self, model_path, stop_words=None, **kwargs):
                 )
                 exit(1)
 
-    # @SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self):
         logging.debug(f"Loading model from {self.downloaded_path}")
         start_time = time.time()
diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py
index fc8034e3..b6437442 100644
--- a/nexa/gguf/nexa_inference_voice.py
+++ b/nexa/gguf/nexa_inference_voice.py
@@ -69,7 +69,7 @@ def __init__(self, model_path, **kwargs):
                 exit(1)
 
 
-    # @SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self):
         from faster_whisper import WhisperModel
 

From 5a2e00a384e7395e2eaca4b3a8a81d559157d750 Mon Sep 17 00:00:00 2001
From: Yu Xing <xyyimian@gmail.com>
Date: Wed, 21 Aug 2024 23:37:19 +0000
Subject: [PATCH 18/31] remove spin

---
 nexa/gguf/nexa_inference_image.py | 2 +-
 nexa/gguf/nexa_inference_text.py  | 2 +-
 nexa/gguf/nexa_inference_vlm.py   | 2 +-
 nexa/gguf/nexa_inference_voice.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py
index d9d38bfa..279923a0 100644
--- a/nexa/gguf/nexa_inference_image.py
+++ b/nexa/gguf/nexa_inference_image.py
@@ -81,7 +81,7 @@ def __init__(self, model_path, **kwargs):
                 logging.error("Failed to load the model or pipeline.")
                 exit(1)
 
-    @SpinningCursorAnimation()
+    #@SpinningCursorAnimation()
     def _load_model(self, model_path: str):
         with suppress_stdout_stderr():
             from nexa.gguf.sd.stable_diffusion import StableDiffusion
diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py
index c93d3519..7395093e 100644
--- a/nexa/gguf/nexa_inference_text.py
+++ b/nexa/gguf/nexa_inference_text.py
@@ -98,7 +98,7 @@ def create_embedding(
         """
         return self.model.create_embedding(input)
 
-    @SpinningCursorAnimation()
+    #@SpinningCursorAnimation()
     def _load_model(self):
         logging.debug(f"Loading model from {self.downloaded_path}")
         start_time = time.time()
diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py
index 47e54786..63b8f091 100644
--- a/nexa/gguf/nexa_inference_vlm.py
+++ b/nexa/gguf/nexa_inference_vlm.py
@@ -146,7 +146,7 @@ def __init__(self, model_path, stop_words=None, **kwargs):
                 )
                 exit(1)
 
-    @SpinningCursorAnimation()
+    #@SpinningCursorAnimation()
     def _load_model(self):
         logging.debug(f"Loading model from {self.downloaded_path}")
         start_time = time.time()
diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py
index b6437442..92eeaee5 100644
--- a/nexa/gguf/nexa_inference_voice.py
+++ b/nexa/gguf/nexa_inference_voice.py
@@ -69,7 +69,7 @@ def __init__(self, model_path, **kwargs):
                 exit(1)
 
 
-    @SpinningCursorAnimation()
+    #@SpinningCursorAnimation()
     def _load_model(self):
         from faster_whisper import WhisperModel
 

From c1809a3380edc10edb93ed8b9bb29744b9ec5fcf Mon Sep 17 00:00:00 2001
From: Yu xing <xyyimian@gmail.com>
Date: Wed, 21 Aug 2024 16:49:44 -0700
Subject: [PATCH 19/31] around it now

---
 nexa/gguf/nexa_inference_image.py  | 2 +-
 nexa/gguf/nexa_inference_text.py   | 2 +-
 nexa/gguf/nexa_inference_vlm.py    | 2 +-
 nexa/gguf/nexa_inference_voice.py  | 2 +-
 tests/{test_vlm.py => vlm_test.py} | 0
 5 files changed, 4 insertions(+), 4 deletions(-)
 rename tests/{test_vlm.py => vlm_test.py} (100%)

diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py
index 279923a0..d9d38bfa 100644
--- a/nexa/gguf/nexa_inference_image.py
+++ b/nexa/gguf/nexa_inference_image.py
@@ -81,7 +81,7 @@ def __init__(self, model_path, **kwargs):
                 logging.error("Failed to load the model or pipeline.")
                 exit(1)
 
-    #@SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self, model_path: str):
         with suppress_stdout_stderr():
             from nexa.gguf.sd.stable_diffusion import StableDiffusion
diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py
index 7395093e..c93d3519 100644
--- a/nexa/gguf/nexa_inference_text.py
+++ b/nexa/gguf/nexa_inference_text.py
@@ -98,7 +98,7 @@ def create_embedding(
         """
         return self.model.create_embedding(input)
 
-    #@SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self):
         logging.debug(f"Loading model from {self.downloaded_path}")
         start_time = time.time()
diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py
index 63b8f091..47e54786 100644
--- a/nexa/gguf/nexa_inference_vlm.py
+++ b/nexa/gguf/nexa_inference_vlm.py
@@ -146,7 +146,7 @@ def __init__(self, model_path, stop_words=None, **kwargs):
                 )
                 exit(1)
 
-    #@SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self):
         logging.debug(f"Loading model from {self.downloaded_path}")
         start_time = time.time()
diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py
index 92eeaee5..b6437442 100644
--- a/nexa/gguf/nexa_inference_voice.py
+++ b/nexa/gguf/nexa_inference_voice.py
@@ -69,7 +69,7 @@ def __init__(self, model_path, **kwargs):
                 exit(1)
 
 
-    #@SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self):
         from faster_whisper import WhisperModel
 
diff --git a/tests/test_vlm.py b/tests/vlm_test.py
similarity index 100%
rename from tests/test_vlm.py
rename to tests/vlm_test.py

From 48625897b917ebecce6b4d1bfd600376e4e4b49b Mon Sep 17 00:00:00 2001
From: Yu xing <xyyimian@gmail.com>
Date: Wed, 21 Aug 2024 17:03:07 -0700
Subject: [PATCH 20/31] remove cursor for now

---
 nexa/gguf/nexa_inference_image.py | 2 +-
 nexa/gguf/nexa_inference_text.py  | 2 +-
 nexa/gguf/nexa_inference_vlm.py   | 2 +-
 nexa/gguf/nexa_inference_voice.py | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py
index d9d38bfa..494374d7 100644
--- a/nexa/gguf/nexa_inference_image.py
+++ b/nexa/gguf/nexa_inference_image.py
@@ -81,7 +81,7 @@ def __init__(self, model_path, **kwargs):
                 logging.error("Failed to load the model or pipeline.")
                 exit(1)
 
-    @SpinningCursorAnimation()
+    # @SpinningCursorAnimation()
     def _load_model(self, model_path: str):
         with suppress_stdout_stderr():
             from nexa.gguf.sd.stable_diffusion import StableDiffusion
diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py
index c93d3519..fa59e7ee 100644
--- a/nexa/gguf/nexa_inference_text.py
+++ b/nexa/gguf/nexa_inference_text.py
@@ -98,7 +98,7 @@ def create_embedding(
         """
         return self.model.create_embedding(input)
 
-    @SpinningCursorAnimation()
+    # @SpinningCursorAnimation()
     def _load_model(self):
         logging.debug(f"Loading model from {self.downloaded_path}")
         start_time = time.time()
diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py
index 47e54786..27c057be 100644
--- a/nexa/gguf/nexa_inference_vlm.py
+++ b/nexa/gguf/nexa_inference_vlm.py
@@ -146,7 +146,7 @@ def __init__(self, model_path, stop_words=None, **kwargs):
                 )
                 exit(1)
 
-    @SpinningCursorAnimation()
+    # @SpinningCursorAnimation()
     def _load_model(self):
         logging.debug(f"Loading model from {self.downloaded_path}")
         start_time = time.time()
diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py
index b6437442..fc8034e3 100644
--- a/nexa/gguf/nexa_inference_voice.py
+++ b/nexa/gguf/nexa_inference_voice.py
@@ -69,7 +69,7 @@ def __init__(self, model_path, **kwargs):
                 exit(1)
 
 
-    @SpinningCursorAnimation()
+    # @SpinningCursorAnimation()
     def _load_model(self):
         from faster_whisper import WhisperModel
 

From 64d9b62e26259b5873a3bc55b1a4c5ce08841c37 Mon Sep 17 00:00:00 2001
From: Yu xing <xyyimian@gmail.com>
Date: Wed, 21 Aug 2024 17:16:48 -0700
Subject: [PATCH 21/31] remove vlm test

---
 tests/vlm_test.py | 25 -------------------------
 1 file changed, 25 deletions(-)
 delete mode 100644 tests/vlm_test.py

diff --git a/tests/vlm_test.py b/tests/vlm_test.py
deleted file mode 100644
index b7a450f2..00000000
--- a/tests/vlm_test.py
+++ /dev/null
@@ -1,25 +0,0 @@
-from nexa.gguf import NexaVLMInference
-import tempfile
-from .utils import download_model
-
-def test_image_generation():
-    with tempfile.TemporaryDirectory() as temp_dir:
-        model = NexaVLMInference(
-            model_path="nanollava",
-        )
-        image_path = download_model(
-            "https://www.wikipedia.org/portal/wikipedia.org/assets/img/Wikipedia-logo-v2.png",
-            temp_dir,
-        )
-        output = model._chat("what's in this image?", image_path)
-        for chunk in output:
-            delta = chunk["choices"][0]["delta"]
-            if "role" in delta:
-                print(delta["role"], end=": ")
-            elif "content" in delta:
-                print(delta["content"], end="")
-
-
-# if __name__ == "__main__":
-#     print("=== Testing 1 ===")
-#     test1()

From 7ee0ca205e735588b31665fcfb543e033b31f268 Mon Sep 17 00:00:00 2001
From: Zack Zhiyuan Li <zhiyuan.li1995@hotmail.com>
Date: Wed, 21 Aug 2024 21:07:20 -0700
Subject: [PATCH 22/31] wip

---
 tomls/pyproject_cuda.toml  | 8 ++++++--
 tomls/pyproject_metal.toml | 6 ++++--
 2 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/tomls/pyproject_cuda.toml b/tomls/pyproject_cuda.toml
index 008ea1a0..4233cd49 100644
--- a/tomls/pyproject_cuda.toml
+++ b/tomls/pyproject_cuda.toml
@@ -81,9 +81,13 @@ build.verbose = true
 cmake.build-type = "Release"
 cmake.version = ">=3.16"
 cmake.args = [
-    "-DGGML_CUDA=ON -DSD_CUBLAS=ON -DCMAKE_CUDA_ARCHITECTURES=all",
+    "-DGGML_CUDA=ON",
+    "-DSD_CUBLAS=ON",
+    "-DCMAKE_CUDA_ARCHITECTURES=all",
     "-DGGML_CUDA_FORCE_MMQ=ON",
-    "-DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF",
+    "-DGGML_AVX2=OFF",
+    "-DGGML_FMA=OFF",
+    "-DGGML_F16C=OFF"
 ]
 
 [tool.pytest.ini_options]
diff --git a/tomls/pyproject_metal.toml b/tomls/pyproject_metal.toml
index c895998d..6154b613 100644
--- a/tomls/pyproject_metal.toml
+++ b/tomls/pyproject_metal.toml
@@ -81,8 +81,10 @@ build.verbose = true
 cmake.build-type = "Release"
 cmake.version = ">=3.16"
 cmake.args = [
-    "-DGGML_METAL=ON -DSD_METAL=ON",
-    "-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64",
+    "-DGGML_METAL=ON"
+    "-DSD_METAL=ON",
+    "-DCMAKE_OSX_ARCHITECTURES=arm64",
+    "-DCMAKE_APPLE_SILICON_PROCESSOR=arm64"
 ]
 
 [tool.pytest.ini_options]

From b826e2b622bf44dda961caccf7a4cedf97ee726e Mon Sep 17 00:00:00 2001
From: Zack Zhiyuan Li <zhiyuan.li1995@hotmail.com>
Date: Wed, 21 Aug 2024 21:11:00 -0700
Subject: [PATCH 23/31] wip

---
 tomls/pyproject_metal.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tomls/pyproject_metal.toml b/tomls/pyproject_metal.toml
index 6154b613..584ebf07 100644
--- a/tomls/pyproject_metal.toml
+++ b/tomls/pyproject_metal.toml
@@ -81,7 +81,7 @@ build.verbose = true
 cmake.build-type = "Release"
 cmake.version = ">=3.16"
 cmake.args = [
-    "-DGGML_METAL=ON"
+    "-DGGML_METAL=ON",
     "-DSD_METAL=ON",
     "-DCMAKE_OSX_ARCHITECTURES=arm64",
     "-DCMAKE_APPLE_SILICON_PROCESSOR=arm64"

From 726c1277278cdbf093be3a9c54a08bcb2895305e Mon Sep 17 00:00:00 2001
From: Ethan Wang <ethan@nexa4ai.com>
Date: Wed, 21 Aug 2024 22:30:55 +0000
Subject: [PATCH 24/31] fix: SpinningCursorAnimation can work on windows now

---
 nexa/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/nexa/utils.py b/nexa/utils.py
index 33499a27..1e2a5b64 100644
--- a/nexa/utils.py
+++ b/nexa/utils.py
@@ -132,7 +132,10 @@ def _spin(self):
 
     def __enter__(self):
         if self._use_alternate_stream:
-            self.stream = open("/dev/tty", "w")
+            if sys.platform == "win32":  # Windows
+                self.stream = open('CONOUT$', "w")
+            else:
+                self.stream = open('/dev/tty', "w")
         self.thread = threading.Thread(target=self._spin)
         self.thread.start()
         return self

From 32e763acf5ec3b63a67c4f44e969130b289667b2 Mon Sep 17 00:00:00 2001
From: Ethan Wang <ethan@nexa4ai.com>
Date: Thu, 22 Aug 2024 03:13:32 +0000
Subject: [PATCH 25/31] fix: /dev/tty -> /dev/stdout

---
 nexa/utils.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/nexa/utils.py b/nexa/utils.py
index 1e2a5b64..985dea56 100644
--- a/nexa/utils.py
+++ b/nexa/utils.py
@@ -135,7 +135,10 @@ def __enter__(self):
             if sys.platform == "win32":  # Windows
                 self.stream = open('CONOUT$', "w")
             else:
-                self.stream = open('/dev/tty', "w")
+                try:
+                    self.stream = open('/dev/tty', "w")
+                except FileNotFoundError:
+                    self.stream = open('/dev/stdout', "w")
         self.thread = threading.Thread(target=self._spin)
         self.thread.start()
         return self

From 4085284b58fdec8596a341c9ef23836e50df415c Mon Sep 17 00:00:00 2001
From: Ethan Wang <ethan@nexa4ai.com>
Date: Thu, 22 Aug 2024 03:15:57 +0000
Subject: [PATCH 26/31] fix: make sd and llama can use CUDA at the same time

---
 CMakeLists.txt             | 104 ++++++++++++++++++-------------------
 tomls/pyproject_cuda.toml  |   2 +-
 tomls/pyproject_metal.toml |   2 +-
 3 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5de11a78..8e2be6d9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,57 @@
 cmake_minimum_required(VERSION 3.16)
 
+# Project: stable_diffusion_cpp
+project(stable_diffusion_cpp)
+
+option(STABLE_DIFFUSION_BUILD "Build stable-diffusion.cpp shared library and install alongside python package" ON)
+
+if (STABLE_DIFFUSION_BUILD)
+    set(BUILD_SHARED_LIBS "ON")
+    option(SD_BUILD_SHARED_LIBS "" "ON")
+
+    # Building llama
+    if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
+        # Need to disable these llama.cpp flags on Apple x86_64,
+        # otherwise users may encounter invalid instruction errors
+        set(GGML_AVX "Off" CACHE BOOL "ggml: enable AVX" FORCE)
+        set(GGML_AVX2 "Off" CACHE BOOL "ggml: enable AVX2" FORCE)
+        set(GGML_FMA "Off" CACHE BOOL "ggml: enable FMA" FORCE)
+        set(GGML_F16C "Off" CACHE BOOL "ggml: enable F16C" FORCE)
+    endif()
+
+    add_subdirectory(dependency/stable-diffusion.cpp)
+    install(
+        TARGETS stable-diffusion
+        LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
+        RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
+        ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
+        FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
+        RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
+    )
+
+    message(STATUS "SKBUILD_PLATLIB_DIR: ${SKBUILD_PLATLIB_DIR}")
+    # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
+    install(
+        TARGETS stable-diffusion
+        LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
+        RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
+        ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
+        FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
+        RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
+    )
+    # Workaround for Windows + CUDA
+    if (WIN32)
+        install(
+            FILES $<TARGET_RUNTIME_DLLS:stable-diffusion>
+            DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
+        )
+        install(
+            FILES $<TARGET_RUNTIME_DLLS:stable-diffusion>
+            DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
+        )
+    endif()
+endif()
+
 # Project: llama_cpp
 project(llama_cpp)
 
@@ -122,55 +174,3 @@ if (LLAMA_BUILD)
         endif()
     endif()
 endif()
-
-# Project: stable_diffusion_cpp
-project(stable_diffusion_cpp)
-
-option(STABLE_DIFFUSION_BUILD "Build stable-diffusion.cpp shared library and install alongside python package" ON)
-
-if (STABLE_DIFFUSION_BUILD)
-    set(BUILD_SHARED_LIBS "ON")
-    option(SD_BUILD_SHARED_LIBS "" "ON")
-
-        # Building llama
-        if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
-        # Need to disable these llama.cpp flags on Apple x86_64,
-        # otherwise users may encounter invalid instruction errors
-        set(GGML_AVX "Off" CACHE BOOL "ggml: enable AVX" FORCE)
-        set(GGML_AVX2 "Off" CACHE BOOL "ggml: enable AVX2" FORCE)
-        set(GGML_FMA "Off" CACHE BOOL "ggml: enable FMA" FORCE)
-        set(GGML_F16C "Off" CACHE BOOL "ggml: enable F16C" FORCE)
-    endif()
-
-    add_subdirectory(dependency/stable-diffusion.cpp)
-    install(
-        TARGETS stable-diffusion
-        LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
-        RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
-        ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
-        FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
-        RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
-    )
-
-    message(STATUS "SKBUILD_PLATLIB_DIR: ${SKBUILD_PLATLIB_DIR}")
-    # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
-    install(
-        TARGETS stable-diffusion
-        LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
-        RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
-        ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
-        FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
-        RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
-    )
-    # Workaround for Windows + CUDA
-    if (WIN32)
-        install(
-            FILES $<TARGET_RUNTIME_DLLS:stable-diffusion>
-            DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
-        )
-        install(
-            FILES $<TARGET_RUNTIME_DLLS:stable-diffusion>
-            DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
-        )
-    endif()
-endif()
\ No newline at end of file
diff --git a/tomls/pyproject_cuda.toml b/tomls/pyproject_cuda.toml
index 4233cd49..69c93f58 100644
--- a/tomls/pyproject_cuda.toml
+++ b/tomls/pyproject_cuda.toml
@@ -81,8 +81,8 @@ build.verbose = true
 cmake.build-type = "Release"
 cmake.version = ">=3.16"
 cmake.args = [
-    "-DGGML_CUDA=ON",
     "-DSD_CUBLAS=ON",
+    "-DGGML_CUDA=ON",
     "-DCMAKE_CUDA_ARCHITECTURES=all",
     "-DGGML_CUDA_FORCE_MMQ=ON",
     "-DGGML_AVX2=OFF",
diff --git a/tomls/pyproject_metal.toml b/tomls/pyproject_metal.toml
index 584ebf07..a14b1155 100644
--- a/tomls/pyproject_metal.toml
+++ b/tomls/pyproject_metal.toml
@@ -81,8 +81,8 @@ build.verbose = true
 cmake.build-type = "Release"
 cmake.version = ">=3.16"
 cmake.args = [
-    "-DGGML_METAL=ON",
     "-DSD_METAL=ON",
+    "-DGGML_METAL=ON",
     "-DCMAKE_OSX_ARCHITECTURES=arm64",
     "-DCMAKE_APPLE_SILICON_PROCESSOR=arm64"
 ]

From 9a01788e1f94b226dd768a55b7d304b842103434 Mon Sep 17 00:00:00 2001
From: Ethan Wang <ethan@nexa4ai.com>
Date: Thu, 22 Aug 2024 03:16:19 +0000
Subject: [PATCH 27/31] add --index-url for installing pre-built wheels

---
 README.md                         | 19 ++++++++++++++++---
 nexa/gguf/nexa_inference_image.py | 24 ++++++++++++------------
 nexa/gguf/nexa_inference_text.py  | 10 +++++-----
 nexa/gguf/nexa_inference_vlm.py   | 24 ++++++++++++------------
 nexa/gguf/nexa_inference_voice.py |  6 +++---
 5 files changed, 48 insertions(+), 35 deletions(-)

diff --git a/README.md b/README.md
index b1716169..939d93db 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ Detailed API documentation is available [here](docs/index.html).
 
 ## Installation
 
-**GPU version(optional)** 
+**GPU version(optional)**
 
 check if you have GPU acceleration (torch required)
 <details>
@@ -40,16 +40,24 @@ check if you have GPU acceleration (torch required)
   ```
   CMAKE_ARGS="-DGGML_CUDA=on -DSD_CUBLAS=ON" pip install nexaai-gpu
   ```
+  Or you prefer to install our pre-built wheel:
+  ```bash
+  pip install nexaai-cuda --index-url https://nexaai.github.io/nexa-sdk/whl/cu124 --extra-index-url https://pypi.org/simple
+  ```
 </details>
 <details>
   <summary>Apple M Chip:</summary>
   Apple icon -> about this mac -> Graphics
-  
+
   if True:
 
   ```
   CMAKE_ARGS="-DGGML_METAL=on -DSD_METAL=ON" pip install nexaai-gpu
   ```
+  Or you prefer to install our pre-built wheel:
+  ```bash
+  pip install nexaai-metal --index-url https://nexaai.github.io/nexa-sdk/whl/metal --extra-index-url https://pypi.org/simple
+  ```
 </details>
 
 <details>
@@ -77,7 +85,12 @@ check if you have GPU acceleration (torch required)
   ```
   pip install nexaai
   ```
-<details>
+</details>
+
+Or you prefer to install the pre-built wheel:
+```bash
+pip install nexaai --index-url https://nexaai.github.io/nexa-sdk/whl/cpu --extra-index-url https://pypi.org/simple
+```
 
 ## Nexa CLI commands
 
diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py
index 494374d7..8c85645a 100644
--- a/nexa/gguf/nexa_inference_image.py
+++ b/nexa/gguf/nexa_inference_image.py
@@ -43,7 +43,7 @@ class NexaImageInference:
         streamlit (bool): Run the inference in Streamlit UI.
 
     """
-    
+
 
     def __init__(self, model_path, **kwargs):
         self.model_path = None
@@ -81,7 +81,7 @@ def __init__(self, model_path, **kwargs):
                 logging.error("Failed to load the model or pipeline.")
                 exit(1)
 
-    # @SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self, model_path: str):
         with suppress_stdout_stderr():
             from nexa.gguf.sd.stable_diffusion import StableDiffusion
@@ -108,9 +108,9 @@ def _save_images(self, images):
             file_path = os.path.join(output_dir, file_name)
             image.save(file_path)
             logging.info(f"\nImage {i+1} saved to: {file_path}")
-    
-    def txt2img(self, 
-                prompt, 
+
+    def txt2img(self,
+                prompt,
                 negative_prompt="",
                 cfg_scale=7.5,
                 width=512,
@@ -151,7 +151,7 @@ def run_txt2img(self):
                 )
                 try:
                     images = self.txt2img(
-                        prompt, 
+                        prompt,
                         negative_prompt,
                         cfg_scale=self.params["guidance_scale"],
                         width=self.params["width"],
@@ -169,9 +169,9 @@ def run_txt2img(self):
             except Exception as e:
                 logging.error(f"Error during generation: {e}", exc_info=True)
 
-    def img2img(self, 
-                image_path, 
-                prompt, 
+    def img2img(self,
+                image_path,
+                prompt,
                 negative_prompt="",
                 cfg_scale=7.5,
                 width=512,
@@ -213,8 +213,8 @@ def run_img2img(self):
                 negative_prompt = nexa_prompt(
                     "Enter your negative prompt (press Enter to skip): "
                 )
-                images = self.img2img(image_path, 
-                                      prompt, 
+                images = self.img2img(image_path,
+                                      prompt,
                                       negative_prompt,
                                       cfg_scale=self.params["guidance_scale"],
                                       width=self.params["width"],
@@ -224,7 +224,7 @@ def run_img2img(self):
                                       control_cond=self.params.get("control_image_path", ""),
                                         control_strength=self.params.get("control_strength", 0.9),
                                     )
-                
+
                 self._save_images(images)
             except KeyboardInterrupt:
                 print(EXIT_REMINDER)
diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py
index fa59e7ee..2760d5d1 100644
--- a/nexa/gguf/nexa_inference_text.py
+++ b/nexa/gguf/nexa_inference_text.py
@@ -40,7 +40,7 @@ class NexaTextInference:
         top_k (int): Top-k sampling parameter.
         top_p (float): Top-p sampling parameter
     """
-    
+
     def __init__(self, model_path, stop_words=None, **kwargs):
         self.params = DEFAULT_TEXT_GEN_PARAMS
         self.params.update(kwargs)
@@ -98,7 +98,7 @@ def create_embedding(
         """
         return self.model.create_embedding(input)
 
-    # @SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self):
         logging.debug(f"Loading model from {self.downloaded_path}")
         start_time = time.time()
@@ -140,7 +140,7 @@ def _load_model(self):
 
     def run(self):
         """
-        CLI interactive session. Not for SDK. 
+        CLI interactive session. Not for SDK.
         """
         while True:
             generated_text = ""
@@ -189,7 +189,7 @@ def run(self):
             except Exception as e:
                 logging.error(f"Error during generation: {e}", exc_info=True)
             print("\n")
-    
+
     def create_chat_completion(self, messages, temperature=0.7, max_tokens=2048, top_k=50, top_p=1.0, stream=False, stop=None):
         """
         Used for SDK. Generate completion for a chat conversation.
@@ -207,7 +207,7 @@ def create_chat_completion(self, messages, temperature=0.7, max_tokens=2048, top
             Iterator: Iterator for the completion.
         """
         return self.model.create_chat_completion(messages=messages, temperature=temperature, max_tokens=max_tokens, top_k=top_k, top_p=top_p, stream=stream, stop=stop)
-    
+
     def create_completion(self, prompt, temperature=0.7, max_tokens=2048, top_k=50, top_p=1.0, echo=False, stream=False, stop=None):
         """
         Used for SDK. Generate completion for a given prompt.
diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py
index 27c057be..e5627ffc 100644
--- a/nexa/gguf/nexa_inference_vlm.py
+++ b/nexa/gguf/nexa_inference_vlm.py
@@ -86,7 +86,7 @@ class NexaVLMInference:
     top_k (int): Top-k sampling parameter.
     top_p (float): Top-p sampling parameter
     """
-    
+
 
     def __init__(self, model_path, stop_words=None, **kwargs):
         self.params = DEFAULT_TEXT_GEN_PARAMS
@@ -146,7 +146,7 @@ def __init__(self, model_path, stop_words=None, **kwargs):
                 )
                 exit(1)
 
-    # @SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self):
         logging.debug(f"Loading model from {self.downloaded_path}")
         start_time = time.time()
@@ -240,18 +240,18 @@ def run(self):
             except Exception as e:
                 logging.error(f"Error during generation: {e}", exc_info=True)
             print("\n")
-    
-    def create_chat_completion(self, 
-                            messages, 
-                            max_tokens:int = 2048, 
+
+    def create_chat_completion(self,
+                            messages,
+                            max_tokens:int = 2048,
                             temperature: float = 0.2,
                             top_p: float = 0.95,
                             top_k: int = 40,
-                            stream=False, 
+                            stream=False,
                             stop=[]):
         """
         Generate text completion for a given chat prompt.
-        
+
         Args:
             messages (list): List of messages in the chat prompt.
             temperature (float): Temperature for sampling.
@@ -260,7 +260,7 @@ def create_chat_completion(self,
             top_p (float): Top-p sampling parameter.
             stream (bool): Stream the output.
             stop (list): List of stop words for early stopping.
-        
+
         Returns:
             Iterator: An iterator of the generated text completion
             return format:
@@ -285,9 +285,9 @@ def create_chat_completion(self,
                     "prompt_tokens": 57,
                     "total_tokens": 74
                 }
-            }          
-            usage: message = completion.choices[0].message.content  
-            
+            }
+            usage: message = completion.choices[0].message.content
+
         """
         return self.model.create_chat_completion(
             messages=messages,
diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py
index fc8034e3..f61f872c 100644
--- a/nexa/gguf/nexa_inference_voice.py
+++ b/nexa/gguf/nexa_inference_voice.py
@@ -69,7 +69,7 @@ def __init__(self, model_path, **kwargs):
                 exit(1)
 
 
-    # @SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self):
         from faster_whisper import WhisperModel
 
@@ -91,7 +91,7 @@ def run(self):
                 print(EXIT_REMINDER)
             except Exception as e:
                 logging.error(f"Error during text generation: {e}", exc_info=True)
-    
+
     def transcribe(self, audio, **kwargs):
         """
         Transcribe the audio file.
@@ -171,7 +171,7 @@ def transcribe(self, audio, **kwargs):
             audio,
             **kwargs,
         )
-        
+
 
     def _transcribe_audio(self, audio_path):
         logging.debug(f"Transcribing audio from: {audio_path}")

From e8f19c34b5b0a1e7420fe9866b6119d0fc380fd3 Mon Sep 17 00:00:00 2001
From: Ethan Wang <ethan@nexa4ai.com>
Date: Thu, 22 Aug 2024 04:47:11 +0000
Subject: [PATCH 28/31] last try

---
 nexa/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/nexa/utils.py b/nexa/utils.py
index 985dea56..2483582f 100644
--- a/nexa/utils.py
+++ b/nexa/utils.py
@@ -137,7 +137,7 @@ def __enter__(self):
             else:
                 try:
                     self.stream = open('/dev/tty', "w")
-                except FileNotFoundError:
+                except (FileNotFoundError, OSError):
                     self.stream = open('/dev/stdout', "w")
         self.thread = threading.Thread(target=self._spin)
         self.thread.start()

From b2ba114d20618d7ababf464b4c3f4eae61e73551 Mon Sep 17 00:00:00 2001
From: Yu Xing <xyyimian@gmail.com>
Date: Thu, 22 Aug 2024 05:00:18 +0000
Subject: [PATCH 29/31] unified cpu and gpu

---
 CMakeLists.txt             | 10 +++++
 nexa/gguf/lib_utils.py     |  4 +-
 tomls/pyproject_cuda.toml  | 90 --------------------------------------
 tomls/pyproject_metal.toml | 89 -------------------------------------
 4 files changed, 13 insertions(+), 180 deletions(-)
 delete mode 100644 tomls/pyproject_cuda.toml
 delete mode 100644 tomls/pyproject_metal.toml

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 5de11a78..7ea5f139 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,15 @@
 cmake_minimum_required(VERSION 3.16)
 
+if (GGML_CUDA OR GGML_METAL)
+    set(EMPTY_FILE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib/empty_file.txt")
+    add_custom_command(
+        OUTPUT ${EMPTY_FILE_PATH}
+        COMMAND ${CMAKE_COMMAND} -E touch ${EMPTY_FILE_PATH}
+        COMMENT "Creating an empty file because MY_FEATURE is ON"
+    )    
+    add_custom_target(create_empty_file ALL DEPENDS ${EMPTY_FILE_PATH})
+endif()
+
 # Project: llama_cpp
 project(llama_cpp)
 
diff --git a/nexa/gguf/lib_utils.py b/nexa/gguf/lib_utils.py
index fe1fa7bc..ff2d887f 100644
--- a/nexa/gguf/lib_utils.py
+++ b/nexa/gguf/lib_utils.py
@@ -15,7 +15,9 @@
 
 
 def is_gpu_available():
-    return is_nexa_cuda_installed() or is_nexa_metal_installed()
+    current_dir = os.path.dirname(os.path.abspath(__file__))
+    sentinel_file_exists = os.path.exists(os.path.join(current_dir, "lib", "empty_file.txt"))
+    return sentinel_file_exists
 
 # Load the library
 def load_library(lib_base_name: str):
diff --git a/tomls/pyproject_cuda.toml b/tomls/pyproject_cuda.toml
deleted file mode 100644
index 008ea1a0..00000000
--- a/tomls/pyproject_cuda.toml
+++ /dev/null
@@ -1,90 +0,0 @@
-[build-system]
-requires = ["scikit-build-core"]
-build-backend = "scikit_build_core.build"
-
-[project]
-name = "nexaai-cuda"
-version = "0.0.1"
-description = "Nexa AI SDK"
-readme = "README.md"
-license = { text = "MIT" }
-authors = [{ name = "Nexa AI", email = "octopus@nexa4ai.com" }]
-dependencies = [
-    "faster_whisper",
-    "typing-extensions>=4.5.0",  # For ggml
-    "numpy>=1.20.0",
-    "diskcache>=5.6.1",
-    "jinja2>=2.11.3",
-    "librosa>=0.8.0",
-    "boto3>=1.34.148",
-    "botocore>=1.34.148",
-    "fastapi",
-    "uvicorn",
-    "pydantic",
-    "pillow",
-    "prompt_toolkit",
-    "tqdm",  # Shared dependencies
-    "tabulate",
-    "streamlit"
-]
-classifiers = [
-    "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.9",
-    "Programming Language :: Python :: 3.10",
-    "Programming Language :: Python :: 3.11",
-    "Programming Language :: Python :: 3.12",
-]
-
-[project.optional-dependencies]
-onnx = [
-    "librosa",
-    "optimum[onnxruntime]>=1.7.3",  # for CPU version
-    "diffusers",  # required for image generation
-    "optuna",
-    "pydantic",
-    "PyYAML",
-    "requests",
-    "setuptools",
-    "soundfile",
-    "streamlit_audiorec",
-    "transformers",
-    "ttstokenizer"
-]
-
-[project.urls]
-Homepage = "https://github.com/NexaAI/nexaai-sdk-cpp"
-Issues = "https://github.com/NexaAI/nexaai-sdk-cpp/issues"
-Documentation = "https://docs-test.nexa4ai.com/"
-
-[project.scripts]
-nexa-cli = "nexa.cli.entry:main"
-nexa = "nexa.cli.entry:main"
-nexaai = "nexa.cli.entry:main"
-nexai = "nexa.cli.entry:main"
-
-[tool.scikit-build]
-wheel.packages = [
-    "nexa",
-    "nexa.cli",
-    "nexa.gguf",
-    "nexa.gguf.llama",
-    "nexa.gguf.sd",
-    "nexa.gguf.streamlit",
-    "nexa.gguf.server",
-    "nexa.onnx",
-    "nexa.onnx.streamlit",
-    "nexa.onnx.server"
-]
-sdist.include = ["CMakeLists.txt", "dependency/llama.cpp/*", "dependency/stable-diffusion.cpp/*"]
-sdist.exclude = [".github", "build", "dist", "nexa.egg-info", "dependency/llama.cpp/build", "dependency/stable-diffusion.cpp/build"]
-build.verbose = true
-cmake.build-type = "Release"
-cmake.version = ">=3.16"
-cmake.args = [
-    "-DGGML_CUDA=ON -DSD_CUBLAS=ON -DCMAKE_CUDA_ARCHITECTURES=all",
-    "-DGGML_CUDA_FORCE_MMQ=ON",
-    "-DGGML_AVX2=OFF -DGGML_FMA=OFF -DGGML_F16C=OFF",
-]
-
-[tool.pytest.ini_options]
-testpaths = ["tests"]
\ No newline at end of file
diff --git a/tomls/pyproject_metal.toml b/tomls/pyproject_metal.toml
deleted file mode 100644
index c895998d..00000000
--- a/tomls/pyproject_metal.toml
+++ /dev/null
@@ -1,89 +0,0 @@
-[build-system]
-requires = ["scikit-build-core"]
-build-backend = "scikit_build_core.build"
-
-[project]
-name = "nexaai-metal"
-version = "0.0.1"
-description = "Nexa AI SDK"
-readme = "README.md"
-license = { text = "MIT" }
-authors = [{ name = "Nexa AI", email = "octopus@nexa4ai.com" }]
-dependencies = [
-    "faster_whisper",
-    "typing-extensions>=4.5.0",  # For ggml
-    "numpy>=1.20.0",
-    "diskcache>=5.6.1",
-    "jinja2>=2.11.3",
-    "librosa>=0.8.0",
-    "boto3>=1.34.148",
-    "botocore>=1.34.148",
-    "fastapi",
-    "uvicorn",
-    "pydantic",
-    "pillow",
-    "prompt_toolkit",
-    "tqdm",  # Shared dependencies
-    "tabulate",
-    "streamlit"
-]
-classifiers = [
-    "Programming Language :: Python :: 3",
-    "Programming Language :: Python :: 3.9",
-    "Programming Language :: Python :: 3.10",
-    "Programming Language :: Python :: 3.11",
-    "Programming Language :: Python :: 3.12",
-]
-
-[project.optional-dependencies]
-onnx = [
-    "librosa",
-    "optimum[onnxruntime]>=1.7.3",  # for CPU version
-    "diffusers",  # required for image generation
-    "optuna",
-    "pydantic",
-    "PyYAML",
-    "requests",
-    "setuptools",
-    "soundfile",
-    "streamlit_audiorec",
-    "transformers",
-    "ttstokenizer"
-]
-
-[project.urls]
-Homepage = "https://github.com/NexaAI/nexaai-sdk-cpp"
-Issues = "https://github.com/NexaAI/nexaai-sdk-cpp/issues"
-Documentation = "https://docs-test.nexa4ai.com/"
-
-[project.scripts]
-nexa-cli = "nexa.cli.entry:main"
-nexa = "nexa.cli.entry:main"
-nexaai = "nexa.cli.entry:main"
-nexai = "nexa.cli.entry:main"
-
-[tool.scikit-build]
-wheel.packages = [
-    "nexa",
-    "nexa.cli",
-    "nexa.gguf",
-    "nexa.gguf.llama",
-    "nexa.gguf.sd",
-    "nexa.gguf.streamlit",
-    "nexa.gguf.server",
-    "nexa.onnx",
-    "nexa.onnx.streamlit",
-    "nexa.onnx.server"
-]
-sdist.include = ["CMakeLists.txt", "dependency/llama.cpp/*", "dependency/stable-diffusion.cpp/*"]
-sdist.exclude = [".github", "build", "dist", "nexa.egg-info", "dependency/llama.cpp/build", "dependency/stable-diffusion.cpp/build"]
-build.verbose = true
-cmake.build-type = "Release"
-cmake.version = ">=3.16"
-cmake.args = [
-    "-DGGML_METAL=ON -DSD_METAL=ON",
-    "-DCMAKE_OSX_ARCHITECTURES=arm64 -DCMAKE_APPLE_SILICON_PROCESSOR=arm64",
-]
-
-[tool.pytest.ini_options]
-testpaths = ["tests"]
\ No newline at end of file

From 89244553fa96098aa63fc700756b21810e3ebaa9 Mon Sep 17 00:00:00 2001
From: Yu Xing <xyyimian@gmail.com>
Date: Thu, 22 Aug 2024 05:03:11 +0000
Subject: [PATCH 30/31] resolve conflicts

---
 README.md                         | 19 ++++++++++++++++---
 nexa/gguf/nexa_inference_image.py | 24 ++++++++++++------------
 nexa/gguf/nexa_inference_text.py  | 10 +++++-----
 nexa/gguf/nexa_inference_vlm.py   | 24 ++++++++++++------------
 nexa/gguf/nexa_inference_voice.py |  6 +++---
 nexa/utils.py                     |  8 +++++++-
 pyproject.toml                    |  2 +-
 7 files changed, 56 insertions(+), 37 deletions(-)

diff --git a/README.md b/README.md
index b1716169..939d93db 100644
--- a/README.md
+++ b/README.md
@@ -24,7 +24,7 @@ Detailed API documentation is available [here](docs/index.html).
 
 ## Installation
 
-**GPU version(optional)** 
+**GPU version(optional)**
 
 check if you have GPU acceleration (torch required)
 <details>
@@ -40,16 +40,24 @@ check if you have GPU acceleration (torch required)
   ```
   CMAKE_ARGS="-DGGML_CUDA=on -DSD_CUBLAS=ON" pip install nexaai-gpu
   ```
+  Or you prefer to install our pre-built wheel:
+  ```bash
+  pip install nexaai-cuda --index-url https://nexaai.github.io/nexa-sdk/whl/cu124 --extra-index-url https://pypi.org/simple
+  ```
 </details>
 <details>
   <summary>Apple M Chip:</summary>
   Apple icon -> about this mac -> Graphics
-  
+
   if True:
 
   ```
   CMAKE_ARGS="-DGGML_METAL=on -DSD_METAL=ON" pip install nexaai-gpu
   ```
+  Or you prefer to install our pre-built wheel:
+  ```bash
+  pip install nexaai-metal --index-url https://nexaai.github.io/nexa-sdk/whl/metal --extra-index-url https://pypi.org/simple
+  ```
 </details>
 
 <details>
@@ -77,7 +85,12 @@ check if you have GPU acceleration (torch required)
   ```
   pip install nexaai
   ```
-<details>
+</details>
+
+Or you prefer to install the pre-built wheel:
+```bash
+pip install nexaai --index-url https://nexaai.github.io/nexa-sdk/whl/cpu --extra-index-url https://pypi.org/simple
+```
 
 ## Nexa CLI commands
 
diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py
index 494374d7..8c85645a 100644
--- a/nexa/gguf/nexa_inference_image.py
+++ b/nexa/gguf/nexa_inference_image.py
@@ -43,7 +43,7 @@ class NexaImageInference:
         streamlit (bool): Run the inference in Streamlit UI.
 
     """
-    
+
 
     def __init__(self, model_path, **kwargs):
         self.model_path = None
@@ -81,7 +81,7 @@ def __init__(self, model_path, **kwargs):
                 logging.error("Failed to load the model or pipeline.")
                 exit(1)
 
-    # @SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self, model_path: str):
         with suppress_stdout_stderr():
             from nexa.gguf.sd.stable_diffusion import StableDiffusion
@@ -108,9 +108,9 @@ def _save_images(self, images):
             file_path = os.path.join(output_dir, file_name)
             image.save(file_path)
             logging.info(f"\nImage {i+1} saved to: {file_path}")
-    
-    def txt2img(self, 
-                prompt, 
+
+    def txt2img(self,
+                prompt,
                 negative_prompt="",
                 cfg_scale=7.5,
                 width=512,
@@ -151,7 +151,7 @@ def run_txt2img(self):
                 )
                 try:
                     images = self.txt2img(
-                        prompt, 
+                        prompt,
                         negative_prompt,
                         cfg_scale=self.params["guidance_scale"],
                         width=self.params["width"],
@@ -169,9 +169,9 @@ def run_txt2img(self):
             except Exception as e:
                 logging.error(f"Error during generation: {e}", exc_info=True)
 
-    def img2img(self, 
-                image_path, 
-                prompt, 
+    def img2img(self,
+                image_path,
+                prompt,
                 negative_prompt="",
                 cfg_scale=7.5,
                 width=512,
@@ -213,8 +213,8 @@ def run_img2img(self):
                 negative_prompt = nexa_prompt(
                     "Enter your negative prompt (press Enter to skip): "
                 )
-                images = self.img2img(image_path, 
-                                      prompt, 
+                images = self.img2img(image_path,
+                                      prompt,
                                       negative_prompt,
                                       cfg_scale=self.params["guidance_scale"],
                                       width=self.params["width"],
@@ -224,7 +224,7 @@ def run_img2img(self):
                                       control_cond=self.params.get("control_image_path", ""),
                                         control_strength=self.params.get("control_strength", 0.9),
                                     )
-                
+
                 self._save_images(images)
             except KeyboardInterrupt:
                 print(EXIT_REMINDER)
diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py
index fa59e7ee..2760d5d1 100644
--- a/nexa/gguf/nexa_inference_text.py
+++ b/nexa/gguf/nexa_inference_text.py
@@ -40,7 +40,7 @@ class NexaTextInference:
         top_k (int): Top-k sampling parameter.
         top_p (float): Top-p sampling parameter
     """
-    
+
     def __init__(self, model_path, stop_words=None, **kwargs):
         self.params = DEFAULT_TEXT_GEN_PARAMS
         self.params.update(kwargs)
@@ -98,7 +98,7 @@ def create_embedding(
         """
         return self.model.create_embedding(input)
 
-    # @SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self):
         logging.debug(f"Loading model from {self.downloaded_path}")
         start_time = time.time()
@@ -140,7 +140,7 @@ def _load_model(self):
 
     def run(self):
         """
-        CLI interactive session. Not for SDK. 
+        CLI interactive session. Not for SDK.
         """
         while True:
             generated_text = ""
@@ -189,7 +189,7 @@ def run(self):
             except Exception as e:
                 logging.error(f"Error during generation: {e}", exc_info=True)
             print("\n")
-    
+
     def create_chat_completion(self, messages, temperature=0.7, max_tokens=2048, top_k=50, top_p=1.0, stream=False, stop=None):
         """
         Used for SDK. Generate completion for a chat conversation.
@@ -207,7 +207,7 @@ def create_chat_completion(self, messages, temperature=0.7, max_tokens=2048, top
             Iterator: Iterator for the completion.
         """
         return self.model.create_chat_completion(messages=messages, temperature=temperature, max_tokens=max_tokens, top_k=top_k, top_p=top_p, stream=stream, stop=stop)
-    
+
     def create_completion(self, prompt, temperature=0.7, max_tokens=2048, top_k=50, top_p=1.0, echo=False, stream=False, stop=None):
         """
         Used for SDK. Generate completion for a given prompt.
diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py
index 27c057be..e5627ffc 100644
--- a/nexa/gguf/nexa_inference_vlm.py
+++ b/nexa/gguf/nexa_inference_vlm.py
@@ -86,7 +86,7 @@ class NexaVLMInference:
     top_k (int): Top-k sampling parameter.
     top_p (float): Top-p sampling parameter
     """
-    
+
 
     def __init__(self, model_path, stop_words=None, **kwargs):
         self.params = DEFAULT_TEXT_GEN_PARAMS
@@ -146,7 +146,7 @@ def __init__(self, model_path, stop_words=None, **kwargs):
                 )
                 exit(1)
 
-    # @SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self):
         logging.debug(f"Loading model from {self.downloaded_path}")
         start_time = time.time()
@@ -240,18 +240,18 @@ def run(self):
             except Exception as e:
                 logging.error(f"Error during generation: {e}", exc_info=True)
             print("\n")
-    
-    def create_chat_completion(self, 
-                            messages, 
-                            max_tokens:int = 2048, 
+
+    def create_chat_completion(self,
+                            messages,
+                            max_tokens:int = 2048,
                             temperature: float = 0.2,
                             top_p: float = 0.95,
                             top_k: int = 40,
-                            stream=False, 
+                            stream=False,
                             stop=[]):
         """
         Generate text completion for a given chat prompt.
-        
+
         Args:
             messages (list): List of messages in the chat prompt.
             temperature (float): Temperature for sampling.
@@ -260,7 +260,7 @@ def create_chat_completion(self,
             top_p (float): Top-p sampling parameter.
             stream (bool): Stream the output.
             stop (list): List of stop words for early stopping.
-        
+
         Returns:
             Iterator: An iterator of the generated text completion
             return format:
@@ -285,9 +285,9 @@ def create_chat_completion(self,
                     "prompt_tokens": 57,
                     "total_tokens": 74
                 }
-            }          
-            usage: message = completion.choices[0].message.content  
-            
+            }
+            usage: message = completion.choices[0].message.content
+
         """
         return self.model.create_chat_completion(
             messages=messages,
diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py
index fc8034e3..f61f872c 100644
--- a/nexa/gguf/nexa_inference_voice.py
+++ b/nexa/gguf/nexa_inference_voice.py
@@ -69,7 +69,7 @@ def __init__(self, model_path, **kwargs):
                 exit(1)
 
 
-    # @SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self):
         from faster_whisper import WhisperModel
 
@@ -91,7 +91,7 @@ def run(self):
                 print(EXIT_REMINDER)
             except Exception as e:
                 logging.error(f"Error during text generation: {e}", exc_info=True)
-    
+
     def transcribe(self, audio, **kwargs):
         """
         Transcribe the audio file.
@@ -171,7 +171,7 @@ def transcribe(self, audio, **kwargs):
             audio,
             **kwargs,
         )
-        
+
 
     def _transcribe_audio(self, audio_path):
         logging.debug(f"Transcribing audio from: {audio_path}")
diff --git a/nexa/utils.py b/nexa/utils.py
index 33499a27..2483582f 100644
--- a/nexa/utils.py
+++ b/nexa/utils.py
@@ -132,7 +132,13 @@ def _spin(self):
 
     def __enter__(self):
         if self._use_alternate_stream:
-            self.stream = open("/dev/tty", "w")
+            if sys.platform == "win32":  # Windows
+                self.stream = open('CONOUT$', "w")
+            else:
+                try:
+                    self.stream = open('/dev/tty', "w")
+                except (FileNotFoundError, OSError):
+                    self.stream = open('/dev/stdout', "w")
         self.thread = threading.Thread(target=self._spin)
         self.thread.start()
         return self
diff --git a/pyproject.toml b/pyproject.toml
index 681d4f04..65a3c414 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "scikit_build_core.build"
 
 [project]
 name = "nexaai"
-version = "0.0.1"
+version = "0.0.2.dev"
 description = "Nexa AI SDK"
 readme = "README.md"
 license = { text = "MIT" }

From 1944e97122b27a4bfce02793c8ef2a26aaf720d2 Mon Sep 17 00:00:00 2001
From: Yu Xing <xyyimian@gmail.com>
Date: Thu, 22 Aug 2024 05:04:56 +0000
Subject: [PATCH 31/31] revert

---
 CMakeLists.txt | 53 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index ffee1ccc..8b4264fa 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,6 +10,58 @@ if (GGML_CUDA OR GGML_METAL)
     add_custom_target(create_empty_file ALL DEPENDS ${EMPTY_FILE_PATH})
 endif()
 
+# Project: stable_diffusion_cpp
+project(stable_diffusion_cpp)
+
+option(STABLE_DIFFUSION_BUILD "Build stable-diffusion.cpp shared library and install alongside python package" ON)
+
+if (STABLE_DIFFUSION_BUILD)
+    set(BUILD_SHARED_LIBS "ON")
+    option(SD_BUILD_SHARED_LIBS "" "ON")
+
+    # Building llama
+    if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
+        # Need to disable these llama.cpp flags on Apple x86_64,
+        # otherwise users may encounter invalid instruction errors
+        set(GGML_AVX "Off" CACHE BOOL "ggml: enable AVX" FORCE)
+        set(GGML_AVX2 "Off" CACHE BOOL "ggml: enable AVX2" FORCE)
+        set(GGML_FMA "Off" CACHE BOOL "ggml: enable FMA" FORCE)
+        set(GGML_F16C "Off" CACHE BOOL "ggml: enable F16C" FORCE)
+    endif()
+
+    add_subdirectory(dependency/stable-diffusion.cpp)
+    install(
+        TARGETS stable-diffusion
+        LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
+        RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
+        ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
+        FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
+        RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
+    )
+
+    message(STATUS "SKBUILD_PLATLIB_DIR: ${SKBUILD_PLATLIB_DIR}")
+    # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
+    install(
+        TARGETS stable-diffusion
+        LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
+        RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
+        ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
+        FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
+        RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
+    )
+    # Workaround for Windows + CUDA
+    if (WIN32)
+        install(
+            FILES $<TARGET_RUNTIME_DLLS:stable-diffusion>
+            DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
+        )
+        install(
+            FILES $<TARGET_RUNTIME_DLLS:stable-diffusion>
+            DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
+        )
+    endif()
+endif()
+
 # Project: llama_cpp
 project(llama_cpp)
 
@@ -132,3 +184,4 @@ if (LLAMA_BUILD)
         endif()
     endif()
 endif()
+