NexaAI · zhiyuan8 · Aug 22, 2024 · Aug 21, 2024 · Aug 22, 2024 · Aug 22, 2024
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -1,5 +1,57 @@
 cmake_minimum_required(VERSION 3.16)
 
+# Project: stable_diffusion_cpp
+project(stable_diffusion_cpp)
+
+option(STABLE_DIFFUSION_BUILD "Build stable-diffusion.cpp shared library and install alongside python package" ON)
+
+if (STABLE_DIFFUSION_BUILD)
+    set(BUILD_SHARED_LIBS "ON")
+    option(SD_BUILD_SHARED_LIBS "" "ON")
+
+    # Building llama
+    if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
+        # Need to disable these llama.cpp flags on Apple x86_64,
+        # otherwise users may encounter invalid instruction errors
+        set(GGML_AVX "Off" CACHE BOOL "ggml: enable AVX" FORCE)
+        set(GGML_AVX2 "Off" CACHE BOOL "ggml: enable AVX2" FORCE)
+        set(GGML_FMA "Off" CACHE BOOL "ggml: enable FMA" FORCE)
+        set(GGML_F16C "Off" CACHE BOOL "ggml: enable F16C" FORCE)
+    endif()
+
+    add_subdirectory(dependency/stable-diffusion.cpp)
+    install(
+        TARGETS stable-diffusion
+        LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
+        RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
+        ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
+        FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
+        RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
+    )
+
+    message(STATUS "SKBUILD_PLATLIB_DIR: ${SKBUILD_PLATLIB_DIR}")
+    # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
+    install(
+        TARGETS stable-diffusion
+        LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
+        RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
+        ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
+        FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
+        RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
+    )
+    # Workaround for Windows + CUDA
+    if (WIN32)
+        install(
+            FILES $<TARGET_RUNTIME_DLLS:stable-diffusion>
+            DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
+        )
+        install(
+            FILES $<TARGET_RUNTIME_DLLS:stable-diffusion>
+            DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
+        )
+    endif()
+endif()
+
 # Project: llama_cpp
 project(llama_cpp)
 
@@ -122,55 +174,3 @@ if (LLAMA_BUILD)
         endif()
     endif()
 endif()
-
-# Project: stable_diffusion_cpp
-project(stable_diffusion_cpp)
-
-option(STABLE_DIFFUSION_BUILD "Build stable-diffusion.cpp shared library and install alongside python package" ON)
-
-if (STABLE_DIFFUSION_BUILD)
-    set(BUILD_SHARED_LIBS "ON")
-    option(SD_BUILD_SHARED_LIBS "" "ON")
-
-        # Building llama
-        if (APPLE AND NOT CMAKE_SYSTEM_PROCESSOR MATCHES "arm64")
-        # Need to disable these llama.cpp flags on Apple x86_64,
-        # otherwise users may encounter invalid instruction errors
-        set(GGML_AVX "Off" CACHE BOOL "ggml: enable AVX" FORCE)
-        set(GGML_AVX2 "Off" CACHE BOOL "ggml: enable AVX2" FORCE)
-        set(GGML_FMA "Off" CACHE BOOL "ggml: enable FMA" FORCE)
-        set(GGML_F16C "Off" CACHE BOOL "ggml: enable F16C" FORCE)
-    endif()
-
-    add_subdirectory(dependency/stable-diffusion.cpp)
-    install(
-        TARGETS stable-diffusion
-        LIBRARY DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
-        RUNTIME DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
-        ARCHIVE DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
-        FRAMEWORK DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
-        RESOURCE DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
-    )
-
-    message(STATUS "SKBUILD_PLATLIB_DIR: ${SKBUILD_PLATLIB_DIR}")
-    # Temporary fix for https://github.com/scikit-build/scikit-build-core/issues/374
-    install(
-        TARGETS stable-diffusion
-        LIBRARY DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
-        RUNTIME DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
-        ARCHIVE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
-        FRAMEWORK DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
-        RESOURCE DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
-    )
-    # Workaround for Windows + CUDA
-    if (WIN32)
-        install(
-            FILES $<TARGET_RUNTIME_DLLS:stable-diffusion>
-            DESTINATION ${SKBUILD_PLATLIB_DIR}/nexa/gguf/lib
-        )
-        install(
-            FILES $<TARGET_RUNTIME_DLLS:stable-diffusion>
-            DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/nexa/gguf/lib
-        )
-    endif()
-endif()
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ Detailed API documentation is available [here](docs/index.html).
 
 ## Installation
 
-**GPU version(optional)** 
+**GPU version(optional)**
 
 check if you have GPU acceleration (torch required)
 <details>
@@ -40,16 +40,24 @@ check if you have GPU acceleration (torch required)
   ```
   CMAKE_ARGS="-DGGML_CUDA=on -DSD_CUBLAS=ON" pip install nexaai-gpu
   ```
+  Or you prefer to install our pre-built wheel:
+  ```bash
+  pip install nexaai-cuda --index-url https://nexaai.github.io/nexa-sdk/whl/cu124 --extra-index-url https://pypi.org/simple
+  ```
 </details>
 <details>
   <summary>Apple M Chip:</summary>
   Apple icon -> about this mac -> Graphics
-  
+
   if True:
 
   ```
   CMAKE_ARGS="-DGGML_METAL=on -DSD_METAL=ON" pip install nexaai-gpu
   ```
+  Or you prefer to install our pre-built wheel:
+  ```bash
+  pip install nexaai-metal --index-url https://nexaai.github.io/nexa-sdk/whl/metal --extra-index-url https://pypi.org/simple
+  ```
 </details>
 
 <details>
@@ -77,7 +85,12 @@ check if you have GPU acceleration (torch required)
   ```
   pip install nexaai
   ```
-<details>
+</details>
+
+Or you prefer to install the pre-built wheel:
+```bash
+pip install nexaai --index-url https://nexaai.github.io/nexa-sdk/whl/cpu --extra-index-url https://pypi.org/simple
+```
 
 ## Nexa CLI commands
 

diff --git a/nexa/gguf/nexa_inference_image.py b/nexa/gguf/nexa_inference_image.py
@@ -43,7 +43,7 @@ class NexaImageInference:
         streamlit (bool): Run the inference in Streamlit UI.
 
     """
-    
+
 
     def __init__(self, model_path, **kwargs):
         self.model_path = None
@@ -81,7 +81,7 @@ def __init__(self, model_path, **kwargs):
                 logging.error("Failed to load the model or pipeline.")
                 exit(1)
 
-    # @SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self, model_path: str):
         with suppress_stdout_stderr():
             from nexa.gguf.sd.stable_diffusion import StableDiffusion
@@ -108,9 +108,9 @@ def _save_images(self, images):
             file_path = os.path.join(output_dir, file_name)
             image.save(file_path)
             logging.info(f"\nImage {i+1} saved to: {file_path}")
-    
-    def txt2img(self, 
-                prompt, 
+
+    def txt2img(self,
+                prompt,
                 negative_prompt="",
                 cfg_scale=7.5,
                 width=512,
@@ -151,7 +151,7 @@ def run_txt2img(self):
                 )
                 try:
                     images = self.txt2img(
-                        prompt, 
+                        prompt,
                         negative_prompt,
                         cfg_scale=self.params["guidance_scale"],
                         width=self.params["width"],
@@ -169,9 +169,9 @@ def run_txt2img(self):
             except Exception as e:
                 logging.error(f"Error during generation: {e}", exc_info=True)
 
-    def img2img(self, 
-                image_path, 
-                prompt, 
+    def img2img(self,
+                image_path,
+                prompt,
                 negative_prompt="",
                 cfg_scale=7.5,
                 width=512,
@@ -213,8 +213,8 @@ def run_img2img(self):
                 negative_prompt = nexa_prompt(
                     "Enter your negative prompt (press Enter to skip): "
                 )
-                images = self.img2img(image_path, 
-                                      prompt, 
+                images = self.img2img(image_path,
+                                      prompt,
                                       negative_prompt,
                                       cfg_scale=self.params["guidance_scale"],
                                       width=self.params["width"],
@@ -224,7 +224,7 @@ def run_img2img(self):
                                       control_cond=self.params.get("control_image_path", ""),
                                         control_strength=self.params.get("control_strength", 0.9),
                                     )
-                
+
                 self._save_images(images)
             except KeyboardInterrupt:
                 print(EXIT_REMINDER)

diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py
@@ -40,7 +40,7 @@ class NexaTextInference:
         top_k (int): Top-k sampling parameter.
         top_p (float): Top-p sampling parameter
     """
-    
+
     def __init__(self, model_path, stop_words=None, **kwargs):
         self.params = DEFAULT_TEXT_GEN_PARAMS
         self.params.update(kwargs)
@@ -98,7 +98,7 @@ def create_embedding(
         """
         return self.model.create_embedding(input)
 
-    # @SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self):
         logging.debug(f"Loading model from {self.downloaded_path}")
         start_time = time.time()
@@ -140,7 +140,7 @@ def _load_model(self):
 
     def run(self):
         """
-        CLI interactive session. Not for SDK. 
+        CLI interactive session. Not for SDK.
         """
         while True:
             generated_text = ""
@@ -189,7 +189,7 @@ def run(self):
             except Exception as e:
                 logging.error(f"Error during generation: {e}", exc_info=True)
             print("\n")
-    
+
     def create_chat_completion(self, messages, temperature=0.7, max_tokens=2048, top_k=50, top_p=1.0, stream=False, stop=None):
         """
         Used for SDK. Generate completion for a chat conversation.
@@ -207,7 +207,7 @@ def create_chat_completion(self, messages, temperature=0.7, max_tokens=2048, top
             Iterator: Iterator for the completion.
         """
         return self.model.create_chat_completion(messages=messages, temperature=temperature, max_tokens=max_tokens, top_k=top_k, top_p=top_p, stream=stream, stop=stop)
-    
+
     def create_completion(self, prompt, temperature=0.7, max_tokens=2048, top_k=50, top_p=1.0, echo=False, stream=False, stop=None):
         """
         Used for SDK. Generate completion for a given prompt.

diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py
@@ -86,7 +86,7 @@ class NexaVLMInference:
     top_k (int): Top-k sampling parameter.
     top_p (float): Top-p sampling parameter
     """
-    
+
 
     def __init__(self, model_path, stop_words=None, **kwargs):
         self.params = DEFAULT_TEXT_GEN_PARAMS
@@ -146,7 +146,7 @@ def __init__(self, model_path, stop_words=None, **kwargs):
                 )
                 exit(1)
 
-    # @SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self):
         logging.debug(f"Loading model from {self.downloaded_path}")
         start_time = time.time()
@@ -240,18 +240,18 @@ def run(self):
             except Exception as e:
                 logging.error(f"Error during generation: {e}", exc_info=True)
             print("\n")
-    
-    def create_chat_completion(self, 
-                            messages, 
-                            max_tokens:int = 2048, 
+
+    def create_chat_completion(self,
+                            messages,
+                            max_tokens:int = 2048,
                             temperature: float = 0.2,
                             top_p: float = 0.95,
                             top_k: int = 40,
-                            stream=False, 
+                            stream=False,
                             stop=[]):
         """
         Generate text completion for a given chat prompt.
-        
+
         Args:
             messages (list): List of messages in the chat prompt.
             temperature (float): Temperature for sampling.
@@ -260,7 +260,7 @@ def create_chat_completion(self,
             top_p (float): Top-p sampling parameter.
             stream (bool): Stream the output.
             stop (list): List of stop words for early stopping.
-        
+
         Returns:
             Iterator: An iterator of the generated text completion
             return format:
@@ -285,9 +285,9 @@ def create_chat_completion(self,
                     "prompt_tokens": 57,
                     "total_tokens": 74
                 }
-            }          
-            usage: message = completion.choices[0].message.content  
-            
+            }
+            usage: message = completion.choices[0].message.content
+
         """
         return self.model.create_chat_completion(
             messages=messages,

diff --git a/nexa/gguf/nexa_inference_voice.py b/nexa/gguf/nexa_inference_voice.py
@@ -69,7 +69,7 @@ def __init__(self, model_path, **kwargs):
                 exit(1)
 
 
-    # @SpinningCursorAnimation()
+    @SpinningCursorAnimation()
     def _load_model(self):
         from faster_whisper import WhisperModel
 
@@ -91,7 +91,7 @@ def run(self):
                 print(EXIT_REMINDER)
             except Exception as e:
                 logging.error(f"Error during text generation: {e}", exc_info=True)
-    
+
     def transcribe(self, audio, **kwargs):
         """
         Transcribe the audio file.
@@ -171,7 +171,7 @@ def transcribe(self, audio, **kwargs):
             audio,
             **kwargs,
         )
-        
+
 
     def _transcribe_audio(self, audio_path):
         logging.debug(f"Transcribing audio from: {audio_path}")

diff --git a/nexa/utils.py b/nexa/utils.py
@@ -132,7 +132,13 @@ def _spin(self):
 
     def __enter__(self):
         if self._use_alternate_stream:
-            self.stream = open("/dev/tty", "w")
+            if sys.platform == "win32":  # Windows
+                self.stream = open('CONOUT$', "w")
+            else:
+                try:
+                    self.stream = open('/dev/tty', "w")
+                except (FileNotFoundError, OSError):
+                    self.stream = open('/dev/stdout', "w")
         self.thread = threading.Thread(target=self._spin)
         self.thread.start()
         return self

diff --git a/tomls/pyproject_cuda.toml b/tomls/pyproject_cuda.toml
@@ -81,8 +81,8 @@ build.verbose = true
 cmake.build-type = "Release"
 cmake.version = ">=3.16"
 cmake.args = [
-    "-DGGML_CUDA=ON",
     "-DSD_CUBLAS=ON",
+    "-DGGML_CUDA=ON",
     "-DCMAKE_CUDA_ARCHITECTURES=all",
     "-DGGML_CUDA_FORCE_MMQ=ON",
     "-DGGML_AVX2=OFF",