merge from main

NexaAI · Nov 9, 2024 · 778c6b9 · 778c6b9
2 parents 8563977 + 62a3bd4
commit 778c6b9
Show file tree

Hide file tree

Showing 13 changed files with 469 additions and 84 deletions.
diff --git a/README.md b/README.md
@@ -4,7 +4,9 @@
 
 [![MacOS][MacOS-image]][release-url] [![Linux][Linux-image]][release-url] [![Windows][Windows-image]][release-url]
 
-[![GitHub Release](https://img.shields.io/github/v/release/NexaAI/nexa-sdk)](https://github.com/NexaAI/nexa-sdk/releases/latest) [![Build workflow](https://img.shields.io/github/actions/workflow/status/NexaAI/nexa-sdk/ci.yaml?label=CI&logo=github)](https://github.com/NexaAI/nexa-sdk/actions/workflows/ci.yaml?query=branch%3Amain) ![GitHub License](https://img.shields.io/github/license/NexaAI/nexa-sdk)
+[![GitHub Release](https://img.shields.io/github/v/release/NexaAI/nexa-sdk)](https://github.com/NexaAI/nexa-sdk/releases/latest) [![Build workflow](https://img.shields.io/github/actions/workflow/status/NexaAI/nexa-sdk/ci.yaml?label=CI&logo=github)](https://github.com/NexaAI/nexa-sdk/actions/workflows/ci.yaml?query=branch%3Amain) ![GitHub License](https://img.shields.io/github/license/NexaAI/nexa-sdk) 
+
+[![](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2FNexaAI%2Fnexa-sdk%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/NexaAI/nexa-sdk) [![](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2FNexaAI%2Fnexa-sdk%2Fbadge%3Ftype%3Dplatforms)](https://swiftpackageindex.com/NexaAI/nexa-sdk)
 
 <!-- ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/nexaai) ![PyPI - Downloads](https://img.shields.io/pypi/dm/nexaai?color=orange) -->
 

diff --git a/examples/swift-test/Shared/ContentView.swift b/examples/swift-test/Shared/ContentView.swift
@@ -3,6 +3,7 @@ import SwiftUI
 struct ContentView: View {
     @State private var viewModel = ViewModel()
     @State private var prompt = ""
+    @FocusState private var isInputActive: Bool
 
     var body: some View {
         VStack {
@@ -21,6 +22,18 @@ struct ContentView: View {
                     guard !prompt.isEmpty else { return }
                     viewModel.run(for: prompt)
                 }
+                .focused($isInputActive)
+
+            Button(action: {
+                guard !prompt.isEmpty else { return }
+                viewModel.run(for: prompt)
+                isInputActive = false
+            }) {
+                Text("Send")
+                    .frame(maxWidth: .infinity)
+            }
+            .buttonStyle(.borderedProminent)
+            .padding(.bottom)
 
             ScrollView {
                 Text(viewModel.result)

diff --git a/nexa/__init__.py b/nexa/__init__.py
@@ -1 +1 @@
-__version__ = "0.0.9.0"
+__version__ = "0.0.9.1"
diff --git a/nexa/cli/entry.py b/nexa/cli/entry.py
@@ -46,14 +46,16 @@ def run_ggml_inference(args):
         run_type = ModelType[model_type].value
 
     local_path = None
+    projector_local_path = None
+
     if is_local_path or hf:
         if not model_type:
             print("Error: --model_type must be provided when using --local_path or --huggingface")
             return
         if is_local_path:
             local_path = os.path.abspath(model_path)
             model_path = local_path
-            if run_type == "Multimodal":
+            if run_type == "Multimodal" or run_type == "AudioLM":
                 if not os.path.isdir(local_path):
                     print("Error: For Multimodal models with --local_path, the provided path must be a directory containing both model and projector ggufs.")
                     return
@@ -71,7 +73,7 @@ def run_ggml_inference(args):
         else:  # hf case
             # TODO: remove this after adding support for Multimodal model in CLI
             if run_type == "Multimodal" or run_type == "Audio" or run_type == "TTS":
-                print("Running multimodal model or audio model from Hugging Face is currently not supported in CLI mode. Please use SDK to run Multimodal model or Audio model or TTS model.")
+                print("Running multimodal model or audio model or TTS model from Hugging Face is currently not supported in CLI mode. Please use SDK to run Multimodal model or Audio model or TTS model.")
                 return
             from nexa.general import pull_model
             local_path, _ = pull_model(model_path, hf=True)
@@ -96,7 +98,6 @@ def run_ggml_inference(args):
                 inference.run_txt2img()
             return
         elif run_type == "Multimodal":
-            print("debug run_type", run_type, "is_local_path", is_local_path)
             if is_local_path:
                 if "omni" in local_path:
                     from nexa.gguf.nexa_inference_vlm_omni import NexaOmniVlmInference
@@ -105,7 +106,6 @@ def run_ggml_inference(args):
                     from nexa.gguf.nexa_inference_vlm import NexaVLMInference
                     inference = NexaVLMInference(model_path=model_path, local_path=local_path, projector_local_path=projector_local_path, stop_words=stop_words, **kwargs)
             else:
-                print("debug: local_path", local_path)
                 if "omni" in local_path:
                     from nexa.gguf.nexa_inference_vlm_omni import NexaOmniVlmInference
                     inference = NexaOmniVlmInference(model_path=model_path, local_path=local_path, **kwargs)
@@ -125,11 +125,12 @@ def run_ggml_inference(args):
             print(f"Unknown task: {run_type}. Skipping inference.")
             return
     except Exception as e:
-        print(f"Error {e}, please refer to our docs to install nexaai package: https://docs.nexaai.com/getting-started/installation ")
+        print(f"Error running ggml inference: {e}")
+        print(f"Please refer to our docs to install nexaai package: https://docs.nexaai.com/getting-started/installation ")
         return
 
     if hasattr(args, 'streamlit') and args.streamlit:
-        if run_type == "Multimodal":
+        if run_type == "Multimodal" or run_type == "AudioLM":
             inference.run_streamlit(model_path, is_local_path = is_local_path, hf = hf, projector_local_path = projector_local_path)
         else:
             inference.run_streamlit(model_path, is_local_path = is_local_path, hf = hf)
@@ -216,7 +217,8 @@ def run_onnx_inference(args):
             print(f"Unknown task: {run_type}. Skipping inference.")
             return
     except Exception as e:
-        print(f"Error loading ONNX models, please refer to our docs to install nexaai[onnx] package: https://docs.nexaai.com/getting-started/installation ")
+        print(f"Error running onnx inference: {e}")
+        print(f"Please refer to our docs to install nexaai[onnx] package: https://docs.nexaai.com/getting-started/installation ")
         return
 
     if hasattr(args, 'streamlit') and args.streamlit:

diff --git a/nexa/constants.py b/nexa/constants.py
@@ -111,12 +111,12 @@ class ModelType(Enum):
 
 NEXA_RUN_MODEL_MAP_AUDIO_LM = {
     "qwen2audio": "Qwen2-Audio-7.8B-Instruct:model-q4_0",
-    "qwen2audio:fp16": "Qwen2-Audio-7.8B-Instruct:model-fp16",
-    "qwen2audio:q8_0": "Qwen2-Audio-7.8B-Instruct:model-q8_0",
-    "qwen2audio:q4_0": "Qwen2-Audio-7.8B-Instruct:model-q4_0",
+    "Qwen2-Audio-7.8B-Instruct:fp16": "Qwen2-Audio-7.8B-Instruct:model-fp16",
+    "Qwen2-Audio-7.8B-Instruct:q8_0": "Qwen2-Audio-7.8B-Instruct:model-q8_0",
+    "Qwen2-Audio-7.8B-Instruct:q4_0": "Qwen2-Audio-7.8B-Instruct:model-q4_0",
     "omniaudio": "Octo-omni-audio:model-fp16",
-    "omniaudio:fp16": "Octo-omni-audio:model-fp16",
-    "omniaudio:q4_0": "Octo-omni-audio:model-q4_0",
+    "Octo-omni-audio:fp16": "Octo-omni-audio:model-fp16",
+    "Octo-omni-audio:q4_0": "Octo-omni-audio:model-q4_0",
 }
 
 NEXA_RUN_MODEL_MAP_VOICE = {
@@ -173,24 +173,24 @@ class ModelType(Enum):
 
 NEXA_RUN_OMNI_VLM_PROJECTOR_MAP = {
     "omnivision": "Octo-omni-vision:projector-fp16",
-    "omnivision:fp16": "Octo-omni-vision:projector-fp16",
-    "omnivision:q4_0": "Octo-omni-vision:projector-q4_0",
+    "Octo-omni-vision:fp16": "Octo-omni-vision:projector-fp16",
+    "Octo-omni-vision:q4_0": "Octo-omni-vision:projector-q4_0",
 }
 
 NEXA_RUN_OMNI_VLM_MAP = {
     "omnivision": "Octo-omni-vision:model-fp16",
-    "omnivision:fp16": "Octo-omni-vision:model-fp16",
-    "omnivision:q4_0": "Octo-omni-vision:model-q4_0",
+    "Octo-omni-vision:fp16": "Octo-omni-vision:model-fp16",
+    "Octo-omni-vision:q4_0": "Octo-omni-vision:model-q4_0",
 }
 
 NEXA_RUN_AUDIO_LM_PROJECTOR_MAP = {
     "qwen2audio": "Qwen2-Audio-7.8B-Instruct:projector-q4_0",
-    "qwen2audio:fp16": "Qwen2-Audio-7.8B-Instruct:projector-fp16",
-    "qwen2audio:q8_0": "Qwen2-Audio-7.8B-Instruct:projector-q8_0",
-    "qwen2audio:q4_0": "Qwen2-Audio-7.8B-Instruct:projector-q4_0",
+    "Qwen2-Audio-7.8B-Instruct:fp16": "Qwen2-Audio-7.8B-Instruct:projector-fp16",
+    "Qwen2-Audio-7.8B-Instruct:q8_0": "Qwen2-Audio-7.8B-Instruct:projector-q8_0",
+    "Qwen2-Audio-7.8B-Instruct:q4_0": "Qwen2-Audio-7.8B-Instruct:projector-q4_0",
     "omniaudio": "Octo-omni-audio:projector-fp16",
-    "omniaudio:fp16": "Octo-omni-audio:projector-fp16",
-    "omniaudio:q4_0": "Octo-omni-audio:projector-q4_0",
+    "Octo-omni-audio:fp16": "Octo-omni-audio:projector-fp16",
+    "Octo-omni-audio:q4_0": "Octo-omni-audio:projector-q4_0",
 }
 
 NEXA_RUN_T5XXL_MAP = {

diff --git a/nexa/gguf/llama/audio_lm_cpp.py b/nexa/gguf/llama/audio_lm_cpp.py
@@ -109,8 +109,8 @@ def free(ctx: omni_context_p, is_qwen: bool = True):
 
     # Configure process_full
     lib.omni_process_full.argtypes = [omni_context_p, omni_context_params_p]
-    lib.omni_process_full.restype = None
+    lib.omni_process_full.restype = ctypes.c_char_p
 
     # Configure free
     lib.omni_free.argtypes = [omni_context_p]
-    lib.omni_free.restype = None
+    lib.omni_free.restype = None
diff --git a/nexa/gguf/llama/omni_vlm_cpp.py b/nexa/gguf/llama/omni_vlm_cpp.py
@@ -68,7 +68,7 @@ def omnivlm_inference(prompt: omni_char_p, image_path: omni_char_p):
 
 
 _lib.omnivlm_inference.argtypes = [omni_char_p, omni_char_p]
-_lib.omnivlm_inference.restype = None
+_lib.omnivlm_inference.restype = omni_char_p
 
 
 def omnivlm_free():

diff --git a/nexa/gguf/nexa_inference_audio_lm.py b/nexa/gguf/nexa_inference_audio_lm.py
@@ -1,8 +1,13 @@
 import ctypes
 import logging
 import os
+import sys
+import librosa
+import tempfile
+import soundfile as sf
 from pathlib import Path
-
+from streamlit.web import cli as stcli
+from nexa.utils import SpinningCursorAnimation, nexa_prompt
 from nexa.constants import (
     DEFAULT_TEXT_GEN_PARAMS,
     NEXA_RUN_MODEL_MAP_AUDIO_LM,
@@ -13,7 +18,6 @@
 from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr
 from nexa.general import pull_model
 
-
 def is_qwen(model_name):
     if "qwen" in model_name.lower():  # TEMPORARY SOLUTION : this hardcode can be risky
         return True
@@ -60,6 +64,8 @@ def __init__(
         self.projector_downloaded_path = projector_local_path
         self.device = device
         self.context = None
+        self.temp_file = None
+
         if self.device == "auto" or self.device == "gpu":
             self.n_gpu_layers = -1 if is_gpu_available() else 0
         else:
@@ -114,6 +120,7 @@ def __init__(
         with suppress_stdout_stderr():
             self._load_model()
 
+    @SpinningCursorAnimation()
     def _load_model(self):
         try:
             self.ctx_params.model = ctypes.c_char_p(
@@ -137,29 +144,84 @@ def _load_model(self):
             raise
 
     def run(self):
+        """
+        Run the audio language model inference loop.
+        """
+        try:
+            while True:
+                audio_path = self._get_valid_audio_path()
+                user_input = nexa_prompt("Enter text (leave empty if no prompt): ")
+
+                response = self.inference(audio_path, user_input)
+                print(response)
+
+        except KeyboardInterrupt:
+            print("\nExiting...")
+        except Exception as e:
+            logging.error(f"\nError during audio generation: {e}", exc_info=True)
+        finally:
+            self.cleanup()
+
+    def _get_valid_audio_path(self) -> str:
+        """
+        Helper method to get a valid audio file path from user
+        """
         while True:
-            try:
-                while True:
-                    audio_path = input("Audio Path (required): ")
-                    if os.path.exists(audio_path):
-                        break
-                    print(f"'{audio_path}' is not a valid audio path. Please try again.")
+            audio_path = nexa_prompt("Enter the path to your audio file (required): ")
+            if os.path.exists(audio_path):
+                # Check if it's a supported audio format
+                if any(audio_path.lower().endswith(ext) for ext in ['.wav', '.mp3', '.m4a', '.flac', '.ogg']):
+                    return audio_path
+                print(f"Unsupported audio format. Please use WAV, MP3, M4A, FLAC, or OGG files.")
+            else:
+                print(f"'{audio_path}' is not a valid audio path. Please try again.")
 
-                user_input = input("Enter text (leave empty if no prompt): ")
+    def inference(self, audio_path: str, prompt: str = "") -> str:
+        """
+        Perform a single inference with the audio language model.
+        """
+        if not os.path.exists(audio_path):
+            raise FileNotFoundError(f"Audio file not found: {audio_path}")
 
-                self.ctx_params.file = ctypes.c_char_p(audio_path.encode("utf-8"))
-                self.ctx_params.prompt = ctypes.c_char_p(user_input.encode("utf-8"))
+        try:
+            # Ensure audio is at 16kHz before processing
+            audio_path = self._ensure_16khz(audio_path)
 
-                audio_lm_cpp.process_full(
-                    self.context, ctypes.byref(self.ctx_params), is_qwen=self.is_qwen
-                )
+            self.ctx_params.file = ctypes.c_char_p(audio_path.encode("utf-8"))
+            self.ctx_params.prompt = ctypes.c_char_p(prompt.encode("utf-8"))
 
-            except KeyboardInterrupt:
-                print("\nExiting...")
-                break
+            response = audio_lm_cpp.process_full(
+                self.context, ctypes.byref(self.ctx_params), is_qwen=self.is_qwen
+            )
+            return response
+        except Exception as e:
+            raise RuntimeError(f"Error during inference: {str(e)}")
+        finally:
+            if self.temp_file:
+                try:
+                    self.temp_file.close()
+                    if os.path.exists(self.temp_file.name):
+                        os.unlink(self.temp_file.name)
+                except:
+                    pass
+                self.temp_file = None
 
-            except Exception as e:
-                logging.error(f"\nError during audio generation: {e}", exc_info=True)
+    def cleanup(self):
+        """
+        Explicitly cleanup resources
+        """
+        if self.context:
+            audio_lm_cpp.free(self.context, is_qwen=self.is_qwen)
+            self.context = None
+
+        if self.temp_file:
+            try:
+                self.temp_file.close()
+                if os.path.exists(self.temp_file.name):
+                    os.unlink(self.temp_file.name)
+            except:
+                pass
+            self.temp_file = None
 
     def __del__(self):
         """
@@ -168,6 +230,47 @@ def __del__(self):
         if self.context:
             audio_lm_cpp.free(self.context, is_qwen=self.is_qwen)
 
+    def _ensure_16khz(self, audio_path: str) -> str:
+        """
+        Check if audio is 16kHz, resample if necessary.
+        Supports various audio formats (mp3, wav, m4a, etc.)
+        """
+        try:
+            y, sr = librosa.load(audio_path, sr=None)
+
+            if sr == 16000:
+                return audio_path
+
+            # Resample to 16kHz
+            print(f"Resampling audio from {sr} to 16000")
+            y_resampled = librosa.resample(y=y, orig_sr=sr, target_sr=16000)
+            self.temp_file = tempfile.NamedTemporaryFile(
+                suffix='.wav',
+                delete=False
+            )
+            sf.write(
+                self.temp_file.name, 
+                y_resampled, 
+                16000,
+                subtype='PCM_16'
+            )
+            return self.temp_file.name
+
+        except Exception as e:
+            raise RuntimeError(f"Error processing audio file: {str(e)}")
+
+    def run_streamlit(self, model_path: str, is_local_path = False, hf = False, projector_local_path = None):
+        """
+        Run the Streamlit UI.
+        """
+        logging.info("Running Streamlit UI...")
+
+        streamlit_script_path = (
+            Path(os.path.abspath(__file__)).parent / "streamlit" / "streamlit_audio_lm.py"
+        )
+
+        sys.argv = ["streamlit", "run", str(streamlit_script_path), model_path, str(is_local_path), str(hf), str(projector_local_path)]
+        sys.exit(stcli.main())
 
 if __name__ == "__main__":
     import argparse
@@ -188,10 +291,20 @@ def __del__(self):
         default="auto",
         help="Device to use for inference (auto, cpu, or gpu)",
     )
+    parser.add_argument(
+        "-st",
+        "--streamlit",
+        action="store_true",
+        help="Run the inference in Streamlit UI",
+    )
+
     args = parser.parse_args()
     kwargs = {k: v for k, v in vars(args).items() if v is not None}
     model_path = kwargs.pop("model_path")
     device = kwargs.pop("device", "auto")
 
     inference = NexaAudioLMInference(model_path, device=device, **kwargs)
-    inference.run()
+    if args.streamlit:
+        inference.run_streamlit(model_path)
+    else:
+        inference.run()