Merge pull request #224 from NexaAI/david/newfeature

add vlm_omni & audio_lm streamlit support
NexaAI · Nov 8, 2024 · 4eee461 · 4eee461
2 parents 0485667 + 8016aac
commit 4eee461
Show file tree

Hide file tree

Showing 8 changed files with 378 additions and 34 deletions.
diff --git a/dependency/llama.cpp b/dependency/llama.cpp
diff --git a/nexa/cli/entry.py b/nexa/cli/entry.py
@@ -55,7 +55,7 @@ def run_ggml_inference(args):
         if is_local_path:
             local_path = os.path.abspath(model_path)
             model_path = local_path
-            if run_type == "Multimodal":
+            if run_type == "Multimodal" or run_type == "AudioLM":
                 if not os.path.isdir(local_path):
                     print("Error: For Multimodal models with --local_path, the provided path must be a directory containing both model and projector ggufs.")
                     return
@@ -73,7 +73,7 @@ def run_ggml_inference(args):
         else:  # hf case
             # TODO: remove this after adding support for Multimodal model in CLI
             if run_type == "Multimodal" or run_type == "Audio" or run_type == "TTS":
-                print("Running multimodal model or audio model from Hugging Face is currently not supported in CLI mode. Please use SDK to run Multimodal model or Audio model or TTS model.")
+                print("Running multimodal model or audio model or TTS model from Hugging Face is currently not supported in CLI mode. Please use SDK to run Multimodal model or Audio model or TTS model.")
                 return
             from nexa.general import pull_model
             local_path, _ = pull_model(model_path, hf=True)
@@ -130,7 +130,7 @@ def run_ggml_inference(args):
         return
 
     if hasattr(args, 'streamlit') and args.streamlit:
-        if run_type == "Multimodal":
+        if run_type == "Multimodal" or run_type == "AudioLM":
             inference.run_streamlit(model_path, is_local_path = is_local_path, hf = hf, projector_local_path = projector_local_path)
         else:
             inference.run_streamlit(model_path, is_local_path = is_local_path, hf = hf)

diff --git a/nexa/gguf/llama/omni_vlm_cpp.py b/nexa/gguf/llama/omni_vlm_cpp.py
@@ -68,7 +68,7 @@ def omnivlm_inference(prompt: omni_char_p, image_path: omni_char_p):
 
 
 _lib.omnivlm_inference.argtypes = [omni_char_p, omni_char_p]
-_lib.omnivlm_inference.restype = None
+_lib.omnivlm_inference.restype = omni_char_p
 
 
 def omnivlm_free():

diff --git a/nexa/gguf/nexa_inference_audio_lm.py b/nexa/gguf/nexa_inference_audio_lm.py
@@ -1,7 +1,12 @@
 import ctypes
 import logging
 import os
+import sys
+import librosa
+import tempfile
+import soundfile as sf
 from pathlib import Path
+from streamlit.web import cli as stcli
 from nexa.utils import SpinningCursorAnimation, nexa_prompt
 from nexa.constants import (
     DEFAULT_TEXT_GEN_PARAMS,
@@ -13,7 +18,6 @@
 from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr
 from nexa.general import pull_model
 
-
 def is_qwen(model_name):
     if "qwen" in model_name.lower():  # TEMPORARY SOLUTION : this hardcode can be risky
         return True
@@ -60,6 +64,8 @@ def __init__(
         self.projector_downloaded_path = projector_local_path
         self.device = device
         self.context = None
+        self.temp_file = None
+
         if self.device == "auto" or self.device == "gpu":
             self.n_gpu_layers = -1 if is_gpu_available() else 0
         else:
@@ -138,30 +144,84 @@ def _load_model(self):
             raise
 
     def run(self):
+        """
+        Run the audio language model inference loop.
+        """
+        try:
+            while True:
+                audio_path = self._get_valid_audio_path()
+                user_input = nexa_prompt("Enter text (leave empty if no prompt): ")
+
+                response = self.inference(audio_path, user_input)
+                print(response)
+
+        except KeyboardInterrupt:
+            print("\nExiting...")
+        except Exception as e:
+            logging.error(f"\nError during audio generation: {e}", exc_info=True)
+        finally:
+            self.cleanup()
+
+    def _get_valid_audio_path(self) -> str:
+        """
+        Helper method to get a valid audio file path from user
+        """
         while True:
-            try:
-                while True:
-                    audio_path = nexa_prompt("Enter the path to your audio file (required): ")
-                    if os.path.exists(audio_path):
-                        break
-                    print(f"'{audio_path}' is not a valid audio path. Please try again.")
+            audio_path = nexa_prompt("Enter the path to your audio file (required): ")
+            if os.path.exists(audio_path):
+                # Check if it's a supported audio format
+                if any(audio_path.lower().endswith(ext) for ext in ['.wav', '.mp3', '.m4a', '.flac', '.ogg']):
+                    return audio_path
+                print(f"Unsupported audio format. Please use WAV, MP3, M4A, FLAC, or OGG files.")
+            else:
+                print(f"'{audio_path}' is not a valid audio path. Please try again.")
 
-                user_input = nexa_prompt("Enter text (leave empty if no prompt): ")
+    def inference(self, audio_path: str, prompt: str = "") -> str:
+        """
+        Perform a single inference with the audio language model.
+        """
+        if not os.path.exists(audio_path):
+            raise FileNotFoundError(f"Audio file not found: {audio_path}")
 
-                self.ctx_params.file = ctypes.c_char_p(audio_path.encode("utf-8"))
-                self.ctx_params.prompt = ctypes.c_char_p(user_input.encode("utf-8"))
+        try:
+            # Ensure audio is at 16kHz before processing
+            audio_path = self._ensure_16khz(audio_path)
 
-                response = audio_lm_cpp.process_full(
-                    self.context, ctypes.byref(self.ctx_params), is_qwen=self.is_qwen
-                ).decode("utf-8")
-                print(response)
+            self.ctx_params.file = ctypes.c_char_p(audio_path.encode("utf-8"))
+            self.ctx_params.prompt = ctypes.c_char_p(prompt.encode("utf-8"))
 
-            except KeyboardInterrupt:
-                print("\nExiting...")
-                break
+            response = audio_lm_cpp.process_full(
+                self.context, ctypes.byref(self.ctx_params), is_qwen=self.is_qwen
+            )
+            return response
+        except Exception as e:
+            raise RuntimeError(f"Error during inference: {str(e)}")
+        finally:
+            if self.temp_file:
+                try:
+                    self.temp_file.close()
+                    if os.path.exists(self.temp_file.name):
+                        os.unlink(self.temp_file.name)
+                except:
+                    pass
+                self.temp_file = None
 
-            except Exception as e:
-                logging.error(f"\nError during audio generation: {e}", exc_info=True)
+    def cleanup(self):
+        """
+        Explicitly cleanup resources
+        """
+        if self.context:
+            audio_lm_cpp.free(self.context, is_qwen=self.is_qwen)
+            self.context = None
+
+        if self.temp_file:
+            try:
+                self.temp_file.close()
+                if os.path.exists(self.temp_file.name):
+                    os.unlink(self.temp_file.name)
+            except:
+                pass
+            self.temp_file = None
 
     def __del__(self):
         """
@@ -170,6 +230,47 @@ def __del__(self):
         if self.context:
             audio_lm_cpp.free(self.context, is_qwen=self.is_qwen)
 
+    def _ensure_16khz(self, audio_path: str) -> str:
+        """
+        Check if audio is 16kHz, resample if necessary.
+        Supports various audio formats (mp3, wav, m4a, etc.)
+        """
+        try:
+            y, sr = librosa.load(audio_path, sr=None)
+
+            if sr == 16000:
+                return audio_path
+
+            # Resample to 16kHz
+            print(f"Resampling audio from {sr} to 16000")
+            y_resampled = librosa.resample(y=y, orig_sr=sr, target_sr=16000)
+            self.temp_file = tempfile.NamedTemporaryFile(
+                suffix='.wav',
+                delete=False
+            )
+            sf.write(
+                self.temp_file.name, 
+                y_resampled, 
+                16000,
+                subtype='PCM_16'
+            )
+            return self.temp_file.name
+
+        except Exception as e:
+            raise RuntimeError(f"Error processing audio file: {str(e)}")
+
+    def run_streamlit(self, model_path: str, is_local_path = False, hf = False, projector_local_path = None):
+        """
+        Run the Streamlit UI.
+        """
+        logging.info("Running Streamlit UI...")
+
+        streamlit_script_path = (
+            Path(os.path.abspath(__file__)).parent / "streamlit" / "streamlit_audio_lm.py"
+        )
+
+        sys.argv = ["streamlit", "run", str(streamlit_script_path), model_path, str(is_local_path), str(hf), str(projector_local_path)]
+        sys.exit(stcli.main())
 
 if __name__ == "__main__":
     import argparse
@@ -190,10 +291,20 @@ def __del__(self):
         default="auto",
         help="Device to use for inference (auto, cpu, or gpu)",
     )
+    parser.add_argument(
+        "-st",
+        "--streamlit",
+        action="store_true",
+        help="Run the inference in Streamlit UI",
+    )
+
     args = parser.parse_args()
     kwargs = {k: v for k, v in vars(args).items() if v is not None}
     model_path = kwargs.pop("model_path")
     device = kwargs.pop("device", "auto")
 
     inference = NexaAudioLMInference(model_path, device=device, **kwargs)
-    inference.run()
+    if args.streamlit:
+        inference.run_streamlit(model_path)
+    else:
+        inference.run()
diff --git a/nexa/gguf/nexa_inference_vlm_omni.py b/nexa/gguf/nexa_inference_vlm_omni.py
@@ -2,7 +2,9 @@
 import ctypes
 import logging
 import os
+import sys
 from pathlib import Path
+from streamlit.web import cli as stcli
 from nexa.utils import nexa_prompt, SpinningCursorAnimation
 from nexa.constants import (
     DEFAULT_TEXT_GEN_PARAMS,
@@ -104,22 +106,44 @@ def run(self):
                 image_path = nexa_prompt("Image Path (required): ")
                 if not os.path.exists(image_path):
                     print(f"Image path: {image_path} not found, running omni VLM without image input.")
-
                 user_input = nexa_prompt()
-                image_path = ctypes.c_char_p(image_path.encode("utf-8"))
-                user_input = ctypes.c_char_p(user_input.encode("utf-8"))
-                omni_vlm_cpp.omnivlm_inference(user_input, image_path)
-
+                response = self.inference(user_input, image_path)
+                print(f"\nResponse: {response}")
             except KeyboardInterrupt:
                 print("\nExiting...")
                 break
             except Exception as e:
                 logging.error(f"\nError during audio generation: {e}", exc_info=True)
             print("\n")
 
+    def inference(self, prompt: str, image_path: str):
+        with suppress_stdout_stderr():
+            prompt = ctypes.c_char_p(prompt.encode("utf-8"))
+            image_path = ctypes.c_char_p(image_path.encode("utf-8"))
+            response = omni_vlm_cpp.omnivlm_inference(prompt, image_path)
+
+            decoded_response = response.decode('utf-8')
+            if '<|im_start|>assistant' in decoded_response:
+                decoded_response = decoded_response.replace('<|im_start|>assistant', '').strip()
+
+            return decoded_response
+
     def __del__(self):
         omni_vlm_cpp.omnivlm_free()
 
+    def run_streamlit(self, model_path: str, is_local_path = False, hf = False, projector_local_path = None):
+        """
+        Run the Streamlit UI.
+        """
+        logging.info("Running Streamlit UI...")
+
+        streamlit_script_path = (
+            Path(os.path.abspath(__file__)).parent / "streamlit" / "streamlit_vlm_omni.py"
+        )
+
+        sys.argv = ["streamlit", "run", str(streamlit_script_path), model_path, str(is_local_path), str(hf), str(projector_local_path)]
+        sys.exit(stcli.main())
+
 
 if __name__ == "__main__":
     import argparse
@@ -140,10 +164,20 @@ def __del__(self):
         default="auto",
         help="Device to use for inference (auto, cpu, or gpu)",
     )
+    parser.add_argument(
+        "-st",
+        "--streamlit",
+        action="store_true",
+        help="Run the inference in Streamlit UI",
+    )
+
     args = parser.parse_args()
     kwargs = {k: v for k, v in vars(args).items() if v is not None}
     model_path = kwargs.pop("model_path")
     device = kwargs.pop("device", "auto")
 
     inference = NexaOmniVlmInference(model_path, device=device, **kwargs)
-    inference.run()
+    if args.streamlit:
+        inference.run_streamlit(model_path)
+    else:
+        inference.run()
+18 −7		examples/nexa-omni-audio/omni.cpp
+2 −2		examples/nexa-omni-audio/omni.h