Skip to content

Commit

Permalink
merge from main
Browse files Browse the repository at this point in the history
  • Loading branch information
zhiyuan8 committed Nov 9, 2024
2 parents 8563977 + 62a3bd4 commit 778c6b9
Show file tree
Hide file tree
Showing 13 changed files with 469 additions and 84 deletions.
4 changes: 3 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,9 @@

[![MacOS][MacOS-image]][release-url] [![Linux][Linux-image]][release-url] [![Windows][Windows-image]][release-url]

[![GitHub Release](https://img.shields.io/github/v/release/NexaAI/nexa-sdk)](https://github.com/NexaAI/nexa-sdk/releases/latest) [![Build workflow](https://img.shields.io/github/actions/workflow/status/NexaAI/nexa-sdk/ci.yaml?label=CI&logo=github)](https://github.com/NexaAI/nexa-sdk/actions/workflows/ci.yaml?query=branch%3Amain) ![GitHub License](https://img.shields.io/github/license/NexaAI/nexa-sdk)
[![GitHub Release](https://img.shields.io/github/v/release/NexaAI/nexa-sdk)](https://github.com/NexaAI/nexa-sdk/releases/latest) [![Build workflow](https://img.shields.io/github/actions/workflow/status/NexaAI/nexa-sdk/ci.yaml?label=CI&logo=github)](https://github.com/NexaAI/nexa-sdk/actions/workflows/ci.yaml?query=branch%3Amain) ![GitHub License](https://img.shields.io/github/license/NexaAI/nexa-sdk)

[![](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2FNexaAI%2Fnexa-sdk%2Fbadge%3Ftype%3Dswift-versions)](https://swiftpackageindex.com/NexaAI/nexa-sdk) [![](https://img.shields.io/endpoint?url=https%3A%2F%2Fswiftpackageindex.com%2Fapi%2Fpackages%2FNexaAI%2Fnexa-sdk%2Fbadge%3Ftype%3Dplatforms)](https://swiftpackageindex.com/NexaAI/nexa-sdk)

<!-- ![PyPI - Python Version](https://img.shields.io/pypi/pyversions/nexaai) ![PyPI - Downloads](https://img.shields.io/pypi/dm/nexaai?color=orange) -->

Expand Down
13 changes: 13 additions & 0 deletions examples/swift-test/Shared/ContentView.swift
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ import SwiftUI
struct ContentView: View {
@State private var viewModel = ViewModel()
@State private var prompt = ""
@FocusState private var isInputActive: Bool

var body: some View {
VStack {
Expand All @@ -21,6 +22,18 @@ struct ContentView: View {
guard !prompt.isEmpty else { return }
viewModel.run(for: prompt)
}
.focused($isInputActive)

Button(action: {
guard !prompt.isEmpty else { return }
viewModel.run(for: prompt)
isInputActive = false
}) {
Text("Send")
.frame(maxWidth: .infinity)
}
.buttonStyle(.borderedProminent)
.padding(.bottom)

ScrollView {
Text(viewModel.result)
Expand Down
2 changes: 1 addition & 1 deletion nexa/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.0.9.0"
__version__ = "0.0.9.1"
16 changes: 9 additions & 7 deletions nexa/cli/entry.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,14 +46,16 @@ def run_ggml_inference(args):
run_type = ModelType[model_type].value

local_path = None
projector_local_path = None

if is_local_path or hf:
if not model_type:
print("Error: --model_type must be provided when using --local_path or --huggingface")
return
if is_local_path:
local_path = os.path.abspath(model_path)
model_path = local_path
if run_type == "Multimodal":
if run_type == "Multimodal" or run_type == "AudioLM":
if not os.path.isdir(local_path):
print("Error: For Multimodal models with --local_path, the provided path must be a directory containing both model and projector ggufs.")
return
Expand All @@ -71,7 +73,7 @@ def run_ggml_inference(args):
else: # hf case
# TODO: remove this after adding support for Multimodal model in CLI
if run_type == "Multimodal" or run_type == "Audio" or run_type == "TTS":
print("Running multimodal model or audio model from Hugging Face is currently not supported in CLI mode. Please use SDK to run Multimodal model or Audio model or TTS model.")
print("Running multimodal model or audio model or TTS model from Hugging Face is currently not supported in CLI mode. Please use SDK to run Multimodal model or Audio model or TTS model.")
return
from nexa.general import pull_model
local_path, _ = pull_model(model_path, hf=True)
Expand All @@ -96,7 +98,6 @@ def run_ggml_inference(args):
inference.run_txt2img()
return
elif run_type == "Multimodal":
print("debug run_type", run_type, "is_local_path", is_local_path)
if is_local_path:
if "omni" in local_path:
from nexa.gguf.nexa_inference_vlm_omni import NexaOmniVlmInference
Expand All @@ -105,7 +106,6 @@ def run_ggml_inference(args):
from nexa.gguf.nexa_inference_vlm import NexaVLMInference
inference = NexaVLMInference(model_path=model_path, local_path=local_path, projector_local_path=projector_local_path, stop_words=stop_words, **kwargs)
else:
print("debug: local_path", local_path)
if "omni" in local_path:
from nexa.gguf.nexa_inference_vlm_omni import NexaOmniVlmInference
inference = NexaOmniVlmInference(model_path=model_path, local_path=local_path, **kwargs)
Expand All @@ -125,11 +125,12 @@ def run_ggml_inference(args):
print(f"Unknown task: {run_type}. Skipping inference.")
return
except Exception as e:
print(f"Error {e}, please refer to our docs to install nexaai package: https://docs.nexaai.com/getting-started/installation ")
print(f"Error running ggml inference: {e}")
print(f"Please refer to our docs to install nexaai package: https://docs.nexaai.com/getting-started/installation ")
return

if hasattr(args, 'streamlit') and args.streamlit:
if run_type == "Multimodal":
if run_type == "Multimodal" or run_type == "AudioLM":
inference.run_streamlit(model_path, is_local_path = is_local_path, hf = hf, projector_local_path = projector_local_path)
else:
inference.run_streamlit(model_path, is_local_path = is_local_path, hf = hf)
Expand Down Expand Up @@ -216,7 +217,8 @@ def run_onnx_inference(args):
print(f"Unknown task: {run_type}. Skipping inference.")
return
except Exception as e:
print(f"Error loading ONNX models, please refer to our docs to install nexaai[onnx] package: https://docs.nexaai.com/getting-started/installation ")
print(f"Error running onnx inference: {e}")
print(f"Please refer to our docs to install nexaai[onnx] package: https://docs.nexaai.com/getting-started/installation ")
return

if hasattr(args, 'streamlit') and args.streamlit:
Expand Down
28 changes: 14 additions & 14 deletions nexa/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,12 @@ class ModelType(Enum):

NEXA_RUN_MODEL_MAP_AUDIO_LM = {
"qwen2audio": "Qwen2-Audio-7.8B-Instruct:model-q4_0",
"qwen2audio:fp16": "Qwen2-Audio-7.8B-Instruct:model-fp16",
"qwen2audio:q8_0": "Qwen2-Audio-7.8B-Instruct:model-q8_0",
"qwen2audio:q4_0": "Qwen2-Audio-7.8B-Instruct:model-q4_0",
"Qwen2-Audio-7.8B-Instruct:fp16": "Qwen2-Audio-7.8B-Instruct:model-fp16",
"Qwen2-Audio-7.8B-Instruct:q8_0": "Qwen2-Audio-7.8B-Instruct:model-q8_0",
"Qwen2-Audio-7.8B-Instruct:q4_0": "Qwen2-Audio-7.8B-Instruct:model-q4_0",
"omniaudio": "Octo-omni-audio:model-fp16",
"omniaudio:fp16": "Octo-omni-audio:model-fp16",
"omniaudio:q4_0": "Octo-omni-audio:model-q4_0",
"Octo-omni-audio:fp16": "Octo-omni-audio:model-fp16",
"Octo-omni-audio:q4_0": "Octo-omni-audio:model-q4_0",
}

NEXA_RUN_MODEL_MAP_VOICE = {
Expand Down Expand Up @@ -173,24 +173,24 @@ class ModelType(Enum):

NEXA_RUN_OMNI_VLM_PROJECTOR_MAP = {
"omnivision": "Octo-omni-vision:projector-fp16",
"omnivision:fp16": "Octo-omni-vision:projector-fp16",
"omnivision:q4_0": "Octo-omni-vision:projector-q4_0",
"Octo-omni-vision:fp16": "Octo-omni-vision:projector-fp16",
"Octo-omni-vision:q4_0": "Octo-omni-vision:projector-q4_0",
}

NEXA_RUN_OMNI_VLM_MAP = {
"omnivision": "Octo-omni-vision:model-fp16",
"omnivision:fp16": "Octo-omni-vision:model-fp16",
"omnivision:q4_0": "Octo-omni-vision:model-q4_0",
"Octo-omni-vision:fp16": "Octo-omni-vision:model-fp16",
"Octo-omni-vision:q4_0": "Octo-omni-vision:model-q4_0",
}

NEXA_RUN_AUDIO_LM_PROJECTOR_MAP = {
"qwen2audio": "Qwen2-Audio-7.8B-Instruct:projector-q4_0",
"qwen2audio:fp16": "Qwen2-Audio-7.8B-Instruct:projector-fp16",
"qwen2audio:q8_0": "Qwen2-Audio-7.8B-Instruct:projector-q8_0",
"qwen2audio:q4_0": "Qwen2-Audio-7.8B-Instruct:projector-q4_0",
"Qwen2-Audio-7.8B-Instruct:fp16": "Qwen2-Audio-7.8B-Instruct:projector-fp16",
"Qwen2-Audio-7.8B-Instruct:q8_0": "Qwen2-Audio-7.8B-Instruct:projector-q8_0",
"Qwen2-Audio-7.8B-Instruct:q4_0": "Qwen2-Audio-7.8B-Instruct:projector-q4_0",
"omniaudio": "Octo-omni-audio:projector-fp16",
"omniaudio:fp16": "Octo-omni-audio:projector-fp16",
"omniaudio:q4_0": "Octo-omni-audio:projector-q4_0",
"Octo-omni-audio:fp16": "Octo-omni-audio:projector-fp16",
"Octo-omni-audio:q4_0": "Octo-omni-audio:projector-q4_0",
}

NEXA_RUN_T5XXL_MAP = {
Expand Down
4 changes: 2 additions & 2 deletions nexa/gguf/llama/audio_lm_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,8 +109,8 @@ def free(ctx: omni_context_p, is_qwen: bool = True):

# Configure process_full
lib.omni_process_full.argtypes = [omni_context_p, omni_context_params_p]
lib.omni_process_full.restype = None
lib.omni_process_full.restype = ctypes.c_char_p

# Configure free
lib.omni_free.argtypes = [omni_context_p]
lib.omni_free.restype = None
lib.omni_free.restype = None
2 changes: 1 addition & 1 deletion nexa/gguf/llama/omni_vlm_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ def omnivlm_inference(prompt: omni_char_p, image_path: omni_char_p):


_lib.omnivlm_inference.argtypes = [omni_char_p, omni_char_p]
_lib.omnivlm_inference.restype = None
_lib.omnivlm_inference.restype = omni_char_p


def omnivlm_free():
Expand Down
153 changes: 133 additions & 20 deletions nexa/gguf/nexa_inference_audio_lm.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
import ctypes
import logging
import os
import sys
import librosa
import tempfile
import soundfile as sf
from pathlib import Path

from streamlit.web import cli as stcli
from nexa.utils import SpinningCursorAnimation, nexa_prompt
from nexa.constants import (
DEFAULT_TEXT_GEN_PARAMS,
NEXA_RUN_MODEL_MAP_AUDIO_LM,
Expand All @@ -13,7 +18,6 @@
from nexa.gguf.llama._utils_transformers import suppress_stdout_stderr
from nexa.general import pull_model


def is_qwen(model_name):
if "qwen" in model_name.lower(): # TEMPORARY SOLUTION : this hardcode can be risky
return True
Expand Down Expand Up @@ -60,6 +64,8 @@ def __init__(
self.projector_downloaded_path = projector_local_path
self.device = device
self.context = None
self.temp_file = None

if self.device == "auto" or self.device == "gpu":
self.n_gpu_layers = -1 if is_gpu_available() else 0
else:
Expand Down Expand Up @@ -114,6 +120,7 @@ def __init__(
with suppress_stdout_stderr():
self._load_model()

@SpinningCursorAnimation()
def _load_model(self):
try:
self.ctx_params.model = ctypes.c_char_p(
Expand All @@ -137,29 +144,84 @@ def _load_model(self):
raise

def run(self):
"""
Run the audio language model inference loop.
"""
try:
while True:
audio_path = self._get_valid_audio_path()
user_input = nexa_prompt("Enter text (leave empty if no prompt): ")

response = self.inference(audio_path, user_input)
print(response)

except KeyboardInterrupt:
print("\nExiting...")
except Exception as e:
logging.error(f"\nError during audio generation: {e}", exc_info=True)
finally:
self.cleanup()

def _get_valid_audio_path(self) -> str:
"""
Helper method to get a valid audio file path from user
"""
while True:
try:
while True:
audio_path = input("Audio Path (required): ")
if os.path.exists(audio_path):
break
print(f"'{audio_path}' is not a valid audio path. Please try again.")
audio_path = nexa_prompt("Enter the path to your audio file (required): ")
if os.path.exists(audio_path):
# Check if it's a supported audio format
if any(audio_path.lower().endswith(ext) for ext in ['.wav', '.mp3', '.m4a', '.flac', '.ogg']):
return audio_path
print(f"Unsupported audio format. Please use WAV, MP3, M4A, FLAC, or OGG files.")
else:
print(f"'{audio_path}' is not a valid audio path. Please try again.")

user_input = input("Enter text (leave empty if no prompt): ")
def inference(self, audio_path: str, prompt: str = "") -> str:
"""
Perform a single inference with the audio language model.
"""
if not os.path.exists(audio_path):
raise FileNotFoundError(f"Audio file not found: {audio_path}")

self.ctx_params.file = ctypes.c_char_p(audio_path.encode("utf-8"))
self.ctx_params.prompt = ctypes.c_char_p(user_input.encode("utf-8"))
try:
# Ensure audio is at 16kHz before processing
audio_path = self._ensure_16khz(audio_path)

audio_lm_cpp.process_full(
self.context, ctypes.byref(self.ctx_params), is_qwen=self.is_qwen
)
self.ctx_params.file = ctypes.c_char_p(audio_path.encode("utf-8"))
self.ctx_params.prompt = ctypes.c_char_p(prompt.encode("utf-8"))

except KeyboardInterrupt:
print("\nExiting...")
break
response = audio_lm_cpp.process_full(
self.context, ctypes.byref(self.ctx_params), is_qwen=self.is_qwen
)
return response
except Exception as e:
raise RuntimeError(f"Error during inference: {str(e)}")
finally:
if self.temp_file:
try:
self.temp_file.close()
if os.path.exists(self.temp_file.name):
os.unlink(self.temp_file.name)
except:
pass
self.temp_file = None

except Exception as e:
logging.error(f"\nError during audio generation: {e}", exc_info=True)
def cleanup(self):
"""
Explicitly cleanup resources
"""
if self.context:
audio_lm_cpp.free(self.context, is_qwen=self.is_qwen)
self.context = None

if self.temp_file:
try:
self.temp_file.close()
if os.path.exists(self.temp_file.name):
os.unlink(self.temp_file.name)
except:
pass
self.temp_file = None

def __del__(self):
"""
Expand All @@ -168,6 +230,47 @@ def __del__(self):
if self.context:
audio_lm_cpp.free(self.context, is_qwen=self.is_qwen)

def _ensure_16khz(self, audio_path: str) -> str:
"""
Check if audio is 16kHz, resample if necessary.
Supports various audio formats (mp3, wav, m4a, etc.)
"""
try:
y, sr = librosa.load(audio_path, sr=None)

if sr == 16000:
return audio_path

# Resample to 16kHz
print(f"Resampling audio from {sr} to 16000")
y_resampled = librosa.resample(y=y, orig_sr=sr, target_sr=16000)
self.temp_file = tempfile.NamedTemporaryFile(
suffix='.wav',
delete=False
)
sf.write(
self.temp_file.name,
y_resampled,
16000,
subtype='PCM_16'
)
return self.temp_file.name

except Exception as e:
raise RuntimeError(f"Error processing audio file: {str(e)}")

def run_streamlit(self, model_path: str, is_local_path = False, hf = False, projector_local_path = None):
"""
Run the Streamlit UI.
"""
logging.info("Running Streamlit UI...")

streamlit_script_path = (
Path(os.path.abspath(__file__)).parent / "streamlit" / "streamlit_audio_lm.py"
)

sys.argv = ["streamlit", "run", str(streamlit_script_path), model_path, str(is_local_path), str(hf), str(projector_local_path)]
sys.exit(stcli.main())

if __name__ == "__main__":
import argparse
Expand All @@ -188,10 +291,20 @@ def __del__(self):
default="auto",
help="Device to use for inference (auto, cpu, or gpu)",
)
parser.add_argument(
"-st",
"--streamlit",
action="store_true",
help="Run the inference in Streamlit UI",
)

args = parser.parse_args()
kwargs = {k: v for k, v in vars(args).items() if v is not None}
model_path = kwargs.pop("model_path")
device = kwargs.pop("device", "auto")

inference = NexaAudioLMInference(model_path, device=device, **kwargs)
inference.run()
if args.streamlit:
inference.run_streamlit(model_path)
else:
inference.run()
Loading

0 comments on commit 778c6b9

Please sign in to comment.