diff --git a/.github/workflows/build-wheels-vulkan-win.yaml b/.github/workflows/build-wheels-vulkan-win.yaml index 4683f8f8..ca68208d 100644 --- a/.github/workflows/build-wheels-vulkan-win.yaml +++ b/.github/workflows/build-wheels-vulkan-win.yaml @@ -88,6 +88,6 @@ jobs: uses: softprops/action-gh-release@v2 with: files: dist/* - tag_name: ${{ github.ref_name }}-vulkan${{ env.VULKAN_VERSION }} + tag_name: ${{ github.ref_name }}-vulkan env: GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} \ No newline at end of file diff --git a/.github/workflows/generate-index-from-release.yaml b/.github/workflows/generate-index-from-release.yaml index 11741cab..b993847a 100644 --- a/.github/workflows/generate-index-from-release.yaml +++ b/.github/workflows/generate-index-from-release.yaml @@ -3,7 +3,7 @@ name: Wheels Index on: # Trigger on new release workflow_run: - workflows: ["Release", "Build Wheels (CUDA)", "Build Wheels (Metal)", "Build Wheels (ROCm)", "Build Wheels (Vulkan)"] + workflows: ["Build Wheels (CPU)", "Build Wheels (CUDA)", "Build Wheels (Metal)", "Build Wheels (ROCm)", "Build Wheels (Vulkan)"] types: - completed diff --git a/CMakeLists.txt b/CMakeLists.txt index e1f28a3d..41738eb8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -188,7 +188,8 @@ if(LLAMA_BUILD) endif() # bark_cpp project -option(BARK_BUILD "Build bark.cpp" ON) +# Temporarily disabled since version v0.0.9.3 +option(BARK_BUILD "Build bark.cpp" OFF) if(BARK_BUILD) # Filter out HIPBLAS and Vulkan options for bark.cpp since it doesn't support them set(BARK_CMAKE_OPTIONS ${USER_DEFINED_OPTIONS}) diff --git a/README.md b/README.md index 924b76ed..0e6b50db 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,9 @@ ## Latest News 🔥 -- Support Nexa AI's own vision language model (0.9B parameters): `nexa run omnivision` and audio language model (2.9B): `nexa run omniaudio` +- Support Nexa AI's own vision language model (0.9B parameters): `nexa run omnivision` and audio language model (2.9B parameters): `nexa run omniaudio` - Support audio language model: `nexa run qwen2audio`, **we are the first open-source toolkit to support audio language model with GGML tensor library.** +- Support iOS Swift binding for local inference on **iOS mobile** devices. - Support embedding model: `nexa embed ` - Support pull and run supported Computer Vision models in GGUF format from HuggingFace or ModelScope: `nexa run -hf -mt COMPUTER_VISION` or `nexa run -ms -mt COMPUTER_VISION` - Support pull and run NLP models in GGUF format from HuggingFace or ModelScope: `nexa run -hf -mt NLP` or `nexa run -ms -mt NLP` @@ -32,13 +33,13 @@ Welcome to submit your requests through [issues](https://github.com/NexaAI/nexa- ## Install Option 1: Executable Installer

- + macOS Installer

- + Windows Installer

@@ -64,12 +65,12 @@ nexa-exe ## Install Option 2: Python Package -We have released pre-built wheels for various Python versions, platforms, and backends for convenient installation on our [index page](https://nexaai.github.io/nexa-sdk/whl/). +We have released pre-built wheels for various Python versions, platforms, and backends for convenient installation on our [index page](https://github.nexa.ai/whl/).
CPU ```bash -pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/cpu --extra-index-url https://pypi.org/simple --no-cache-dir +pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/cpu --extra-index-url https://pypi.org/simple --no-cache-dir ```
@@ -79,7 +80,7 @@ pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk For the GPU version supporting **Metal (macOS)**: ```bash -CMAKE_ARGS="-DGGML_METAL=ON -DSD_METAL=ON" pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/metal --extra-index-url https://pypi.org/simple --no-cache-dir +CMAKE_ARGS="-DGGML_METAL=ON -DSD_METAL=ON" pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/metal --extra-index-url https://pypi.org/simple --no-cache-dir ```
@@ -92,7 +93,7 @@ wget https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge bash Miniforge3-MacOSX-arm64.sh conda create -n nexasdk python=3.10 conda activate nexasdk -CMAKE_ARGS="-DGGML_METAL=ON -DSD_METAL=ON" pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/metal --extra-index-url https://pypi.org/simple --no-cache-dir +CMAKE_ARGS="-DGGML_METAL=ON -DSD_METAL=ON" pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/metal --extra-index-url https://pypi.org/simple --no-cache-dir ```
@@ -105,25 +106,25 @@ To install with CUDA support, make sure you have [CUDA Toolkit 12.0 or later](ht For **Linux**: ```bash -CMAKE_ARGS="-DGGML_CUDA=ON -DSD_CUBLAS=ON" pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/cu124 --extra-index-url https://pypi.org/simple --no-cache-dir +CMAKE_ARGS="-DGGML_CUDA=ON -DSD_CUBLAS=ON" pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/cu124 --extra-index-url https://pypi.org/simple --no-cache-dir ``` For **Windows PowerShell**: ```bash -$env:CMAKE_ARGS="-DGGML_CUDA=ON -DSD_CUBLAS=ON"; pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/cu124 --extra-index-url https://pypi.org/simple --no-cache-dir +$env:CMAKE_ARGS="-DGGML_CUDA=ON -DSD_CUBLAS=ON"; pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/cu124 --extra-index-url https://pypi.org/simple --no-cache-dir ``` For **Windows Command Prompt**: ```bash -set CMAKE_ARGS="-DGGML_CUDA=ON -DSD_CUBLAS=ON" & pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/cu124 --extra-index-url https://pypi.org/simple --no-cache-dir +set CMAKE_ARGS="-DGGML_CUDA=ON -DSD_CUBLAS=ON" & pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/cu124 --extra-index-url https://pypi.org/simple --no-cache-dir ``` For **Windows Git Bash**: ```bash -CMAKE_ARGS="-DGGML_CUDA=ON -DSD_CUBLAS=ON" pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/cu124 --extra-index-url https://pypi.org/simple --no-cache-dir +CMAKE_ARGS="-DGGML_CUDA=ON -DSD_CUBLAS=ON" pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/cu124 --extra-index-url https://pypi.org/simple --no-cache-dir ```
@@ -150,7 +151,7 @@ To install with ROCm support, make sure you have [ROCm 6.2.1 or later](https://r For **Linux**: ```bash -CMAKE_ARGS="-DGGML_HIPBLAS=on" pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/rocm621 --extra-index-url https://pypi.org/simple --no-cache-dir +CMAKE_ARGS="-DGGML_HIPBLAS=on" pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/rocm621 --extra-index-url https://pypi.org/simple --no-cache-dir ```
@@ -162,19 +163,19 @@ To install with Vulkan support, make sure you have [Vulkan SDK 1.3.261.1 or late For **Windows PowerShell**: ```bash -$env:CMAKE_ARGS="-DGGML_VULKAN=on"; pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/vulkan --extra-index-url https://pypi.org/simple --no-cache-dir +$env:CMAKE_ARGS="-DGGML_VULKAN=on"; pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/vulkan --extra-index-url https://pypi.org/simple --no-cache-dir ``` For **Windows Command Prompt**: ```bash -set CMAKE_ARGS="-DGGML_VULKAN=on" & pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/vulkan --extra-index-url https://pypi.org/simple --no-cache-dir +set CMAKE_ARGS="-DGGML_VULKAN=on" & pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/vulkan --extra-index-url https://pypi.org/simple --no-cache-dir ``` For **Windows Git Bash**: ```bash -CMAKE_ARGS="-DGGML_VULKAN=on" pip install nexaai --prefer-binary --index-url https://nexaai.github.io/nexa-sdk/whl/vulkan --extra-index-url https://pypi.org/simple --no-cache-dir +CMAKE_ARGS="-DGGML_VULKAN=on" pip install nexaai --prefer-binary --index-url https://github.nexa.ai/whl/vulkan --extra-index-url https://pypi.org/simple --no-cache-dir ``` @@ -205,16 +206,18 @@ pip install -e . Below is our differentiation from other similar tools: -| **Feature** | **[Nexa SDK](https://github.com/NexaAI/nexa-sdk)** | **[ollama](https://github.com/ollama/ollama)** | **[Optimum](https://github.com/huggingface/optimum)** | **[LM Studio](https://github.com/lmstudio-ai)** | -| -------------------------- | :------------------------------------------------: | :--------------------------------------------: | :---------------------------------------------------: | :---------------------------------------------: | -| **GGML Support** | ✅ | ✅ | ❌ | ✅ | -| **ONNX Support** | ✅ | ❌ | ✅ | ❌ | -| **Text Generation** | ✅ | ✅ | ✅ | ✅ | -| **Image Generation** | ✅ | ❌ | ❌ | ❌ | -| **Vision-Language Models** | ✅ | ✅ | ✅ | ✅ | -| **Text-to-Speech** | ✅ | ❌ | ✅ | ❌ | -| **Server Capability** | ✅ | ✅ | ✅ | ✅ | -| **User Interface** | ✅ | ❌ | ❌ | ✅ | +| **Feature** | **[Nexa SDK](https://github.com/NexaAI/nexa-sdk)** | **[ollama](https://github.com/ollama/ollama)** | **[Optimum](https://github.com/huggingface/optimum)** | **[LM Studio](https://github.com/lmstudio-ai)** | +| --------------------------- | :------------------------------------------------: | :--------------------------------------------: | :---------------------------------------------------: | :---------------------------------------------: | +| **GGML Support** | ✅ | ✅ | ❌ | ✅ | +| **ONNX Support** | ✅ | ❌ | ✅ | ❌ | +| **Text Generation** | ✅ | ✅ | ✅ | ✅ | +| **Image Generation** | ✅ | ❌ | ❌ | ❌ | +| **Vision-Language Models** | ✅ | ✅ | ✅ | ✅ | +| **Audio-Language Models** | ✅ | ❌ | ❌ | ❌ | +| **Text-to-Speech** | ✅ | ❌ | ✅ | ❌ | +| **Server Capability** | ✅ | ✅ | ✅ | ✅ | +| **User Interface** | ✅ | ❌ | ❌ | ✅ | +| **Executable Installation** | ✅ | ✅ | ❌ | ✅ | ## Supported Models & Model Hub @@ -223,11 +226,11 @@ Our on-device model hub offers all types of quantized models (text, image, audio Supported model examples (full list at [Model Hub](https://nexa.ai/models)): | Model | Type | Format | Command | | ------------------------------------------------------------------------------------------------------- | --------------- | --------- | -------------------------------------- | -| [omniaudio](https://nexa.ai/NexaAI/Octo-omni-audio/gguf-q4_0/readme) | AudioLM | GGUF | `nexa run omniaudio` | +| [omniaudio](https://nexa.ai/NexaAI/omniaudio/gguf-q4_0/readme) | AudioLM | GGUF | `nexa run omniaudio` | | [qwen2audio](https://www.nexaai.com/qnguyen3/nanoLLaVA/gguf-fp16/readme) | AudioLM | GGUF | `nexa run qwen2audio` | | [octopus-v2](https://www.nexaai.com/NexaAI/Octopus-v2/gguf-q4_0/readme) | Function Call | GGUF | `nexa run octopus-v2` | | [octo-net](https://www.nexaai.com/NexaAI/Octo-net/gguf-q4_0/readme) | Text | GGUF | `nexa run octo-net` | -| [omnivision](https://nexa.ai/NexaAI/Octo-omni-vision/gguf-fp16/readme) | Multimodal | GGUF | `nexa run omnivision` | +| [omnivision](https://nexa.ai/NexaAI/omnivision/gguf-fp16/readme) | Multimodal | GGUF | `nexa run omnivision` | | [nanollava](https://www.nexaai.com/qnguyen3/nanoLLaVA/gguf-fp16/readme) | Multimodal | GGUF | `nexa run nanollava` | | [llava-phi3](https://www.nexaai.com/xtuner/llava-phi-3-mini/gguf-q4_0/readme) | Multimodal | GGUF | `nexa run llava-phi3` | | [llava-llama3](https://www.nexaai.com/xtuner/llava-llama-3-8b-v1.1/gguf-q4_0/readme) | Multimodal | GGUF | `nexa run llava-llama3` | @@ -255,25 +258,37 @@ Supported model examples (full list at [Model Hub](https://nexa.ai/models)): | [bark-small](https://nexa.ai/suno/bark-small/gguf-fp16/readme) | Text-to-Speech | GGUF | `nexa run bark-small:fp16` | ## Run Models from 🤗 HuggingFace or 🤖 ModelScope + You can pull, convert (to .gguf), quantize and run [llama.cpp supported](https://github.com/ggerganov/llama.cpp#description) text generation models from HF or MS with Nexa SDK. + ### Run .gguf File + Use `nexa run -hf ` or `nexa run -ms ` to run models with provided .gguf files: + ```bash nexa run -hf Qwen/Qwen2.5-Coder-7B-Instruct-GGUF ``` + ```bash nexa run -ms Qwen/Qwen2.5-Coder-7B-Instruct-GGUF ``` + > **Note:** You will be prompted to select a single .gguf file. If your desired quantization version has multiple split files (like fp16-00001-of-00004), please use Nexa's conversion tool (see below) to convert and quantize the model locally. + ### Convert .safetensors Files + Install [Nexa Python package](https://github.com/NexaAI/nexa-sdk?tab=readme-ov-file#install-option-2-python-package), and install Nexa conversion tool with `pip install "nexaai[convert]"`, then convert models from huggingface with `nexa convert `: + ```bash nexa convert HuggingFaceTB/SmolLM2-135M-Instruct ``` + Or you can convert models from ModelScope with `nexa convert -ms `: + ```bash nexa convert -ms Qwen/Qwen2.5-7B-Instruct ``` + > **Note:** Check our [leaderboard](https://nexa.ai/leaderboard) for performance benchmarks of different quantized versions of mainstream language models and [HuggingFace docs](https://huggingface.co/docs/optimum/en/concept_guides/quantization) to learn about quantization options. 📋 You can view downloaded and converted models with `nexa list` diff --git a/dependency/llama.cpp b/dependency/llama.cpp index bb33473f..ed459776 160000 --- a/dependency/llama.cpp +++ b/dependency/llama.cpp @@ -1 +1 @@ -Subproject commit bb33473f08db604e1f30334366032f0904e2a722 +Subproject commit ed459776811d0928ce55a001e9e5a6bc3bf22ca4 diff --git a/docs/README.md b/docs/README.md index c6481515..252116f7 100644 --- a/docs/README.md +++ b/docs/README.md @@ -61,6 +61,7 @@ twine upload dist/* ``` git tag git tag -d +git push origin --delete git tag git push origin ``` diff --git a/nexa/__init__.py b/nexa/__init__.py index b53e36d3..488bc026 100644 --- a/nexa/__init__.py +++ b/nexa/__init__.py @@ -1 +1 @@ -__version__ = "0.0.9.2" +__version__ = "0.0.9.4" diff --git a/nexa/cli/entry.py b/nexa/cli/entry.py index b4ea222d..ece27ca7 100644 --- a/nexa/cli/entry.py +++ b/nexa/cli/entry.py @@ -120,8 +120,10 @@ def run_ggml_inference(args): from nexa.gguf.nexa_inference_voice import NexaVoiceInference inference = NexaVoiceInference(model_path=model_path, local_path=local_path, **kwargs) elif run_type == "TTS": - from nexa.gguf.nexa_inference_tts import NexaTTSInference - inference = NexaTTSInference(model_path=model_path, local_path=local_path, **kwargs) + # # Temporarily disabled since version v0.0.9.3 + raise NotImplementedError("TTS model is not supported in CLI mode.") + # from nexa.gguf.nexa_inference_tts import NexaTTSInference + # inference = NexaTTSInference(model_path=model_path, local_path=local_path, **kwargs) elif run_type == "AudioLM": from nexa.gguf.nexa_inference_audio_lm import NexaAudioLMInference inference = NexaAudioLMInference(model_path=model_path, local_path=local_path, **kwargs) @@ -330,7 +332,7 @@ def _select_quantization_type(): except ValueError: print("Please enter a valid number.") -def _store_in_nexa_list(converted_path, model_type): +def _store_in_nexa_list(converted_path, model_type, input_name, output_ftype): """Helper function to store converted model in nexa list.""" import shutil from nexa.general import add_model_to_list @@ -344,7 +346,8 @@ def _store_in_nexa_list(converted_path, model_type): shutil.copy2(converted_path, nexa_list_path) # Add the new path to the model list - add_model_to_list(os.path.basename(converted_path), nexa_list_path, "gguf", model_type) + nexa_model_name = f"{input_name}:{output_ftype}" + add_model_to_list(nexa_model_name, nexa_list_path, "gguf", model_type) def _run_converted_model(converted_path, model_type): """Helper function to run the converted model.""" @@ -391,7 +394,7 @@ def run_convert(args): try: from nexa.gguf.converter.nexa_convert import convert_hf_to_quantized_gguf - converted_path = convert_hf_to_quantized_gguf( + converted_path, input_name, output_ftype = convert_hf_to_quantized_gguf( input_path, output_file=args.output_file, ftype=ftype, @@ -404,7 +407,7 @@ def run_convert(args): # Ask if user wants to store in nexa list store_choice = input("\nWould you like to store this model in nexa list so you can run it with `nexa run ` anywhere and anytime? (y/N): ").strip().lower() if store_choice == 'y': - _store_in_nexa_list(converted_path, model_type) + _store_in_nexa_list(converted_path, model_type, input_name, output_ftype) # Ask if user wants to run the model run_choice = input("\nWould you like to run the converted model? (y/N): ").strip().lower() @@ -414,7 +417,8 @@ def run_convert(args): print("Exiting without running the model.") print(f"\nConverted model stored at {converted_path}") - running_command = f"nexa run {converted_path.split('/')[-1]}"\ + nexa_model_name = f"{input_name}:{output_ftype}" + running_command = f"nexa run {nexa_model_name}"\ if store_choice == 'y' else f"nexa run {converted_path} -lp -mt {model_type}" print(f"\nYou can run the converted model with command: {running_command}") else: diff --git a/nexa/general.py b/nexa/general.py index 3190c49d..aa88e710 100644 --- a/nexa/general.py +++ b/nexa/general.py @@ -594,6 +594,15 @@ def is_model_exists(model_name): with open(NEXA_MODEL_LIST_PATH, "r") as f: model_list = json.load(f) + + # For AudioLM and Multimodal models, should check the file location instead of model name + if ":" in model_name: + model_path_with_slash = model_name.replace(":", "/") + + # Check if model_prefix/model_suffix exists in any location path + for model_key, model_info in model_list.items(): + if model_path_with_slash in model_info["location"]: + return model_key return model_name in model_list @@ -606,6 +615,13 @@ def add_model_to_list(model_name, model_location, model_type, run_type): model_list = json.load(f) else: model_list = {} + + # For AudioLM and Multimodal models, should remove the "model-" prefix from the tag name + if run_type == "AudioLM" or run_type == "Multimodal": + tag_name = model_name.split(":")[1] + if tag_name.startswith("model-"): + tag_name = tag_name[6:] + model_name = f"{model_name.split(':')[0]}:{tag_name}" model_list[model_name] = { "type": model_type, @@ -624,11 +640,21 @@ def get_model_info(model_name): with open(NEXA_MODEL_LIST_PATH, "r") as f: model_list = json.load(f) + # First try direct lookup model_data = model_list.get(model_name, {}) - location = model_data.get("location") - run_type = model_data.get("run_type") + if model_data: + return model_data.get("location"), model_data.get("run_type") + + # If not found and model_name contains ":", try path-based lookup + if ":" in model_name: + model_path_with_slash = model_name.replace(":", "/") + + # Check if model_prefix/model_suffix exists in any location path + for model_key, model_info in model_list.items(): + if model_path_with_slash in model_info["location"]: + return model_info["location"], model_info["run_type"] - return location, run_type + return None, None def list_models(): @@ -642,7 +668,7 @@ def list_models(): filtered_list = { model_name: model_info for model_name, model_info in model_list.items() - if not model_name.split(':')[1].startswith('projector') + if ':' not in model_name or not model_name.split(':')[1].startswith('projector') } table = [ diff --git a/nexa/gguf/__init__.py b/nexa/gguf/__init__.py index 9ba70de9..d2afff06 100644 --- a/nexa/gguf/__init__.py +++ b/nexa/gguf/__init__.py @@ -2,13 +2,15 @@ from .nexa_inference_text import NexaTextInference from .nexa_inference_vlm import NexaVLMInference from .nexa_inference_voice import NexaVoiceInference -from .nexa_inference_tts import NexaTTSInference + +# Temporarily disabled since version v0.0.9.3 +# from .nexa_inference_tts import NexaTTSInference __all__ = [ "NexaImageInference", "NexaTextInference", "NexaVLMInference", "NexaVoiceInference", - "NexaTTSInference", + #"NexaTTSInference", "NexaAudioLMInference" ] \ No newline at end of file diff --git a/nexa/gguf/converter/nexa_convert.py b/nexa/gguf/converter/nexa_convert.py index 5e13c16a..7c24771b 100644 --- a/nexa/gguf/converter/nexa_convert.py +++ b/nexa/gguf/converter/nexa_convert.py @@ -110,7 +110,7 @@ def convert_hf_to_quantized_gguf( ftype: str = "q4_0", convert_type: str = "f16", **kwargs -) -> Optional[str]: +) -> Optional[tuple[str, str, str]]: """ Convert a model in safetensors format to a quantized GGUF file. @@ -118,14 +118,14 @@ def convert_hf_to_quantized_gguf( It can process both directories containing .safetensors files and existing .gguf files. Args: - input_path (str): Path to the input Hugging Face model directory or GGUF file. + input_path (str): Path in the local file system to the input Hugging Face model directory or GGUF file. output_file (str, optional): Path to the output quantized GGUF file. If None, a default path will be used. ftype (str, optional): Quantization type (default: "q4_0"). convert_type (str, optional): Conversion type for safetensors to GGUF (default: "f16"). **kwargs: Additional keyword arguments for the conversion and quantization process. Returns: - Optional[str]: Path to the output quantized GGUF file if successful, None otherwise. + Optional[tuple[str, str, str]]: Tuple of (output_file_path, input_name, ftype) if successful, None otherwise. Raises: FileNotFoundError: If the input directory or file does not exist. @@ -139,11 +139,13 @@ def convert_hf_to_quantized_gguf( # Convert input path to absolute path input_path = os.path.abspath(input_path) + # Get input name early + input_name = os.path.basename(input_path) + if input_path.endswith('.gguf'): + input_name = os.path.splitext(input_name)[0] # Remove .gguf extension + # Set default output file if not provided if not output_file: - input_name = os.path.basename(input_path) - if input_path.endswith('.gguf'): - input_name = os.path.splitext(input_name)[0] # Remove .gguf extension output_file = os.path.abspath(f"./{input_name}-{ftype}.gguf") else: output_file = os.path.abspath(output_file) @@ -168,7 +170,7 @@ def convert_hf_to_quantized_gguf( # Quantize GGUF model quantize_model(str(tmp_file_path.absolute()), output_file, ftype, **kwargs) - return output_file + return output_file, input_name, ftype finally: # Delete the temporary file if tmp_file_path.exists(): @@ -179,7 +181,7 @@ def convert_hf_to_quantized_gguf( elif input_path.endswith('.gguf'): # Directly call quantize_model with input_path quantize_model(input_file=input_path, output_file=output_file, ftype=ftype, **kwargs) - return output_file + return output_file, input_name, ftype else: logger.error(f"Invalid input path: {input_path}. Must be a directory with .safetensors files or a .gguf file.") return None diff --git a/nexa/gguf/llama/kv_cache.py b/nexa/gguf/llama/kv_cache.py new file mode 100644 index 00000000..aa01630d --- /dev/null +++ b/nexa/gguf/llama/kv_cache.py @@ -0,0 +1,86 @@ +from nexa.gguf.llama.llama_cache import LlamaDiskCache +from typing import Any, Dict + +def run_inference_with_disk_cache( + model: Any, + cache_prompt: str, + total_prompt: str, + use_cache: bool = True, + cache_dir: str = "llama.cache", + **kwargs: Dict[str, Any] +) -> Any: + """ + Runs inference using a disk cache to store and retrieve model states. + + Parameters: + - model: The model object that supports caching and inference. + - cache_prompt: The prompt used to generate a cache key. + - total_prompt: The full prompt for generating output. + - use_cache: Flag to determine if caching should be used. + - cache_dir: Directory where cache files are stored. + - kwargs: Additional parameters for model inference. + + Returns: + - The output generated by the model. + """ + temperature = kwargs.get('temperature', 0.7) + max_tokens = kwargs.get('max_tokens', 2048) + top_p = kwargs.get('top_p', 0.8) + top_k = kwargs.get('top_k', 50) + repeat_penalty = kwargs.get('repeat_penalty', 1.0) + + if use_cache: + # Initialize disk cache with specified directory + cache_context = LlamaDiskCache(cache_dir=cache_dir) + model.set_cache(cache_context) + # Convert prompt to tokens for cache key + prompt_tokens = model.tokenize(cache_prompt.encode("utf-8")) + + try: + # Try to load existing cache + cached_state = cache_context[prompt_tokens] + model.load_state(cached_state) + + output = model( + total_prompt, + max_tokens=max_tokens, + temperature=temperature, + stream=True, + ) + except KeyError: + # If cache doesn't exist, create it + model.reset() + # Run initial inference to populate cache + _ = model( + cache_prompt, + max_tokens=1, # Minimal tokens for cache creation + temperature=temperature, + echo=False, + ) + # Save the state to cache + cache_context[prompt_tokens] = model.save_state() + + # Generate output after creating cache + output = model( + total_prompt, + max_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + top_k=top_k, + repeat_penalty=repeat_penalty, + stream=True, + ) + else: + model.reset() + model.set_cache(None) + + output = model( + total_prompt, + max_tokens=max_tokens, + temperature=temperature, + top_p=top_p, + top_k=top_k, + repeat_penalty=repeat_penalty, + stream=True, + ) + return output \ No newline at end of file diff --git a/nexa/gguf/llama/llama.py b/nexa/gguf/llama/llama.py index d7b241e7..0007b515 100644 --- a/nexa/gguf/llama/llama.py +++ b/nexa/gguf/llama/llama.py @@ -31,6 +31,7 @@ from nexa.gguf.llama.llama_types import * from nexa.gguf.llama.llama_grammar import LlamaGrammar +from nexa.gguf.llama.llama_cache import BaseLlamaCache from nexa.gguf.llama.llama_tokenizer import BaseLlamaTokenizer, LlamaTokenizer import nexa.gguf.llama.llama_cpp as llama_cpp import nexa.gguf.llama.llama_chat_format as llama_chat_format @@ -350,6 +351,8 @@ def __init__( # Sampling Params self.last_n_tokens_size = last_n_tokens_size + self.cache: Optional[BaseLlamaCache] = None + self.lora_base = lora_base self.lora_scale = lora_scale self.lora_path = lora_path @@ -596,6 +599,14 @@ def detokenize( The detokenized string. """ return self.tokenizer_.detokenize(tokens, prev_tokens=prev_tokens, special=special) + + def set_cache(self, cache: Optional[BaseLlamaCache]): + """Set the cache. + + Args: + cache: The cache to set. + """ + self.cache = cache def set_seed(self, seed: int): """Set the random seed. @@ -1211,6 +1222,23 @@ def logit_bias_processor( raise ValueError( "logprobs is not supported for models created with logits_all=False" ) + + if self.cache: + try: + cache_item = self.cache[prompt_tokens] + cache_prefix_len = Llama.longest_token_prefix( + cache_item.input_ids.tolist(), prompt_tokens + ) + eval_prefix_len = Llama.longest_token_prefix( + self._input_ids.tolist(), prompt_tokens + ) + if cache_prefix_len > eval_prefix_len: + self.load_state(cache_item) + if self.verbose: + print("Llama._create_completion: cache hit", file=sys.stderr) + except KeyError: + if self.verbose: + print("Llama._create_completion: cache miss", file=sys.stderr) if seed is not None: self._ctx.set_rng_seed(seed) @@ -1552,8 +1580,19 @@ def logit_bias_processor( } ], } + if self.cache: + if self.verbose: + print("Llama._create_completion: cache save", file=sys.stderr) + self.cache[prompt_tokens + completion_tokens] = self.save_state() + if self.verbose: + print("Llama._create_completion: cache saved", file=sys.stderr) return + if self.cache: + if self.verbose: + print("Llama._create_completion: cache save", file=sys.stderr) + self.cache[prompt_tokens + completion_tokens] = self.save_state() + text_str = text.decode("utf-8", errors="ignore") if echo: diff --git a/nexa/transformers/README.md b/nexa/transformers/README.md deleted file mode 100644 index c539b454..00000000 --- a/nexa/transformers/README.md +++ /dev/null @@ -1,8 +0,0 @@ -# transformers support for Nexa AI models - -``` -python run_omnivision.py -``` - -## Acknowledgements -We thank the [Hugging Face Transformers](https://github.com/huggingface/transformers) for their amazing work on the Transformers library. diff --git a/nexa/transformers/__init__.py b/nexa/transformers/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/nexa/transformers/omnivision/__init__.py b/nexa/transformers/omnivision/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/nexa/transformers/omnivision/configuration.py b/nexa/transformers/omnivision/configuration.py deleted file mode 100644 index d356a315..00000000 --- a/nexa/transformers/omnivision/configuration.py +++ /dev/null @@ -1,130 +0,0 @@ -# Copyright (c) 2024 Nexa AI Inc., Alibaba Group (Qwen team), and HuggingFace Inc. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -""" Qwen2 model configuration""" - -from transformers.configuration_utils import PretrainedConfig -from transformers.utils import logging -from typing import Union -from transformers import PretrainedConfig -import os -from transformers.models.auto import CONFIG_MAPPING - -logger = logging.get_logger(__name__) - - -class SigLipVisionConfig(PretrainedConfig): - model_type = "siglip_vision_model" - def __init__( - self, - hidden_size=1152, - image_mean=(0.5, 0.5, 0.5), - intermediate_size=4304, - num_hidden_layers=27, - num_attention_heads=16, - num_channels=3, - image_size=384, - patch_size=14, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - **kwargs, - ): - super().__init__(**kwargs) - self.hidden_size = hidden_size - self.intermediate_size = intermediate_size - self.num_hidden_layers = num_hidden_layers - self.num_attention_heads = num_attention_heads - self.num_channels = num_channels - self.patch_size = patch_size - self.image_size = image_size - self.attention_dropout = attention_dropout - self.layer_norm_eps = layer_norm_eps - self.hidden_act = hidden_act - self.image_mean = image_mean - - @classmethod - def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs) -> "PretrainedConfig": - cls._set_token_in_kwargs(kwargs) - - config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs) - - # get the vision config dict if we are loading from SigLipConfig - if config_dict.get("model_type") == "siglip": - config_dict = config_dict["vision_config"] - - if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type: - logger.warning( - f"You are using a model of type {config_dict['model_type']} to instantiate a model of type " - f"{cls.model_type}. This is not supported for all configurations of models and can yield errors." - ) - return cls.from_dict(config_dict, **kwargs) - - -""" Nexa AI model configuration""" -class OminiVLMConfig(PretrainedConfig): - model_type = "nano-omini-vlm" - - model_type = "omini_vlm" - keys_to_ignore_at_inference = ["past_key_values"] - - def __init__( - self, - vision_config=None, - text_config=None, - hidden_size=4096, - mm_hidden_size=1152, - mm_projector_lr=None, - mm_projector_type="mlp2x_gelu", - image_token_index=151655, - initializer_range=0.02, - **kwargs, - ): - self.hidden_size = hidden_size - self.mm_hidden_size = mm_hidden_size - self.mm_projector_lr = mm_projector_lr - self.mm_projector_type = mm_projector_type - self.image_token_index = image_token_index - self.initializer_range = initializer_range - if isinstance(vision_config, dict): - vision_config = SigLipVisionConfig(**vision_config) - elif vision_config is None: - vision_config = SigLipVisionConfig( - hidden_size=1152, - image_mean=(0.5, 0.5, 0.5), - intermediate_size=4304, - num_hidden_layers=27, - num_attention_heads=16, - num_channels=3, - image_size=384, - patch_size=14, - hidden_act="gelu_pytorch_tanh", - layer_norm_eps=1e-6, - attention_dropout=0.0, - ) - self.vision_config = vision_config - - if isinstance(text_config, dict): - text_config["model_type"] = ( - text_config["model_type"] if "model_type" in text_config else "qwen2" - ) - text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config) - elif text_config is None: - text_config = CONFIG_MAPPING["qwen2"]() - - self.text_config = text_config - - super().__init__(**kwargs) - \ No newline at end of file diff --git a/nexa/transformers/omnivision/modeling.py b/nexa/transformers/omnivision/modeling.py deleted file mode 100644 index 94bd67b1..00000000 --- a/nexa/transformers/omnivision/modeling.py +++ /dev/null @@ -1,709 +0,0 @@ -# Copyright (c) 2024 Nexa AI Inc., Alibaba Group (Qwen team), and HuggingFace Inc. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import math -from dataclasses import dataclass -from typing import Any, Dict, List, Optional, Tuple, Union -from dataclasses import dataclass -from transformers.activations import ACT2FN -import torch.utils.checkpoint -from torch import nn -from transformers.modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling -from transformers.modeling_utils import PreTrainedModel -from transformers.utils import ModelOutput -from transformers import Qwen2ForCausalLM -from .configuration import SigLipVisionConfig, OminiVLMConfig - -# ======================================================================================== # -# vision tower # -# ======================================================================================== # -@dataclass -class SigLipVisionModelOutput(ModelOutput): - """ - Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states. - - Args: - image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`): - The image embeddings obtained by applying the projection layer to the pooler_output. - last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Sequence of hidden-states at the output of the last layer of the model. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + - one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - """ - image_embeds: Optional[torch.FloatTensor] = None - last_hidden_state: torch.FloatTensor = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - - -@dataclass -class OminiVLMOutputWithPast(ModelOutput): - """ - Base class for Gemma2Audio causal language model (or autoregressive) outputs. - - Args: - loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided): - Language modeling loss (for next-token prediction). - logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`): - Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax). - past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`): - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape - `(batch_size, num_heads, sequence_length, embed_size_per_head)`) - - Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see - `past_key_values` input) to speed up sequential decoding. - hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`): - Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, + - one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`. - - Hidden-states of the model at the output of each layer plus the optional initial embedding outputs. - attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`): - Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length, - sequence_length)`. - - Attentions weights after the attention softmax, used to compute the weighted average in the self-attention - heads. - attention_mask (`torch.FloatTensor`, *optional*): - Attentions mask, used to update attention mask and position_ids. - """ - loss: Optional[torch.FloatTensor] = None - logits: torch.FloatTensor = None - past_key_values: Optional[List[torch.FloatTensor]] = None - hidden_states: Optional[Tuple[torch.FloatTensor]] = None - attentions: Optional[Tuple[torch.FloatTensor]] = None - attention_mask: Optional[torch.FloatTensor] = None - - -class SigLipVisionEmbeddings(nn.Module): - def __init__(self, config: SigLipVisionConfig): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.image_size = config.image_size - self.patch_size = config.patch_size - - self.patch_embedding = nn.Conv2d( - in_channels=config.num_channels, - out_channels=self.embed_dim, - kernel_size=self.patch_size, - stride=self.patch_size, - padding="valid", - ) - - self.num_patches = (self.image_size // self.patch_size) ** 2 - self.num_positions = self.num_patches - self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim) - self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False) - - def forward(self, pixel_values: torch.FloatTensor) -> torch.Tensor: - patch_embeds = self.patch_embedding(pixel_values) # shape = [*, width, grid, grid] - embeddings = patch_embeds.flatten(2).transpose(1, 2) - - embeddings = embeddings + self.position_embedding(self.position_ids) - return embeddings - - -class SigLipAttention(nn.Module): - """Multi-headed attention from 'Attention Is All You Need' paper""" - - def __init__(self, config): - super().__init__() - self.config = config - self.embed_dim = config.hidden_size - self.num_heads = config.num_attention_heads - self.head_dim = self.embed_dim // self.num_heads - if self.head_dim * self.num_heads != self.embed_dim: - raise ValueError( - f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:" - f" {self.num_heads})." - ) - self.scale = self.head_dim ** -0.5 - self.dropout = config.attention_dropout - - self.k_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.v_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.q_proj = nn.Linear(self.embed_dim, self.embed_dim) - self.out_proj = nn.Linear(self.embed_dim, self.embed_dim) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]: - """Input shape: Batch x Time x Channel""" - - batch_size, q_len, _ = hidden_states.size() - - query_states = self.q_proj(hidden_states) - key_states = self.k_proj(hidden_states) - value_states = self.v_proj(hidden_states) - - query_states = query_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) - key_states = key_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) - value_states = value_states.view(batch_size, q_len, self.num_heads, self.head_dim).transpose(1, 2) - - k_v_seq_len = key_states.shape[-2] - attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) * self.scale - - if attn_weights.size() != (batch_size, self.num_heads, q_len, k_v_seq_len): - raise ValueError( - f"Attention weights should be of size {(batch_size, self.num_heads, q_len, k_v_seq_len)}, but is" - f" {attn_weights.size()}" - ) - - if attention_mask is not None: - if attention_mask.size() != (batch_size, 1, q_len, k_v_seq_len): - raise ValueError( - f"Attention mask should be of size {(batch_size, 1, q_len, k_v_seq_len)}, but is {attention_mask.size()}" - ) - attn_weights = attn_weights + attention_mask - - # upcast attention to fp32 - attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype) - attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training) - attn_output = torch.matmul(attn_weights, value_states) - - if attn_output.size() != (batch_size, self.num_heads, q_len, self.head_dim): - raise ValueError( - f"`attn_output` should be of size {(batch_size, self.num_heads, q_len, self.head_dim)}, but is" - f" {attn_output.size()}" - ) - - attn_output = attn_output.transpose(1, 2).contiguous() - attn_output = attn_output.reshape(batch_size, q_len, self.embed_dim) - - attn_output = self.out_proj(attn_output) - - return attn_output, attn_weights - - -class SigLipMLP(nn.Module): - def __init__(self, config): - super().__init__() - self.config = config - self.activation_fn = ACT2FN[config.hidden_act] - self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size) - self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size) - - def forward(self, hidden_states: torch.Tensor) -> torch.Tensor: - hidden_states = self.fc1(hidden_states) - hidden_states = self.activation_fn(hidden_states) - hidden_states = self.fc2(hidden_states) - return hidden_states - - -class SigLipEncoderLayer(nn.Module): - def __init__(self, config: SigLipVisionConfig): - super().__init__() - self.embed_dim = config.hidden_size - self.self_attn = SigLipAttention(config) - self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - self.mlp = SigLipMLP(config) - self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps) - - def forward( - self, - hidden_states: torch.Tensor, - attention_mask: torch.Tensor, - output_attentions: Optional[bool] = False, - ) -> Tuple[torch.FloatTensor]: - """ - Args: - hidden_states (`torch.FloatTensor`): - Input to the layer of shape `(batch, seq_len, embed_dim)`. - attention_mask (`torch.FloatTensor`): - Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values. - output_attentions (`bool`, *optional*, defaults to `False`): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - """ - residual = hidden_states - - hidden_states = self.layer_norm1(hidden_states) - hidden_states, attn_weights = self.self_attn( - hidden_states=hidden_states, - attention_mask=attention_mask, - output_attentions=output_attentions, - ) - hidden_states = residual + hidden_states - - residual = hidden_states - hidden_states = self.layer_norm2(hidden_states) - hidden_states = self.mlp(hidden_states) - hidden_states = residual + hidden_states - - outputs = (hidden_states,) - - if output_attentions: - outputs += (attn_weights,) - - return outputs - - -class SigLipPreTrainedModel(PreTrainedModel): - """ - An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained - models. - """ - - config_class = SigLipVisionConfig - base_model_prefix = "siglip" - supports_gradient_checkpointing = True - - def _init_weights(self, module): - """Initialize the weights""" - pass - - -class SigLipEncoder(nn.Module): - """ - Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a - [`SigLipEncoderLayer`]. - - Args: - config: SigLipVisionConfig - """ - - def __init__(self, config: SigLipVisionConfig): - super().__init__() - self.config = config - self.layers = nn.ModuleList([SigLipEncoderLayer(config) for _ in range(config.num_hidden_layers)]) - self.gradient_checkpointing = False - - def forward( - self, - inputs_embeds, - attention_mask: Optional[torch.Tensor] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutput]: - r""" - Args: - inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`): - Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. - This is useful if you want more control over how to convert `input_ids` indices into associated vectors - than the model's internal embedding lookup matrix. - attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*): - Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`: - - - 1 for tokens that are **not masked**, - - 0 for tokens that are **masked**. - - [What are attention masks?](../glossary#attention-mask) - output_attentions (`bool`, *optional*): - Whether or not to return the attentions tensors of all attention layers. See `attentions` under - returned tensors for more detail. - output_hidden_states (`bool`, *optional*): - Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors - for more detail. - return_dict (`bool`, *optional*): - Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. - """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - encoder_states = () if output_hidden_states else None - all_attentions = () if output_attentions else None - - hidden_states = inputs_embeds - for encoder_layer in self.layers: - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - if self.gradient_checkpointing and self.training: - layer_outputs = self._gradient_checkpointing_func( - encoder_layer.__call__, - hidden_states, - attention_mask, - output_attentions, - ) - else: - layer_outputs = encoder_layer( - hidden_states, - attention_mask, - output_attentions=output_attentions, - ) - - hidden_states = layer_outputs[0] - - if output_attentions: - all_attentions = all_attentions + (layer_outputs[1],) - - if output_hidden_states: - encoder_states = encoder_states + (hidden_states,) - - if not return_dict: - return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None) - return BaseModelOutput( - last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions - ) - - -class SigLipMultiheadAttentionPoolingHead(nn.Module): - """Multihead Attention Pooling.""" - - def __init__(self, config: SigLipVisionConfig): - super().__init__() - - self.probe = nn.Parameter(torch.randn(1, 1, config.hidden_size)) - self.attention = torch.nn.MultiheadAttention(config.hidden_size, config.num_attention_heads, batch_first=True) - self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps) - self.mlp = SigLipMLP(config) - - def forward(self, hidden_state): - batch_size = hidden_state.shape[0] - probe = self.probe.repeat(batch_size, 1, 1) - - hidden_state = self.attention(probe, hidden_state, hidden_state)[0] - - residual = hidden_state - hidden_state = self.layernorm(hidden_state) - hidden_state = residual + self.mlp(hidden_state) - - return hidden_state[:, 0] - - -class SigLipVisionTransformer(nn.Module): - def __init__(self, config: SigLipVisionConfig): - super().__init__() - self.config = config - embed_dim = config.hidden_size - - self.embeddings = SigLipVisionEmbeddings(config) - self.encoder = SigLipEncoder(config) - self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps) - self.head = SigLipMultiheadAttentionPoolingHead(config) - - def get_dtype(self) -> torch.dtype: - return self.encoder.layers[0].mlp.fc2.weight.dtype - - def forward( - self, - pixel_values, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - r""" - Returns: - - """ - output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions - output_hidden_states = ( - output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states - ) - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - hidden_states = self.embeddings(pixel_values) - - encoder_outputs = self.encoder( - inputs_embeds=hidden_states, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - last_hidden_state = encoder_outputs[0] - last_hidden_state = self.post_layernorm(last_hidden_state) - - pooled_output = self.head(last_hidden_state) - - if not return_dict: - return (last_hidden_state, pooled_output) + encoder_outputs[1:] - - return BaseModelOutputWithPooling( - last_hidden_state=last_hidden_state, - pooler_output=pooled_output, - hidden_states=encoder_outputs.hidden_states, - attentions=encoder_outputs.attentions, - ) - - -class SigLipVisionModel(SigLipPreTrainedModel): - config_class = SigLipVisionConfig - main_input_name = "pixel_values" - _no_split_modules = ["SigLipEncoderLayer"] - - def __init__(self, config: SigLipVisionConfig): - super().__init__(config) - self.vision_model = SigLipVisionTransformer(config) - - # Initialize weights and apply final processing - self.post_init() - - def get_input_embeddings(self) -> nn.Module: - return self.vision_model.embeddings.patch_embedding - - def forward( - self, - pixel_values, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - ) -> Union[Tuple, BaseModelOutputWithPooling]: - return_dict = return_dict if return_dict is not None else self.config.use_return_dict - - return self.vision_model( - pixel_values=pixel_values, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - -# ======================================================================================== # -# Projector # -# ======================================================================================== # - -import re -def build_vision_projector(config, delay_load=False, **kwargs): - projector_type = getattr(config, 'mm_projector_type', 'mlp2x_gelu') - mlp_gelu_match = re.match(r'^mlp(\d+)x_gelu$', projector_type) - if mlp_gelu_match: - mlp_depth = int(mlp_gelu_match.group(1)) - modules = [nn.Linear(config.mm_hidden_size*9, config.hidden_size)] - for _ in range(1, mlp_depth): - modules.append(nn.GELU()) - modules.append(nn.Linear(config.hidden_size, config.text_config.hidden_size)) - return nn.Sequential(*modules) - - -# ======================================================================================== # -# LLM # -# ======================================================================================== # -class OminiVLMPreTrainedModel(PreTrainedModel): - config_class = OminiVLMConfig - base_model_prefix = "model" - supports_gradient_checkpointing = True - _no_split_modules = ["Qwen2DecoderLayer", "SigLipEncoderLayer"] - _skip_keys_device_placement = "past_key_values" - _supports_flash_attn_2 = True - _supports_sdpa = True - _supports_cache_class = True - _supports_static_cache = True - - def _init_weights(self, module): - std = self.config.initializer_range - if isinstance(module, (nn.Linear, nn.Conv3d)): - module.weight.data.normal_(mean=0.0, std=std) - if module.bias is not None: - module.bias.data.zero_() - elif isinstance(module, nn.Embedding): - module.weight.data.normal_(mean=0.0, std=std) - if module.padding_idx is not None: - module.weight.data[module.padding_idx].zero_() - - -class OminiVLMForConditionalGeneration(OminiVLMPreTrainedModel): - def __init__(self, config: OminiVLMConfig): - super().__init__(config) - if isinstance(config.vision_config, dict): - vision_config = SigLipVisionConfig(**config.vision_config) - else: - vision_config = config.vision_config - self.vision_tower = SigLipVisionModel(vision_config) - self.multi_modal_projector = build_vision_projector(config) - self.vocab_size = config.text_config.vocab_size - self.language_model = Qwen2ForCausalLM( - config.text_config, - ) - self.pad_token_id = ( - self.config.pad_token_id if self.config.pad_token_id is not None else -1 - ) - self._padding_side = "right" # set it to left by default, user can use setter to change padding_sides - self.post_init() - - @property - def padding_side(self): - return self._padding_side - - @padding_side.setter - def padding_side(self, padding_side: str): - if padding_side not in ["left", "right"]: - raise ValueError(f"{padding_side} is not `left` or `right`.") - self._padding_side = padding_side - - def get_input_embeddings(self): - return self.language_model.get_input_embeddings() - - def set_input_embeddings(self, value): - self.language_model.set_input_embeddings(value) - - def get_output_embeddings(self): - return self.language_model.get_output_embeddings() - - def set_output_embeddings(self, new_embeddings): - self.language_model.set_output_embeddings(new_embeddings) - - def set_decoder(self, decoder): - self.language_model.set_decoder(decoder) - - def get_decoder(self): - return self.language_model.get_decoder() - - def tie_weights(self): - return self.language_model.tie_weights() - - def resize_token_embeddings( - self, new_num_tokens: Optional[int] = None, pad_to_multiple_of=None - ) -> nn.Embedding: - model_embeds = self.language_model.resize_token_embeddings( - new_num_tokens, pad_to_multiple_of - ) - # update vocab size - self.config.text_config.vocab_size = model_embeds.num_embeddings - self.vocab_size = model_embeds.num_embeddings - return model_embeds - - def _update_model_kwargs_for_generation( - self, - outputs: ModelOutput, - model_kwargs: Dict[str, Any], - is_encoder_decoder: bool = False, - num_new_tokens: int = 1, - ) -> Dict[str, Any]: - model_kwargs = super()._update_model_kwargs_for_generation( - outputs=outputs, - model_kwargs=model_kwargs, - is_encoder_decoder=is_encoder_decoder, - num_new_tokens=num_new_tokens, - ) - return model_kwargs - - def forward( - self, - input_ids: torch.LongTensor = None, - attention_mask: Optional[torch.Tensor] = None, - position_ids: Optional[torch.LongTensor] = None, - past_key_values: Optional[List[torch.FloatTensor]] = None, - inputs_embeds: Optional[torch.FloatTensor] = None, - labels: Optional[torch.LongTensor] = None, - use_cache: Optional[bool] = None, - output_attentions: Optional[bool] = None, - output_hidden_states: Optional[bool] = None, - return_dict: Optional[bool] = None, - pixel_values: Optional[torch.Tensor] = None, - ) -> Union[Tuple, OminiVLMOutputWithPast]: - r""" - Args: - labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*): - Labels for computing the masked language modeling loss. Indices should either be in `[0, ..., - config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored - (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`. - - Returns: - ```""" - output_attentions = ( - output_attentions - if output_attentions is not None - else self.config.output_attentions - ) - output_hidden_states = ( - output_hidden_states - if output_hidden_states is not None - else self.config.output_hidden_states - ) - return_dict = ( - return_dict if return_dict is not None else self.config.use_return_dict - ) - - target_device = self.vision_tower.device - - if pixel_values is not None: - pixel_values = pixel_values.to(target_device) - - if inputs_embeds is None: - # 1. Extract the input embeddings - inputs_embeds = self.get_input_embeddings()(input_ids) - - # 2. Merge text and vision features - if pixel_values is not None: - pixel_values = pixel_values.type(self.vision_tower.vision_model.get_dtype()) - image_embeds = self.vision_tower(pixel_values).last_hidden_state.to(pixel_values.dtype) - image_embeds = image_embeds.view(image_embeds.shape[0], 81, -1) - image_embeds = self.multi_modal_projector(image_embeds) - image_mask = ( - (input_ids == self.config.image_token_index) - .unsqueeze(-1) - .expand_as(inputs_embeds) - .to(inputs_embeds.device) - ) - image_embeds = image_embeds.to(inputs_embeds.device, inputs_embeds.dtype) - - ## This is to intelligently replace the image tokens with the image features - inputs_embeds = inputs_embeds.masked_scatter(image_mask, image_embeds) - - if attention_mask is not None: - attention_mask = attention_mask.to(inputs_embeds.device) - - outputs = self.language_model( - attention_mask=attention_mask, - position_ids=position_ids, - past_key_values=past_key_values, - inputs_embeds=inputs_embeds, - use_cache=use_cache, - output_attentions=output_attentions, - output_hidden_states=output_hidden_states, - return_dict=return_dict, - ) - - logits = outputs[0] - - loss = None - if labels is not None: - # Shift so that tokens < n predict n - if attention_mask is not None: - shift_attention_mask = attention_mask[..., 1:] - shift_logits = logits[..., :-1, :][ - shift_attention_mask.to(logits.device) != 0 - ].contiguous() - shift_labels = labels[..., 1:][ - shift_attention_mask.to(labels.device) != 0 - ].contiguous() - else: - shift_logits = logits[..., :-1, :].contiguous() - shift_labels = labels[..., 1:].contiguous() - # Flatten the tokens - loss_fct = nn.CrossEntropyLoss() - loss = loss_fct( - shift_logits.view(-1, shift_logits.size(-1)), - shift_labels.view(-1).to(shift_logits.device), - ) - - if not return_dict: - output = (logits,) + outputs[1:] - return (loss,) + output if loss is not None else output - - return OminiVLMOutputWithPast( - loss=loss, - logits=logits, - past_key_values=outputs.past_key_values, - hidden_states=outputs.hidden_states, - attentions=outputs.attentions, - attention_mask=attention_mask, - ) \ No newline at end of file diff --git a/nexa/transformers/omnivision/processing.py b/nexa/transformers/omnivision/processing.py deleted file mode 100644 index 2bc3f008..00000000 --- a/nexa/transformers/omnivision/processing.py +++ /dev/null @@ -1,201 +0,0 @@ -# Copyright (c) 2024 Nexa AI Inc., Alibaba Group (Qwen team), and HuggingFace Inc. -# All rights reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from typing import List, Union - -try: - from typing import Unpack -except ImportError: - from typing_extensions import Unpack - -from transformers.feature_extraction_utils import BatchFeature -from transformers.image_utils import ImageInput, VideoInput -from transformers.processing_utils import ( - ProcessingKwargs, - ProcessorMixin, -) -from transformers.tokenization_utils_base import PreTokenizedInput, TextInput -from transformers.utils import logging - - -logger = logging.get_logger(__name__) -NUM_IMAGE_TOKENS = 81 - -class NanoVLMProcessorKwargs(ProcessingKwargs, total=False): - _defaults = { - "text_kwargs": { - "padding": False, - }, - } - - -class NanoVLMProcessor(ProcessorMixin): - attributes = ["image_processor", "tokenizer"] - valid_kwargs = ["chat_template"] - image_processor_class = "SiglipImageProcessor" - tokenizer_class = ("Qwen2Tokenizer", "Qwen2TokenizerFast") - - def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): - if chat_template is None: - chat_template = self.default_chat_template - super().__init__(image_processor, tokenizer, chat_template=chat_template) - - def __call__( - self, - images: ImageInput = None, - text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None, - **kwargs: Unpack[NanoVLMProcessorKwargs], - ) -> BatchFeature: - """ - Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text` - and `kwargs` arguments to Gemma2TokenizerFast's [`~Gemma2TokenizerFast.__call__`] if `text` is not `None` to encode - the text. To prepare the vision inputs, this method forwards the `vision_infos` and `kwrags` arguments to - Gemma2VLImageProcessor's [`~Gemma2VLImageProcessor.__call__`] if `vision_infos` is not `None`. - - Args: - images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`): - The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch - tensor. Both channels-first and channels-last formats are supported. - text (`str`, `List[str]`, `List[List[str]]`): - The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings - (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set - `is_split_into_words=True` (to lift the ambiguity with a batch of sequences). - return_tensors (`str` or [`~utils.TensorType`], *optional*): - If set, will return tensors of a particular framework. Acceptable values are: - - `'tf'`: Return TensorFlow `tf.constant` objects. - - `'pt'`: Return PyTorch `torch.Tensor` objects. - - `'np'`: Return NumPy `np.ndarray` objects. - - `'jax'`: Return JAX `jnp.ndarray` objects. - - Returns: - [`BatchFeature`]: A [`BatchFeature`] with the following fields: - - - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. - - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when - `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not - `None`). - - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`. - """ - output_kwargs = self._merge_kwargs( - NanoVLMProcessorKwargs, - tokenizer_init_kwargs=self.tokenizer.init_kwargs, - **kwargs, - ) - - # check the number of images is equal to the number of all image_pad tokens - assert len(images) == sum([t.count("<|image_pad|>") for t in text]), "The number of images must be equal to the number of all image_pad tokens in the text." - - if images is not None: - image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"]) - else: - image_inputs = {} - - if not isinstance(text, list): - text = [text] - - if image_inputs is not None: - index = 0 - for i in range(len(text)): - while "<|image_pad|>" in text[i]: - text[i] = text[i].replace( - "<|image_pad|>", "<|placeholder|>" * NUM_IMAGE_TOKENS, 1 - ) - index += 1 - text[i] = text[i].replace("<|placeholder|>", "<|image_pad|>") - - _ = output_kwargs["text_kwargs"].pop("padding_side", None) - text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) - - return BatchFeature(data={**text_inputs, **image_inputs}) - - def batch_decode(self, *args, **kwargs): - """ - This method forwards all its arguments to Gemma2TokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please - refer to the docstring of this method for more information. - """ - return self.tokenizer.batch_decode(*args, **kwargs) - - def decode(self, *args, **kwargs): - """ - This method forwards all its arguments to Gemma2TokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to - the docstring of this method for more information. - """ - return self.tokenizer.decode(*args, **kwargs) - - @property - def model_input_names(self): - tokenizer_input_names = self.tokenizer.model_input_names - image_processor_input_names = self.image_processor.model_input_names - return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names)) - - - @property - def default_chat_template(self): - return ( - "{%- if tools %}" - "{{- '<|im_start|>system\n' }}" - "{%- if messages[0]['role'] == 'system' %}" - "{{- messages[0]['content'] }}" - "{%- else %}" - "{{- 'You are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.' }}" - "{%- endif %}" - "{{- \"\n\n# Tools\n\nYou may call one or more functions to assist with the user query.\n\nYou are provided with function signatures within XML tags:\n\" }}" - "{%- for tool in tools %}" - "{{- \"\n\" }}" - "{{- tool | tojson }}" - "{%- endfor %}" - "{{- \"\n\n\nFor each function call, return a json object with function name and arguments within XML tags:\n\n{\\\"name\\\": , \\\"arguments\\\": }\n<|im_end|>\n\" }}" - "{%- else %}" - "{%- if messages[0]['role'] == 'system' %}" - "{{- '<|im_start|>system\n' + messages[0]['content'] + '<|im_end|>\n' }}" - "{%- else %}" - "{{- '<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n' }}" - "{%- endif %}" - "{%- endif %}" - "{%- for message in messages %}" - "{%- if (message.role == \"user\") or (message.role == \"system\" and not loop.first) or (message.role == \"assistant\" and not message.tool_calls) %}" - "{{- '<|im_start|>' + message.role + '\n' + message.content + '<|im_end|>' + '\n' }}" - "{%- elif message.role == \"assistant\" %}" - "{{- '<|im_start|>' + message.role }}" - "{%- if message.content %}" - "{{- '\n' + message.content }}" - "{%- endif %}" - "{%- for tool_call in message.tool_calls %}" - "{%- if tool_call.function is defined %}" - "{%- set tool_call = tool_call.function %}" - "{%- endif %}" - "{{- '\n\n{\"name\": \"' }}" - "{{- tool_call.name }}" - "{{- '\", \"arguments\": ' }}" - "{{- tool_call.arguments | tojson }}" - "{{- '}\n' }}" - "{%- endfor %}" - "{{- '<|im_end|>\n' }}" - "{%- elif message.role == \"tool\" %}" - "{%- if (loop.index0 == 0) or (messages[loop.index0 - 1].role != \"tool\") %}" - "{{- '<|im_start|>user' }}" - "{%- endif %}" - "{{- '\n\n' }}" - "{{- message.content }}" - "{{- '\n' }}" - "{%- if loop.last or (messages[loop.index0 + 1].role != \"tool\") %}" - "{{- '<|im_end|>\n' }}" - "{%- endif %}" - "{%- endif %}" - "{%- endfor %}" - "{%- if add_generation_prompt %}" - "{{- '<|im_start|>assistant\n' }}" - "{%- endif %}" - ) \ No newline at end of file diff --git a/nexa/transformers/run_omnivision.py b/nexa/transformers/run_omnivision.py deleted file mode 100644 index f81d1efe..00000000 --- a/nexa/transformers/run_omnivision.py +++ /dev/null @@ -1,92 +0,0 @@ -from nexa.transformers.omnivision.processing import NanoVLMProcessor -from nexa.transformers.omnivision.modeling import OminiVLMForConditionalGeneration -import argparse -import torch - - -model_name = "NexaAIDev/omnivlm-dpo" -image_url = "https://public-storage.nexa4ai.com/public-images/cat.png" - - -def get_device(): - if torch.cuda.is_available(): - return "cuda" - elif torch.backends.mps.is_available(): - return "mps" - return "cpu" - - -def load_model_and_processor(model_path): - device = get_device() - proc_path = "nexa-collaboration/nano-vlm-processor" - processor = NanoVLMProcessor.from_pretrained(proc_path) - processor.tokenizer.pad_token = processor.tokenizer.eos_token - processor.tokenizer.padding_side = "right" - - model_kwargs = {} - # Adjust dtype based on device - dtype = torch.bfloat16 if device == "cuda" else torch.float32 - local_model = OminiVLMForConditionalGeneration.from_pretrained( - model_path, - torch_dtype=dtype, - **model_kwargs - ) - local_model = local_model.to(device) - return local_model, processor - - -def process_single_image(processor, image_path, input_prompt=None): - text = f"<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n{input_prompt}\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>" - # Changed from Image.open() to handle URLs - if image_path.startswith('http'): - from PIL import Image - import requests - from io import BytesIO - response = requests.get(image_path) - image = Image.open(BytesIO(response.content)).convert('RGB') - else: - image = Image.open(image_path).convert('RGB') - inputs = processor( - text=[text], - images=[image], - padding=True, - return_tensors="pt", - ) - return inputs.to(get_device()) - - -def generate_output(model, processor, inputs, max_tokens): - cur_ids = inputs['input_ids'] - cur_attention_mask = inputs['attention_mask'] - input_token_length = cur_ids.shape[-1] - for _ in range(max_tokens): - out = model( - cur_ids, - attention_mask=cur_attention_mask, - pixel_values=inputs['pixel_values'], - use_cache=False - ) - next_token = out.logits[:, -1].argmax() - next_word = processor.decode(next_token) - cur_ids = torch.cat([cur_ids, next_token.unsqueeze(0).unsqueeze(0)], dim=-1) - cur_attention_mask = torch.cat([cur_attention_mask, torch.ones_like(next_token).unsqueeze(0).unsqueeze(0)], dim=-1) - if next_word in ("<|im_end|>"): - break - return processor.batch_decode(cur_ids[:, input_token_length:])[0] - -def main(args): - model, processor = load_model_and_processor(args.model_path) - inputs = process_single_image(processor, args.image_path, args.input_prompt) - output = generate_output(model, processor, inputs, args.max_tokens) - print("=== Inference Result ===\n", output) - -if __name__ == "__main__": - parser = argparse.ArgumentParser(description="Inference script for Nano-Omni-VLM") - parser.add_argument("--model_path", default=model_name, help="Path to the model checkpoint") - # Add image_path argument - parser.add_argument("--image_path", default=image_url, help="Path to input image or image URL") - parser.add_argument("--input_prompt", type=str, default="Describe this image for me", help="Input prompt for instruct task") - parser.add_argument("--max_tokens", type=int, default=512, help="Maximum number of tokens to generate") - - args = parser.parse_args() - main(args) \ No newline at end of file diff --git a/pyproject.toml b/pyproject.toml index 7e9c6478..5e3b1b1a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -80,12 +80,6 @@ convert = [ "nexa-gguf", ] -transformers = [ - "transformers", - "torch", - "pillow" -] - [project.urls] Homepage = "https://github.com/NexaAI/nexa-sdk" Issues = "https://github.com/NexaAI/nexa-sdk/issues" diff --git a/tests/test_tts_generation.py b/tests/test_tts_generation.py index 5d55ed4d..2dc9c526 100644 --- a/tests/test_tts_generation.py +++ b/tests/test_tts_generation.py @@ -1,22 +1,24 @@ -from nexa.gguf import NexaTTSInference +# Temporarily disabled since version v0.0.9.3 -def test_tts_generation(): - tts = NexaTTSInference( - model_path="bark-small", - local_path=None, - n_threads=4, - seed=42, - sampling_rate=24000, - verbosity=2 - ) +# from nexa.gguf import NexaTTSInference + +# def test_tts_generation(): +# tts = NexaTTSInference( +# model_path="bark-small", +# local_path=None, +# n_threads=4, +# seed=42, +# sampling_rate=24000, +# verbosity=2 +# ) - # Generate audio from prompt - prompt = "Hello, this is a test of the Bark text to speech system." - audio_data = tts.audio_generation(prompt) +# # Generate audio from prompt +# prompt = "Hello, this is a test of the Bark text to speech system." +# audio_data = tts.audio_generation(prompt) - # Save the generated audio - tts._save_audio(audio_data, tts.sampling_rate, "tts_output") - print("TTS generation test completed successfully!") +# # Save the generated audio +# tts._save_audio(audio_data, tts.sampling_rate, "tts_output") +# print("TTS generation test completed successfully!") -if __name__ == "__main__": - test_tts_generation() \ No newline at end of file +# if __name__ == "__main__": +# test_tts_generation() \ No newline at end of file