Merge pull request #303 from NexaAI/qi-sdkk

omnivision → omniVLM
NexaAI · Dec 6, 2024 · 3eb45b7 · 3eb45b7
2 parents 26f5a5a + 608c928
commit 3eb45b7
Show file tree

Hide file tree

Showing 5 changed files with 17 additions and 11 deletions.
diff --git a/README.md b/README.md
@@ -21,7 +21,7 @@
 
 ## Latest News 🔥
 
-- Support Nexa AI's own vision language model (0.9B parameters): `nexa run omnivision` and audio language model (2.9B parameters): `nexa run omniaudio`
+- Support Nexa AI's own vision language model (0.9B parameters): `nexa run omniVLM` and audio language model (2.9B parameters): `nexa run omniaudio`
 - Support audio language model: `nexa run qwen2audio`, **we are the first open-source toolkit to support audio language model with GGML tensor library.**
 - Support iOS Swift binding for local inference on **iOS mobile** devices.
 - Support embedding model: `nexa embed <model_path> <prompt>`
@@ -228,7 +228,7 @@ Supported model examples (full list at [Model Hub](https://nexa.ai/models)):
 | [qwen2audio](https://nexa.ai/Qwen/Qwen2-Audio-7.8B-Instruct/gguf-q4_K_M/readme) | AudioLM | GGUF | `nexa run qwen2audio` |
 | [octopus-v2](https://www.nexaai.com/NexaAI/Octopus-v2/gguf-q4_0/readme) | Function Call | GGUF | `nexa run octopus-v2` |
 | [octo-net](https://www.nexaai.com/NexaAI/Octo-net/gguf-q4_0/readme) | Text | GGUF | `nexa run octo-net` |
-| [omnivision](https://nexa.ai/NexaAI/omnivision/gguf-fp16/readme) | Multimodal | GGUF | `nexa run omnivision` |
+| [omniVLM](https://nexa.ai/NexaAI/omniVLM/gguf-fp16/readme) | Multimodal | GGUF | `nexa run omniVLM` |
 | [nanollava](https://www.nexaai.com/qnguyen3/nanoLLaVA/gguf-fp16/readme) | Multimodal | GGUF | `nexa run nanollava` |
 | [llava-phi3](https://www.nexaai.com/xtuner/llava-phi-3-mini/gguf-q4_0/readme) | Multimodal | GGUF | `nexa run llava-phi3` |
 | [llava-llama3](https://www.nexaai.com/xtuner/llava-llama-3-8b-v1.1/gguf-q4_0/readme) | Multimodal | GGUF | `nexa run llava-llama3` |

diff --git a/dependency/llama.cpp b/dependency/llama.cpp
diff --git a/docs/README.md b/docs/README.md
@@ -28,12 +28,16 @@ pip install nexaai[onnx] # if you need ONNX support
 ```
 
 ### build from source
+
 To build C++ only
+
 ```
 cmake -B build -S .
 cmake --build build --config Release -j32
 ```
+
 To build C++ and install python package from source, run the following commands:
+
 ```bash
 git clone --recursive https://github.com/NexaAI/nexa-sdk.git
 cd nexa-sdk
@@ -75,7 +79,7 @@ python -m nexa.gguf.nexa_inference_text gemma
 python -m nexa.gguf.nexa_inference_text octopusv2 --stop_words "<nexa_end>"
 wget https://assets-c4akfrf5b4d3f4b7.z01.azurefd.net/assets/2024/04/BMDataViz_661fb89f3845e.png -O test.png
 python -m nexa.gguf.nexa_inference_vlm nanollava
-python -m nexa.gguf.nexa_inference_vlm_omni omnivision
+python -m nexa.gguf.nexa_inference_vlm_omni omniVLM
 python -m nexa.gguf.nexa_inference_image sd1-4
 python -m nexa.gguf.nexa_inference_image sd1-4 --img2img
 wget -O control_normal-fp16.safetensors https://huggingface.co/webui/ControlNet-modules-safetensors/resolve/main/control_normal-fp16.safetensors
@@ -235,7 +239,9 @@ dumpbin /dependents your_executable_or_dll.dll  # in Developer PowerShell for Vi
 ```
 
 ### Debug dynamic lib
+
 According to [isse](https://github.com/abetlen/llama-cpp-python/issues/1346), below can check the exported symbols on linux.
+
 ```
 readelf -Ws --dyn-syms libllama.so
-```
+```
diff --git a/nexa/constants.py b/nexa/constants.py
@@ -188,8 +188,8 @@ class ModelType(Enum):
     "omnivision-preview": "omnivision-preview:projector-fp16",
     "omnivision-preview:fp16": "omnivision-preview:projector-fp16",
     "omnivision-preview:q4_0": "omnivision-preview:projector-q4_0",
-    "omnivision": "omnivision:projector-fp16",
-    "omnivision:fp16": "omnivision:projector-fp16",
+    "omniVLM": "omniVLM:projector-fp16",
+    "omniVLM:fp16": "omniVLM:projector-fp16",
     "omnivision-ocr": "omnivision-ocr:projector-fp16",
     "omnivision-ocr:fp16": "omnivision-ocr:projector-fp16",
 }
@@ -198,8 +198,8 @@ class ModelType(Enum):
     "omnivision-preview": "omnivision-preview:model-fp16",
     "omnivision-preview:fp16": "omnivision-preview:model-fp16",
     "omnivision-preview:q4_0": "omnivision-preview:model-q4_0",
-    "omnivision": "omnivision:model-fp16",
-    "omnivision:fp16": "omnivision:model-fp16",
+    "omniVLM": "omniVLM:model-fp16",
+    "omniVLM:fp16": "omniVLM:model-fp16",
     "omnivision-ocr": "omnivision-ocr:model-fp16",
     "omnivision-ocr:fp16": "omnivision-ocr:model-fp16",
 }
@@ -461,7 +461,7 @@ class ModelType(Enum):
     "FLUX.1-schnell": ModelType.COMPUTER_VISION,
     "Phi-3-vision-128k-instruct": ModelType.MULTIMODAL,
     "omnivision-preview": ModelType.MULTIMODAL,
-    "omnivision": ModelType.MULTIMODAL,
+    "omniVLM": ModelType.MULTIMODAL,
     "omnivision-ocr": ModelType.MULTIMODAL,
     "nanoLLaVA": ModelType.MULTIMODAL,
     "llava-v1.6-mistral-7b": ModelType.MULTIMODAL,

diff --git a/nexa/gguf/nexa_inference_vlm_omni.py b/nexa/gguf/nexa_inference_vlm_omni.py
@@ -40,7 +40,7 @@ def __init__(
         else:
             self.n_gpu_layers = 0
 
-        # Handle direct model file paths (e.g., omnivision:model-fp16)
+        # Handle direct model file paths (e.g., omniVLM:model-fp16)
         if model_path and ':model-' in model_path:
             base_name = model_path.split(':')[0]
             model_type = model_path.split('model-')[1]
+1 −2		common/common-nexa.cpp
+1 −1		common/common-nexa.h
+1 −8		examples/omni-vlm/CMakeLists.txt
+1 −1		examples/omni-vlm/omni-vlm-cli.cpp
+1 −1		examples/omni-vlm/omni-vlm-wrapper.cpp
+1 −5		examples/qwen2-audio/qwen2.cpp
+0 −2		examples/qwen2-audio/whisper.cpp
+1 −1		ggml_llama/src/ggml-metal.m