Skip to content

Commit

Permalink
Merge pull request #229 from NexaAI/david/newfeature
Browse files Browse the repository at this point in the history
David/newfeature add omnivision python interface
  • Loading branch information
zhiyuan8 authored Nov 11, 2024
2 parents 84dd847 + 0ce19fd commit 4aa23d7
Show file tree
Hide file tree
Showing 3 changed files with 97 additions and 47 deletions.
80 changes: 59 additions & 21 deletions nexa/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ class ModelType(Enum):
"mistral": "Mistral-7B-Instruct-v0.3:q4_0",
"codegemma": "codegemma-2b:q4_0",
"codellama": "CodeLlama-7b-Instruct:q4_0",
"codeqwen": "Qwen2.5-Coder-1.5B-Instruct:q4_0",
"codeqwen": "Qwen2.5-Coder-3B-Instruct:q4_0",
"mathqwen": "Qwen2.5-Math-1.5B-Instruct:q4_0",
"deepseek-coder": "deepseek-coder-1.3b-instruct:q4_0",
"dolphin-mistral": "dolphin-2.8-mistral-7b:q4_0",
Expand Down Expand Up @@ -110,13 +110,25 @@ class ModelType(Enum):
}

NEXA_RUN_MODEL_MAP_AUDIO_LM = {
"qwen2audio": "Qwen2-Audio-7.8B-Instruct:model-q4_0",
"Qwen2-Audio-7.8B-Instruct:fp16": "Qwen2-Audio-7.8B-Instruct:model-fp16",
"Qwen2-Audio-7.8B-Instruct:q8_0": "Qwen2-Audio-7.8B-Instruct:model-q8_0",
"qwen2audio": "Qwen2-Audio-7.8B-Instruct:model-q4_K_M",
"Qwen2-Audio-7.8B-Instruct:q2_K": "Qwen2-Audio-7.8B-Instruct:model-q2_K",
"Qwen2-Audio-7.8B-Instruct:q3_K_L": "Qwen2-Audio-7.8B-Instruct:model-q3_K_L",
"Qwen2-Audio-7.8B-Instruct:q3_K_M": "Qwen2-Audio-7.8B-Instruct:model-q3_K_M",
"Qwen2-Audio-7.8B-Instruct:q3_K_S": "Qwen2-Audio-7.8B-Instruct:model-q3_K_S",
"Qwen2-Audio-7.8B-Instruct:q4_0": "Qwen2-Audio-7.8B-Instruct:model-q4_0",
"omniaudio": "Octo-omni-audio:model-fp16",
"Octo-omni-audio:fp16": "Octo-omni-audio:model-fp16",
"Octo-omni-audio:q4_0": "Octo-omni-audio:model-q4_0",
"Qwen2-Audio-7.8B-Instruct:q4_1": "Qwen2-Audio-7.8B-Instruct:model-q4_1",
"Qwen2-Audio-7.8B-Instruct:q4_K_M": "Qwen2-Audio-7.8B-Instruct:model-q4_K_M",
"Qwen2-Audio-7.8B-Instruct:q4_K_S": "Qwen2-Audio-7.8B-Instruct:model-q4_K_S",
"Qwen2-Audio-7.8B-Instruct:q5_0": "Qwen2-Audio-7.8B-Instruct:model-q5_0",
"Qwen2-Audio-7.8B-Instruct:q5_1": "Qwen2-Audio-7.8B-Instruct:model-q5_1",
"Qwen2-Audio-7.8B-Instruct:q5_K_M": "Qwen2-Audio-7.8B-Instruct:model-q5_K_M",
"Qwen2-Audio-7.8B-Instruct:q5_K_S": "Qwen2-Audio-7.8B-Instruct:model-q5_K_S",
"Qwen2-Audio-7.8B-Instruct:q6_K": "Qwen2-Audio-7.8B-Instruct:model-q6_K",
"Qwen2-Audio-7.8B-Instruct:q8_0": "Qwen2-Audio-7.8B-Instruct:model-q8_0",
"Qwen2-Audio-7.8B-Instruct:fp16": "Qwen2-Audio-7.8B-Instruct:model-fp16",
"omniaudio": "omniaudio:model-fp16",
"omniaudio:fp16": "omniaudio:model-fp16",
"omniaudio:q4_0": "omniaudio:model-q4_0",
}

NEXA_RUN_MODEL_MAP_VOICE = {
Expand Down Expand Up @@ -172,25 +184,45 @@ class ModelType(Enum):
}

NEXA_RUN_OMNI_VLM_PROJECTOR_MAP = {
"omnivision": "Octo-omni-vision:projector-fp16",
"Octo-omni-vision:fp16": "Octo-omni-vision:projector-fp16",
"Octo-omni-vision:q4_0": "Octo-omni-vision:projector-q4_0",
"omnivision-preview": "omnivision-preview:projector-fp16",
"omnivision-preview:fp16": "omnivision-preview:projector-fp16",
"omnivision-preview:q4_0": "omnivision-preview:projector-q4_0",
"omnivision": "omnivision:projector-fp16",
"omnivision:fp16": "omnivision:projector-fp16",
"omnivision-ocr": "omnivision-ocr:projector-fp16",
"omnivision-ocr:fp16": "omnivision-ocr:projector-fp16",
}

NEXA_RUN_OMNI_VLM_MAP = {
"omnivision": "Octo-omni-vision:model-fp16",
"Octo-omni-vision:fp16": "Octo-omni-vision:model-fp16",
"Octo-omni-vision:q4_0": "Octo-omni-vision:model-q4_0",
"omnivision-preview": "omnivision-preview:model-fp16",
"omnivision-preview:fp16": "omnivision-preview:model-fp16",
"omnivision-preview:q4_0": "omnivision-preview:model-q4_0",
"omnivision": "omnivision:model-fp16",
"omnivision:fp16": "omnivision:model-fp16",
"omnivision-ocr": "omnivision-ocr:model-fp16",
"omnivision-ocr:fp16": "omnivision-ocr:model-fp16",
}

NEXA_RUN_AUDIO_LM_PROJECTOR_MAP = {
"qwen2audio": "Qwen2-Audio-7.8B-Instruct:projector-q4_0",
"Qwen2-Audio-7.8B-Instruct:fp16": "Qwen2-Audio-7.8B-Instruct:projector-fp16",
"Qwen2-Audio-7.8B-Instruct:q8_0": "Qwen2-Audio-7.8B-Instruct:projector-q8_0",
"qwen2audio": "Qwen2-Audio-7.8B-Instruct:projector-q4_K_M",
"Qwen2-Audio-7.8B-Instruct:q2_K": "Qwen2-Audio-7.8B-Instruct:projector-q2_K",
"Qwen2-Audio-7.8B-Instruct:q3_K_L": "Qwen2-Audio-7.8B-Instruct:projector-q3_K_L",
"Qwen2-Audio-7.8B-Instruct:q3_K_M": "Qwen2-Audio-7.8B-Instruct:projector-q3_K_M",
"Qwen2-Audio-7.8B-Instruct:q3_K_S": "Qwen2-Audio-7.8B-Instruct:projector-q3_K_S",
"Qwen2-Audio-7.8B-Instruct:q4_0": "Qwen2-Audio-7.8B-Instruct:projector-q4_0",
"omniaudio": "Octo-omni-audio:projector-fp16",
"Octo-omni-audio:fp16": "Octo-omni-audio:projector-fp16",
"Octo-omni-audio:q4_0": "Octo-omni-audio:projector-q4_0",
"Qwen2-Audio-7.8B-Instruct:q4_1": "Qwen2-Audio-7.8B-Instruct:projector-q4_1",
"Qwen2-Audio-7.8B-Instruct:q4_K_M": "Qwen2-Audio-7.8B-Instruct:projector-q4_K_M",
"Qwen2-Audio-7.8B-Instruct:q4_K_S": "Qwen2-Audio-7.8B-Instruct:projector-q4_K_S",
"Qwen2-Audio-7.8B-Instruct:q5_0": "Qwen2-Audio-7.8B-Instruct:projector-q5_0",
"Qwen2-Audio-7.8B-Instruct:q5_1": "Qwen2-Audio-7.8B-Instruct:projector-q5_1",
"Qwen2-Audio-7.8B-Instruct:q5_K_M": "Qwen2-Audio-7.8B-Instruct:projector-q5_K_M",
"Qwen2-Audio-7.8B-Instruct:q5_K_S": "Qwen2-Audio-7.8B-Instruct:projector-q5_K_S",
"Qwen2-Audio-7.8B-Instruct:q6_K": "Qwen2-Audio-7.8B-Instruct:projector-q6_K",
"Qwen2-Audio-7.8B-Instruct:q8_0": "Qwen2-Audio-7.8B-Instruct:projector-q8_0",
"Qwen2-Audio-7.8B-Instruct:fp16": "Qwen2-Audio-7.8B-Instruct:projector-fp16",
"omniaudio": "omniaudio:projector-fp16",
"omniaudio:fp16": "omniaudio:projector-fp16",
"omniaudio:q4_0": "omniaudio:projector-q4_0",
}

NEXA_RUN_T5XXL_MAP = {
Expand Down Expand Up @@ -279,7 +311,9 @@ class ModelType(Enum):
"qwen2.5-1.5b-instruct": "qwen",
"qwen2.5-3b-instruct": "qwen",
"qwen2.5-7b-instruct": "qwen",
"qwen2.5-coder-0.5b-instruct": "qwen",
"qwen2.5-coder-1.5b-instruct": "qwen",
"qwen2.5-coder-3b-instruct": "qwen",
"qwen2.5-coder-7b-instruct": "qwen",
"qwen2.5-math-1.5b-instruct": "qwen",
"qwen2.5-math-7b-instruct": "qwen",
Expand Down Expand Up @@ -385,7 +419,9 @@ class ModelType(Enum):
"Qwen2.5-0.5B-Instruct": ModelType.NLP,
"Qwen2.5-1.5B-Instruct": ModelType.NLP,
"Qwen2.5-3B-Instruct": ModelType.NLP,
"Qwen2.5-Coder-0.5B-Instruct": ModelType.NLP,
"Qwen2.5-Coder-1.5B-Instruct": ModelType.NLP,
"Qwen2.5-Coder-3B-Instruct": ModelType.NLP,
"Qwen2.5-Coder-7B-Instruct": ModelType.NLP,
"Qwen2.5-Math-1.5B-Instruct": ModelType.NLP,
"Qwen2.5-Math-7B-Instruct": ModelType.NLP,
Expand Down Expand Up @@ -423,13 +459,15 @@ class ModelType(Enum):
"anything-v30-LCM": ModelType.COMPUTER_VISION,
"FLUX.1-schnell": ModelType.COMPUTER_VISION,
"Phi-3-vision-128k-instruct": ModelType.MULTIMODAL,
"Octo-omni-vision": ModelType.MULTIMODAL,
"omnivision-preview": ModelType.MULTIMODAL,
"omnivision": ModelType.MULTIMODAL,
"omnivision-ocr": ModelType.MULTIMODAL,
"nanoLLaVA": ModelType.MULTIMODAL,
"llava-v1.6-mistral-7b": ModelType.MULTIMODAL,
"llava-v1.6-vicuna-7b": ModelType.MULTIMODAL,
"llava-phi-3-mini": ModelType.MULTIMODAL,
"llava-llama-3-8b-v1.1": ModelType.MULTIMODAL,
"Octo-omni-audio": ModelType.AUDIOLM,
"omniaudio": ModelType.AUDIOLM,
"Qwen2-Audio-7.8B-Instruct": ModelType.AUDIOLM,
"faster-whisper-tiny.en": ModelType.AUDIO,
"faster-whisper-tiny": ModelType.AUDIO,
Expand Down
6 changes: 3 additions & 3 deletions nexa/gguf/llama/omni_vlm_cpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -55,11 +55,11 @@ def _get_lib():
omni_char_p = ctypes.c_char_p


def omnivlm_init(llm_model_path: omni_char_p, mmproj_model_path: omni_char_p):
return _lib.omnivlm_init(llm_model_path, mmproj_model_path)
def omnivlm_init(llm_model_path: omni_char_p, mmproj_model_path: omni_char_p, vlm_version: omni_char_p):
return _lib.omnivlm_init(llm_model_path, mmproj_model_path, vlm_version)


_lib.omnivlm_init.argtypes = [omni_char_p, omni_char_p]
_lib.omnivlm_init.argtypes = [omni_char_p, omni_char_p, omni_char_p]
_lib.omnivlm_init.restype = None


Expand Down
58 changes: 35 additions & 23 deletions nexa/gguf/nexa_inference_vlm_omni.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@

import ctypes
import logging
import os
Expand All @@ -7,7 +6,6 @@
from streamlit.web import cli as stcli
from nexa.utils import nexa_prompt, SpinningCursorAnimation
from nexa.constants import (
DEFAULT_TEXT_GEN_PARAMS,
NEXA_RUN_OMNI_VLM_PROJECTOR_MAP,
NEXA_RUN_OMNI_VLM_MAP
)
Expand All @@ -32,39 +30,33 @@ def __init__(
if model_path is None and local_path is None:
raise ValueError("Either model_path or local_path must be provided.")

self.params = DEFAULT_TEXT_GEN_PARAMS.copy()
self.params.update(kwargs)
self.model = None
self.projector = None
self.projector_path = NEXA_RUN_OMNI_VLM_PROJECTOR_MAP.get(model_path, None)
self.downloaded_path = local_path
self.projector_downloaded_path = projector_local_path
self.device = device
self.context = None
self.omni_vlm_version = "vlm-81-instruct"
if self.device == "auto" or self.device == "gpu":
self.n_gpu_layers = -1 if is_gpu_available() else 0
else:
self.n_gpu_layers = 0

if (
self.downloaded_path is not None
and self.projector_downloaded_path is not None
):
if self.downloaded_path is not None and self.projector_downloaded_path is not None:
# when running from local, both path should be provided
pass
self.omni_vlm_version = self._determine_vlm_version(str(self.downloaded_path))
elif self.downloaded_path is not None:
if model_path in NEXA_RUN_OMNI_VLM_MAP:
self.projector_path = NEXA_RUN_OMNI_VLM_PROJECTOR_MAP[model_path]
self.projector_downloaded_path, _ = pull_model(
self.projector_path, **kwargs
)
self.projector_downloaded_path, _ = pull_model(self.projector_path, **kwargs)
self.omni_vlm_version = self._determine_vlm_version(model_path)
elif model_path in NEXA_RUN_OMNI_VLM_MAP:
self.model_path = NEXA_RUN_OMNI_VLM_MAP[model_path]
self.projector_path = NEXA_RUN_OMNI_VLM_PROJECTOR_MAP[model_path]
self.downloaded_path, _ = pull_model(self.model_path, **kwargs)
self.projector_downloaded_path, _ = pull_model(
self.projector_path, **kwargs
)
self.projector_downloaded_path, _ = pull_model(self.projector_path, **kwargs)
self.omni_vlm_version = self._determine_vlm_version(model_path)
elif Path(model_path).parent.exists():
local_dir = Path(model_path).parent
model_name = Path(model_path).name
Expand All @@ -80,22 +72,34 @@ def __init__(
"Make sure to name them as 'model-<tag>.gguf' and 'projector-<tag>.gguf'."
)
exit(1)
self.omni_vlm_version = self._determine_vlm_version(model_path)
else:
logging.error("VLM user model from hub is not supported yet.")
exit(1)

# Override version if specified in kwargs
if 'omni_vlm_version' in kwargs:
self.omni_vlm_version = kwargs.get('omni_vlm_version')
print(f"Using omni-vlm-version: {self.omni_vlm_version}")

with suppress_stdout_stderr():
self._load_model()

def _determine_vlm_version(self, path_str: str) -> str:
"""Helper function to determine VLM version from path string."""
if 'ocr' in path_str:
return "vlm-81-ocr"
elif 'preview' in path_str:
return "nano-vlm-instruct"
return "vlm-81-instruct"

@SpinningCursorAnimation()
def _load_model(self):
try:
self.ctx_params_model = ctypes.c_char_p(
self.downloaded_path.encode("utf-8")
)
self.ctx_params_mmproj = ctypes.c_char_p(
self.projector_downloaded_path.encode("utf-8")
)
omni_vlm_cpp.omnivlm_init(self.ctx_params_model, self.ctx_params_mmproj)
self.ctx_params_model = ctypes.c_char_p(self.downloaded_path.encode("utf-8"))
self.ctx_params_mmproj = ctypes.c_char_p(self.projector_downloaded_path.encode("utf-8"))
self.ctx_params_omni_vlm_version = ctypes.c_char_p(self.omni_vlm_version.encode("utf-8"))
omni_vlm_cpp.omnivlm_init(self.ctx_params_model, self.ctx_params_mmproj, self.ctx_params_omni_vlm_version)
except Exception as e:
logging.error(f"Error loading model: {e}")
raise
Expand All @@ -106,7 +110,8 @@ def run(self):
image_path = nexa_prompt("Image Path (required): ")
if not os.path.exists(image_path):
print(f"Image path: {image_path} not found, running omni VLM without image input.")
user_input = nexa_prompt()
# Skip user input for OCR version
user_input = "" if self.omni_vlm_version == "vlm-81-ocr" else nexa_prompt()
response = self.inference(user_input, image_path)
print(f"\nResponse: {response}")
except KeyboardInterrupt:
Expand Down Expand Up @@ -170,6 +175,13 @@ def run_streamlit(self, model_path: str, is_local_path = False, hf = False, proj
action="store_true",
help="Run the inference in Streamlit UI",
)
parser.add_argument(
"--omni_vlm_version",
type=str,
choices=["vlm-81-ocr", "vlm-81-instruct", "nano-vlm-instruct"],
default="vlm-81-instruct",
help="omni-vlm-version to use",
)

args = parser.parse_args()
kwargs = {k: v for k, v in vars(args).items() if v is not None}
Expand Down

0 comments on commit 4aa23d7

Please sign in to comment.