Merge pull request #229 from NexaAI/david/newfeature

David/newfeature add omnivision python interface
NexaAI · Nov 11, 2024 · 4aa23d7 · 4aa23d7
2 parents 84dd847 + 0ce19fd
commit 4aa23d7
Show file tree

Hide file tree

Showing 3 changed files with 97 additions and 47 deletions.
diff --git a/nexa/constants.py b/nexa/constants.py
@@ -58,7 +58,7 @@ class ModelType(Enum):
     "mistral": "Mistral-7B-Instruct-v0.3:q4_0",
     "codegemma": "codegemma-2b:q4_0",
     "codellama": "CodeLlama-7b-Instruct:q4_0",
-    "codeqwen": "Qwen2.5-Coder-1.5B-Instruct:q4_0",
+    "codeqwen": "Qwen2.5-Coder-3B-Instruct:q4_0",
     "mathqwen": "Qwen2.5-Math-1.5B-Instruct:q4_0",
     "deepseek-coder": "deepseek-coder-1.3b-instruct:q4_0",
     "dolphin-mistral": "dolphin-2.8-mistral-7b:q4_0",
@@ -110,13 +110,25 @@ class ModelType(Enum):
 }
 
 NEXA_RUN_MODEL_MAP_AUDIO_LM = {
-    "qwen2audio": "Qwen2-Audio-7.8B-Instruct:model-q4_0",
-    "Qwen2-Audio-7.8B-Instruct:fp16": "Qwen2-Audio-7.8B-Instruct:model-fp16",
-    "Qwen2-Audio-7.8B-Instruct:q8_0": "Qwen2-Audio-7.8B-Instruct:model-q8_0",
+    "qwen2audio": "Qwen2-Audio-7.8B-Instruct:model-q4_K_M",
+    "Qwen2-Audio-7.8B-Instruct:q2_K": "Qwen2-Audio-7.8B-Instruct:model-q2_K",
+    "Qwen2-Audio-7.8B-Instruct:q3_K_L": "Qwen2-Audio-7.8B-Instruct:model-q3_K_L",
+    "Qwen2-Audio-7.8B-Instruct:q3_K_M": "Qwen2-Audio-7.8B-Instruct:model-q3_K_M",
+    "Qwen2-Audio-7.8B-Instruct:q3_K_S": "Qwen2-Audio-7.8B-Instruct:model-q3_K_S",
     "Qwen2-Audio-7.8B-Instruct:q4_0": "Qwen2-Audio-7.8B-Instruct:model-q4_0",
-    "omniaudio": "Octo-omni-audio:model-fp16",
-    "Octo-omni-audio:fp16": "Octo-omni-audio:model-fp16",
-    "Octo-omni-audio:q4_0": "Octo-omni-audio:model-q4_0",
+    "Qwen2-Audio-7.8B-Instruct:q4_1": "Qwen2-Audio-7.8B-Instruct:model-q4_1",
+    "Qwen2-Audio-7.8B-Instruct:q4_K_M": "Qwen2-Audio-7.8B-Instruct:model-q4_K_M",
+    "Qwen2-Audio-7.8B-Instruct:q4_K_S": "Qwen2-Audio-7.8B-Instruct:model-q4_K_S",
+    "Qwen2-Audio-7.8B-Instruct:q5_0": "Qwen2-Audio-7.8B-Instruct:model-q5_0",
+    "Qwen2-Audio-7.8B-Instruct:q5_1": "Qwen2-Audio-7.8B-Instruct:model-q5_1",
+    "Qwen2-Audio-7.8B-Instruct:q5_K_M": "Qwen2-Audio-7.8B-Instruct:model-q5_K_M",
+    "Qwen2-Audio-7.8B-Instruct:q5_K_S": "Qwen2-Audio-7.8B-Instruct:model-q5_K_S",
+    "Qwen2-Audio-7.8B-Instruct:q6_K": "Qwen2-Audio-7.8B-Instruct:model-q6_K",
+    "Qwen2-Audio-7.8B-Instruct:q8_0": "Qwen2-Audio-7.8B-Instruct:model-q8_0",
+    "Qwen2-Audio-7.8B-Instruct:fp16": "Qwen2-Audio-7.8B-Instruct:model-fp16",
+    "omniaudio": "omniaudio:model-fp16",
+    "omniaudio:fp16": "omniaudio:model-fp16",
+    "omniaudio:q4_0": "omniaudio:model-q4_0",
 }
 
 NEXA_RUN_MODEL_MAP_VOICE = {
@@ -172,25 +184,45 @@ class ModelType(Enum):
 }
 
 NEXA_RUN_OMNI_VLM_PROJECTOR_MAP = {
-    "omnivision": "Octo-omni-vision:projector-fp16",
-    "Octo-omni-vision:fp16": "Octo-omni-vision:projector-fp16",
-    "Octo-omni-vision:q4_0": "Octo-omni-vision:projector-q4_0",
+    "omnivision-preview": "omnivision-preview:projector-fp16",
+    "omnivision-preview:fp16": "omnivision-preview:projector-fp16",
+    "omnivision-preview:q4_0": "omnivision-preview:projector-q4_0",
+    "omnivision": "omnivision:projector-fp16",
+    "omnivision:fp16": "omnivision:projector-fp16",
+    "omnivision-ocr": "omnivision-ocr:projector-fp16",
+    "omnivision-ocr:fp16": "omnivision-ocr:projector-fp16",
 }
 
 NEXA_RUN_OMNI_VLM_MAP = {
-    "omnivision": "Octo-omni-vision:model-fp16",
-    "Octo-omni-vision:fp16": "Octo-omni-vision:model-fp16",
-    "Octo-omni-vision:q4_0": "Octo-omni-vision:model-q4_0",
+    "omnivision-preview": "omnivision-preview:model-fp16",
+    "omnivision-preview:fp16": "omnivision-preview:model-fp16",
+    "omnivision-preview:q4_0": "omnivision-preview:model-q4_0",
+    "omnivision": "omnivision:model-fp16",
+    "omnivision:fp16": "omnivision:model-fp16",
+    "omnivision-ocr": "omnivision-ocr:model-fp16",
+    "omnivision-ocr:fp16": "omnivision-ocr:model-fp16",
 }
 
 NEXA_RUN_AUDIO_LM_PROJECTOR_MAP = {
-    "qwen2audio": "Qwen2-Audio-7.8B-Instruct:projector-q4_0",
-    "Qwen2-Audio-7.8B-Instruct:fp16": "Qwen2-Audio-7.8B-Instruct:projector-fp16",
-    "Qwen2-Audio-7.8B-Instruct:q8_0": "Qwen2-Audio-7.8B-Instruct:projector-q8_0",
+    "qwen2audio": "Qwen2-Audio-7.8B-Instruct:projector-q4_K_M",
+    "Qwen2-Audio-7.8B-Instruct:q2_K": "Qwen2-Audio-7.8B-Instruct:projector-q2_K",
+    "Qwen2-Audio-7.8B-Instruct:q3_K_L": "Qwen2-Audio-7.8B-Instruct:projector-q3_K_L",
+    "Qwen2-Audio-7.8B-Instruct:q3_K_M": "Qwen2-Audio-7.8B-Instruct:projector-q3_K_M",
+    "Qwen2-Audio-7.8B-Instruct:q3_K_S": "Qwen2-Audio-7.8B-Instruct:projector-q3_K_S",
     "Qwen2-Audio-7.8B-Instruct:q4_0": "Qwen2-Audio-7.8B-Instruct:projector-q4_0",
-    "omniaudio": "Octo-omni-audio:projector-fp16",
-    "Octo-omni-audio:fp16": "Octo-omni-audio:projector-fp16",
-    "Octo-omni-audio:q4_0": "Octo-omni-audio:projector-q4_0",
+    "Qwen2-Audio-7.8B-Instruct:q4_1": "Qwen2-Audio-7.8B-Instruct:projector-q4_1",
+    "Qwen2-Audio-7.8B-Instruct:q4_K_M": "Qwen2-Audio-7.8B-Instruct:projector-q4_K_M",
+    "Qwen2-Audio-7.8B-Instruct:q4_K_S": "Qwen2-Audio-7.8B-Instruct:projector-q4_K_S",
+    "Qwen2-Audio-7.8B-Instruct:q5_0": "Qwen2-Audio-7.8B-Instruct:projector-q5_0",
+    "Qwen2-Audio-7.8B-Instruct:q5_1": "Qwen2-Audio-7.8B-Instruct:projector-q5_1",
+    "Qwen2-Audio-7.8B-Instruct:q5_K_M": "Qwen2-Audio-7.8B-Instruct:projector-q5_K_M",
+    "Qwen2-Audio-7.8B-Instruct:q5_K_S": "Qwen2-Audio-7.8B-Instruct:projector-q5_K_S",
+    "Qwen2-Audio-7.8B-Instruct:q6_K": "Qwen2-Audio-7.8B-Instruct:projector-q6_K",
+    "Qwen2-Audio-7.8B-Instruct:q8_0": "Qwen2-Audio-7.8B-Instruct:projector-q8_0",
+    "Qwen2-Audio-7.8B-Instruct:fp16": "Qwen2-Audio-7.8B-Instruct:projector-fp16",
+    "omniaudio": "omniaudio:projector-fp16",
+    "omniaudio:fp16": "omniaudio:projector-fp16",
+    "omniaudio:q4_0": "omniaudio:projector-q4_0",
 }
 
 NEXA_RUN_T5XXL_MAP = {
@@ -279,7 +311,9 @@ class ModelType(Enum):
     "qwen2.5-1.5b-instruct": "qwen",
     "qwen2.5-3b-instruct": "qwen",
     "qwen2.5-7b-instruct": "qwen",
+    "qwen2.5-coder-0.5b-instruct": "qwen",
     "qwen2.5-coder-1.5b-instruct": "qwen",
+    "qwen2.5-coder-3b-instruct": "qwen",
     "qwen2.5-coder-7b-instruct": "qwen",
     "qwen2.5-math-1.5b-instruct": "qwen",
     "qwen2.5-math-7b-instruct": "qwen",
@@ -385,7 +419,9 @@ class ModelType(Enum):
     "Qwen2.5-0.5B-Instruct": ModelType.NLP,
     "Qwen2.5-1.5B-Instruct": ModelType.NLP,
     "Qwen2.5-3B-Instruct": ModelType.NLP,
+    "Qwen2.5-Coder-0.5B-Instruct": ModelType.NLP,
     "Qwen2.5-Coder-1.5B-Instruct": ModelType.NLP,
+    "Qwen2.5-Coder-3B-Instruct": ModelType.NLP,
     "Qwen2.5-Coder-7B-Instruct": ModelType.NLP,
     "Qwen2.5-Math-1.5B-Instruct": ModelType.NLP,
     "Qwen2.5-Math-7B-Instruct": ModelType.NLP,
@@ -423,13 +459,15 @@ class ModelType(Enum):
     "anything-v30-LCM": ModelType.COMPUTER_VISION,
     "FLUX.1-schnell": ModelType.COMPUTER_VISION,
     "Phi-3-vision-128k-instruct": ModelType.MULTIMODAL,
-    "Octo-omni-vision": ModelType.MULTIMODAL,
+    "omnivision-preview": ModelType.MULTIMODAL,
+    "omnivision": ModelType.MULTIMODAL,
+    "omnivision-ocr": ModelType.MULTIMODAL,
     "nanoLLaVA": ModelType.MULTIMODAL,
     "llava-v1.6-mistral-7b": ModelType.MULTIMODAL,
     "llava-v1.6-vicuna-7b": ModelType.MULTIMODAL,
     "llava-phi-3-mini": ModelType.MULTIMODAL,
     "llava-llama-3-8b-v1.1": ModelType.MULTIMODAL,
-    "Octo-omni-audio": ModelType.AUDIOLM,
+    "omniaudio": ModelType.AUDIOLM,
     "Qwen2-Audio-7.8B-Instruct": ModelType.AUDIOLM,
     "faster-whisper-tiny.en": ModelType.AUDIO,
     "faster-whisper-tiny": ModelType.AUDIO,

diff --git a/nexa/gguf/llama/omni_vlm_cpp.py b/nexa/gguf/llama/omni_vlm_cpp.py
@@ -55,11 +55,11 @@ def _get_lib():
 omni_char_p = ctypes.c_char_p
 
 
-def omnivlm_init(llm_model_path: omni_char_p, mmproj_model_path: omni_char_p):
-    return _lib.omnivlm_init(llm_model_path, mmproj_model_path)
+def omnivlm_init(llm_model_path: omni_char_p, mmproj_model_path: omni_char_p, vlm_version: omni_char_p):
+    return _lib.omnivlm_init(llm_model_path, mmproj_model_path, vlm_version)
 
 
-_lib.omnivlm_init.argtypes = [omni_char_p, omni_char_p]
+_lib.omnivlm_init.argtypes = [omni_char_p, omni_char_p, omni_char_p]
 _lib.omnivlm_init.restype = None
 
 

diff --git a/nexa/gguf/nexa_inference_vlm_omni.py b/nexa/gguf/nexa_inference_vlm_omni.py
@@ -1,4 +1,3 @@
-
 import ctypes
 import logging
 import os
@@ -7,7 +6,6 @@
 from streamlit.web import cli as stcli
 from nexa.utils import nexa_prompt, SpinningCursorAnimation
 from nexa.constants import (
-    DEFAULT_TEXT_GEN_PARAMS,
     NEXA_RUN_OMNI_VLM_PROJECTOR_MAP,
     NEXA_RUN_OMNI_VLM_MAP
 )
@@ -32,39 +30,33 @@ def __init__(
         if model_path is None and local_path is None:
             raise ValueError("Either model_path or local_path must be provided.")
 
-        self.params = DEFAULT_TEXT_GEN_PARAMS.copy()
-        self.params.update(kwargs)
         self.model = None
         self.projector = None
         self.projector_path = NEXA_RUN_OMNI_VLM_PROJECTOR_MAP.get(model_path, None)
         self.downloaded_path = local_path
         self.projector_downloaded_path = projector_local_path
         self.device = device
         self.context = None
+        self.omni_vlm_version = "vlm-81-instruct"
         if self.device == "auto" or self.device == "gpu":
             self.n_gpu_layers = -1 if is_gpu_available() else 0
         else:
             self.n_gpu_layers = 0
 
-        if (
-            self.downloaded_path is not None
-            and self.projector_downloaded_path is not None
-        ):
+        if self.downloaded_path is not None and self.projector_downloaded_path is not None:
             # when running from local, both path should be provided
-            pass
+            self.omni_vlm_version = self._determine_vlm_version(str(self.downloaded_path))
         elif self.downloaded_path is not None:
             if model_path in NEXA_RUN_OMNI_VLM_MAP:
                 self.projector_path = NEXA_RUN_OMNI_VLM_PROJECTOR_MAP[model_path]
-                self.projector_downloaded_path, _ = pull_model(
-                    self.projector_path, **kwargs
-                )
+                self.projector_downloaded_path, _ = pull_model(self.projector_path, **kwargs)
+                self.omni_vlm_version = self._determine_vlm_version(model_path)
         elif model_path in NEXA_RUN_OMNI_VLM_MAP:
             self.model_path = NEXA_RUN_OMNI_VLM_MAP[model_path]
             self.projector_path = NEXA_RUN_OMNI_VLM_PROJECTOR_MAP[model_path]
             self.downloaded_path, _ = pull_model(self.model_path, **kwargs)
-            self.projector_downloaded_path, _ = pull_model(
-                self.projector_path, **kwargs
-            )
+            self.projector_downloaded_path, _ = pull_model(self.projector_path, **kwargs)
+            self.omni_vlm_version = self._determine_vlm_version(model_path)
         elif Path(model_path).parent.exists():
             local_dir = Path(model_path).parent
             model_name = Path(model_path).name
@@ -80,22 +72,34 @@ def __init__(
                     "Make sure to name them as 'model-<tag>.gguf' and 'projector-<tag>.gguf'."
                 )
                 exit(1)
+            self.omni_vlm_version = self._determine_vlm_version(model_path)
         else:
             logging.error("VLM user model from hub is not supported yet.")
             exit(1)
+
+        # Override version if specified in kwargs
+        if 'omni_vlm_version' in kwargs:
+            self.omni_vlm_version = kwargs.get('omni_vlm_version')
+        print(f"Using omni-vlm-version: {self.omni_vlm_version}")
+
         with suppress_stdout_stderr():
             self._load_model()
 
+    def _determine_vlm_version(self, path_str: str) -> str:
+        """Helper function to determine VLM version from path string."""
+        if 'ocr' in path_str:
+            return "vlm-81-ocr"
+        elif 'preview' in path_str:
+            return "nano-vlm-instruct"
+        return "vlm-81-instruct"
+
     @SpinningCursorAnimation()
     def _load_model(self):
         try:
-            self.ctx_params_model = ctypes.c_char_p(
-                self.downloaded_path.encode("utf-8")
-            )
-            self.ctx_params_mmproj = ctypes.c_char_p(
-                self.projector_downloaded_path.encode("utf-8")
-            )
-            omni_vlm_cpp.omnivlm_init(self.ctx_params_model, self.ctx_params_mmproj)
+            self.ctx_params_model = ctypes.c_char_p(self.downloaded_path.encode("utf-8"))
+            self.ctx_params_mmproj = ctypes.c_char_p(self.projector_downloaded_path.encode("utf-8"))
+            self.ctx_params_omni_vlm_version = ctypes.c_char_p(self.omni_vlm_version.encode("utf-8"))
+            omni_vlm_cpp.omnivlm_init(self.ctx_params_model, self.ctx_params_mmproj, self.ctx_params_omni_vlm_version)
         except Exception as e:
             logging.error(f"Error loading model: {e}")
             raise
@@ -106,7 +110,8 @@ def run(self):
                 image_path = nexa_prompt("Image Path (required): ")
                 if not os.path.exists(image_path):
                     print(f"Image path: {image_path} not found, running omni VLM without image input.")
-                user_input = nexa_prompt()
+                # Skip user input for OCR version
+                user_input = "" if self.omni_vlm_version == "vlm-81-ocr" else nexa_prompt()
                 response = self.inference(user_input, image_path)
                 print(f"\nResponse: {response}")
             except KeyboardInterrupt:
@@ -170,6 +175,13 @@ def run_streamlit(self, model_path: str, is_local_path = False, hf = False, proj
         action="store_true",
         help="Run the inference in Streamlit UI",
     )
+    parser.add_argument(
+        "--omni_vlm_version",
+        type=str,
+        choices=["vlm-81-ocr", "vlm-81-instruct", "nano-vlm-instruct"],
+        default="vlm-81-instruct",
+        help="omni-vlm-version to use",
+    )
 
     args = parser.parse_args()
     kwargs = {k: v for k, v in vars(args).items() if v is not None}