refactor device arguments for power benchmark in gguf python interface

NexaAI · Oct 16, 2024 · 97b0fd3 · 97b0fd3
1 parent 304be41
commit 97b0fd3
Show file tree

Hide file tree

Showing 4 changed files with 18 additions and 11 deletions.
diff --git a/nexa/eval/nexa_eval.py b/nexa/eval/nexa_eval.py
@@ -203,7 +203,7 @@ def run_perf_eval(self, device: str, new_tokens: int):
             memory=True,  
             energy=True,
             input_shapes={
-                "batch_size": 1,
+                "batch_size": 1,        # TODO: make it dynamic, hardcoded to 1 for now
                 "sequence_length": 256,
                 "vocab_size": 32000,
             },

diff --git a/nexa/eval/nexa_perf/nexa_backend.py b/nexa/eval/nexa_perf/nexa_backend.py
@@ -122,7 +122,14 @@ def load_model(self) -> None:
         """
         Load the model from the given model path (normally GGUF, GGML)
         """
-        nexa_model = NexaTextInference(model_path=self.config.model)
+        # TODO: add mps (apple metal) support, currently cant benchmark mps device accurately for energy
+        if self.config.device == "cuda" or self.config.device == "mps":
+            nexa_model = NexaTextInference(model_path=self.config.model, device="gpu", **self.config.model_kwargs)
+        elif self.config.device == "cpu":
+            nexa_model = NexaTextInference(model_path=self.config.model, device="cpu", **self.config.model_kwargs)
+        else:
+            raise ValueError(f"Invalid device: {self.config.device}")
+
         self.pretrained_model = nexa_model.model
 
     def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> list[int]:

diff --git a/nexa/gguf/nexa_inference_text.py b/nexa/gguf/nexa_inference_text.py
@@ -44,14 +44,14 @@ class NexaTextInference:
     top_k (int): Top-k sampling parameter.
     top_p (float): Top-p sampling parameter
     """
-    def __init__(self, model_path=None, local_path=None, stop_words=None, **kwargs):
+    def __init__(self, model_path=None, local_path=None, stop_words=None, device="auto", **kwargs):
         if model_path is None and local_path is None:
             raise ValueError("Either model_path or local_path must be provided.")
 
         self.params = DEFAULT_TEXT_GEN_PARAMS
         self.params.update(kwargs)
         self.model = None
-        self.device = None
+        self.device = device
 
         self.model_path = model_path
         self.downloaded_path = local_path
@@ -108,7 +108,6 @@ def _load_model(self):
         with suppress_stdout_stderr():
             from nexa.gguf.llama.llama import Llama
             try:
-                self.device = self.params.get("device", "auto")
                 if self.device == "auto" or self.device == "gpu":
                     n_gpu_layers = -1 if is_gpu_available() else 0
                 elif self.device == "cpu":
@@ -377,8 +376,9 @@ def run_streamlit(self, model_path: str, is_local_path = False, hf = False):
     kwargs = {k: v for k, v in vars(args).items() if v is not None}
     model_path = kwargs.pop("model_path")
     stop_words = kwargs.pop("stop_words", [])
+    device = kwargs.pop("device", "auto")
 
-    inference = NexaTextInference(model_path, stop_words=stop_words, **kwargs)
+    inference = NexaTextInference(model_path, stop_words=stop_words, device=device, **kwargs)
     if args.streamlit:
         inference.run_streamlit(model_path)
     else:

diff --git a/nexa/gguf/nexa_inference_vlm.py b/nexa/gguf/nexa_inference_vlm.py
@@ -89,7 +89,7 @@ class NexaVLMInference:
     top_k (int): Top-k sampling parameter.
     top_p (float): Top-p sampling parameter
     """
-    def __init__(self, model_path=None, local_path=None, projector_local_path=None, stop_words=None, **kwargs):
+    def __init__(self, model_path=None, local_path=None, projector_local_path=None, stop_words=None, device="auto", **kwargs):
         if model_path is None and local_path is None:
             raise ValueError("Either model_path or local_path must be provided.")
 
@@ -100,7 +100,7 @@ def __init__(self, model_path=None, local_path=None, projector_local_path=None,
         self.projector_path = NEXA_RUN_PROJECTOR_MAP.get(model_path, None)
         self.downloaded_path = local_path
         self.projector_downloaded_path = projector_local_path
-        self.device = None
+        self.device = device
 
         if self.downloaded_path is not None and self.projector_downloaded_path is not None:
             # when running from local, both path should be provided
@@ -167,8 +167,6 @@ def _load_model(self):
             )
             try:
                 from nexa.gguf.llama.llama import Llama
-
-                self.device = self.params.get("device", "auto")
                 if self.device == "auto" or self.device == "gpu":
                     n_gpu_layers = -1 if is_gpu_available() else 0
                 elif self.device == "cpu":
@@ -417,7 +415,9 @@ def run_streamlit(self, model_path: str, is_local_path = False, hf = False, proj
     kwargs = {k: v for k, v in vars(args).items() if v is not None}
     model_path = kwargs.pop("model_path")
     stop_words = kwargs.pop("stop_words", [])
-    inference = NexaVLMInference(model_path, stop_words=stop_words, **kwargs)
+    device = kwargs.pop("device", "auto")
+
+    inference = NexaVLMInference(model_path, stop_words=stop_words, device=device, **kwargs)
     if args.streamlit:
         inference.run_streamlit(model_path)
     else: