Skip to content

Commit

Permalink
refactor device arguments for power benchmark in gguf python interface
Browse files Browse the repository at this point in the history
  • Loading branch information
Davidqian123 committed Oct 16, 2024
1 parent 304be41 commit 97b0fd3
Show file tree
Hide file tree
Showing 4 changed files with 18 additions and 11 deletions.
2 changes: 1 addition & 1 deletion nexa/eval/nexa_eval.py
Original file line number Diff line number Diff line change
Expand Up @@ -203,7 +203,7 @@ def run_perf_eval(self, device: str, new_tokens: int):
memory=True,
energy=True,
input_shapes={
"batch_size": 1,
"batch_size": 1, # TODO: make it dynamic, hardcoded to 1 for now
"sequence_length": 256,
"vocab_size": 32000,
},
Expand Down
9 changes: 8 additions & 1 deletion nexa/eval/nexa_perf/nexa_backend.py
Original file line number Diff line number Diff line change
Expand Up @@ -122,7 +122,14 @@ def load_model(self) -> None:
"""
Load the model from the given model path (normally GGUF, GGML)
"""
nexa_model = NexaTextInference(model_path=self.config.model)
# TODO: add mps (apple metal) support, currently cant benchmark mps device accurately for energy
if self.config.device == "cuda" or self.config.device == "mps":
nexa_model = NexaTextInference(model_path=self.config.model, device="gpu", **self.config.model_kwargs)
elif self.config.device == "cpu":
nexa_model = NexaTextInference(model_path=self.config.model, device="cpu", **self.config.model_kwargs)
else:
raise ValueError(f"Invalid device: {self.config.device}")

self.pretrained_model = nexa_model.model

def prefill(self, inputs: Dict[str, Any], kwargs: Dict[str, Any]) -> list[int]:
Expand Down
8 changes: 4 additions & 4 deletions nexa/gguf/nexa_inference_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -44,14 +44,14 @@ class NexaTextInference:
top_k (int): Top-k sampling parameter.
top_p (float): Top-p sampling parameter
"""
def __init__(self, model_path=None, local_path=None, stop_words=None, **kwargs):
def __init__(self, model_path=None, local_path=None, stop_words=None, device="auto", **kwargs):
if model_path is None and local_path is None:
raise ValueError("Either model_path or local_path must be provided.")

self.params = DEFAULT_TEXT_GEN_PARAMS
self.params.update(kwargs)
self.model = None
self.device = None
self.device = device

self.model_path = model_path
self.downloaded_path = local_path
Expand Down Expand Up @@ -108,7 +108,6 @@ def _load_model(self):
with suppress_stdout_stderr():
from nexa.gguf.llama.llama import Llama
try:
self.device = self.params.get("device", "auto")
if self.device == "auto" or self.device == "gpu":
n_gpu_layers = -1 if is_gpu_available() else 0
elif self.device == "cpu":
Expand Down Expand Up @@ -377,8 +376,9 @@ def run_streamlit(self, model_path: str, is_local_path = False, hf = False):
kwargs = {k: v for k, v in vars(args).items() if v is not None}
model_path = kwargs.pop("model_path")
stop_words = kwargs.pop("stop_words", [])
device = kwargs.pop("device", "auto")

inference = NexaTextInference(model_path, stop_words=stop_words, **kwargs)
inference = NexaTextInference(model_path, stop_words=stop_words, device=device, **kwargs)
if args.streamlit:
inference.run_streamlit(model_path)
else:
Expand Down
10 changes: 5 additions & 5 deletions nexa/gguf/nexa_inference_vlm.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,7 +89,7 @@ class NexaVLMInference:
top_k (int): Top-k sampling parameter.
top_p (float): Top-p sampling parameter
"""
def __init__(self, model_path=None, local_path=None, projector_local_path=None, stop_words=None, **kwargs):
def __init__(self, model_path=None, local_path=None, projector_local_path=None, stop_words=None, device="auto", **kwargs):
if model_path is None and local_path is None:
raise ValueError("Either model_path or local_path must be provided.")

Expand All @@ -100,7 +100,7 @@ def __init__(self, model_path=None, local_path=None, projector_local_path=None,
self.projector_path = NEXA_RUN_PROJECTOR_MAP.get(model_path, None)
self.downloaded_path = local_path
self.projector_downloaded_path = projector_local_path
self.device = None
self.device = device

if self.downloaded_path is not None and self.projector_downloaded_path is not None:
# when running from local, both path should be provided
Expand Down Expand Up @@ -167,8 +167,6 @@ def _load_model(self):
)
try:
from nexa.gguf.llama.llama import Llama

self.device = self.params.get("device", "auto")
if self.device == "auto" or self.device == "gpu":
n_gpu_layers = -1 if is_gpu_available() else 0
elif self.device == "cpu":
Expand Down Expand Up @@ -417,7 +415,9 @@ def run_streamlit(self, model_path: str, is_local_path = False, hf = False, proj
kwargs = {k: v for k, v in vars(args).items() if v is not None}
model_path = kwargs.pop("model_path")
stop_words = kwargs.pop("stop_words", [])
inference = NexaVLMInference(model_path, stop_words=stop_words, **kwargs)
device = kwargs.pop("device", "auto")

inference = NexaVLMInference(model_path, stop_words=stop_words, device=device, **kwargs)
if args.streamlit:
inference.run_streamlit(model_path)
else:
Expand Down

0 comments on commit 97b0fd3

Please sign in to comment.