Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix github issue and printing logging #63

Merged
merged 1 commit into from
Aug 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 11 additions & 0 deletions nexa/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,6 +103,17 @@

NEXA_RUN_MODEL_MAP_FUNCTION_CALLING = {
"llama2-function-calling": "Llama2-7b-function-calling:q4_K_M",
"Llama2-7b-function-calling:fp16": "Llama2-7b-function-calling:fp16",
"Llama2-7b-function-calling:q2_K": "Llama2-7b-function-calling:q2_K",
"Llama2-7b-function-calling:q3_K_L": "Llama2-7b-function-calling:q3_K_L",
"Llama2-7b-function-calling:q3_K_M": "Llama2-7b-function-calling:q3_K_M",
"Llama2-7b-function-calling:q3_K_S": "Llama2-7b-function-calling:q3_K_S",
"Llama2-7b-function-calling:q4_K_M": "Llama2-7b-function-calling:q4_K_M",
"Llama2-7b-function-calling:q4_K_S": "Llama2-7b-function-calling:q4_K_S",
"Llama2-7b-function-calling:q5_K_M": "Llama2-7b-function-calling:q5_K_M",
"Llama2-7b-function-calling:q5_K_S": "Llama2-7b-function-calling:q5_K_S",
"Llama2-7b-function-calling:q6_K": "Llama2-7b-function-calling:q6_K",
"Llama2-7b-function-calling:q8_0": "Llama2-7b-function-calling:q8_0",
}


Expand Down
2 changes: 1 addition & 1 deletion nexa/gguf/llama/_utils_transformers.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ class suppress_stdout_stderr(object):
sys = sys
os = os

def __init__(self, disable: bool = True):
def __init__(self, disable: bool = False):
self.disable = disable

# Oddly enough this works better than the contextlib version
Expand Down
106 changes: 51 additions & 55 deletions nexa/gguf/server/nexa_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,68 +62,64 @@ class GenerationRequest(BaseModel):

async def load_model():
global model, chat_format, completion_template, model_path
if model_path in NEXA_RUN_MODEL_MAP_TEXT:
chat_format = NEXA_RUN_CHAT_TEMPLATE_MAP.get(model_path, None)
completion_template = NEXA_RUN_COMPLETION_TEMPLATE_MAP.get(model_path, None)
model_path = NEXA_RUN_MODEL_MAP_TEXT.get(model_path)
downloaded_path, run_type = pull_model(model_path)
with suppress_stdout_stderr():
try:
model = Llama(
model_path=downloaded_path,
verbose=False,
chat_format=chat_format,
n_gpu_layers=-1 if is_gpu_available() else 0,
)
except Exception as e:
logging.error(
f"Failed to load model: {e}. Falling back to CPU.", exc_info=True
)
model = Llama(
model_path=downloaded_path,
verbose=False,
chat_format=chat_format,
n_gpu_layers=0, # hardcode to use CPU
)
logging.info(f"model loaded as {model}")
elif model_path in NEXA_RUN_MODEL_MAP_FUNCTION_CALLING:
chat_format = "chatml-function-calling"
model_path = NEXA_RUN_MODEL_MAP_FUNCTION_CALLING.get(model_path)
downloaded_path, run_type = pull_model(model_path)
with suppress_stdout_stderr():
try:
model = Llama(
model_path=downloaded_path,
verbose=False,
chat_format=chat_format,
n_gpu_layers=-1 if is_gpu_available() else 0,
)
except Exception as e:
logging.error(
f"Failed to load model: {e}. Falling back to CPU.", exc_info=True
)
model = Llama(
model_path=downloaded_path,
verbose=False,
chat_format=chat_format,
n_gpu_layers=0, # hardcode to use CPU
)

logging.info(f"model loaded as {model}")
elif model_path in NEXA_RUN_MODEL_MAP_IMAGE:
downloaded_path, run_type = pull_model(model_path)
downloaded_path, run_type = pull_model(model_path)

if run_type == "NLP":
if model_path in NEXA_RUN_MODEL_MAP_FUNCTION_CALLING:
chat_format = "chatml-function-calling"
with suppress_stdout_stderr():
try:
model = Llama(
model_path=downloaded_path,
verbose=False,
chat_format=chat_format,
n_gpu_layers=-1 if is_gpu_available() else 0,
)
except Exception as e:
logging.error(
f"Failed to load model: {e}. Falling back to CPU.", exc_info=True
)
model = Llama(
model_path=downloaded_path,
verbose=False,
chat_format=chat_format,
n_gpu_layers=0, # hardcode to use CPU
)

logging.info(f"model loaded as {model}")
else:
chat_format = NEXA_RUN_CHAT_TEMPLATE_MAP.get(model_path, None)
completion_template = NEXA_RUN_COMPLETION_TEMPLATE_MAP.get(model_path, None)
with suppress_stdout_stderr():
try:
model = Llama(
model_path=downloaded_path,
verbose=False,
chat_format=chat_format,
n_gpu_layers=-1 if is_gpu_available() else 0,
)
except Exception as e:
logging.error(
f"Failed to load model: {e}. Falling back to CPU.", exc_info=True
)
model = Llama(
model_path=downloaded_path,
verbose=False,
chat_format=chat_format,
n_gpu_layers=0, # hardcode to use CPU
)
logging.info(f"model loaded as {model}")
elif run_type == "Computer Vision":
with suppress_stdout_stderr():
model = StableDiffusion(
model_path=downloaded_path,
wtype=NEXA_RUN_MODEL_PRECISION_MAP.get(
model_path, "default"
model_path, "f32"
), # Weight type (options: default, f32, f16, q4_0, q4_1, q5_0, q5_1, q8_0)
n_threads=multiprocessing.cpu_count(),
)
logging.info(f"model loaded as {model}")
elif model_path in NEXA_RUN_MODEL_MAP_VOICE:
model_path = NEXA_RUN_MODEL_MAP_VOICE.get(model_path)
downloaded_path, run_type = pull_model(model_path)
elif run_type == "Audio":
with suppress_stdout_stderr():
model = WhisperModel(
downloaded_path,
Expand All @@ -132,7 +128,7 @@ async def load_model():
)
logging.info(f"model loaded as {model}")
else:
raise ValueError(f"Model {model_path} not found in NEXA_RUN_MODEL_MAP")
raise ValueError(f"Model {model_path} not found in Model Hub")


@app.on_event("startup")
Expand Down
Loading