diff --git a/bpm_ai_inference/daemon.py b/bpm_ai_inference/daemon.py index 2afe312..142c509 100644 --- a/bpm_ai_inference/daemon.py +++ b/bpm_ai_inference/daemon.py @@ -38,7 +38,7 @@ host=os.getenv('DAEMON_HOST', '0.0.0.0'), port=int(os.getenv('DAEMON_PORT', 6666)), instance_strategy=os.getenv('INSTANCE_STRATEGY', 'memory_limit'), - max_memory=int(os.getenv('SOFT_MEMORY_LIMIT', 8_589_934_592)) + max_memory=int(os.getenv('SOFT_MEMORY_LIMIT', 16_000_000_000)) ) for c in remote_classes: diff --git a/bpm_ai_inference/llm/llama_cpp/llama_chat.py b/bpm_ai_inference/llm/llama_cpp/llama_chat.py index 148ab0a..1090105 100644 --- a/bpm_ai_inference/llm/llama_cpp/llama_chat.py +++ b/bpm_ai_inference/llm/llama_cpp/llama_chat.py @@ -46,7 +46,7 @@ def __init__( filename: str = DEFAULT_QUANT_BALANCED, temperature: float = DEFAULT_TEMPERATURE, max_retries: int = DEFAULT_MAX_RETRIES, - force_offline: bool = os.getenv(FORCE_OFFLINE_FLAG, False) + force_offline: bool = (os.getenv(FORCE_OFFLINE_FLAG, "false").lower() == "true") ): if not has_llama_cpp_python: raise ImportError('llama-cpp-python is not installed') diff --git a/bpm_ai_inference/util/optimum.py b/bpm_ai_inference/util/optimum.py index 7e2bfe1..3feb90b 100644 --- a/bpm_ai_inference/util/optimum.py +++ b/bpm_ai_inference/util/optimum.py @@ -39,6 +39,8 @@ def _holisticon_onnx_repository_id(model_name: str) -> str: def get_optimized_model(model: str, task: str, optimization_level: int = None, push_to_hub: bool = False): + logger.info(f"Loading model {model}...") + model_name = model model_dir = hf_home() + "/onnx/" + model.replace("/", "--") tokenizer = AutoTokenizer.from_pretrained(model) @@ -123,6 +125,7 @@ def _export_to_onnx(repository_id: str, model_dir, task): @timer def _optimize(repository_id: str, model_dir, task, push_to_hub=False): + logger.info(f"Optimizing model {repository_id}...") model_class = _task_to_model(task) # try to load from hub or cache