0.3.5 - add more logging and increase default memory limit

holunda-io · May 6, 2024 · 082c77b · 082c77b
1 parent 9cb513e
commit 082c77b
Show file tree

Hide file tree

Showing 3 changed files with 5 additions and 2 deletions.
diff --git a/bpm_ai_inference/daemon.py b/bpm_ai_inference/daemon.py
@@ -38,7 +38,7 @@
         host=os.getenv('DAEMON_HOST', '0.0.0.0'),
         port=int(os.getenv('DAEMON_PORT', 6666)),
         instance_strategy=os.getenv('INSTANCE_STRATEGY', 'memory_limit'),
-        max_memory=int(os.getenv('SOFT_MEMORY_LIMIT', 8_589_934_592))
+        max_memory=int(os.getenv('SOFT_MEMORY_LIMIT', 16_000_000_000))
     )
 
     for c in remote_classes:

diff --git a/bpm_ai_inference/llm/llama_cpp/llama_chat.py b/bpm_ai_inference/llm/llama_cpp/llama_chat.py
@@ -46,7 +46,7 @@ def __init__(
         filename: str = DEFAULT_QUANT_BALANCED,
         temperature: float = DEFAULT_TEMPERATURE,
         max_retries: int = DEFAULT_MAX_RETRIES,
-        force_offline: bool = os.getenv(FORCE_OFFLINE_FLAG, False)
+        force_offline: bool = (os.getenv(FORCE_OFFLINE_FLAG, "false").lower() == "true")
     ):
         if not has_llama_cpp_python:
             raise ImportError('llama-cpp-python is not installed')

diff --git a/bpm_ai_inference/util/optimum.py b/bpm_ai_inference/util/optimum.py
@@ -39,6 +39,8 @@ def _holisticon_onnx_repository_id(model_name: str) -> str:
 
 
 def get_optimized_model(model: str, task: str, optimization_level: int = None, push_to_hub: bool = False):
+    logger.info(f"Loading model {model}...")
+
     model_name = model
     model_dir = hf_home() + "/onnx/" + model.replace("/", "--")
     tokenizer = AutoTokenizer.from_pretrained(model)
@@ -123,6 +125,7 @@ def _export_to_onnx(repository_id: str, model_dir, task):
 
 @timer
 def _optimize(repository_id: str, model_dir, task, push_to_hub=False):
+    logger.info(f"Optimizing model {repository_id}...")
     model_class = _task_to_model(task)
 
     # try to load from hub or cache